Skip to main content

Python Scrapy Fake Headers Integration

The following are two examples of how to integrate the Fake Browser Headers API and the Fake User-Agent API into your Python Scrapy based web scrapers.


Python Scrapy Fake Browser Headers Middleware

The best way to integrate the Fake Browser Headers API is to create a Downloader middleware and have fake browser headers be added to every request. Here is an example middleware you can use:

## middlewares.py

from urllib.parse import urlencode
from random import randint
import requests

class ScrapeOpsFakeBrowserHeadersMiddleware:

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_HEADERS_ENDPOINT', 'http://headers.scrapeops.io/v1/browser-headers?')
self.scrapeops_fake_headers_active = settings.get('SCRAPEOPS_FAKE_HEADERS_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_headers_list()
self._scrapeops_fake_headers_enabled()

def _get_headers_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.headers_list = json_response.get('result', [])

def _get_random_header(self):
random_index = randint(0, len(self.headers_list) - 1)
return self.headers_list[random_index]

def _scrapeops_fake_headers_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_headers_active == False:
self.scrapeops_fake_headers_active = False
self.scrapeops_fake_headers_active = True

def process_request(self, request, spider):
random_header = self._get_random_header()
for key, val in random_header.items():
request.headers[key] = val


Note: This middleware example requires the installation of Python Requests via pip install requests.

And then enable it in your project in the settings.py file. Remembering to swap the YOUR_PROJECT_NAME for the name of your project (BOT_NAME in your settings.py file):

## settings.py

SCRAPEOPS_API_KEY = 'YOUR_API_KEY'
SCRAPEOPS_FAKE_HEADERS_ENABLED = True

DOWNLOADER_MIDDLEWARES = {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeBrowserHeadersMiddleware': 400,
}

Or in the spider itself using the custom_settings attribute.

## your_spider.py

import scrapy
from demo.items import QuoteItem


class QuotesSpider(scrapy.Spider):
name = "demo"
start_urls = ["http://quotes.toscrape.com/"]

## Enable ScrapeOps Fake Browser API Here
custom_settings = {
'SCRAPEOPS_API_KEY': 'YOUR_API_KEY',
'SCRAPEOPS_FAKE_HEADERS_ENABLED': True,
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeBrowserHeadersMiddleware': 400,
}
}

def parse(self, response):
pass


Python Scrapy Fake User-Agent Middleware

The best way to integrate the Fake User-Agent API is to create a Downloader middleware and have a fake user-agent be added to every request. Here is an example middleware you can use:

## middlewares.py

from urllib.parse import urlencode
from random import randint
import requests

class ScrapeOpsFakeUserAgentMiddleware:

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT', 'http://headers.scrapeops.io/v1/user-agents?')
self.scrapeops_fake_user_agents_active = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_user_agents_list()
self._scrapeops_fake_user_agents_enabled()

def _get_user_agents_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.user_agents_list = json_response.get('result', [])

def _get_random_user_agent(self):
random_index = randint(0, len(self.user_agents_list) - 1)
return self.user_agents_list[random_index]

def _scrapeops_fake_user_agents_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_user_agents_active == False:
self.scrapeops_fake_user_agents_active = False
self.scrapeops_fake_user_agents_active = True

def process_request(self, request, spider):
random_user_agent = self._get_random_user_agent()
request.headers['User-Agent'] = random_user_agent

Note: This middleware example requires the installation of Python Requests via pip install requests.

And then enable it in your project in the settings.py file. Remembering to swap the YOUR_PROJECT_NAME for the name of your project (BOT_NAME in your settings.py file):

## settings.py

SCRAPEOPS_API_KEY = 'YOUR_API_KEY'
SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True

DOWNLOADER_MIDDLEWARES = {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeUserAgentMiddleware': 400,
}

Or in the spider itself using the custom_settings attribute.

## your_spider.py

import scrapy
from demo.items import QuoteItem


class QuotesSpider(scrapy.Spider):
name = "demo"
start_urls = ["http://quotes.toscrape.com/"]

## Enable ScrapeOps Fake User Agent API Here
custom_settings = {
'SCRAPEOPS_API_KEY': 'YOUR_API_KEY',
'SCRAPEOPS_FAKE_USER_AGENT_ENABLED': True,
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeUserAgentMiddleware': 400,
}
}

def parse(self, response):
pass


API Parameters

The following is a list of API parameters that you can include with your requests to customise the header list response.

ParameterDescription
api_keyThis is a required parameter. You can get your Free API key here.
num_resultsBy default the API returns a list of 10 user-agents, however, you can increase that number by changing the num_results number. Max is 100 headers.