Python Scrapy Fake Headers Integration
The following are two examples of how to integrate the Fake Browser Headers API and the Fake User-Agent API into your Python Scrapy based web scrapers.
Python Scrapy Fake Browser Headers Middleware
The best way to integrate the Fake Browser Headers API is to create a Downloader middleware and have fake browser headers be added to every request. Here is an example middleware you can use:
## middlewares.py
from urllib.parse import urlencode
from random import randint
import requests
class ScrapeOpsFakeBrowserHeadersMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_HEADERS_ENDPOINT', 'http://headers.scrapeops.io/v1/browser-headers?')
self.scrapeops_fake_headers_active = settings.get('SCRAPEOPS_FAKE_HEADERS_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_headers_list()
self._scrapeops_fake_headers_enabled()
def _get_headers_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.headers_list = json_response.get('result', [])
def _get_random_header(self):
random_index = randint(0, len(self.headers_list) - 1)
return self.headers_list[random_index]
def _scrapeops_fake_headers_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_headers_active == False:
self.scrapeops_fake_headers_active = False
self.scrapeops_fake_headers_active = True
def process_request(self, request, spider):
random_header = self._get_random_header()
for key, val in random_header.items():
request.headers[key] = val
Note: This middleware example requires the installation of Python Requests via pip install requests
.
And then enable it in your project in the settings.py
file. Remembering to swap the YOUR_PROJECT_NAME
for the name of your project (BOT_NAME
in your settings.py
file):
## settings.py
SCRAPEOPS_API_KEY = 'YOUR_API_KEY'
SCRAPEOPS_FAKE_HEADERS_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeBrowserHeadersMiddleware': 400,
}
Or in the spider itself using the custom_settings
attribute.
## your_spider.py
import scrapy
from demo.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = "demo"
start_urls = ["http://quotes.toscrape.com/"]
## Enable ScrapeOps Fake Browser API Here
custom_settings = {
'SCRAPEOPS_API_KEY': 'YOUR_API_KEY',
'SCRAPEOPS_FAKE_HEADERS_ENABLED': True,
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeBrowserHeadersMiddleware': 400,
}
}
def parse(self, response):
pass
Python Scrapy Fake User-Agent Middleware
The best way to integrate the Fake User-Agent API is to create a Downloader middleware and have a fake user-agent be added to every request. Here is an example middleware you can use:
## middlewares.py
from urllib.parse import urlencode
from random import randint
import requests
class ScrapeOpsFakeUserAgentMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT', 'http://headers.scrapeops.io/v1/user-agents?')
self.scrapeops_fake_user_agents_active = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_user_agents_list()
self._scrapeops_fake_user_agents_enabled()
def _get_user_agents_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.user_agents_list = json_response.get('result', [])
def _get_random_user_agent(self):
random_index = randint(0, len(self.user_agents_list) - 1)
return self.user_agents_list[random_index]
def _scrapeops_fake_user_agents_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_user_agents_active == False:
self.scrapeops_fake_user_agents_active = False
self.scrapeops_fake_user_agents_active = True
def process_request(self, request, spider):
random_user_agent = self._get_random_user_agent()
request.headers['User-Agent'] = random_user_agent
Note: This middleware example requires the installation of Python Requests via pip install requests
.
And then enable it in your project in the settings.py
file. Remembering to swap the YOUR_PROJECT_NAME
for the name of your project (BOT_NAME
in your settings.py
file):
## settings.py
SCRAPEOPS_API_KEY = 'YOUR_API_KEY'
SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeUserAgentMiddleware': 400,
}
Or in the spider itself using the custom_settings
attribute.
## your_spider.py
import scrapy
from demo.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = "demo"
start_urls = ["http://quotes.toscrape.com/"]
## Enable ScrapeOps Fake User Agent API Here
custom_settings = {
'SCRAPEOPS_API_KEY': 'YOUR_API_KEY',
'SCRAPEOPS_FAKE_USER_AGENT_ENABLED': True,
'DOWNLOADER_MIDDLEWARES': {
'YOUR_PROJECT_NAME.middlewares.ScrapeOpsFakeUserAgentMiddleware': 400,
}
}
def parse(self, response):
pass
API Parameters
The following is a list of API parameters that you can include with your requests to customise the header list response.
Parameter | Description |
---|---|
api_key | This is a required parameter. You can get your Free API key here. |
num_results | By default the API returns a list of 10 user-agents, however, you can increase that number by changing the num_results number. Max is 100 headers. |