I'm following the "The freeCodeCamp Python Scrapy Beginners Course" using yt & the guide below, this also including the scrapeops.io guide as well.
I've run into a problem with the below;
Article Guide: Part-8
Section:
Fake Browser Headers vs Fake User-Agents
Both article guide & yt guide fail, I'll start with the article guide first; there are a number of fixes in the comment section at the bottom of the page which I've tried to implement to no success... so neither the original article or the amended article works; amended article results below.
Files being used are settings.py, bookspider.py & middlewares.py
settings.py
# Enable or disable spider middlewares
# See .html
#SPIDER_MIDDLEWARES = {
# "bookscraper.middlewares.BookscraperSpiderMiddleware": 543,
#}
SCRAPEOPS_API_KEY = 'MY_API_KEY'
SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT = ''
SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True
#SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT = ''
#SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED = True
SCRAPEOPS_NUM_RESULTS = 50
# Enable or disable downloader middlewares
# See .html
DOWNLOADER_MIDDLEWARES = {
# "bookscraper.middlewares.BookscraperDownloaderMiddleware": 543,
# "bookscraper.middlewares.ScrapeOpsFakeUserAgentMiddleware": 400,
"bookscraper.middlewares.ScrapeOpsFakeBrowserHeaderAgentMiddleware": 400,
}
bookspider.py
import scrapy
from bookscraper.items import BookItem
class BookspiderSpider(scrapy.Spider):
name = "bookspider"
allowed_domains = ["books.toscrape"]
start_urls = [";]
def parse(self, response):
books = response.css('article.product_pod')
for book in books:
relative_url = book.css('h3 a ::attr(href)').get()
if 'catalogue/' in relative_url:
book_url = '/' + relative_url
else:
book_url = '/' + relative_url
yield response.follow(book_url, callback = self.parse_book_page)
next_page = response.css('li.next a ::attr(href)').get()
if next_page is not None:
if 'catalogue/' in next_page:
next_page_url = '/' + next_page
else:
next_page_url = '/' + next_page
yield response.follow(next_page_url, callback = self.parse)
def parse_book_page(self, response):
book = response.css("div.product_main")[0]
table_rows = response.css("table tr")
book_item = BookItem()
book_item['url'] = response.url
book_item['title'] = book.css("h1 ::text").get()
book_item['upc'] = table_rows[0].css("td ::text").get()
book_item['product_type'] = table_rows[1].css("td ::text").get()
book_item['price_excl_tax'] = table_rows[2].css("td ::text").get()
book_item['price_incl_tax'] = table_rows[3].css("td ::text").get()
book_item['tax'] = table_rows[4].css("td ::text").get()
book_item['availability'] = table_rows[5].css("td ::text").get()
book_item['num_reviews'] = table_rows[6].css("td ::text").get()
book_item['stars'] = book.css("p.star-rating").attrib['class']
book_item['category'] = book.xpath("//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()").get()
book_item['description'] = book.xpath("//div[@id='product_description']/following-sibling::p/text()").get()
book_item['price'] = book.css('p.price_color ::text').get()
yield book_item
middlewares.py
from scrapy import signals
from urllib.parse import urlencode
from scrapy.http import Headers
from random import randint
import requests
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ScrapeOpsFakeUserAgentMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT', '?')
self.scrapeops_fake_user_agents_active = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_user_agents_list()
self._scrapeops_fake_user_agents_enabled()
def _get_user_agents_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.user_agents_list = json_response.get('result', [])
def _get_random_user_agent(self):
random_index = randint(0, len(self.user_agents_list) - 1)
return self.user_agents_list[random_index]
def _scrapeops_fake_user_agents_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_user_agents_active == False:
self.scrapeops_fake_user_agents_active = False
else:
self.scrapeops_fake_user_agents_active = True
def process_request(self, request, spider):
random_user_agent = self._get_random_user_agent()
request.headers['User-Agent'] = random_user_agent
#print("########### NEW USER AGENT ############")
#print(request.headers['User-Agent'])
class ScrapeOpsFakeBrowserHeaderAgentMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT', '?')
self.scrapeops_fake_browser_headers_active = settings.get('SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED', True)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_headers_list()
self._scrapeops_fake_browser_headers_enabled()
def _get_headers_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.headers_list = json_response.get('result', [])
def _get_random_browser_header(self):
random_index = randint(0, len(self.headers_list) - 1)
return self.headers_list[random_index]
def _scrapeops_fake_browser_headers_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_browser_headers_active == False:
self.scrapeops_fake_browser_headers_active = False
else:
self.scrapeops_fake_browser_headers_active = True
def process_request(self, request, spider):
random_browser_header = self._get_random_browser_header()
#request.headers = random_browser_header
request.headers = Headers(random_browser_header)
#request.headers['accept-language'] = random_browser_header['accept-language']
#request.headers['accept-encoding'] = random_browser_header['accept-encoding']
#request.headers['sec-fetch-user'] = random_browser_header['sec-fetch-user']
#request.headers['sec-fetch-mod'] = random_browser_header['sec-fetch-mod']
#request.headers['sec-fetch-site'] = random_browser_header['sec-fetch-site']
#request.headers['sec-ch-ua-platform'] = random_browser_header['sec-ch-ua-platform']
#request.headers['sec-ch-ua-mobile'] = random_browser_header['sec-ch-ua-mobile']
#request.headers['sec-ch-ua'] = random_browser_header['sec-ch-ua']
#request.headers['accept'] = random_browser_header['accept']
#request.headers['user-agent'] = random_browser_header['user-agent']
#request.headers['upgrade-insecure-requests'] = random_browser_header.get('upgrade-insecure-requests')
print("########### NEW USER AGENT ############")
print(request.headers)
These are the results below, it fails on line 12 in bookspider.py & I have no idea where to start when it comes to resolving issues found in the bookspider.py & "/site-packages/".
RESULTS - FULL ERROR.
2025-03-31 11:42:49 [scrapy.core.scraper] ERROR: Spider error processing <GET ; (referer: None)
Traceback (most recent call last):
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/defer.py", line 327, in iter_errback
yield next(it)
~~~~^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/python.py", line 368, in __next__
return next(self.data)
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/python.py", line 368, in __next__
return next(self.data)
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/referer.py", line 379, in <genexpr>
return (self._set_referer(r, response) for r in result)
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/urllength.py", line 57, in <genexpr>
return (r for r in result if self._filter(r, spider))
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result if self._filter(r, response, spider))
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/project/bookscraper/bookscraper/spiders/bookspider.py", line 12, in parse
books = response.css('article.product_pod')
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/http/response/__init__.py", line 170, in css
raise NotSupported("Response content isn't text")
scrapy.exceptions.NotSupported: Response content isn't text
2025-03-31 11:42:49 [scrapy.core.engine] INFO: Closing spider (finished)
I've followed the yt step by step guide, it differs to the article guide, however still fails.
These are the files being used settings.py, bookspider.py (are the same, no changes) & middlewares.py (amended);
middlewares.py
from scrapy import signals
from urllib.parse import urlencode
#from scrapy.http import Headers
from random import randint
import requests
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ScrapeOpsFakeUserAgentMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENDPOINT', '?')
self.scrapeops_fake_user_agents_active = settings.get('SCRAPEOPS_FAKE_USER_AGENT_ENABLED', False)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_user_agents_list()
self._scrapeops_fake_user_agents_enabled()
def _get_user_agents_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.user_agents_list = json_response.get('result', [])
def _get_random_user_agent(self):
random_index = randint(0, len(self.user_agents_list) - 1)
return self.user_agents_list[random_index]
def _scrapeops_fake_user_agents_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_user_agents_active == False:
self.scrapeops_fake_user_agents_active = False
else:
self.scrapeops_fake_user_agents_active = True
def process_request(self, request, spider):
random_user_agent = self._get_random_user_agent()
request.headers['User-Agent'] = random_user_agent
#print("########### NEW USER AGENT ############")
#print(request.headers['User-Agent'])
class ScrapeOpsFakeBrowserHeaderAgentMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.scrapeops_api_key = settings.get('SCRAPEOPS_API_KEY')
self.scrapeops_endpoint = settings.get('SCRAPEOPS_FAKE_BROWSER_HEADER_ENDPOINT', '?')
self.scrapeops_fake_browser_headers_active = settings.get('SCRAPEOPS_FAKE_BROWSER_HEADER_ENABLED', True)
self.scrapeops_num_results = settings.get('SCRAPEOPS_NUM_RESULTS')
self.headers_list = []
self._get_headers_list()
self._scrapeops_fake_browser_headers_enabled()
def _get_headers_list(self):
payload = {'api_key': self.scrapeops_api_key}
if self.scrapeops_num_results is not None:
payload['num_results'] = self.scrapeops_num_results
response = requests.get(self.scrapeops_endpoint, params=urlencode(payload))
json_response = response.json()
self.headers_list = json_response.get('result', [])
def _get_random_browser_header(self):
random_index = randint(0, len(self.headers_list) - 1)
return self.headers_list[random_index]
def _scrapeops_fake_browser_headers_enabled(self):
if self.scrapeops_api_key is None or self.scrapeops_api_key == '' or self.scrapeops_fake_browser_headers_active == False:
self.scrapeops_fake_browser_headers_active = False
else:
self.scrapeops_fake_browser_headers_active = True
def process_request(self, request, spider):
random_browser_header = self._get_random_browser_header()
#request.headers = random_browser_header
#request.headers = Headers(random_browser_header)
request.headers['accept-language'] = random_browser_header['accept-language']
request.headers['accept-encoding'] = random_browser_header['accept-encoding']
#request.headers['sec-fetch-user'] = random_browser_header['sec-fetch-user']
#request.headers['sec-fetch-mod'] = random_browser_header['sec-fetch-mod']
request.headers['sec-fetch-site'] = random_browser_header['sec-fetch-site']
request.headers['sec-ch-ua-platform'] = random_browser_header['sec-ch-ua-platform']
request.headers['sec-ch-ua-mobile'] = random_browser_header['sec-ch-ua-mobile']
request.headers['sec-ch-ua'] = random_browser_header['sec-ch-ua']
request.headers['accept'] = random_browser_header['accept']
request.headers['user-agent'] = random_browser_header['user-agent']
request.headers['upgrade-insecure-requests'] = random_browser_header.get('upgrade-insecure-requests')
print("########### NEW USER AGENT ############")
print(request.headers)
RESULTS - FULL ERROR.
2025-03-31 12:32:12 [scrapy.core.scraper] ERROR: Spider error processing <GET ; (referer: None)
Traceback (most recent call last):
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/defer.py", line 327, in iter_errback
yield next(it)
~~~~^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/python.py", line 368, in __next__
return next(self.data)
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/utils/python.py", line 368, in __next__
return next(self.data)
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/referer.py", line 379, in <genexpr>
return (self._set_referer(r, response) for r in result)
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/urllength.py", line 57, in <genexpr>
return (r for r in result if self._filter(r, spider))
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result if self._filter(r, response, spider))
^^^^^^
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/core/spidermw.py", line 106, in process_sync
yield from iterable
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/project/bookscraper/bookscraper/spiders/bookspider.py", line 12, in parse
books = response.css('article.product_pod')
File "/home/USERNAME/.local/share/pipx/venvs/ipykernel/lib/python3.13/site-packages/scrapy/http/response/__init__.py", line 170, in css
raise NotSupported("Response content isn't text")
scrapy.exceptions.NotSupported: Response content isn't text
2025-03-31 12:32:12 [scrapy.core.engine] INFO: Closing spider (finished)
I've also tried the article guide from scrapeops.io/docs/fake-user-agent-headers-api/fake-browser-headers/ & this ERROR's out as well. However, I'm not going to upload these results unless otherwise asked to do so.
Also to add, as soon as I change to; "ScrapeOps Fake User-Agent API" everything works as expected.
DOWNLOADER_MIDDLEWARES = {
'bookscraper.middlewares.ScrapeOpsFakeUserAgentMiddleware': 400,
}
This is only happening with the "ScrapeOps Fake Browser Header API"
DOWNLOADER_MIDDLEWARES = {
'bookscraper.middlewares.ScrapeOpsFakeBrowserHeaderAgentMiddleware': 400,
}
I'm stumped, & as I said, I'm very new to coding in general. What can I try next?