I'm using Playwright to crawl websites. I have a scraping function that uses Playwright, and implemented a Python object that uses this function to crawl websites in a bread-first search manner.
Below is the scraping function:
import logging
from collections import deque
from playwright.async_api import Browser, BrowserContext, async_playwright
async def fetch_page_content(
url: str,
browser: Browser = None,
context: BrowserContext = None,
open_pages: deque = None,
max_open_pages: int = 100,
timeout: int = 60000,
headless: bool = True,
logger: logging.Logger = None,
) -> str | None:
should_close_browser = browser is None
should_close_context = context is None
if should_close_browser:
p = await async_playwright().start()
browser = await p.chromium.launch(headless=headless)
if should_close_context:
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
)
if open_pages is not None:
if len(open_pages) >= max_open_pages:
old_page = open_pages.popleft()
await old_page.close()
page = await context.new_page()
if open_pages is not None:
open_pages.append(page)
try:
response = await page.goto(url, timeout=timeout, wait_until="load")
if not response or response.status >= 400:
if logger:
logger.error(f"Failed to fetch {url}")
return None
html = await page.content()
return html
except Exception as e:
if logger:
logger.warning(f"Error fetching {url}: {e}")
return None
finally:
await page.close()
if open_pages is not None and page in open_pages:
open_pages.remove(page)
if should_close_context:
await context.close()
if should_close_browser:
await browser.close()
Inside of my crawler, the part that uses this function is as follows:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
total_pages = 0
while self.queue:
batch = []
for _ in range(self.parallel_requests):
if not self.queue:
break
url, depth = self.queue.popleft()
if url in self.visited_urls or depth > self.max_depth:
continue
should_skip_url_, reason = should_skip_url(url=url)
if should_skip_url_:
self.logger.info(f"Skipping {url}: {reason}")
continue
total_pages += 1
self.logger.info(f"[{total_pages}] Crawling: {url} (Depth: {depth})")
self.visited_urls.add(url)
batch.append((url, depth))
self.logger.info(f"open_pages size before fetching batch: {len(self.open_pages)}")
tasks = [
fetch_page_content(
url=url,
context=context,
open_pages=self.open_pages,
max_open_pages=self.max_open_pages,
logger=self.logger,
)
for url, depth in batch
]
html_results = await asyncio.gather(*tasks, return_exceptions=True)
self.logger.info(f"open_pages size after fetching batch: {len(self.open_pages)}")
for (url, depth), html_result in zip(batch, html_results):
processing_successful = await self.process_and_save_fetched_html(url=url, html=html_result)
if not processing_successful:
continue
links = await self.extract_links(html=html_result, base_url=url)
await self.validate_and_enqueue_links(url=url, links=links, depth=depth)
if total_pages % self.restart_interval == 0 and total_pages != 0:
self.logger.info("Restarting browser and context...")
await context.close()
await browser.close()
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
What I've tried are as follows:
- Limit concurrency by using lower values of
parallel_requests
. - Manually close pages using the
open_pages
deque. - Restarting the browser when we hit an interval.
Playwright's throwing an Errno 24 saying that there are too many files open and it failed to fetch the webpage.
I'm sort of at a dead end now and am not sure what else I can do. I would rather not manually raise the system's values for maximum open pages and take care of it at the code level, if possible.