i'm trying to use crawl4ai to extract some data from a website with dynamic content loading, and since the url doesn't change i'm trying to use javascript load diferent content in between extractions.
the class i'm providing is my first attempt at doing this and is just a test:
class AuthenticatedCrawler:
async def crawl(self, start_url):
results = []
browser_config = BrowserConfig(
headless=False,
use_persistent_context=True,
browser_type="chromium",
user_data_dir=CHROME_PROFILE_PATH,
extra_args=[
"--profile-directory=Profile 3",
"--disable-blink-features=AutomationControlled",
f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
]
)
crawler = AsyncWebCrawler(config=browser_config)
try:
#Iniciar manualmente el navegador
await crawler.start()
result = ""
categories = ["MLA5725", "MLA4711", "MLA11090"]
count = 0
for category in categories:
crawl_result = await crawler.arun(
url=start_url,
timeout=120000,
config=CrawlerRunConfig(
bypass_cache=True,
session_id="1",
word_count_threshold=30,
js_code=[
f"""
// 1. Click target element
await new Promise(r => setTimeout(r, 3000));
const target = document.querySelector('[data-categ-id={category}] .wrap-rollover');
if(target) {{
target.click();
await new Promise(r => setTimeout(r, 2000)); // Wait 2s after click
}}
// 2. Progressive scroll
for(let i = 0; i < 3; i++) {{
window.scrollTo(0, document.body.scrollHeight * (i/4));
await new Promise(r => setTimeout(r, 2000 + (i * 1000)));
}}
// 3. Final stabilization wait
await new Promise(r => setTimeout(r, 3000));
"""
],
js_only = count > 0,
screenshot=True,
verbose=True
)
)
count=count+1
if crawl_result.success:
soup = BeautifulSoup(crawl_result.html, 'html.parser')
# Extraer y procesar TODOS los elementos wrap-viz
for wrap_viz in soup.select(".wrap-viz"):
results.append({
'url': start_url,
'titulo': wrap_viz.select_one('.title-viz').get_text(strip=True) if wrap_viz.select_one('.title-viz') else '',
'facturacion': wrap_viz.select_one('.side-sold-quant-viz.first .sold-quant-viz-numb.sellers-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.side-sold-quant-viz.first .sold-quant-viz-numb.sellers-quant-viz-numb') else '',
'u_vendidas': wrap_viz.select_one('.gmv-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.gmv-quant-viz-numb') else '',
'publicaciones': wrap_viz.select_one('.listings-viz-numb').get_text(strip=True) if wrap_viz.select_one('.listings-viz-numb') else '',
'vendedores': wrap_viz.select_one('.sold-quant-viz .sold-quant-viz-numb.sellers-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.sold-quant-viz .sold-quant-viz-numb.sellers-quant-viz-numb') else '',
#'participacion': wrap_viz.select_one('.gmv-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.gmv-quant-viz-numb') else '',
'timestamp': datetime.now().isoformat()
})
else:
print("crawl_result NOT success!")
finally:
# Cerrar manualmente cuando hayas terminado TODO
await crawler.close()
return results
the for inside the crawl method should load different content to extract between iterations, but it only extracts data on the first iteration, and the other 2 throw errors:
┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ × Unexpected error in _crawl_web at line 528 in wrap_api_call (venv\Lib\site- │
│ packages\playwright\_impl\_connection.py): │
│ Error: Page.evaluate: Target page, context or browser has been closed │
│ │
│ Code context: │
│ 523 parsed_st = _extract_stack_trace_information_from_stack(st, is_internal) │
│ 524 self._api_zone.set(parsed_st) │
│ 525 try: │
│ 526 return await cb() │
│ 527 except Exception as error: │
│ 528 → raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None │
│ 529 finally: │
│ 530 self._api_zone.set(None) │
│ 531 │
│ 532 def wrap_api_call_sync( │
│ 533 self, cb: Callable[[], Any], is_internal: bool = False │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
crawl_result NOT success!
[FETCH]... ↓ /... | Status: True | Time: 0.08s
[ERROR]... × /... | Error:
┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ × Unexpected error in _crawl_web at line 528 in wrap_api_call (venv\Lib\site- │
│ packages\playwright\_impl\_connection.py): │
│ Error: Page.evaluate: Target page, context or browser has been closed │
│ │
│ Code context: │
│ 523 parsed_st = _extract_stack_trace_information_from_stack(st, is_internal) │
│ 524 self._api_zone.set(parsed_st) │
│ 525 try: │
│ 526 return await cb() │
│ 527 except Exception as error: │
│ 528 → raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None │
│ 529 finally: │
│ 530 self._api_zone.set(None) │
│ 531 │
│ 532 def wrap_api_call_sync( │
│ 533 self, cb: Callable[[], Any], is_internal: bool = False │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
crawl_result NOT success!
The error says "context or browser has been closed" but i can't find a way to make all iterations work on the same browser without closing it. Having the same session_id and use_persistent_context=True is not doing the trick. Does anyone know how to do this?
sorry if my english is not good