python - How do i keep the browser crawl4ai is using from closing before second and third iterations?

i'm trying to use crawl4ai to extract some data from a website with dynamic content loading, and since the url doesn't change i'm trying to use javascript load diferent content in between extractions.

the class i'm providing is my first attempt at doing this and is just a test:

class AuthenticatedCrawler:
    async def crawl(self, start_url):
        results = []
        
        browser_config = BrowserConfig(
            headless=False,
            use_persistent_context=True,
            browser_type="chromium",
            user_data_dir=CHROME_PROFILE_PATH,
            extra_args=[
                "--profile-directory=Profile 3",
                "--disable-blink-features=AutomationControlled",
                f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            ]
        )
        crawler = AsyncWebCrawler(config=browser_config)

        try:
            #Iniciar manualmente el navegador
            await crawler.start()
            result = ""
            categories = ["MLA5725", "MLA4711", "MLA11090"]
            count = 0
            for category in categories:
                crawl_result = await crawler.arun(
                    url=start_url,
                    timeout=120000,
                    config=CrawlerRunConfig(
                        bypass_cache=True,
                        session_id="1",
                        word_count_threshold=30,
                        js_code=[
                            f"""
                                // 1. Click target element
                                await new Promise(r => setTimeout(r, 3000));
                                const target = document.querySelector('[data-categ-id={category}] .wrap-rollover');
                                if(target) {{
                                    target.click();
                                    await new Promise(r => setTimeout(r, 2000));  // Wait 2s after click
                                }}

                                // 2. Progressive scroll
                                for(let i = 0; i < 3; i++) {{
                                    window.scrollTo(0, document.body.scrollHeight * (i/4));
                                    await new Promise(r => setTimeout(r, 2000 + (i * 1000)));
                                }}
                                
                                // 3. Final stabilization wait
                                await new Promise(r => setTimeout(r, 3000));
                            """
                        ],
                        js_only = count > 0,
                        screenshot=True,
                        verbose=True
                    )
                )
            
                count=count+1
                if crawl_result.success:
                    soup = BeautifulSoup(crawl_result.html, 'html.parser')
                    
                    # Extraer y procesar TODOS los elementos wrap-viz
                    for wrap_viz in soup.select(".wrap-viz"):
                        results.append({
                            'url': start_url,
                            'titulo': wrap_viz.select_one('.title-viz').get_text(strip=True) if wrap_viz.select_one('.title-viz') else '',
                            'facturacion': wrap_viz.select_one('.side-sold-quant-viz.first .sold-quant-viz-numb.sellers-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.side-sold-quant-viz.first .sold-quant-viz-numb.sellers-quant-viz-numb') else '',
                            'u_vendidas': wrap_viz.select_one('.gmv-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.gmv-quant-viz-numb') else '',
                            'publicaciones': wrap_viz.select_one('.listings-viz-numb').get_text(strip=True) if wrap_viz.select_one('.listings-viz-numb') else '',
                            'vendedores': wrap_viz.select_one('.sold-quant-viz .sold-quant-viz-numb.sellers-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.sold-quant-viz .sold-quant-viz-numb.sellers-quant-viz-numb') else '',
                            #'participacion': wrap_viz.select_one('.gmv-quant-viz-numb').get_text(strip=True) if wrap_viz.select_one('.gmv-quant-viz-numb') else '',
                            'timestamp': datetime.now().isoformat()
                        })
                else:
                    print("crawl_result NOT success!")
        finally:
            # Cerrar manualmente cuando hayas terminado TODO
            await crawler.close()
        
        return results

the for inside the crawl method should load different content to extract between iterations, but it only extracts data on the first iteration, and the other 2 throw errors:

┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ × Unexpected error in _crawl_web at line 528 in wrap_api_call (venv\Lib\site-                                         │
│ packages\playwright\_impl\_connection.py):                                                                            │
│   Error: Page.evaluate: Target page, context or browser has been closed                                               │
│                                                                                                                       │
│   Code context:                                                                                                       │
│   523           parsed_st = _extract_stack_trace_information_from_stack(st, is_internal)                              │
│   524           self._api_zone.set(parsed_st)                                                                         │
│   525           try:                                                                                                  │
│   526               return await cb()                                                                                 │
│   527           except Exception as error:                                                                            │
│   528 →             raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None                          │
│   529           finally:                                                                                              │
│   530               self._api_zone.set(None)                                                                          │
│   531                                                                                                                 │
│   532       def wrap_api_call_sync(                                                                                   │
│   533           self, cb: Callable[[], Any], is_internal: bool = False                                                │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

crawl_result NOT success!
[FETCH]... ↓ /... | Status: True | Time: 0.08s
[ERROR]... × /... | Error: 
┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ × Unexpected error in _crawl_web at line 528 in wrap_api_call (venv\Lib\site-                                         │
│ packages\playwright\_impl\_connection.py):                                                                            │
│   Error: Page.evaluate: Target page, context or browser has been closed                                               │
│                                                                                                                       │
│   Code context:                                                                                                       │
│   523           parsed_st = _extract_stack_trace_information_from_stack(st, is_internal)                              │
│   524           self._api_zone.set(parsed_st)                                                                         │
│   525           try:                                                                                                  │
│   526               return await cb()                                                                                 │
│   527           except Exception as error:                                                                            │
│   528 →             raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None                          │
│   529           finally:                                                                                              │
│   530               self._api_zone.set(None)                                                                          │
│   531                                                                                                                 │
│   532       def wrap_api_call_sync(                                                                                   │
│   533           self, cb: Callable[[], Any], is_internal: bool = False                                                │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

crawl_result NOT success!

The error says "context or browser has been closed" but i can't find a way to make all iterations work on the same browser without closing it. Having the same session_id and use_persistent_context=True is not doing the trick. Does anyone know how to do this?

sorry if my english is not good

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - How do i keep the browser crawl4ai is using from closing before second and third iterations? - Stack Overflow

与本文相关的文章

评论列表(0)