javascript - Why can't I extract listings information

I am trying to extract the EPC rating from each listings. you can only get the EPC rating when you click on the listing. each time i run my script it it's keeps timing out, what could be the issue ? though i tried increasing the waiting time for the main content to load but I still encounter the same issue. could it be the headless browser not been able to load?

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdrivermon.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from seleniummon.exceptions import TimeoutException, NoSuchElementException
from selenium.webdrivermon.action_chains import ActionChains
from typing import Iterator
import pandas as pd

# Constants
URL = "/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5

# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
    ActionChains(driver).click(e).perform()

# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
    wait = WebDriverWait(driver, TIMEOUT)
    sel = (By.CSS_SELECTOR, css)
    try:
        yield from wait.until(EC.presence_of_all_elements_located(sel))
    except TimeoutException:
        pass  # Return empty if elements are not found

# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
    for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
        if etext(a) == "Next":
            click(driver, a)
            break

# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
    try:
        wait = WebDriverWait(driver, TIMEOUT)
        shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
        button = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
        ))
        click(driver, button)
    except Exception:
        pass  # Ignore if cookie popup isn't present

# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
    driver.get(listing_url)  # Open property details page
    try:
        epc_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
        )
        return etext(epc_element)  # Extract EPC rating text
    except TimeoutException:
        return "N/A"  # Return "N/A" if EPC Rating is missing

# Scrape data from the search results page
def scrape_page(driver: WebDriver) -> list[dict]:
    result = []
    for house in get_all(driver, "div[data-testid=result-item]"):
        try:
            listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
            date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
            house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
            num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
            tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
            square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))

            # Get EPC Rating from listing page
            epc_rating = get_epc_rating(driver, listing_url)

            result.append({
                "Address": address,
                "Date Last Sold": date_sold,
                "Property Type": house_type,
                "Number of Rooms": num_rooms,
                "Tenure": tenure,
                "Square Foot": square_foot,
                "EPC Rating": epc_rating,
                "Listing URL": listing_url
            })
        except NoSuchElementException:
            continue  # Skip missing elements
    return result

# Main script execution
if __name__ == "__main__":
    with Chrome() as driver:
        driver.get(URL)
        click_through(driver)  # Handle cookies

        all_results = []
        prev_url = ""
        npages = 0

        while prev_url != driver.current_url:  # Stop if pagination stops working (e.g., Cloudflare blocks)
            prev_url = driver.current_url
            all_results.extend(scrape_page(driver))
            click_next(driver)
            npages += 1

        # Convert results to DataFrame
        df = pd.DataFrame(all_results)

        # Display results
        print(df)
        print(f"Processed {npages} pages")

        # Save to CSV
        df.to_csv("zoopla_data.csv", index=False)

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from undetected_chromedriver import Chrome
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from typing import Iterator
import pandas as pd

# Constants
URL = "https://www.zoopla.co.uk/house-prices/england/?new_homes=include&q=england+&orig_q=united+kingdom&view_type=list&pn=1"
TIMEOUT = 5

# Helper function to extract text from a WebElement
def etext(e: WebElement) -> str:
    if e:
        if t := e.text.strip():
            return t
        if (p := e.get_property("textContent")) and isinstance(p, str):
            return p.strip()
    return ""

# Click a WebElement
def click(driver: WebDriver, e: WebElement) -> None:
    ActionChains(driver).click(e).perform()

# Get all WebElements that match the given CSS selector
def get_all(driver: WebDriver, css: str) -> Iterator[WebElement]:
    wait = WebDriverWait(driver, TIMEOUT)
    sel = (By.CSS_SELECTOR, css)
    try:
        yield from wait.until(EC.presence_of_all_elements_located(sel))
    except TimeoutException:
        pass  # Return empty if elements are not found

# Click the "Next" button for pagination
def click_next(driver: WebDriver) -> None:
    for a in get_all(driver, "a[aria-live=polite] > div > div:nth-child(2)"):
        if etext(a) == "Next":
            click(driver, a)
            break

# Handle cookie consent popup
def click_through(driver: WebDriver) -> None:
    try:
        wait = WebDriverWait(driver, TIMEOUT)
        shadow_root = driver.find_element(By.ID, "usercentrics-root").shadow_root
        button = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button[data-testid=uc-deny-all-button]")
        ))
        click(driver, button)
    except Exception:
        pass  # Ignore if cookie popup isn't present

# Scrape EPC Rating from individual listing
def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
    driver.get(listing_url)  # Open property details page
    try:
        epc_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.main-content .z3kgis3 ._1vhryas0 ._8lgu4x1 div:nth-child(3) div'))
        )
        return etext(epc_element)  # Extract EPC rating text
    except TimeoutException:
        return "N/A"  # Return "N/A" if EPC Rating is missing

# Scrape data from the search results page
def scrape_page(driver: WebDriver) -> list[dict]:
    result = []
    for house in get_all(driver, "div[data-testid=result-item]"):
        try:
            listing_url = house.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            address = etext(house.find_element(By.CSS_SELECTOR, "h2"))
            date_sold = etext(house.find_element(By.CSS_SELECTOR, "._1hzil3o9._1hzil3o8._194zg6t7"))
            house_type = etext(house.find_element(By.CSS_SELECTOR, "div._1pbf8i52 p"))
            num_rooms = etext(house.find_element(By.CSS_SELECTOR, "._1pbf8i51 div:nth-child(2) p"))
            tenure = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(1) div"))
            square_foot = etext(house.find_element(By.CSS_SELECTOR, ".agepcz0 div:nth-child(2) div"))

            # Get EPC Rating from listing page
            epc_rating = get_epc_rating(driver, listing_url)

            result.append({
                "Address": address,
                "Date Last Sold": date_sold,
                "Property Type": house_type,
                "Number of Rooms": num_rooms,
                "Tenure": tenure,
                "Square Foot": square_foot,
                "EPC Rating": epc_rating,
                "Listing URL": listing_url
            })
        except NoSuchElementException:
            continue  # Skip missing elements
    return result

# Main script execution
if __name__ == "__main__":
    with Chrome() as driver:
        driver.get(URL)
        click_through(driver)  # Handle cookies

        all_results = []
        prev_url = ""
        npages = 0

        while prev_url != driver.current_url:  # Stop if pagination stops working (e.g., Cloudflare blocks)
            prev_url = driver.current_url
            all_results.extend(scrape_page(driver))
            click_next(driver)
            npages += 1

        # Convert results to DataFrame
        df = pd.DataFrame(all_results)

        # Display results
        print(df)
        print(f"Processed {npages} pages")

        # Save to CSV
        df.to_csv("zoopla_data.csv", index=False)

Share Improve this question edited Feb 7 at 12:52 asked Feb 6 at 11:08 Chioma Okoroafor 471 silver badge6 bronze badges New contributor Chioma Okoroafor is a new contributor to this site. Take care in asking for clarification, commenting, and answering. Check out our Code of Conduct.

There are no EPC ratings shown on the page that's rendered by the URL in your question – Adon Bilivit Commented Feb 6 at 12:23
@AdonBilivit The EPC ratings can be seen only when you click on a listing – Chioma Okoroafor Commented Feb 6 at 23:32
Your code gets blocked by cloudfare. – Automator Commented Feb 7 at 5:13
@Automator the updated code above takes care of cloudfare. – Chioma Okoroafor Commented Feb 7 at 11:17
thank you once again @AdonBilivit for your help with cloudfare implementation. how do you think i can approach to extract the EPC rating – Chioma Okoroafor Commented Feb 7 at 11:18

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

There's quite a bit of errors in your logic in the code here, so I'll offer you the solution to get the EPC rating, but you are going to need to go back and rework your logic of how you are navigating the site.

Your CSS Selector is off. Those classes also appear to be dynamically generated. Try avoiding hard coding these and think of robust ways to get the data when you encounter obviously randomized classes or ids.

So what you can do instead is just find the content in the html that starts with EPC rating. That will find the specific element then:

Code:

def get_epc_rating(driver: WebDriver, listing_url: str) -> str:
    driver.get(listing_url)  # Open property details page
    try:
        xpath_expression = (
            "//*[starts-with(translate(text(), 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 'EPC RATING')]"
        )
        
        epc_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, xpath_expression))
        )
        return etext(epc_element)  # Extract EPC rating text
    except TimeoutException:
        return "N/A"  # Return "N/A" if EPC Rating is missing

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

javascript - Why can't I extract listings information - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)