最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - I tried scraping a website using selenium but I was unsuccessful due to its inability to detect its web element - Stack

programmeradmin1浏览0评论

Problem Description: I tried scraping a website using Selenium, but I was unsuccessful due to its inability to detect the web elements. I need to collect product information from 6 to 10 different websites on a daily basis. This is the website: /?dtche%5Bpath%5D=products.

Code Snippet:

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time  # To manage delays

# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()

# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")  
driver = webdriver.Edge(service=service, options=edge_options)  

# URL of the website to scrape
url = "/?dtche%5Bpath%5D=products"
driver.get(url)  

# WebDriver Wait
wait = WebDriverWait(driver, 30)

# Handle ad close button
try:
    close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
    close_button.click()
except TimeoutException:
    print("Ad close button not found. Continuing...")

# Handle cookie consent
try:
    accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
    accept_button.click()
except TimeoutException:
    print("Cookie consent button not found. Continuing...")

# Initialize storage
product_list = []
current_page = 1

# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []

while True:  # Loop through pages
    products = []
    
    # Get all product elements
    product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")

    for product in product_elements:
        try:
            product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
            products.append(product_url)
        except Exception as e:
            print(f"Error extracting product link: {e}")
            continue 

    # Open each product in a new tab, scrape details, then close tab
    for product_url in products:
        driver.execute_script(f"window.open('{product_url}', '_blank');")
        driver.switch_to.window(driver.window_handles[-1])  # Switch to new tab

        try:
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Ensure page loads
            product_soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scrape the product data
            name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
            names.append(name.text.strip())

            brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
            brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
            brand_names.append(brand.text.strip())
            brand_links.append(brand_element.get_attribute('href'))

            strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
            strains.append(strain.text.strip())

            potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
            # Extract text and remove anything before ':'
            potency_values = [p.text.split(":")[-1].strip() for p in potencies]
            # Join them as a single string (optional, useful for CSV)
            potency_text = potency_values[1]
            potencys.append(potency_text)

            price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
            prices.append(price.text.strip())

            effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
            effects.append(", ".join([e.text.strip() for e in effect_elements]))

            product_data = {
                "name": names,
                "brand_name": brand_names,
                "brand_link": brand_links,
                "strain": strains,
                "potency": potencys,
                "price": prices,
                "effects": effects
            }
            product_list.append(product_data)

        except Exception as e:
            print(f"Error scraping product details: {e}")

        driver.close()
        driver.switch_to.window(driver.window_handles[0])  # Switch back to main tab

    print(f"Page {current_page} scraped successfully.")

    # Click the next page button if available
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
        next_button.click()
        current_page += 1
        time.sleep(5)  # Allow the next page to load
    except NoSuchElementException:
        print("No more pages found. Exiting loop.")
        break

# Check if data was scraped
if product_list:
    # Initialize DataFrame with the scraped data
    df = pd.DataFrame(product_list)  # Wrap the dict in a list
    
    # Save DataFrame to CSV (append mode if the file exists)
    df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
    print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
    print("No data to save.")
    
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")

driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.

Problem Description: I tried scraping a website using Selenium, but I was unsuccessful due to its inability to detect the web elements. I need to collect product information from 6 to 10 different websites on a daily basis. This is the website: https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products.

Code Snippet:

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time  # To manage delays

# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()

# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")  
driver = webdriver.Edge(service=service, options=edge_options)  

# URL of the website to scrape
url = "https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)  

# WebDriver Wait
wait = WebDriverWait(driver, 30)

# Handle ad close button
try:
    close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
    close_button.click()
except TimeoutException:
    print("Ad close button not found. Continuing...")

# Handle cookie consent
try:
    accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
    accept_button.click()
except TimeoutException:
    print("Cookie consent button not found. Continuing...")

# Initialize storage
product_list = []
current_page = 1

# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []

while True:  # Loop through pages
    products = []
    
    # Get all product elements
    product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")

    for product in product_elements:
        try:
            product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
            products.append(product_url)
        except Exception as e:
            print(f"Error extracting product link: {e}")
            continue 

    # Open each product in a new tab, scrape details, then close tab
    for product_url in products:
        driver.execute_script(f"window.open('{product_url}', '_blank');")
        driver.switch_to.window(driver.window_handles[-1])  # Switch to new tab

        try:
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Ensure page loads
            product_soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scrape the product data
            name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
            names.append(name.text.strip())

            brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
            brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
            brand_names.append(brand.text.strip())
            brand_links.append(brand_element.get_attribute('href'))

            strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
            strains.append(strain.text.strip())

            potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
            # Extract text and remove anything before ':'
            potency_values = [p.text.split(":")[-1].strip() for p in potencies]
            # Join them as a single string (optional, useful for CSV)
            potency_text = potency_values[1]
            potencys.append(potency_text)

            price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
            prices.append(price.text.strip())

            effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
            effects.append(", ".join([e.text.strip() for e in effect_elements]))

            product_data = {
                "name": names,
                "brand_name": brand_names,
                "brand_link": brand_links,
                "strain": strains,
                "potency": potencys,
                "price": prices,
                "effects": effects
            }
            product_list.append(product_data)

        except Exception as e:
            print(f"Error scraping product details: {e}")

        driver.close()
        driver.switch_to.window(driver.window_handles[0])  # Switch back to main tab

    print(f"Page {current_page} scraped successfully.")

    # Click the next page button if available
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
        next_button.click()
        current_page += 1
        time.sleep(5)  # Allow the next page to load
    except NoSuchElementException:
        print("No more pages found. Exiting loop.")
        break

# Check if data was scraped
if product_list:
    # Initialize DataFrame with the scraped data
    df = pd.DataFrame(product_list)  # Wrap the dict in a list
    
    # Save DataFrame to CSV (append mode if the file exists)
    df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
    print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
    print("No data to save.")
    
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")

driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.
Share Improve this question edited Feb 2 at 6:02 Ajeet Verma 3,0813 gold badges17 silver badges28 bronze badges asked Feb 1 at 13:12 Mubaraq OnipedeMubaraq Onipede 395 bronze badges 1
  • First of all you should keep the scraping process as simple as possible. If it is possible to make a list of URLs and then open and scrape each one by one(like in this case) - that's the way to go. Opening in a new tab, then switching between the tabs can cause errors. I also see you parse html for BeautifulSoup, but don't use it. Why? And what are the web elements you think selenium is unable to detect? – Artem Fedorov Commented Feb 7 at 18:03
Add a comment  | 

1 Answer 1

Reset to default 0

The following modifications have been added. For reference, the ids of the modifications( #mod-1 etc) are located at the point of the codes as well.

mod-0: Added "scrollIntoView" for paging button.
mod-1: Added "switch_to" iframe.
mod-2: Searching all the urls from the entire DOM resulted null list. Narrowing the search range to the scope of "main" tag solved this problem.
mod-3: The first product yielded "element not found" exception, because the class attribute did'nt match.
mod-4: Separated "ad close button" as a function.
mod-5: Result of csv was that all the listed data were of the first product on the first page. This was because list of data(names,brand_names and so on) were used as values on the setting of "product_data".
mod-6: Added "try" to every property getting process so that another properties would not be skipped when a property fails. Some of the attributes of products fail to be scraped. You can inspect the structure of HTML of these products.
mod-7: Instead of "driver.switch_to.window", 2nd driver is adopted for details of each product. It is simple because of no need to consider the timing of switch-in and out.
mod-8: Used Firefox as browser. Please go back to Edge if necessary.
mod-9: For testing, page iteration is limited to two pages.

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
#from bs4 import BeautifulSoup
import pandas as pd
import time  # To manage delays

# Set up Selenium WebDriver (using Microsoft Edge)
#edge_options = Options()

# Initialize WebDriver
#service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")  
#driver = webdriver.Edge(service=service, options=edge_options)  
driver = webdriver.Firefox()# mod-8
driver.implicitly_wait(1)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# open 2nd window for each product
driver2 = webdriver.Firefox()#mod-7
driver2.implicitly_wait(1)
wait2 = WebDriverWait(driver2, 30)


# URL of the website to scrape
url = "https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)  

def ad_close():#mod-4
    # Handle ad close button
    try:
        close_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
        close_button.click()
    except TimeoutException:
        print("Ad close button not found. Continuing...")

ad_close()#mod-4
# Handle cookie consent
try:
    accept_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
    accept_button.click()
except TimeoutException:
    print("Cookie consent button not found. Continuing...")


# Initialize storage
product_list = []
current_page = 1

# Lists to store scraped data
#mod-5
"""
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
"""

#iframe= driver.find_elements(By.CSS_SELECTOR,"#dutchie--embed__iframe")#mod-1
#iframe= driver.find_elements(By.CSS_SELECTOR,"iframe")#mod-1
#driver.switch_to.frame(iframe)#failed
driver.switch_to.frame("dutchie--embed__iframe")#mod-1

while True:  # Loop through pages
    products = []
    ad_close()#mod-4

    # Get all product elements
    main = driver.find_element(By.CSS_SELECTOR,"#main-content")#mod-2
    #product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
    #product_elements = driver.find_elements(By.CLASS_NAME, "full-card__Wrapper-sc-11z5u35-0 gwMuEZ")
    product_elements = main.find_elements(By.TAG_NAME, "a")#mod-2
    print('len(product_elements): ',len(product_elements),'\n')
    for product in product_elements:
        product_url = product.get_attribute("href")
        #print(product_url)
        products.append(product_url)
        
    # Open each product in a new tab, scrape details, then close tab
    for product_url in products:
        #driver.execute_script(f"window.open('{product_url}', '_blank');")
        #driver.switch_to.window(driver.window_handles[-1])  # Switch to new tab
        driver2.get(product_url)#mod-7

        try:
            wait2.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Ensure page loads
            #product_soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scrape the product data
            name = driver2.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
            #names.append(name.text.strip())
            print('\n',name.text.strip())

            try:#mod-6
                brand_element = driver2.find_element(By.CSS_SELECTOR, "a[href]")
                brand = driver2.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
                #brand_names.append(brand.text.strip())
                brand_name = brand.text.strip()
                #brand_links.append(brand_element.get_attribute('href'))
                brand_link= brand_element.get_attribute('href')
            except:
                print('error at getting brand')
                print('product_url: ',product_url)
                brand_name = ''
                brand_link = ''

            try:#mod-6
                strain = driver2.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
                #strains.append(strain.text.strip())
                strain_text = strain.text.strip()
            except:
                print('error at getting strain')
                print('product_url: ',product_url)
                strain_text = ''

            try:#mod-6
                potencies = driver2.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
                # Extract text and remove anything before ':'
                potency_values = [p.text.split(":")[-1].strip() for p in potencies]
                # Join them as a single string (optional, useful for CSV)
                potency_text = potency_values[1]
                #potencys.append(potency_text)
            except:
                print('error at getting potency')
                print('product_url: ',product_url)
                potency_text = ''

            try:#mod-6
                #price = driver2.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")#mod-3
                price = driver2.find_element(By.CSS_SELECTOR, 'div[class^="price__PriceText-sc-diymzm-2"]')#mod-3
                #prices.append(price.text.strip())
                price_text = price.text.strip()
                print('price: ',price.text.strip())
            except:
                print('error at getting price')
                print('product_url: ',product_url)
                price_text = ''

            try:#mod-6
                effect_elements = driver2.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
                #effects.append(", ".join([e.text.strip() for e in effect_elements]))
                effect = ", ".join([e.text.strip() for e in effect_elements])
            except:
                print('error at getting effect')
                print('product_url: ',product_url)
                effect = ''
                
            product_data = {
                "name": name.text.strip(),
                "brand_name": brand_name,
                "brand_link": brand_link,
                "strain": strain_text,
                "potency": potency_text,
                "price": price_text,
                "effects": effect
            }#mod-5
            product_list.append(product_data)

        except Exception as e:
            print(f"Error scraping product details: {e}")
            print('product_url: ',product_url,'\n')

        #driver.close()
        #driver.switch_to.window(driver.window_handles[0])  # Switch back to main tab

    print(f"\nPage {current_page} scraped successfully.")

    # Click the next page button if available
    
    next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="go to next page"]')
    driver.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end"});', next_button)#mod-0
    next_button.click()
    current_page += 1
    time.sleep(5)  # Allow the next page to load
    
    if current_page > 2:#tempolary for test #mod-9
        break

driver.switch_to.default_content()#mod-1

# Check if data was scraped
if product_list:
    # Initialize DataFrame with the scraped data
    df = pd.DataFrame(product_list)  # Wrap the dict in a list
    
    # Save DataFrame to CSV (append mode if the file exists)
    df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
    print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
    print("No data to save.")
    
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)

与本文相关的文章

发布评论

评论列表(0)

  1. 暂无评论