Problem Description: I tried scraping a website using Selenium, but I was unsuccessful due to its inability to detect the web elements. I need to collect product information from 6 to 10 different websites on a daily basis. This is the website: /?dtche%5Bpath%5D=products.
Code Snippet:
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()
# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)
# URL of the website to scrape
url = "/?dtche%5Bpath%5D=products"
driver.get(url)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")
# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found. Continuing...")
# Initialize storage
product_list = []
current_page = 1
# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
while True: # Loop through pages
products = []
# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue
# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())
brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))
strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())
potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)
price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())
effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))
product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)
except Exception as e:
print(f"Error scraping product details: {e}")
driver.close()
driver.switch_to.window(driver.window_handles[0]) # Switch back to main tab
print(f"Page {current_page} scraped successfully.")
# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5) # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break
# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")
driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.
Problem Description: I tried scraping a website using Selenium, but I was unsuccessful due to its inability to detect the web elements. I need to collect product information from 6 to 10 different websites on a daily basis. This is the website: https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products.
Code Snippet:
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge)
edge_options = Options()
# Initialize WebDriver
service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=edge_options)
# URL of the website to scrape
url = "https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# Handle ad close button
try:
close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")
# Handle cookie consent
try:
accept_button = wait.until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found. Continuing...")
# Initialize storage
product_list = []
current_page = 1
# Lists to store scraped data
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
while True: # Loop through pages
products = []
# Get all product elements
product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
for product in product_elements:
try:
product_url = product.find_element(By.TAG_NAME, "a").get_attribute("href")
products.append(product_url)
except Exception as e:
print(f"Error extracting product link: {e}")
continue
# Open each product in a new tab, scrape details, then close tab
for product_url in products:
driver.execute_script(f"window.open('{product_url}', '_blank');")
driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
try:
wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # Ensure page loads
product_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scrape the product data
name = driver.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
names.append(name.text.strip())
brand_element = driver.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
brand_names.append(brand.text.strip())
brand_links.append(brand_element.get_attribute('href'))
strain = driver.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
strains.append(strain.text.strip())
potencies = driver.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
potencys.append(potency_text)
price = driver.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")
prices.append(price.text.strip())
effect_elements = driver.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
effects.append(", ".join([e.text.strip() for e in effect_elements]))
product_data = {
"name": names,
"brand_name": brand_names,
"brand_link": brand_links,
"strain": strains,
"potency": potencys,
"price": prices,
"effects": effects
}
product_list.append(product_data)
except Exception as e:
print(f"Error scraping product details: {e}")
driver.close()
driver.switch_to.window(driver.window_handles[0]) # Switch back to main tab
print(f"Page {current_page} scraped successfully.")
# Click the next page button if available
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label*="next page"]')
next_button.click()
current_page += 1
time.sleep(5) # Allow the next page to load
except NoSuchElementException:
print("No more pages found. Exiting loop.")
break
# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)
#print("Scraping completed. Data saved to 'thriveil_products.xlsx'.")
driver.quit()
Thriveil sample 1 copy.py
Displaying thriveil.py.
Share
Improve this question
edited Feb 2 at 6:02
Ajeet Verma
3,0813 gold badges17 silver badges28 bronze badges
asked Feb 1 at 13:12
Mubaraq OnipedeMubaraq Onipede
395 bronze badges
1
- First of all you should keep the scraping process as simple as possible. If it is possible to make a list of URLs and then open and scrape each one by one(like in this case) - that's the way to go. Opening in a new tab, then switching between the tabs can cause errors. I also see you parse html for BeautifulSoup, but don't use it. Why? And what are the web elements you think selenium is unable to detect? – Artem Fedorov Commented Feb 7 at 18:03
1 Answer
Reset to default 0The following modifications have been added. For reference, the ids of the modifications( #mod-1 etc) are located at the point of the codes as well.
mod-0: Added "scrollIntoView" for paging button.
mod-1: Added "switch_to" iframe.
mod-2: Searching all the urls from the entire DOM resulted null list. Narrowing the search range to the scope of "main" tag solved this problem.
mod-3: The first product yielded "element not found" exception, because the class attribute did'nt match.
mod-4: Separated "ad close button" as a function.
mod-5: Result of csv was that all the listed data were of the first product on the first page. This was because list of data(names,brand_names and so on) were used as values on the setting of "product_data".
mod-6: Added "try" to every property getting process so that another properties would not be skipped when a property fails. Some of the attributes of products fail to be scraped. You can inspect the structure of HTML of these products.
mod-7: Instead of "driver.switch_to.window", 2nd driver is adopted for details of each product. It is simple because of no need to consider the timing of switch-in and out.
mod-8: Used Firefox as browser. Please go back to Edge if necessary.
mod-9: For testing, page iteration is limited to two pages.
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from seleniummon.exceptions import NoSuchElementException, TimeoutException
#from bs4 import BeautifulSoup
import pandas as pd
import time # To manage delays
# Set up Selenium WebDriver (using Microsoft Edge)
#edge_options = Options()
# Initialize WebDriver
#service = Service("C:\\Users\\iyush\\Documents\\VS Code\\Selenium\\msedgedriver.exe")
#driver = webdriver.Edge(service=service, options=edge_options)
driver = webdriver.Firefox()# mod-8
driver.implicitly_wait(1)
# WebDriver Wait
wait = WebDriverWait(driver, 30)
# open 2nd window for each product
driver2 = webdriver.Firefox()#mod-7
driver2.implicitly_wait(1)
wait2 = WebDriverWait(driver2, 30)
# URL of the website to scrape
url = "https://thriveil/casey-rec-menu/?dtche%5Bpath%5D=products"
driver.get(url)
def ad_close():#mod-4
# Handle ad close button
try:
close_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "terpli-close")))
close_button.click()
except TimeoutException:
print("Ad close button not found. Continuing...")
ad_close()#mod-4
# Handle cookie consent
try:
accept_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "wt-cli-accept-all-btn")))
accept_button.click()
except TimeoutException:
print("Cookie consent button not found. Continuing...")
# Initialize storage
product_list = []
current_page = 1
# Lists to store scraped data
#mod-5
"""
names = []
brand_names = []
brand_links = []
strains = []
potencys = []
prices = []
effects = []
"""
#iframe= driver.find_elements(By.CSS_SELECTOR,"#dutchie--embed__iframe")#mod-1
#iframe= driver.find_elements(By.CSS_SELECTOR,"iframe")#mod-1
#driver.switch_to.frame(iframe)#failed
driver.switch_to.frame("dutchie--embed__iframe")#mod-1
while True: # Loop through pages
products = []
ad_close()#mod-4
# Get all product elements
main = driver.find_element(By.CSS_SELECTOR,"#main-content")#mod-2
#product_elements = driver.find_elements(By.CLASS_NAME, "full-card_Wrapper-sc-11z5u35-0")
#product_elements = driver.find_elements(By.CLASS_NAME, "full-card__Wrapper-sc-11z5u35-0 gwMuEZ")
product_elements = main.find_elements(By.TAG_NAME, "a")#mod-2
print('len(product_elements): ',len(product_elements),'\n')
for product in product_elements:
product_url = product.get_attribute("href")
#print(product_url)
products.append(product_url)
# Open each product in a new tab, scrape details, then close tab
for product_url in products:
#driver.execute_script(f"window.open('{product_url}', '_blank');")
#driver.switch_to.window(driver.window_handles[-1]) # Switch to new tab
driver2.get(product_url)#mod-7
try:
wait2.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # Ensure page loads
#product_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scrape the product data
name = driver2.find_element(By.CSS_SELECTOR, "h1[data-testid='product-name']")
#names.append(name.text.strip())
print('\n',name.text.strip())
try:#mod-6
brand_element = driver2.find_element(By.CSS_SELECTOR, "a[href]")
brand = driver2.find_element(By.CSS_SELECTOR, ".typography__Brand-sc-1q7gvs8-2.fyoohd")
#brand_names.append(brand.text.strip())
brand_name = brand.text.strip()
#brand_links.append(brand_element.get_attribute('href'))
brand_link= brand_element.get_attribute('href')
except:
print('error at getting brand')
print('product_url: ',product_url)
brand_name = ''
brand_link = ''
try:#mod-6
strain = driver2.find_element(By.CSS_SELECTOR, "span[data-testid='info-chip']")
#strains.append(strain.text.strip())
strain_text = strain.text.strip()
except:
print('error at getting strain')
print('product_url: ',product_url)
strain_text = ''
try:#mod-6
potencies = driver2.find_elements(By.CSS_SELECTOR, "span.info-chip__InfoChipText-sc-11n9ujc-0")
# Extract text and remove anything before ':'
potency_values = [p.text.split(":")[-1].strip() for p in potencies]
# Join them as a single string (optional, useful for CSV)
potency_text = potency_values[1]
#potencys.append(potency_text)
except:
print('error at getting potency')
print('product_url: ',product_url)
potency_text = ''
try:#mod-6
#price = driver2.find_element(By.CSS_SELECTOR, "div.price__PriceText-sc-diymzm-2")#mod-3
price = driver2.find_element(By.CSS_SELECTOR, 'div[class^="price__PriceText-sc-diymzm-2"]')#mod-3
#prices.append(price.text.strip())
price_text = price.text.strip()
print('price: ',price.text.strip())
except:
print('error at getting price')
print('product_url: ',product_url)
price_text = ''
try:#mod-6
effect_elements = driver2.find_elements(By.CSS_SELECTOR, "span.effect-tile__Text-sc-1as4rkm-1")
#effects.append(", ".join([e.text.strip() for e in effect_elements]))
effect = ", ".join([e.text.strip() for e in effect_elements])
except:
print('error at getting effect')
print('product_url: ',product_url)
effect = ''
product_data = {
"name": name.text.strip(),
"brand_name": brand_name,
"brand_link": brand_link,
"strain": strain_text,
"potency": potency_text,
"price": price_text,
"effects": effect
}#mod-5
product_list.append(product_data)
except Exception as e:
print(f"Error scraping product details: {e}")
print('product_url: ',product_url,'\n')
#driver.close()
#driver.switch_to.window(driver.window_handles[0]) # Switch back to main tab
print(f"\nPage {current_page} scraped successfully.")
# Click the next page button if available
next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="go to next page"]')
driver.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end"});', next_button)#mod-0
next_button.click()
current_page += 1
time.sleep(5) # Allow the next page to load
if current_page > 2:#tempolary for test #mod-9
break
driver.switch_to.default_content()#mod-1
# Check if data was scraped
if product_list:
# Initialize DataFrame with the scraped data
df = pd.DataFrame(product_list) # Wrap the dict in a list
# Save DataFrame to CSV (append mode if the file exists)
df.to_csv("thriveil_products.csv", mode='a', header=not pd.iomon.file_exists("thriveil_products.csv"), index=False)
print("Scraping completed. Data saved to 'thriveil_products.csv'.")
else:
print("No data to save.")
# Save DataFrame to Excel
#df = pd.DataFrame(product_list)
#df.to_excel("thriveil_products.xlsx", index=False)