I received kind of a homework from an internship and it sounds something like this : By Monday, please make me a program in Python that will run in a Docker container and that, at start-up, will serve a web page where the following happens:
- to take the main brands of cars from autovit.ro
- to allow the selection of a brand and, after selection, to display the minimum and maximum prices of the cars of the respective brand on Autovit. You are allowed to get help from chatgpt.
So I am not a very skilled coder but I tried my best with the help of chatgpt. Below is the code in Python for the scraper :
from fastapi import FastAPI, Form
from fastapi.responses import HTMLResponse
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
app = FastAPI()
# Inițializează driver-ul Chrome cu Selenium
def get_selenium_driver():
options = webdriver.ChromeOptions()
options.headless = True # Rulează în mod headless (fără interfață grafică)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
return driver
# Funcție pentru a obține brandurile de pe autovit.ro
def get_car_brands():
url = "/"
driver = get_selenium_driver()
driver.get(url)
# Așteaptă să se încarce elementele (poți ajusta timpul după caz)
time.sleep(5)
# Obține sursa paginii
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Încearcă să găsești brandurile folosind selectorul corect (actualizează-l cu unul corect)
brands = []
try:
for brand in soup.find_all('a', class_="e4zbkti0 ooa-17iu0re"): # Actualizează cu selectorul corect
brands.append(brand.text.strip())
except Exception as e:
print(f"Error while extracting brands: {e}")
driver.quit()
return brands
# Funcție pentru a obține prețurile minime și maxime ale unui brand
def get_price_range(brand):
url = f"/{brand}"
driver = get_selenium_driver()
driver.get(url)
# Așteaptă să se încarce elementele (ajustează timpul după caz)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Găsește prețurile folosind selectorul corect
prices = []
try:
for price in soup.find_all('span', class_="ooa-ejqwvc"): # Actualizează cu selectorul corect
try:
price_value = int(price.text.replace("€", "").replace(",", "").strip())
prices.append(price_value)
except ValueError:
pass # În cazul în care prețul nu poate fi convertit într-un număr
except Exception as e:
print(f"Error while extracting prices: {e}")
driver.quit()
return min(prices), max(prices) if prices else (0, 0) # Asigură-te că returnezi o valoare validă
@app.get("/", response_class=HTMLResponse)
def read_root():
brands = get_car_brands()
brand_options = "".join([f"<option value='{brand}'>{brand}</option>" for brand in brands])
html_content = f"""
<html>
<body>
<h2>Select a car brand</h2>
<form action="/price_range" method="get">
<select name="brand">
{brand_options}
</select>
<input type="submit" value="Get Price Range">
</form>
</body>
</html>
"""
return html_content
@app.get("/price_range", response_class=HTMLResponse)
def price_range(brand: str = Form(...)):
min_price, max_price = get_price_range(brand)
html_content = f"""
<html>
<body>
<h2>Price Range for {brand}</h2>
<p>Min Price: €{min_price}</p>
<p>Max Price: €{max_price}</p>
</body>
</html>
"""
return html_content
The problem is the container in Docker works, I can open the browser and search for localhost:8000 and the html opens but I cannot find any car brand or price. I asked chatgpt and I also concluded that the classes are wrong but I cannot find anything better. I tried with XPATH as well but with no succes. Do you have any ideas ?
Thanks !
I received kind of a homework from an internship and it sounds something like this : By Monday, please make me a program in Python that will run in a Docker container and that, at start-up, will serve a web page where the following happens:
- to take the main brands of cars from autovit.ro
- to allow the selection of a brand and, after selection, to display the minimum and maximum prices of the cars of the respective brand on Autovit. You are allowed to get help from chatgpt.
So I am not a very skilled coder but I tried my best with the help of chatgpt. Below is the code in Python for the scraper :
from fastapi import FastAPI, Form
from fastapi.responses import HTMLResponse
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
app = FastAPI()
# Inițializează driver-ul Chrome cu Selenium
def get_selenium_driver():
options = webdriver.ChromeOptions()
options.headless = True # Rulează în mod headless (fără interfață grafică)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
return driver
# Funcție pentru a obține brandurile de pe autovit.ro
def get_car_brands():
url = "https://www.autovit.ro/"
driver = get_selenium_driver()
driver.get(url)
# Așteaptă să se încarce elementele (poți ajusta timpul după caz)
time.sleep(5)
# Obține sursa paginii
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Încearcă să găsești brandurile folosind selectorul corect (actualizează-l cu unul corect)
brands = []
try:
for brand in soup.find_all('a', class_="e4zbkti0 ooa-17iu0re"): # Actualizează cu selectorul corect
brands.append(brand.text.strip())
except Exception as e:
print(f"Error while extracting brands: {e}")
driver.quit()
return brands
# Funcție pentru a obține prețurile minime și maxime ale unui brand
def get_price_range(brand):
url = f"https://www.autovit.ro/{brand}"
driver = get_selenium_driver()
driver.get(url)
# Așteaptă să se încarce elementele (ajustează timpul după caz)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Găsește prețurile folosind selectorul corect
prices = []
try:
for price in soup.find_all('span', class_="ooa-ejqwvc"): # Actualizează cu selectorul corect
try:
price_value = int(price.text.replace("€", "").replace(",", "").strip())
prices.append(price_value)
except ValueError:
pass # În cazul în care prețul nu poate fi convertit într-un număr
except Exception as e:
print(f"Error while extracting prices: {e}")
driver.quit()
return min(prices), max(prices) if prices else (0, 0) # Asigură-te că returnezi o valoare validă
@app.get("/", response_class=HTMLResponse)
def read_root():
brands = get_car_brands()
brand_options = "".join([f"<option value='{brand}'>{brand}</option>" for brand in brands])
html_content = f"""
<html>
<body>
<h2>Select a car brand</h2>
<form action="/price_range" method="get">
<select name="brand">
{brand_options}
</select>
<input type="submit" value="Get Price Range">
</form>
</body>
</html>
"""
return html_content
@app.get("/price_range", response_class=HTMLResponse)
def price_range(brand: str = Form(...)):
min_price, max_price = get_price_range(brand)
html_content = f"""
<html>
<body>
<h2>Price Range for {brand}</h2>
<p>Min Price: €{min_price}</p>
<p>Max Price: €{max_price}</p>
</body>
</html>
"""
return html_content
The problem is the container in Docker works, I can open the browser and search for localhost:8000 and the html opens but I cannot find any car brand or price. I asked chatgpt and I also concluded that the classes are wrong but I cannot find anything better. I tried with XPATH as well but with no succes. Do you have any ideas ?
Thanks !
Share Improve this question asked Feb 2 at 19:22 Dobrea MarianDobrea Marian 111 bronze badge2 Answers
Reset to default 0I have to make the assumption those classes are dynamically created. You need to try a few things:
- make it robust so it's not reliant on a fixed class of (for example
class_="ooa-ejqwvc"
- make sure the content is loaded - is it possible you are trying to find content that has not been generated yet?
- The content is within the html in json under the
<script>
tag. You could just parse that.
Here's the solution for #3:
Code:
import requests
from bs4 import BeautifulSoup
import json
url = 'https://www.autovit.ro/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find('script', {'id':'__NEXT_DATA__'})
jsonData = json.loads(data.text)
ads = []
for i in ['offerOfTheDay', 'promotedAds']:
ads += jsonData['props']['pageProps'][i]
offerOfTheDay = [jsonData['props']['pageProps']['offerOfTheDay']]
ads = jsonData['props']['pageProps']['promotedAds'] + offerOfTheDay
for each in ads:
print(f'{each["price"]["value"]} - {each["title"]}')
Output:
112 000 - BMW X6 M M60i xDrive AT MHEV
129 710 - Ferrari California
23 990 - Mazda MX-30
20 990 - Renault Captur E-Tech Plug-In-Hybrid Intens
29 035 - Skoda Kodiaq 2.0 TDI 4X4 DSG Style
45 815 - Mercedes-Benz E 300e 9G-TRONIC
13 300 - Mercedes-Benz C 220 (BlueTEC) d 7G-TRONIC Avantgarde
32 000 - BMW Seria 5 540i xDrive Touring Aut. Sport Line
6 990 - Nissan Juke 1.2 DIG-T N-Connecta
6 700 - BMW Seria 1 120d
79 718,1 - Porsche Cayenne Coupe E-Hybrid Tiptronic S Platinum Edition
To get the full list, use the grapgql
endpoint. Keep in mind it's pulling 32 records at a time out of 42k+. So it will take a little while.
Code:
import requests
import pandas as pd
import time
import random
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'}
outputData = []
page = 1
max_retries = 10 # Max retry attempts per failed request
pullData = True
while pullData == True:
url = f'https://www.autovit.ro/graphql?operationName=listingScreen&variables=%7B%22click2BuyExperimentId%22%3A%22%22%2C%22click2BuyExperimentVariant%22%3A%22%22%2C%22experiments%22%3A%5B%7B%22key%22%3A%22MCTA-1414%22%2C%22variant%22%3A%22a%22%7D%2C%7B%22key%22%3A%22MCTA-1617%22%2C%22variant%22%3A%22b%22%7D%2C%7B%22key%22%3A%22MCTA-1660%22%2C%22variant%22%3A%22a%22%7D%2C%7B%22key%22%3A%22MCTA-1661%22%2C%22variant%22%3A%22a%22%7D%2C%7B%22key%22%3A%22MCTA-1733%22%2C%22variant%22%3A%22a%22%7D%2C%7B%22key%22%3A%22MCTA-1715%22%2C%22variant%22%3A%22a%22%7D%2C%7B%22key%22%3A%22MCTA-1721%22%2C%22variant%22%3A%22a%22%7D%5D%2C%22filters%22%3A%5B%7B%22name%22%3A%22category_id%22%2C%22value%22%3A%2229%22%7D%5D%2C%22includeCepik%22%3Afalse%2C%22includeClick2Buy%22%3Afalse%2C%22includeFiltersCounters%22%3Afalse%2C%22includeNewPromotedAds%22%3Afalse%2C%22includePriceEvaluation%22%3Atrue%2C%22includePromotedAds%22%3Afalse%2C%22includeRatings%22%3Afalse%2C%22includeSortOptions%22%3Afalse%2C%22includeSuggestedFilters%22%3Afalse%2C%22maxAge%22%3A60%2C%22page%22%3A{page}%2C%22parameters%22%3A%5B%22make%22%2C%22vat%22%2C%22fuel_type%22%2C%22mileage%22%2C%22engine_capacity%22%2C%22engine_code%22%2C%22engine_power%22%2C%22first_registration_year%22%2C%22model%22%2C%22version%22%2C%22year%22%5D%2C%22promotedInput%22%3A%7B%7D%2C%22searchTerms%22%3A%5B%5D%7D&extensions=%7B%22persistedQuery%22%3A%7B%22sha256Hash%22%3A%222cd92d3c6edae023213334eb68d1fc7804216d8decd1b4ee89842c7470aa3ea6%22%2C%22version%22%3A1%7D%7D'
retries = 0
while retries < max_retries:
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an error for bad status
data = response.json()
ads = data['data']['advertSearch']['edges']
totalCount = data['data']['advertSearch']['totalCount']
for each in ads:
temp_data = {
'id': each['node']['id'],
'value': each['node']['price']['amount']['value']
}
for i in each['node']['parameters']:
temp_data[i['key']] = i['value']
outputData.append(temp_data)
print(f'Collected {len(outputData)} of {totalCount}')
# Stop when we reach total count
if len(outputData) >= totalCount:
pullData = False
break
# Sleep every 100 pages to avoid rate limits
if page % 100 == 0:
sleep_time = random.randint(5, 20)
print(f'Pausing for {sleep_time} seconds...')
time.sleep(sleep_time)
page += 1
break # Exit retry loop on success
except (requests.RequestException, KeyError, ValueError) as e:
retries += 1
wait_time = 2 ** retries # Exponential backoff (2, 4, 8, 16... seconds)
print(f'Error fetching page {page}: {e}. Retrying in {wait_time} seconds...')
time.sleep(wait_time)
else:
print(f'Failed to fetch page {page} after {max_retries} retries. Skipping...')
output = pd.DataFrame(outputData)
output = output.drop_duplicates()
Output:
print(output.head().to_string())
id value make vat fuel_type mileage engine_capacity engine_power model version year
0 7055969127 9790 volkswagen 1 diesel 247477 1968 140 tiguan ver-2-0-tdi-dpf-bluemotion-technology-team 2012
1 7054386352 29600 audi NaN diesel 165000 1968 190 a5 ver-sportback-2-0-tdi-clean-diesel-quattro-s-tronic 2017
2 7055926560 19999 mercedes-benz 1 diesel 212565 1461 116 cla ver-180-d-shooting-brake-7g--dct-progressive 2019
3 7055212106 35998 bmw 1 diesel 148587 1995 190 x4 ver-xdrive20d-aut- 2021
4 7049240606 88989 mercedes-benz 1 petrol 19900 2999 435 gle ver-amg-53-mhev-4maticplus 2021