I have written this web scrapper that reads real estate web listings, but for some weird reason it fails when I run it on my Ubuntu 24.02 server but works on my Win 10 machine. I am on python 3.12.3 for both Ubuntu and Win10.
If I do control-C to see where in the script it dies at it seems to hang on the http part:
Both machines are on the same subnet, and U24 can ping the centris webpage and curl gets a bunch of stuff so it's not networking related. UFW is off as well.
bloom@bloom:~/centris_scrap/webScrap_Selenium$ python3 main.py -tp 1
^CTraceback (most recent call last):
File "/home/bloom/centris_scrap/webScrap_Selenium/main.py", line 210, in <module>
driver = webdriver.Chrome(service=service, options=chrome_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
super().__init__(
File "/usr/lib/python3/dist-packages/selenium/webdriver/chromium/webdriver.py", line 61, in __init__
super().__init__(command_executor=executor, options=options)
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 208, in __init__
self.start_session(capabilities)
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 292, in start_session
response = self.execute(Command.NEW_SESSION, caps)["value"]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 345, in execute
response = selfmand_executor.execute(driver_command, params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/remote_connection.py", line 302, in execute
return self._request(command_info[0], url, body=data)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/remote_connection.py", line 322, in _request
response = self._conn.request(method, url, body=body, headers=headers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/_request_methods.py", line 118, in request
return self.request_encode_body(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/_request_methods.py", line 217, in request_encode_body
return self.urlopen(method, url, **extra_kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 443, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 537, in _make_request
response = conn.getresponse()
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 461, in getresponse
httplib_response = super().getresponse()
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse
response.begin()
File "/usr/lib/python3.12/http/client.py", line 331, in begin
version, status, reason = self._read_status()
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/socket.py", line 707, in readinto
return self._sock.recv_into(b)
^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
EDIT: I have made a simplified version that doesn't go into the logic of the scrape itself, it seems to be stuck trying to instantiate the browser? The above error hasn't changed but I have commented out everything from the get(url).
The entire script can be found in the github or below:
or below:
import logging
from selenium import webdriver
from seleniummon import ElementClickInterceptedException, NoSuchElementException
import argparse
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdrivermon.by import By
import time
import pandas as pd
from datetime import datetime
import uuid
import glob
import os
import os.path
from jinja2 import Environment, FileSystemLoader
def scrap_pages(driver):
sqft=0
year=0
parking=0
listings = driver.find_elements(By.CLASS_NAME, 'description')
if listings[-1].text.split('/n')[0] == '': del listings[-1]
for listing in listings:
price=12333
mls = '12333'
prop_type = 'test'
addr = 'test'
city = 'test'
sector = 'test'
bedrooms = 1
bathrooms=1
listing_item = {
'mls': mls,
'price': price,
'address': addr,
'property type': prop_type,
'city': city,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'sector': sector,
'living sqft': sqft,
'lot sqft': sqft,
'year': year,
'parking': parking
}
centris_list.append(listing_item)
if __name__ == '__main__':
today=datetime.now()
today=today.strftime("%Y%m%d")
start_time = time.time()
UUID = str(uuid.uuid4())[-4:]
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--skip_scrape", type=bool, default=False, help='dont scrape the webpage')
parser.add_argument("-tp","--total_pages", type=int, help='number of pages to scrape')
args = parser.parse_args()
filename = f"centris_{today}_{UUID}_app.log"
logging.basicConfig(
filename=filename,
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M",
force=True
)
logging.info(f"We are starting the app")
logging.info(f"We are scraping : {args.total_pages}")
if not args.skip_scrape:
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
#headless and block anti-headless
chrome_options.add_argument('--headless')
user_agent_win = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.53 Safari/537.36'
user_agent_u24 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.53 Safari/537.36'
driver_path_win = 'C:\\WebDriver\\bin\\chromedriver132\\chromedriver.exe'
driver_path_u24 = r'/usr/lib/chromium-browser/chromedriver'
if os.path.exists(driver_path_win):
user_agent = user_agent_win
else:
user_agent = user_agent_u24
chrome_options.add_argument(f'user-agent={user_agent}')
if os.path.exists(driver_path_win):
service = ChromeService(executable_path=driver_path_win)
else:
service = ChromeService(executable_path=driver_path_u24)
driver = webdriver.Chrome(service=service, options=chrome_options)
centris_list = []
url = ''
'''
driver.get(url)
time.sleep(5)
driver.find_element(By.ID, 'didomi-notice-agree-button').click()
total_pages = driver.find_element(By.CLASS_NAME, 'pager-current').text.split('/')[1].strip()
if args.total_pages is not None:
total = args.total_pages
else:
total=int(total_pages)
for i in range(0, total):
try:
scrap_pages(driver)
driver.find_element(By.CSS_SELECTOR, 'li.next> a').click()
time.sleep(3)
except ElementClickInterceptedException as initial_error:
try:
if len(driver.find_elements(By.XPATH, ".//div[@class='DialogInsightLightBoxCloseButton']")) > 0:
driver.find_element(By.XPATH, ".//div[@class='DialogInsightLightBoxCloseButton']").click()
time.sleep(3)
print('pop-up closed')
scrap_pages(driver)
driver.find_element(By.CSS_SELECTOR, 'li.next> a').click()
time.sleep(3)
except NoSuchElementException:
raise initial_error
'''
driver.close()
end_time=time.time()
elapsed_seconds =end_time-start_time
elapsed_time=elapsed_seconds/60
logging.info(f"excution time is {elapsed_time:.2f}")
I have written this web scrapper that reads real estate web listings, but for some weird reason it fails when I run it on my Ubuntu 24.02 server but works on my Win 10 machine. I am on python 3.12.3 for both Ubuntu and Win10.
If I do control-C to see where in the script it dies at it seems to hang on the http part:
Both machines are on the same subnet, and U24 can ping the centris webpage and curl gets a bunch of stuff so it's not networking related. UFW is off as well.
bloom@bloom:~/centris_scrap/webScrap_Selenium$ python3 main.py -tp 1
^CTraceback (most recent call last):
File "/home/bloom/centris_scrap/webScrap_Selenium/main.py", line 210, in <module>
driver = webdriver.Chrome(service=service, options=chrome_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
super().__init__(
File "/usr/lib/python3/dist-packages/selenium/webdriver/chromium/webdriver.py", line 61, in __init__
super().__init__(command_executor=executor, options=options)
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 208, in __init__
self.start_session(capabilities)
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 292, in start_session
response = self.execute(Command.NEW_SESSION, caps)["value"]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/webdriver.py", line 345, in execute
response = selfmand_executor.execute(driver_command, params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/remote_connection.py", line 302, in execute
return self._request(command_info[0], url, body=data)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/selenium/webdriver/remote/remote_connection.py", line 322, in _request
response = self._conn.request(method, url, body=body, headers=headers)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/_request_methods.py", line 118, in request
return self.request_encode_body(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/_request_methods.py", line 217, in request_encode_body
return self.urlopen(method, url, **extra_kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 443, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 537, in _make_request
response = conn.getresponse()
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 461, in getresponse
httplib_response = super().getresponse()
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse
response.begin()
File "/usr/lib/python3.12/http/client.py", line 331, in begin
version, status, reason = self._read_status()
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 292, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/socket.py", line 707, in readinto
return self._sock.recv_into(b)
^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
EDIT: I have made a simplified version that doesn't go into the logic of the scrape itself, it seems to be stuck trying to instantiate the browser? The above error hasn't changed but I have commented out everything from the get(url).
The entire script can be found in the github or below:
or below:
import logging
from selenium import webdriver
from seleniummon import ElementClickInterceptedException, NoSuchElementException
import argparse
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdrivermon.by import By
import time
import pandas as pd
from datetime import datetime
import uuid
import glob
import os
import os.path
from jinja2 import Environment, FileSystemLoader
def scrap_pages(driver):
sqft=0
year=0
parking=0
listings = driver.find_elements(By.CLASS_NAME, 'description')
if listings[-1].text.split('/n')[0] == '': del listings[-1]
for listing in listings:
price=12333
mls = '12333'
prop_type = 'test'
addr = 'test'
city = 'test'
sector = 'test'
bedrooms = 1
bathrooms=1
listing_item = {
'mls': mls,
'price': price,
'address': addr,
'property type': prop_type,
'city': city,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'sector': sector,
'living sqft': sqft,
'lot sqft': sqft,
'year': year,
'parking': parking
}
centris_list.append(listing_item)
if __name__ == '__main__':
today=datetime.now()
today=today.strftime("%Y%m%d")
start_time = time.time()
UUID = str(uuid.uuid4())[-4:]
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--skip_scrape", type=bool, default=False, help='dont scrape the webpage')
parser.add_argument("-tp","--total_pages", type=int, help='number of pages to scrape')
args = parser.parse_args()
filename = f"centris_{today}_{UUID}_app.log"
logging.basicConfig(
filename=filename,
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M",
force=True
)
logging.info(f"We are starting the app")
logging.info(f"We are scraping : {args.total_pages}")
if not args.skip_scrape:
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
#headless and block anti-headless
chrome_options.add_argument('--headless')
user_agent_win = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.53 Safari/537.36'
user_agent_u24 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.53 Safari/537.36'
driver_path_win = 'C:\\WebDriver\\bin\\chromedriver132\\chromedriver.exe'
driver_path_u24 = r'/usr/lib/chromium-browser/chromedriver'
if os.path.exists(driver_path_win):
user_agent = user_agent_win
else:
user_agent = user_agent_u24
chrome_options.add_argument(f'user-agent={user_agent}')
if os.path.exists(driver_path_win):
service = ChromeService(executable_path=driver_path_win)
else:
service = ChromeService(executable_path=driver_path_u24)
driver = webdriver.Chrome(service=service, options=chrome_options)
centris_list = []
url = 'https://www.centris.ca/en/properties~for-sale~brossard?view=Thumbnail'
'''
driver.get(url)
time.sleep(5)
driver.find_element(By.ID, 'didomi-notice-agree-button').click()
total_pages = driver.find_element(By.CLASS_NAME, 'pager-current').text.split('/')[1].strip()
if args.total_pages is not None:
total = args.total_pages
else:
total=int(total_pages)
for i in range(0, total):
try:
scrap_pages(driver)
driver.find_element(By.CSS_SELECTOR, 'li.next> a').click()
time.sleep(3)
except ElementClickInterceptedException as initial_error:
try:
if len(driver.find_elements(By.XPATH, ".//div[@class='DialogInsightLightBoxCloseButton']")) > 0:
driver.find_element(By.XPATH, ".//div[@class='DialogInsightLightBoxCloseButton']").click()
time.sleep(3)
print('pop-up closed')
scrap_pages(driver)
driver.find_element(By.CSS_SELECTOR, 'li.next> a').click()
time.sleep(3)
except NoSuchElementException:
raise initial_error
'''
driver.close()
end_time=time.time()
elapsed_seconds =end_time-start_time
elapsed_time=elapsed_seconds/60
logging.info(f"excution time is {elapsed_time:.2f}")
Share
Improve this question
edited Feb 17 at 2:03
D.Zou
asked Feb 16 at 5:11
D.ZouD.Zou
8081 gold badge11 silver badges26 bronze badges
1 Answer
Reset to default 0Make sure that both Chrome and ChromeDriver versions match. Run the following on Ubuntu:
google-chrome --version
chromedriver --version
Try launching Chrome directly:
google-chrome --headless --no-sandbox --disable-dev-shm-usage --remote-debugging-port=9222
chromedriver --verbose