I'm encountering an issue on my development server, which is deployed on an EC2 c5.xlarge instance and running a parallel web scraping process. The scraping tasks frequently fail with the following error:
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
I'm looking for possible ways to handle this error more gracefully—whether by implementing retries with exponential backoff, adjusting request concurrency, or optimizing network configurations on my EC2 instance.
Screenshot of logs from server
def scrape_services(master_list, excluded_list, master_list_filename, request_id, max_workers=3):
results = []
output_csv_path = None
total_items = len(master_list)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(process_row, index, row, request_id, excluded_list, queue_status, total_items): index for index, row in master_list.iterrows()}
done, _ = concurrent.futures.wait(futures.keys(), return_when=concurrent.futures.ALL_COMPLETED)
results = [future.result() for future in done]
# Save results
if results:
output_csv_path = save_results_to_csv(results, master_list_filename)
return jsonify({"message": "Scraping completed", "csv_file": output_csv_path})
def process_row(index, row, request_id, excluded_list, queue_status, total_items):
""" Worker function that processes a single item """
with driver_lock: # Ensure only one thread initializes the driver at a time
driver = get_chrome_driver()
try:
# Add sleep to prevent Google from blocking
time.sleep(random.uniform(2, 5))
keyword = f"{row.get('業種_検索用_', '')} {row.get('顧客名', '')} {row.get('住所', '')} {row.get('電話番号', '')}".strip()
print(f"[{index+1}/{total_items}] Searching for: {keyword}")
# Introduce a random delay to prevent being blocked
time.sleep(random.uniform(2, 5)) # Wait between 2 to 5 seconds
url, top_url = scrape_google_search(driver, keyword, excluded_list)
return {
"業種_検索用_": row.get("業種_検索用_", ""),
"顧客名": row.get("顧客名", ""),
"住所": row.get("住所", ""),
"電話番号": row.get("電話番号", ""),
"url": url,
"top_url": top_url
}
except Exception as e:
print(f"Error processing {keyword}: {e}")
return {
"業種_検索用_": row.get("業種_検索用_", ""),
"顧客名": row.get("顧客名", ""),
"住所": row.get("住所", ""),
"電話番号": row.get("電話番号", ""),
"url": None,
"top_url": None,
"error": str(e)
}
finally:
driver.quit()