I'm trying to get the data from a simple table from the following website (.aspx?Idioma=pt-br). I was able to get the data from the first page, but as we can see the pagination it's not linked to the URL and I couldn't get it, even though i could find the buttons at the bottom of the page "ProximoPaginacao" and "MeioPaginacao", but i couldn't handle this implementation. Any ideas?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract_table_data(url, table_id):
try:
response = requests.get(url,verify=False)
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table', id=table_id)
if not table:
print(f"Table with ID '{table_id}' not found.")
return None
# Extract header row
header_row = [th.get_text(strip=True) for th in table.find_all('th')]
# Extract data rows
data_rows = []
for row in table.find('tbody').find_all('tr'):
data_rows.append([td.get_text(strip=True) for td in row.find_all('td')])
# Create DataFrame
df = pd.DataFrame(data_rows, columns=header_row)
return df
except requests.exceptions.RequestException as e:
print(f"Error during requests: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
# Example usage
url = ".aspx?
Idioma=pt-br" # Replace with the actual URL
table_id = "ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01" # Replace with the actual
table ID
table_data = extract_table_data(url, table_id)
if table_data is not None:
print(table_data)
I'm trying to get the data from a simple table from the following website (https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?Idioma=pt-br). I was able to get the data from the first page, but as we can see the pagination it's not linked to the URL and I couldn't get it, even though i could find the buttons at the bottom of the page "ProximoPaginacao" and "MeioPaginacao", but i couldn't handle this implementation. Any ideas?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract_table_data(url, table_id):
try:
response = requests.get(url,verify=False)
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table', id=table_id)
if not table:
print(f"Table with ID '{table_id}' not found.")
return None
# Extract header row
header_row = [th.get_text(strip=True) for th in table.find_all('th')]
# Extract data rows
data_rows = []
for row in table.find('tbody').find_all('tr'):
data_rows.append([td.get_text(strip=True) for td in row.find_all('td')])
# Create DataFrame
df = pd.DataFrame(data_rows, columns=header_row)
return df
except requests.exceptions.RequestException as e:
print(f"Error during requests: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
# Example usage
url = "https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?
Idioma=pt-br" # Replace with the actual URL
table_id = "ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01" # Replace with the actual
table ID
table_data = extract_table_data(url, table_id)
if table_data is not None:
print(table_data)
Share
Improve this question
edited Jan 21 at 8:31
HedgeHog
25k5 gold badges17 silver badges41 bronze badges
asked Jan 20 at 17:30
André FAndré F
274 bronze badges
3
- BeautifulSoup can't handle dynamic pages. The right way of doing this would be getting a API access. I know the website you're tryng to access and they have an API for developers (if you are working for a company you may get access to it). However, if you are not, you could try using selenium. But be aware that the B3 could probably block your IP if they detect that you are a bot. I'd advise testing your code on other pages before going to their website. – Iran Ribeiro Commented Jan 20 at 19:37
- I've discussed with B3 since I do have an open chat with them and due to the old layout treatment for this specific data, they couldn't share it with me on a easy path. I supose that even in up2data, they may not have an endpoint for this particular archive..I'm not a heavy user of selenium, but i'm gonna try something different. ManyThanks. – André F Commented Jan 20 at 19:46
- I see. Fell free to contact me if you need any help with selenium. I had to use it in my job to automate some tasks because the website we need to access doesn't have an API yet. I've struggled a lot at the beginning, but now we have something that works. – Iran Ribeiro Commented Jan 20 at 22:47
2 Answers
Reset to default 2As mentioned in the comments, content is dynamically reloaded - In order to access it via requests
, you would have to replicate the mechanism, but it would be easier to do it via an official api or selenium
.
Here is an approach that calls the page initil and takes over the paging as long as a next button is available. At the end, the tables are concatenated into a final dataframe:
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
driver = webdriver.Chrome()
url = f'https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?Idioma=pt-br'
driver.get(url)
data = []
while True:
try:
WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[id="ctl00_contentPlaceHolderConteudo_tabIbovespa_TabAdmnistradores"]'))
).click()
table = pd.read_html(driver.page_source)[0]
data.append(table)
next_button = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[id="ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01_ctl03_ctl01_ctl00_lnkNext"]'))
)
driver.execute_script("arguments[0].click();", next_button)
except TimeoutException:
print("no next page element")
break
except StaleElementReferenceException:
print("dom chenged try again")
continue
driver.close()
result = pd.concat(data, ignore_index=True)
result[(~result['Administrador'].str.endswith('...')) & (~result['Administrador'].str.startswith('...'))]
Clube de Investimento | Administrador | CNPJ | Nº Registro | Registrado em | |
---|---|---|---|---|---|
1 | A40 Clube de Investimento | XP INVESTIMENTOS CCTVM S/A | 53.086.076/0001-48 | 9058 | 19/10/2023 |
2 | Abaetê Clube de Investimento | XP INVESTIMENTOS CCTVM S/A | 52.143.244/0001-27 | 9006 | 04/05/2023 |
... | |||||
4968 | Zodiacus Clube de Investimentos | BTG PACTUAL SERVICOS FINANCEIROS S/A DTVM | 43.193.144/0001-14 | 8742 | 11/08/2021 |
4969 | ZTA CLUBE DE INVESTIMENTO | GUIDE INVESTIMENTOS S.A. CV | 58.848.525/0001-61 | 9171 | 17/10/2024 |
You can get all the pages using requests & BeautifulSoup, without selenium:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract_all_tables(url):
requests.packages.urllib3.disable_warnings()
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
}
data = {}
tables = []
while True:
print(f'Scraping table #{len(tables) + 1}')
response = requests.post(url, headers=headers, data=data, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.select_one('table#ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01')
header_row = [th.get_text(strip=True) for th in table.thead.select('th')]
data_rows = [[td.get_text(strip=True) for td in tr.select('td')] for tr in table.tbody.select('tr')]
df = pd.DataFrame(data_rows, columns=header_row)
tables.append(df)
next_button = table.tfoot.select_one('td.ProximoPaginacao > input')
if not next_button:
break
data['__VIEWSTATE'] = soup.select_one('input#__VIEWSTATE').get('value')
data['__EVENTTARGET'] = next_button.get('name').replace('$', ':')
return tables
url = 'https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?idioma=pt-br'
tables = extract_all_tables(url)
print(f'{len(tables) = }')