python - Scrape table data into header and data fields

VBA guy trying to learn python, I am about 90% through this but have been unable to work out how to sort the last stage

Code scrapes a table of data
Post data manipulation every second row of text contains a header
The odd rows contain a list of maps that are comma delimited post a regexp (so these would be split out for as many rows as there are delimited entries)
First row would be excluded as being too short in terms of characters (tried but failed to completely remove empty strings)

I am stuck at how to write the headers and data to my 2D array. TIA

import requests
import regex
from bs4 import BeautifulSoup

url = '/'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

matrix = [[]]

table = soup.find('table')
table_body = table.find('tbody')
rows = table.find_all('tr')

 for row in rows:
    cols = row.find_all(['td', 'th'])
    coldata = [ele.text.strip() for ele in cols]
    coldata = str(coldata).replace('[MERGE]',"")
    # failed attempt to remove empty strings from each line
    #coldata = filter(None, coldata)
    x = regex.sub(r'([a-z]{1})([A-Z]{1})', r'\1,\2', coldata).strip()

    lst = x.split(",")
    stringCount = len(lst)
    print (stringCount)

    # would be cleaner to remove empty strings and write odd lines as headers, even lenes as data

    if stringCount == 2:
        print ("header"+x)
    # add headers here
    elif stringCount> 2:
        print (lst)
    #list data here
    else:
    print ("failed to remove single empty string")

#data.append([ele for ele in coldata if ele])
#print(data)

VBA guy trying to learn python, I am about 90% through this but have been unable to work out how to sort the last stage

Code scrapes a table of data
Post data manipulation every second row of text contains a header
The odd rows contain a list of maps that are comma delimited post a regexp (so these would be split out for as many rows as there are delimited entries)
First row would be excluded as being too short in terms of characters (tried but failed to completely remove empty strings)

I am stuck at how to write the headers and data to my 2D array. TIA

import requests
import regex
from bs4 import BeautifulSoup

url = 'https://modernarmor.worldoftanks/en/cms/guides/map-labels-tiers-eras/'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

matrix = [[]]

table = soup.find('table')
table_body = table.find('tbody')
rows = table.find_all('tr')

 for row in rows:
    cols = row.find_all(['td', 'th'])
    coldata = [ele.text.strip() for ele in cols]
    coldata = str(coldata).replace('[MERGE]',"")
    # failed attempt to remove empty strings from each line
    #coldata = filter(None, coldata)
    x = regex.sub(r'([a-z]{1})([A-Z]{1})', r'\1,\2', coldata).strip()

    lst = x.split(",")
    stringCount = len(lst)
    print (stringCount)

    # would be cleaner to remove empty strings and write odd lines as headers, even lenes as data

    if stringCount == 2:
        print ("header"+x)
    # add headers here
    elif stringCount> 2:
        print (lst)
    #list data here
    else:
    print ("failed to remove single empty string")

#data.append([ele for ele in coldata if ele])
#print(data)

Share Improve this question edited Feb 17 at 10:28 Christoph Rackwitz 15.5k5 gold badges39 silver badges51 bronze badges asked Feb 17 at 9:15 Bob 374 bronze badges

Please clarify your intention by adding the required output (just an extract is fine) to your question. – Adon Bilivit Commented Feb 17 at 9:19
Appreciate Adon Bilivit being able to solve it as asked – Bob Commented Feb 17 at 13:02

Add a comment |

1 Answer 1

Sorted by: Reset to default 1

It's not entirely clear (to me) what you're trying to achieve.

I think you're trying to scrape a table and consolidate its contents into a header/detail arrangement. If that's the case then a Python dictionary is ideal.

You may find requests and BeautifulSoup unsuitable given that the website you're scraping is heavily reliant on JavaScript.

selenium might be a better option.

Something like this:

from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdrivermon.by import By
import json # only used for data presentation

URL = "https://modernarmor.worldoftanks/en/cms/guides/map-labels-tiers-eras/"

def reject_cookies(driver):
    wait = WebDriverWait(driver, 5)
    ec = EC.element_to_be_clickable
    sel = (By.ID, "onetrust-reject-all-handler")
    try:
        wait.until(ec(sel)).click()
    except Exception:
        pass

def gen_tr(driver):
    result = dict()
    wait = WebDriverWait(driver, 5)
    ec = EC.presence_of_all_elements_located
    sel = (By.CSS_SELECTOR, "tr.news_post-widget-table-tr")
    for i, tr in enumerate(wait.until(ec(sel))):
        if i % 2 == 0:
            key = tr.text.strip()
        else:
            result[key] = tr.text.split("\n")
    return result

with Chrome() as driver:
    driver.get(URL)
    reject_cookies(driver)
    result = gen_tr(driver)
    print(json.dumps(result, indent=2))

Output (partial):

{
  "World War II: Battle Tiers 1-2": [
    "Mines",
    "Liberty Falls"
  ],
  "World War II: Battle Tiers 3-4": [
    "Himmelsdorf",
    "Mines",
    "Cliff",
    "Abbey",
    "Mountain Pass",
    "Port",
    "Ghost Town",
    "Pilsen",

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Scrape table data into header and data fields - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)