I have an excel file with different sections, each one with its header, of which i provide its names (they also have a number attached to it on second column). On a first approach, i want to read and print on screen each section's cells. Here is the code I used:
section_headers = [ 'LIBROS', 'CAPÍTULOS DE LIBROS','ARTÍCULOS CIENTÍFICOS (ORIGINALES Y DE REVISIÓN)', 'ARTÍCULOS DE DIVULGACIÓN','PONENCIAS PRESENTADAS','CARTELES PRESENTADOS EN CONGRESOS', 'CURSOS IMPARTIDOS','SEMINARIOS IMPARTIDOS','PROGRAMAS DE RADIO Y TV','PODCASTS','SOFTWARE', 'PATENTES', 'MODELOS DE UTILIDAD', 'PROTOTIPO', 'PRÁCTICAS PROFESIONALES', 'SERVICIO SOCIAL', 'TESIS','OTROS (Ej. Manuales, procesos)']
def process_excel(file_path, section_headers):
df = pd.read_excel(file_path, skiprows=2)
sections = {}
current_section = None
section_data = []
for idx, row in df.iterrows():
if any(header in str(row[0]) for header in section_headers) and isinstance(row[1], (int, float)):
if current_section is not None:
sections[current_section] = pd.DataFrame(section_data)
current_section = row[0] # The first column is the section name
section_data = [] # Reset data for the new section
else:
# Add row data to current section
section_data.append(row)
if current_section is not None:
sections[current_section] = pd.DataFrame(section_data)
for section, data in sections.items():
print(f"\nSection: {section}")
print(data)
Now, when I call the function:
process_excel(file_path='wherever the file is", section_headers)
i obtain nothing. I am running it on google colab. I already charged pandas and openpyxl. Any ideas on why i don't see anything (not even a error message)?
I have an excel file with different sections, each one with its header, of which i provide its names (they also have a number attached to it on second column). On a first approach, i want to read and print on screen each section's cells. Here is the code I used:
section_headers = [ 'LIBROS', 'CAPÍTULOS DE LIBROS','ARTÍCULOS CIENTÍFICOS (ORIGINALES Y DE REVISIÓN)', 'ARTÍCULOS DE DIVULGACIÓN','PONENCIAS PRESENTADAS','CARTELES PRESENTADOS EN CONGRESOS', 'CURSOS IMPARTIDOS','SEMINARIOS IMPARTIDOS','PROGRAMAS DE RADIO Y TV','PODCASTS','SOFTWARE', 'PATENTES', 'MODELOS DE UTILIDAD', 'PROTOTIPO', 'PRÁCTICAS PROFESIONALES', 'SERVICIO SOCIAL', 'TESIS','OTROS (Ej. Manuales, procesos)']
def process_excel(file_path, section_headers):
df = pd.read_excel(file_path, skiprows=2)
sections = {}
current_section = None
section_data = []
for idx, row in df.iterrows():
if any(header in str(row[0]) for header in section_headers) and isinstance(row[1], (int, float)):
if current_section is not None:
sections[current_section] = pd.DataFrame(section_data)
current_section = row[0] # The first column is the section name
section_data = [] # Reset data for the new section
else:
# Add row data to current section
section_data.append(row)
if current_section is not None:
sections[current_section] = pd.DataFrame(section_data)
for section, data in sections.items():
print(f"\nSection: {section}")
print(data)
Now, when I call the function:
process_excel(file_path='wherever the file is", section_headers)
i obtain nothing. I am running it on google colab. I already charged pandas and openpyxl. Any ideas on why i don't see anything (not even a error message)?
Share Improve this question asked Nov 20, 2024 at 15:40 John DoeJohn Doe 1058 bronze badges 2 |2 Answers
Reset to default 0import pandas as pd
section_headers = [
'LIBROS', 'CAPÍTULOS DE LIBROS', 'ARTÍCULOS CIENTÍFICOS (ORIGINALES Y DE REVISIÓN)',
'ARTÍCULOS DE DIVULGACIÓN', 'PONENCIAS PRESENTADAS', 'CARTELES PRESENTADOS EN CONGRESOS',
'CURSOS IMPARTIDOS', 'SEMINARIOS IMPARTIDOS', 'PROGRAMAS DE RADIO Y TV', 'PODCASTS',
'SOFTWARE', 'PATENTES', 'MODELOS DE UTILIDAD', 'PROTOTIPO', 'PRÁCTICAS PROFESIONALES',
'SERVICIO SOCIAL', 'TESIS', 'OTROS (Ej. Manuales, procesos)'
]
def process_excel(file_path, section_headers):
# Read the Excel file, skipping unnecessary rows
df = pd.read_excel(file_path, skiprows=2)
# Initialize storage for sections
sections = {}
current_section = None
section_data = []
for idx, row in df.iterrows():
# Check if the row corresponds to a section header
if any(header in str(row[0]) for header in section_headers) and isinstance(row[1], (int, float)):
# Save the previous section's data
if current_section is not None and section_data:
sections[current_section] = pd.DataFrame(section_data, columns=df.columns)
# Start a new section
current_section = row[0]
section_data = [] # Reset for new section
else:
# Add the row to the current section's data
if not row.isnull().all(): # Skip completely empty rows
section_data.append(row.tolist())
# Save the last section
if current_section is not None and section_data:
sections[current_section] = pd.DataFrame(section_data, columns=df.columns)
# Print each section
for section, data in sections.items():
print(f"\nSection: {section}")
print(data)
# Example Usage
file_path = "your_excel_file.xlsx"
process_excel(file_path, section_headers)
df = pd.read_excel(file_path, skiprows=2)
for section in section headers:
print(f"\nSection: {section}")
# Remove print(df.filter(like=section, axis=0)
# Update requires data frame column name Sections for section headers
#Can add in excel row 2 and change from skiprows=2 to skiprows =1
filter = df['Sections'].str.contains(section)
print(df[filter])
This should simplify and achieve same goal
def process_excel(file_path, section_headers):
looks incorrect. – JonSG Commented Nov 20, 2024 at 17:33