I am conducting research on p-hacking, which requires accurately extracting tables from published academic papers. I have downloaded a large number of PDF files for this purpose.
So far, I have tried several approaches to extract the tables. Unfortunately, Python packages like fitz and camelot are unable to directly extract the tables from these PDFs. As a workaround, I used a YOLO-based layout detection model to identify the locations of each table. While this approach allows me to extract the text content of the tables, I am losing the structural information (e.g., the alignment of rows and columns), and the OCR model I’m using is not always accurate.
Could you suggest better methods or tools to improve the extraction process, ensuring both the content and structure of the tables are preserved?
P.S. I apologize, but due to copyright restrictions, I am unable to upload the PDF files I am working on.
import json
import os
import fitz
def get_tables_loc(layout_json: dict) -> list:
pdf_info = layout_json['pdf_info']
layout = {page: pdf_info[page]['tables'] for page in range(len(pdf_info)) if
pdf_info[page]['tables']}
tables_loc = []
for page in layout.keys():
for table in layout[page]:
try:
table_body = [block for block in table['blocks'] if block['type'] == 'table_body']
if not table['bbox'] or not table_body:
continue
tables_loc.append((page, table['bbox']))
except Exception as e:
print(e)
return tables_loc
def extract_tables(path_paper):
path_layout = os.path.join(path_paper, "layout.json")
path_origin = os.path.join(path_paper, "origin.pdf")
with open(path_layout, "r", encoding="utf-8") as f:
layout_json = json.load(f)
tables_loc = get_tables_loc(layout_json)
doc = fitz.open(path_origin)
for page, table_loc in tables_loc:
rect = fitz.Rect(*table_loc)
table_finder = doc[page].find_tables(clip=rect)
if table_finder.tables:
table_result = table_finder.tables[0].to_pandas()
else:
table_text = doc[page].get_text("text", clip=rect)
I am conducting research on p-hacking, which requires accurately extracting tables from published academic papers. I have downloaded a large number of PDF files for this purpose.
So far, I have tried several approaches to extract the tables. Unfortunately, Python packages like fitz and camelot are unable to directly extract the tables from these PDFs. As a workaround, I used a YOLO-based layout detection model to identify the locations of each table. While this approach allows me to extract the text content of the tables, I am losing the structural information (e.g., the alignment of rows and columns), and the OCR model I’m using is not always accurate.
Could you suggest better methods or tools to improve the extraction process, ensuring both the content and structure of the tables are preserved?
P.S. I apologize, but due to copyright restrictions, I am unable to upload the PDF files I am working on.
import json
import os
import fitz
def get_tables_loc(layout_json: dict) -> list:
pdf_info = layout_json['pdf_info']
layout = {page: pdf_info[page]['tables'] for page in range(len(pdf_info)) if
pdf_info[page]['tables']}
tables_loc = []
for page in layout.keys():
for table in layout[page]:
try:
table_body = [block for block in table['blocks'] if block['type'] == 'table_body']
if not table['bbox'] or not table_body:
continue
tables_loc.append((page, table['bbox']))
except Exception as e:
print(e)
return tables_loc
def extract_tables(path_paper):
path_layout = os.path.join(path_paper, "layout.json")
path_origin = os.path.join(path_paper, "origin.pdf")
with open(path_layout, "r", encoding="utf-8") as f:
layout_json = json.load(f)
tables_loc = get_tables_loc(layout_json)
doc = fitz.open(path_origin)
for page, table_loc in tables_loc:
rect = fitz.Rect(*table_loc)
table_finder = doc[page].find_tables(clip=rect)
if table_finder.tables:
table_result = table_finder.tables[0].to_pandas()
else:
table_text = doc[page].get_text("text", clip=rect)
Share
Improve this question
asked Feb 16 at 12:41
Buoyant XuBuoyant Xu
777 bronze badges
2
- 2 "ensuring both the content and structure of the tables are preserved" - You cannot ensure that in general. You can merely improve the share of correctly extracted content and structure. – mkl Commented Feb 18 at 13:56
- You are correct. Specifically, for the data I am working on, I need to accurately extract regression result information from the empirical analysis of research papers, including regression coefficients, significance levels, and statistics (values of standard errors or t-statistics) for subsequent analysis. Generally, the content of such tables in PDFs can be read horizontally, but the three pieces of information I need are often vertically split across two rows. Therefore, it is necessary to accurately identify the columns in the table. – Buoyant Xu Commented Feb 19 at 2:25
2 Answers
Reset to default 1Start by experimenting with libraries like Layout Parser
and models such as CascadeTabNet
.If you re open to cloud solutions services like Amazon Textract or Google Document AI might be worth evaluating as well.
For this problem, I tried to implement a simple algorithm, which can accurately extract tables from well-formatted academic papers and meet my needs. Of course, there is still room for optimization, and I look forward to better solutions in the future.
import json
import math
import os
import fitz
import pandas as pd
def get_tables_loc(layout_json: dict) -> list:
"""
Extracts the locations of tables from the layout JSON.
Args:
layout_json (dict): The layout JSON containing table information.
Returns:
list: A list of tuples containing page number and table bounding box coordinates.
"""
pdf_info = layout_json.get('pdf_info', {})
# Filter pages that contain tables
layout = {page: pdf_info[page]['tables'] for page in range(len(pdf_info)) if pdf_info[page].get('tables')}
tables_loc = []
for page, tables in layout.items():
for table in tables:
try:
# Check if the table has a valid body and bounding box
table_body = [block for block in table['blocks'] if block['type'] == 'table_body']
if not table.get('bbox') or not table_body:
continue
tables_loc.append((page, table['bbox']))
except Exception as e:
print(f"Error processing table on page {page}: {e}")
return tables_loc
def merge_text_blocks(words):
"""
Merge text blocks that are close to each other horizontally and vertically.
Args:
words (list): List of tuples, each containing coordinates (x0, y0, x1, y1) and text.
Returns:
list: List of merged text blocks with updated coordinates and combined text.
"""
# Convert coordinates to integers, floor for start points and ceil for end points
words_col = [
((math.floor(word[0]), math.floor(word[1]),
math.ceil(word[2]), math.ceil(word[3])), word[4])
for word in words
]
# Calculate horizontal distance between two text blocks
def horizontal_distance(x, y):
return y[0] - x[2] # Distance between right edge of first block and left edge of second block
# Sort text blocks by y-coordinate (row) first, then x-coordinate (column)
words_col.sort(key=lambda x: (x[0][1], x[0][0]))
words_cleaned = []
i = 0
while i < len(words_col):
# Start a new group with current text block
current_group = [words_col[i]]
next_idx = i + 1
# Try to merge with subsequent text blocks if they meet the criteria
while next_idx < len(words_col):
# Check if blocks are in the same row (y-coordinate difference ≤ 5)
same_row = abs(words_col[next_idx][0][1] - current_group[-1][0][1]) <= 5
# Check if horizontal distance is reasonable (between 0 and 15)
valid_distance = 0 <= horizontal_distance(current_group[-1][0], words_col[next_idx][0]) < 15
# Check if blocks overlap horizontally (considering a margin of 50)
x_overlap = (words_col[next_idx][0][0] < current_group[-1][0][2] + 50 and
words_col[next_idx][0][2] > current_group[-1][0][0] - 50)
# If all criteria are met, add block to current group
if same_row and valid_distance and x_overlap:
current_group.append(words_col[next_idx])
next_idx += 1
else:
break
# If group has only one block, keep it as is
if len(current_group) == 1:
words_cleaned.append(current_group[0])
else:
# Merge all blocks in group: combine coordinates and concatenate text
merged_coords = (
min(word[0][0] for word in current_group), # Leftmost x0
min(word[0][1] for word in current_group), # Topmost y0
max(word[0][2] for word in current_group), # Rightmost x1
max(word[0][3] for word in current_group) # Bottommost y1
)
merged_text = " ".join(word[1] for word in current_group)
words_cleaned.append((merged_coords, merged_text))
# Move to next unprocessed block
i = next_idx
return words_cleaned
def determine_rows_and_columns(words_cleaned):
def has_vertical_overlap(block1, block2, overlap_threshold=0.5):
y1_min, y1_max = block1[0][1], block1[0][3]
y2_min, y2_max = block2[0][1], block2[0][3]
overlap = min(y1_max, y2_max) - max(y1_min, y2_min)
min_height = min(y1_max - y1_min, y2_max - y2_min)
return overlap > min_height * overlap_threshold
def has_horizontal_overlap(block1, block2, overlap_threshold=0.5):
x1_min, x1_max = block1[0][0], block1[0][2]
x2_min, x2_max = block2[0][0], block2[0][2]
overlap = min(x1_max, x2_max) - max(x1_min, x2_min)
min_width = min(x1_max - x1_min, x2_max - x2_min)
return overlap > min_width * overlap_threshold
# Group blocks into rows based on vertical overlap
rows = []
remaining_blocks = words_cleaned.copy()
while remaining_blocks:
current_row = [remaining_blocks.pop(0)]
i = 0
while i < len(remaining_blocks):
if any(has_vertical_overlap(block, remaining_blocks[i]) for block in current_row):
current_row.append(remaining_blocks.pop(i))
else:
i += 1
rows.append(sorted(current_row, key=lambda x: x[0][0])) # Sort blocks in row by x-coordinate
# Extract row boundaries (y-coordinates)
row_coords = []
for row in rows:
y_min = min(block[0][1] for block in row)
row_coords.append(y_min)
# Group blocks into columns based on horizontal overlap
cols = []
all_blocks = words_cleaned.copy()
while all_blocks:
current_col = [all_blocks.pop(0)]
i = 0
while i < len(all_blocks):
if any(has_horizontal_overlap(block, all_blocks[i]) for block in current_col):
current_col.append(all_blocks.pop(i))
else:
i += 1
cols.append(sorted(current_col, key=lambda x: x[0][1])) # Sort blocks in column by y-coordinate
# Extract column boundaries (x-coordinates)
col_coords = []
for col in cols:
x_min = min(block[0][0] for block in col)
col_coords.append(x_min)
# Sort coordinates
row_coords = sorted(row_coords)
col_coords = sorted(col_coords)
return row_coords, col_coords
def assign_text_to_cells(text_blocks, rows, cols):
# Initialize an empty 2D list to store table data
table = [["" for _ in range(len(cols))] for _ in range(len(rows))]
for block in text_blocks:
coords, text = block
x0, y0, x1, y1 = coords
# Find rows with the highest overlap
row_overlaps = []
for i, row_y in enumerate(rows):
# Calculate the y-coordinate of the next row (if exists)
next_row_y = rows[i + 1] if i < len(rows) - 1 else row_y + 20
# If text block is between current row and next row, consider it belongs to current row
if row_y - 5 <= y0 <= next_row_y + 5:
row_overlaps.append((i, abs(row_y - y0)))
# Find columns with the highest overlap
col_overlaps = []
for j, col_x in enumerate(cols):
# Calculate the x-coordinate of the next column (if exists)
next_col_x = cols[j + 1] if j < len(cols) - 1 else col_x + 50
# Check if text block overlaps with current column
if col_x - 5 <= x0 <= next_col_x + 5:
col_overlaps.append((j, abs(col_x - x0)))
# If matching rows and columns are found, select the ones with highest overlap
if row_overlaps and col_overlaps:
row_idx = min(row_overlaps, key=lambda x: x[1])[0]
col_idx = min(col_overlaps, key=lambda x: x[1])[0]
# If cell already has content, add space and concatenate
if table[row_idx][col_idx]:
table[row_idx][col_idx] += " " + text
else:
table[row_idx][col_idx] = text
return table
def create_table_from_text_blocks(text_blocks):
"""
Creates a pandas DataFrame from a list of text blocks by determining rows and columns.
Args:
text_blocks (list): List of text blocks with coordinates and text content.
Returns:
pd.DataFrame: A DataFrame representing the table.
"""
rows, cols = determine_rows_and_columns(text_blocks)
table = assign_text_to_cells(text_blocks, rows, cols)
# Create a DataFrame with column names
df = pd.DataFrame(table, columns=[f"Col {i + 1}" for i in range(len(cols))])
return df
def extract_tables(path_paper):
"""
Extracts tables from a PDF document using layout information.
Args:
path_paper (str): Path to the directory containing the PDF and layout JSON.
Returns:
list: A list of dictionaries containing extracted tables.
"""
paper_title = os.path.basename(path_paper)
path_layout = os.path.join(path_paper, "layout.json")
path_origin = os.path.join(path_paper, "origin.pdf")
# Load layout JSON
with open(path_layout, "r", encoding="utf-8") as f:
layout_json = json.load(f)
# Get table locations
tables_loc = get_tables_loc(layout_json)
tables_list = []
# Open the PDF document
doc = fitz.open(path_origin)
for page, table_loc in tables_loc:
rect = fitz.Rect(*table_loc)
table_finder = doc[page].find_tables(clip=rect)
if table_finder.tables:
# Use PyMuPDF's table extraction if available
df_table = table_finder.tables[0].to_pandas()
else:
# Extract text blocks and merge them
words = doc[page].get_text("words", clip=rect)
words_cleaned = merge_text_blocks(words)
df_table = create_table_from_text_blocks(words_cleaned)
tables_list.append({
"title": paper_title,
"page": page,
"table": df_table
})
return tables_list
if __name__ == '__main__':
path_paper = r"***"
tables = extract_tables(path_paper)
print("All tables extracted.")