I’m trying to extract data from a PDF form that includes text fields, checkboxes and other input elements. The form looks like a structured intake form.
PyMuPDF
def extract_text_from_pdf(pdf_path): with fitz.open(pdf_path) as pdf_file: all_text = "" for page_num in range(pdf_file.page_count): page = pdf_file[page_num] all_text += page.get_text() return all_text
Extracts raw text.But No information about form fields or checkboxes and also loses structure of the form.
Opencv
def convert_pdf_to_images(pdf_path): return convert_from_path(pdf_path) # Preprocess image for better conthattour detection def preprocess_image(image): gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) dilated = cv2.dilate(binary, kernel, iterations=1) return dilated # Detect main and sub boxes and extract text immediately def detect_and_extract_text(image, original_image): contours, _ = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) extracted_data = [] # print("\nExtracting text from bounding boxes:\n" + "-"*50) for contour in contours: x, y, w, h = cv2.boundingRect(contour) # Filter small boxes if w > 50 and 50 < h < 1000: # Draw the box on the original image cv2.rectangle(original_image, (x, y), (x + w, y + h), (0, 255, 0), 2) # Extract the region of interest (ROI) roi = original_image[y:y + h, x:x + w] # Extract text using Tesseract text = pytesseract.image_to_string(roi, config="--psm 6").strip() # Append data to the list extracted_data.append({"text": text, "bbox": (x, y, w, h)}) # Print the text for this bounding box # print(f"Text: {text}") # print(f"Bounding Box: {x, y, w, h}") # print("-" * 50) return extracted_data, original_image # Global OCR for remaining text excluding bounding box areas def extract_remaining_text_excluding_boxes(image, bounding_boxes): # Create a mask to exclude bounding box areas mask = np.ones(image.shape[:2], dtype="uint8") * 255 # Start with a white mask for box in bounding_boxes: x, y, w, h = box cv2.rectangle(mask, (x, y), (x + w, y + h), 0, -1) # Black out bounding box area # Apply the mask to the image masked_image = cv2.bitwise_and(image, image, mask=mask) # Convert the masked image to grayscale gray_image = cv2.cvtColor(masked_image, cv2.COLOR_BGR2GRAY) # Perform OCR on the masked image text = pytesseract.image_to_string(gray_image, config="--psm 6").strip() # print("\nExtracting text from the remaining areas (global OCR):\n" + "-"*50) # print(f"Remaining Text:\n{text}") # print("-" * 50) return text # Main execution function def process_pdf_and_detect_boxes(pdf_path, output_image_path): # Step 1: Convert PDF to images pages = convert_pdf_to_images(pdf_path) all_extracted_data = [] for i, page in enumerate(pages): # print(f"\nProcessing page {i + 1}...\n" + "="*50) # Convert PIL Image to OpenCV format original_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR) # Step 2: Preprocess the image preprocessed_image = preprocess_image(original_image) # Step 3: Detect and extract text box by box extracted_data, image_with_boxes = detect_and_extract_text(preprocessed_image, original_image) all_extracted_data.extend(extracted_data) # Collect bounding boxes for exclusion in global OCR bounding_boxes = [entry["bbox"] for entry in extracted_data] # Step 4: Perform global OCR to extract remaining text remaining_text = extract_remaining_text_excluding_boxes(original_image, bounding_boxes) if remaining_text: all_extracted_data.append({"text": remaining_text, "bbox": None}) # Step 5: Save image with bounding boxes cv2.imwrite(f"{output_image_path}_page_{i + 1}.png", image_with_boxes) # print(f"Saved image with boxes: {output_image_path}_page_{i + 1}.png")
Works well for extracting text from images. But unable to accurately detect checkboxes. And fails to maintain field-value pairs.
- pdf2docx Lost form structure and checkboxes and resulting data was difficult to parse.
So, how can I extract form field values and checkbox states from a PDF using Python? Are there any libraries or techniques that handle checkboxes and structured form data effectively?
Also attached the PDF for your reference. PDF: PDF_flile