How to Extract Data (Including Checkboxes) from PDF Form Using Python (PyPDF2, OpenCV Not Working)

I’m trying to extract data from a PDF form that includes text fields, checkboxes and other input elements. The form looks like a structured intake form.

PyMuPDF

 def extract_text_from_pdf(pdf_path):
 with fitz.open(pdf_path) as pdf_file:
     all_text = ""
     for page_num in range(pdf_file.page_count):
         page = pdf_file[page_num]
         all_text += page.get_text()
 return all_text

Extracts raw text.But No information about form fields or checkboxes and also loses structure of the form.

Opencv

 def convert_pdf_to_images(pdf_path):
     return convert_from_path(pdf_path)

 # Preprocess image for better conthattour detection
 def preprocess_image(image):
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
     dilated = cv2.dilate(binary, kernel, iterations=1)
     return dilated

 # Detect main and sub boxes and extract text immediately
 def detect_and_extract_text(image, original_image):
     contours, _ = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
     extracted_data = []

     # print("\nExtracting text from bounding boxes:\n" + "-"*50)
     for contour in contours:
         x, y, w, h = cv2.boundingRect(contour)
         # Filter small boxes
         if w > 50 and 50 < h < 1000:
             # Draw the box on the original image
             cv2.rectangle(original_image, (x, y), (x + w, y + h), (0, 255, 0), 2)

             # Extract the region of interest (ROI)
             roi = original_image[y:y + h, x:x + w]

             # Extract text using Tesseract
             text = pytesseract.image_to_string(roi, config="--psm 6").strip()

             # Append data to the list
             extracted_data.append({"text": text, "bbox": (x, y, w, h)})

             # Print the text for this bounding box
             # print(f"Text: {text}")
             # print(f"Bounding Box: {x, y, w, h}")
             # print("-" * 50)

     return extracted_data, original_image

 # Global OCR for remaining text excluding bounding box areas
 def extract_remaining_text_excluding_boxes(image, bounding_boxes):
     # Create a mask to exclude bounding box areas
     mask = np.ones(image.shape[:2], dtype="uint8") * 255  # Start with a white mask
     for box in bounding_boxes:
         x, y, w, h = box
         cv2.rectangle(mask, (x, y), (x + w, y + h), 0, -1)  # Black out bounding box area

     # Apply the mask to the image
     masked_image = cv2.bitwise_and(image, image, mask=mask)

     # Convert the masked image to grayscale
     gray_image = cv2.cvtColor(masked_image, cv2.COLOR_BGR2GRAY)

     # Perform OCR on the masked image
     text = pytesseract.image_to_string(gray_image, config="--psm 6").strip()

     # print("\nExtracting text from the remaining areas (global OCR):\n" + "-"*50)
     # print(f"Remaining Text:\n{text}")
     # print("-" * 50)

     return text

 # Main execution function
 def process_pdf_and_detect_boxes(pdf_path, output_image_path):
     # Step 1: Convert PDF to images
     pages = convert_pdf_to_images(pdf_path)
     all_extracted_data = []

     for i, page in enumerate(pages):
         # print(f"\nProcessing page {i + 1}...\n" + "="*50)

         # Convert PIL Image to OpenCV format
         original_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

         # Step 2: Preprocess the image
         preprocessed_image = preprocess_image(original_image)

         # Step 3: Detect and extract text box by box
         extracted_data, image_with_boxes = detect_and_extract_text(preprocessed_image, original_image)
         all_extracted_data.extend(extracted_data)

         # Collect bounding boxes for exclusion in global OCR
         bounding_boxes = [entry["bbox"] for entry in extracted_data]

         # Step 4: Perform global OCR to extract remaining text
         remaining_text = extract_remaining_text_excluding_boxes(original_image, bounding_boxes)
         if remaining_text:
             all_extracted_data.append({"text": remaining_text, "bbox": None})

         # Step 5: Save image with bounding boxes
         cv2.imwrite(f"{output_image_path}_page_{i + 1}.png", image_with_boxes)
         # print(f"Saved image with boxes: {output_image_path}_page_{i + 1}.png")

Works well for extracting text from images. But unable to accurately detect checkboxes. And fails to maintain field-value pairs.

pdf2docx Lost form structure and checkboxes and resulting data was difficult to parse.

So, how can I extract form field values and checkbox states from a PDF using Python? Are there any libraries or techniques that handle checkboxes and structured form data effectively?

Also attached the PDF for your reference. PDF: PDF_flile

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

How to Extract Data (Including Checkboxes) from PDF Form Using Python (PyPDF2, OpenCV Not Working) - Stack Overflow

与本文相关的文章

评论列表(0)