I'm building a model that will be able to adaptably interact with webpages, currently creating a basic pipeline that will attempt to login to a randomly generated html page, and then document results so that I can build a database that I will use to train the actual model. My code uses OpenCV to group text and then read the text using TrOCR.
Here is what the login page looks like, with the only variation being the text labels:
then the model uses very basic fuzzy matching to delegate text into 3 categories: username, password, button. The username and password text is being detected pretty well, but OCR is having a lot of trouble with the button text detection. It is duplicating the text. For example, if the button says "enter" it will output "enter enter." My guess was that it was because of the low contrast so I used OpenCV to try to mitigate that by detecting the low contrast areas, but it still isn't working great. I also tried to shrink the bounding boxes incase they were overlapping but that threw off the entire code. I'm a little lost here.
My relevant functions:
def merge_contours(contours):
boxes = [cv2.boundingRect(c) for c in contours]
boxes.sort(key=lambda b: (b[1], b[0]))
merged = []
used = [False] * len(boxes)
for i in range(len(boxes)):
if used[i]:
continue
x1, y1, w1, h1 = boxes[i]
x2, y2 = x1 + w1, y1 + h1
group = [boxes[i]]
used[i] = True
for j in range(i + 1, len(boxes)):
xj, yj, wj, hj = boxes[j]
if used[j]:
continue
if abs(yj - y1) < 15 and abs(hj - h1) < 10 and 0 < xj - x2 < 60:
group.append(boxes[j])
x2 = max(x2, xj + wj)
used[j] = True
merged.append(group)
return merged
def extract_text_with_trocr(image):
ocr_lines = []
image_np = np.array(image)
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
blur = cv2.GaussianBlur(gray, (3, 3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
groups = merge_contours(contours)
draw = ImageDraw.Draw(image)
line_id = 0
for group in groups:
x1 = min([b[0] for b in group])
y1 = min([b[1] for b in group])
x2 = max([b[0] + b[2] for b in group])
y2 = max([b[1] + b[3] for b in group])
roi = image.crop((x1, y1, x2, y2)).convert("RGB")
if is_low_contrast(roi):
roi = ImageOps.invert(roi)
print("Inverted low contrast region")
roi_resized = roi.resize((384, 384))
pixel_values = processor(images=roi_resized, return_tensors="pt").pixel_values
with torch.no_grad():
generated_ids = model.generate(pixel_values)
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"Detected [{output_text}] at ({x1},{y1},{x2 - x1},{y2 - y1})")
if output_text and not all(c == '.' for c in output_text):
ocr_lines.append(output_text)
draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)
draw.text((x1, y1 - 10), f"line_{line_id}", fill="red")
line_id += 1
image.save("debug_labeled.png")
return ocr_lines