Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Read, create, merge, split, watermark, encrypt, OCR, and fill PDF files using Python and CLI tools
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/extract_form_structure.py
1"""2Extract form structure from a non-fillable PDF.34This script analyzes the PDF to find:5- Text labels with their exact coordinates6- Horizontal lines (row boundaries)7- Checkboxes (small rectangles)89Output: A JSON file with the form structure that can be used to generate10accurate field coordinates for filling.1112Usage: python extract_form_structure.py <input.pdf> <output.json>13"""1415import json16import sys17import pdfplumber181920def extract_form_structure(pdf_path):21structure = {22"pages": [],23"labels": [],24"lines": [],25"checkboxes": [],26"row_boundaries": []27}2829with pdfplumber.open(pdf_path) as pdf:30for page_num, page in enumerate(pdf.pages, 1):31structure["pages"].append({32"page_number": page_num,33"width": float(page.width),34"height": float(page.height)35})3637words = page.extract_words()38for word in words:39structure["labels"].append({40"page": page_num,41"text": word["text"],42"x0": round(float(word["x0"]), 1),43"top": round(float(word["top"]), 1),44"x1": round(float(word["x1"]), 1),45"bottom": round(float(word["bottom"]), 1)46})4748for line in page.lines:49if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:50structure["lines"].append({51"page": page_num,52"y": round(float(line["top"]), 1),53"x0": round(float(line["x0"]), 1),54"x1": round(float(line["x1"]), 1)55})5657for rect in page.rects:58width = float(rect["x1"]) - float(rect["x0"])59height = float(rect["bottom"]) - float(rect["top"])60if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:61structure["checkboxes"].append({62"page": page_num,63"x0": round(float(rect["x0"]), 1),64"top": round(float(rect["top"]), 1),65"x1": round(float(rect["x1"]), 1),66"bottom": round(float(rect["bottom"]), 1),67"center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),68"center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)69})7071lines_by_page = {}72for line in structure["lines"]:73page = line["page"]74if page not in lines_by_page:75lines_by_page[page] = []76lines_by_page[page].append(line["y"])7778for page, y_coords in lines_by_page.items():79y_coords = sorted(set(y_coords))80for i in range(len(y_coords) - 1):81structure["row_boundaries"].append({82"page": page,83"row_top": y_coords[i],84"row_bottom": y_coords[i + 1],85"row_height": round(y_coords[i + 1] - y_coords[i], 1)86})8788return structure899091def main():92if len(sys.argv) != 3:93print("Usage: extract_form_structure.py <input.pdf> <output.json>")94sys.exit(1)9596pdf_path = sys.argv[1]97output_path = sys.argv[2]9899print(f"Extracting structure from {pdf_path}...")100structure = extract_form_structure(pdf_path)101102with open(output_path, "w") as f:103json.dump(structure, f, indent=2)104105print(f"Found:")106print(f" - {len(structure['pages'])} pages")107print(f" - {len(structure['labels'])} text labels")108print(f" - {len(structure['lines'])} horizontal lines")109print(f" - {len(structure['checkboxes'])} checkboxes")110print(f" - {len(structure['row_boundaries'])} row boundaries")111print(f"Saved to {output_path}")112113114if __name__ == "__main__":115main()116