Source from repo

PDF Processing Guide

Read, create, merge, split, watermark, encrypt, OCR, and fill PDF files using Python and CLI tools

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

57.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/extract_form_structure.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code116 linesFree

scripts/extract_form_structure.py

1"""
2Extract form structure from a non-fillable PDF.
3 
4This script analyzes the PDF to find:
5- Text labels with their exact coordinates
6- Horizontal lines (row boundaries)
7- Checkboxes (small rectangles)
8 
9Output: A JSON file with the form structure that can be used to generate
10accurate field coordinates for filling.
11 
12Usage: python extract_form_structure.py <input.pdf> <output.json>
13"""
14 
15import json
16import sys
17import pdfplumber
18 
19 
20def extract_form_structure(pdf_path):
21    structure = {
22        "pages": [],
23        "labels": [],
24        "lines": [],
25        "checkboxes": [],
26        "row_boundaries": []
27    }
28 
29    with pdfplumber.open(pdf_path) as pdf:
30        for page_num, page in enumerate(pdf.pages, 1):
31            structure["pages"].append({
32                "page_number": page_num,
33                "width": float(page.width),
34                "height": float(page.height)
35            })
36 
37            words = page.extract_words()
38            for word in words:
39                structure["labels"].append({
40                    "page": page_num,
41                    "text": word["text"],
42                    "x0": round(float(word["x0"]), 1),
43                    "top": round(float(word["top"]), 1),
44                    "x1": round(float(word["x1"]), 1),
45                    "bottom": round(float(word["bottom"]), 1)
46                })
47 
48            for line in page.lines:
49                if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
50                    structure["lines"].append({
51                        "page": page_num,
52                        "y": round(float(line["top"]), 1),
53                        "x0": round(float(line["x0"]), 1),
54                        "x1": round(float(line["x1"]), 1)
55                    })
56 
57            for rect in page.rects:
58                width = float(rect["x1"]) - float(rect["x0"])
59                height = float(rect["bottom"]) - float(rect["top"])
60                if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
61                    structure["checkboxes"].append({
62                        "page": page_num,
63                        "x0": round(float(rect["x0"]), 1),
64                        "top": round(float(rect["top"]), 1),
65                        "x1": round(float(rect["x1"]), 1),
66                        "bottom": round(float(rect["bottom"]), 1),
67                        "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
68                        "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
69                    })
70 
71    lines_by_page = {}
72    for line in structure["lines"]:
73        page = line["page"]
74        if page not in lines_by_page:
75            lines_by_page[page] = []
76        lines_by_page[page].append(line["y"])
77 
78    for page, y_coords in lines_by_page.items():
79        y_coords = sorted(set(y_coords))
80        for i in range(len(y_coords) - 1):
81            structure["row_boundaries"].append({
82                "page": page,
83                "row_top": y_coords[i],
84                "row_bottom": y_coords[i + 1],
85                "row_height": round(y_coords[i + 1] - y_coords[i], 1)
86            })
87 
88    return structure
89 
90 
91def main():
92    if len(sys.argv) != 3:
93        print("Usage: extract_form_structure.py <input.pdf> <output.json>")
94        sys.exit(1)
95 
96    pdf_path = sys.argv[1]
97    output_path = sys.argv[2]
98 
99    print(f"Extracting structure from {pdf_path}...")
100    structure = extract_form_structure(pdf_path)
101 
102    with open(output_path, "w") as f:
103        json.dump(structure, f, indent=2)
104 
105    print(f"Found:")
106    print(f"  - {len(structure['pages'])} pages")
107    print(f"  - {len(structure['labels'])} text labels")
108    print(f"  - {len(structure['lines'])} horizontal lines")
109    print(f"  - {len(structure['checkboxes'])} checkboxes")
110    print(f"  - {len(structure['row_boundaries'])} row boundaries")
111    print(f"Saved to {output_path}")
112 
113 
114if __name__ == "__main__":
115    main()
116

Marketplace

Source from repo

PDF Processing Guide

Read, create, merge, split, watermark, encrypt, OCR, and fill PDF files using Python and CLI tools

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

57.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/extract_form_structure.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code116 linesFree

scripts/extract_form_structure.py

1"""
2Extract form structure from a non-fillable PDF.
3 
4This script analyzes the PDF to find:
5- Text labels with their exact coordinates
6- Horizontal lines (row boundaries)
7- Checkboxes (small rectangles)
8 
9Output: A JSON file with the form structure that can be used to generate
10accurate field coordinates for filling.
11 
12Usage: python extract_form_structure.py <input.pdf> <output.json>
13"""
14 
15import json
16import sys
17import pdfplumber
18 
19 
20def extract_form_structure(pdf_path):
21    structure = {
22        "pages": [],
23        "labels": [],
24        "lines": [],
25        "checkboxes": [],
26        "row_boundaries": []
27    }
28 
29    with pdfplumber.open(pdf_path) as pdf:
30        for page_num, page in enumerate(pdf.pages, 1):
31            structure["pages"].append({
32                "page_number": page_num,
33                "width": float(page.width),
34                "height": float(page.height)
35            })
36 
37            words = page.extract_words()
38            for word in words:
39                structure["labels"].append({
40                    "page": page_num,
41                    "text": word["text"],
42                    "x0": round(float(word["x0"]), 1),
43                    "top": round(float(word["top"]), 1),
44                    "x1": round(float(word["x1"]), 1),
45                    "bottom": round(float(word["bottom"]), 1)
46                })
47 
48            for line in page.lines:
49                if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
50                    structure["lines"].append({
51                        "page": page_num,
52                        "y": round(float(line["top"]), 1),
53                        "x0": round(float(line["x0"]), 1),
54                        "x1": round(float(line["x1"]), 1)
55                    })
56 
57            for rect in page.rects:
58                width = float(rect["x1"]) - float(rect["x0"])
59                height = float(rect["bottom"]) - float(rect["top"])
60                if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
61                    structure["checkboxes"].append({
62                        "page": page_num,
63                        "x0": round(float(rect["x0"]), 1),
64                        "top": round(float(rect["top"]), 1),
65                        "x1": round(float(rect["x1"]), 1),
66                        "bottom": round(float(rect["bottom"]), 1),
67                        "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
68                        "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
69                    })
70 
71    lines_by_page = {}
72    for line in structure["lines"]:
73        page = line["page"]
74        if page not in lines_by_page:
75            lines_by_page[page] = []
76        lines_by_page[page].append(line["y"])
77 
78    for page, y_coords in lines_by_page.items():
79        y_coords = sorted(set(y_coords))
80        for i in range(len(y_coords) - 1):
81            structure["row_boundaries"].append({
82                "page": page,
83                "row_top": y_coords[i],
84                "row_bottom": y_coords[i + 1],
85                "row_height": round(y_coords[i + 1] - y_coords[i], 1)
86            })
87 
88    return structure
89 
90 
91def main():
92    if len(sys.argv) != 3:
93        print("Usage: extract_form_structure.py <input.pdf> <output.json>")
94        sys.exit(1)
95 
96    pdf_path = sys.argv[1]
97    output_path = sys.argv[2]
98 
99    print(f"Extracting structure from {pdf_path}...")
100    structure = extract_form_structure(pdf_path)
101 
102    with open(output_path, "w") as f:
103        json.dump(structure, f, indent=2)
104 
105    print(f"Found:")
106    print(f"  - {len(structure['pages'])} pages")
107    print(f"  - {len(structure['labels'])} text labels")
108    print(f"  - {len(structure['lines'])} horizontal lines")
109    print(f"  - {len(structure['checkboxes'])} checkboxes")
110    print(f"  - {len(structure['row_boundaries'])} row boundaries")
111    print(f"Saved to {output_path}")
112 
113 
114if __name__ == "__main__":
115    main()
116

PDF Processing Guide

scripts/extract_form_structure.py

Preparing the source view

PDF Processing Guide

scripts/extract_form_structure.py