Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Read, create, merge, split, watermark, encrypt, OCR, and fill PDF files using Python and CLI tools
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/extract_form_field_info.py
1import json2import sys34from pypdf import PdfReader56789def get_full_annotation_field_id(annotation):10components = []11while annotation:12field_name = annotation.get('/T')13if field_name:14components.append(field_name)15annotation = annotation.get('/Parent')16return ".".join(reversed(components)) if components else None171819def make_field_dict(field, field_id):20field_dict = {"field_id": field_id}21ft = field.get('/FT')22if ft == "/Tx":23field_dict["type"] = "text"24elif ft == "/Btn":25field_dict["type"] = "checkbox"26states = field.get("/_States_", [])27if len(states) == 2:28if "/Off" in states:29field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]30field_dict["unchecked_value"] = "/Off"31else:32print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")33field_dict["checked_value"] = states[0]34field_dict["unchecked_value"] = states[1]35elif ft == "/Ch":36field_dict["type"] = "choice"37states = field.get("/_States_", [])38field_dict["choice_options"] = [{39"value": state[0],40"text": state[1],41} for state in states]42else:43field_dict["type"] = f"unknown ({ft})"44return field_dict454647def get_field_info(reader: PdfReader):48fields = reader.get_fields()4950field_info_by_id = {}51possible_radio_names = set()5253for field_id, field in fields.items():54if field.get("/Kids"):55if field.get("/FT") == "/Btn":56possible_radio_names.add(field_id)57continue58field_info_by_id[field_id] = make_field_dict(field, field_id)596061radio_fields_by_id = {}6263for page_index, page in enumerate(reader.pages):64annotations = page.get('/Annots', [])65for ann in annotations:66field_id = get_full_annotation_field_id(ann)67if field_id in field_info_by_id:68field_info_by_id[field_id]["page"] = page_index + 169field_info_by_id[field_id]["rect"] = ann.get('/Rect')70elif field_id in possible_radio_names:71try:72on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]73except KeyError:74continue75if len(on_values) == 1:76rect = ann.get("/Rect")77if field_id not in radio_fields_by_id:78radio_fields_by_id[field_id] = {79"field_id": field_id,80"type": "radio_group",81"page": page_index + 1,82"radio_options": [],83}84radio_fields_by_id[field_id]["radio_options"].append({85"value": on_values[0],86"rect": rect,87})8889fields_with_location = []90for field_info in field_info_by_id.values():91if "page" in field_info:92fields_with_location.append(field_info)93else:94print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")9596def sort_key(f):97if "radio_options" in f:98rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]99else:100rect = f.get("rect") or [0, 0, 0, 0]101adjusted_position = [-rect[1], rect[0]]102return [f.get("page"), adjusted_position]103104sorted_fields = fields_with_location + list(radio_fields_by_id.values())105sorted_fields.sort(key=sort_key)106107return sorted_fields108109110def write_field_info(pdf_path: str, json_output_path: str):111reader = PdfReader(pdf_path)112field_info = get_field_info(reader)113with open(json_output_path, "w") as f:114json.dump(field_info, f, indent=2)115print(f"Wrote {len(field_info)} fields to {json_output_path}")116117118if __name__ == "__main__":119if len(sys.argv) != 3:120print("Usage: extract_form_field_info.py [input pdf] [output json]")121sys.exit(1)122write_field_info(sys.argv[1], sys.argv[2])123