Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/unpack.py
1"""Unpack Office files (DOCX, PPTX, XLSX) for editing.23Extracts the ZIP archive, pretty-prints XML files, and optionally:4- Merges adjacent runs with identical formatting (DOCX only)5- Simplifies adjacent tracked changes from same author (DOCX only)67Usage:8python unpack.py <office_file> <output_dir> [options]910Examples:11python unpack.py document.docx unpacked/12python unpack.py presentation.pptx unpacked/13python unpack.py document.docx unpacked/ --merge-runs false14"""1516import argparse17import sys18import zipfile19from pathlib import Path2021import defusedxml.minidom2223from helpers.merge_runs import merge_runs as do_merge_runs24from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines2526SMART_QUOTE_REPLACEMENTS = {27"\u201c": "“",28"\u201d": "”",29"\u2018": "‘",30"\u2019": "’",31}323334def unpack(35input_file: str,36output_directory: str,37merge_runs: bool = True,38simplify_redlines: bool = True,39) -> tuple[None, str]:40input_path = Path(input_file)41output_path = Path(output_directory)42suffix = input_path.suffix.lower()4344if not input_path.exists():45return None, f"Error: {input_file} does not exist"4647if suffix not in {".docx", ".pptx", ".xlsx"}:48return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file"4950try:51output_path.mkdir(parents=True, exist_ok=True)5253with zipfile.ZipFile(input_path, "r") as zf:54zf.extractall(output_path)5556xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))57for xml_file in xml_files:58_pretty_print_xml(xml_file)5960message = f"Unpacked {input_file} ({len(xml_files)} XML files)"6162if suffix == ".docx":63if simplify_redlines:64simplify_count, _ = do_simplify_redlines(str(output_path))65message += f", simplified {simplify_count} tracked changes"6667if merge_runs:68merge_count, _ = do_merge_runs(str(output_path))69message += f", merged {merge_count} runs"7071for xml_file in xml_files:72_escape_smart_quotes(xml_file)7374return None, message7576except zipfile.BadZipFile:77return None, f"Error: {input_file} is not a valid Office file"78except Exception as e:79return None, f"Error unpacking: {e}"808182def _pretty_print_xml(xml_file: Path) -> None:83try:84content = xml_file.read_text(encoding="utf-8")85dom = defusedxml.minidom.parseString(content)86xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8"))87except Exception:88pass899091def _escape_smart_quotes(xml_file: Path) -> None:92try:93content = xml_file.read_text(encoding="utf-8")94for char, entity in SMART_QUOTE_REPLACEMENTS.items():95content = content.replace(char, entity)96xml_file.write_text(content, encoding="utf-8")97except Exception:98pass99100101if __name__ == "__main__":102parser = argparse.ArgumentParser(103description="Unpack an Office file (DOCX, PPTX, XLSX) for editing"104)105parser.add_argument("input_file", help="Office file to unpack")106parser.add_argument("output_directory", help="Output directory")107parser.add_argument(108"--merge-runs",109type=lambda x: x.lower() == "true",110default=True,111metavar="true|false",112help="Merge adjacent runs with identical formatting (DOCX only, default: true)",113)114parser.add_argument(115"--simplify-redlines",116type=lambda x: x.lower() == "true",117default=True,118metavar="true|false",119help="Merge adjacent tracked changes from same author (DOCX only, default: true)",120)121args = parser.parse_args()122123_, message = unpack(124args.input_file,125args.output_directory,126merge_runs=args.merge_runs,127simplify_redlines=args.simplify_redlines,128)129print(message)130131if "Error" in message:132sys.exit(1)133