Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, edit, and inspect PowerPoint presentations with professional design and automated visual QA
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/helpers/simplify_redlines.py
1"""Simplify tracked changes by merging adjacent w:ins or w:del elements.23Merges adjacent <w:ins> elements from the same author into a single element.4Same for <w:del> elements. This makes heavily-redlined documents easier to5work with by reducing the number of tracked change wrappers.67Rules:8- Only merges w:ins with w:ins, w:del with w:del (same element type)9- Only merges if same author (ignores timestamp differences)10- Only merges if truly adjacent (only whitespace between them)11"""1213import xml.etree.ElementTree as ET14import zipfile15from pathlib import Path1617import defusedxml.minidom1819WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"202122def simplify_redlines(input_dir: str) -> tuple[int, str]:23doc_xml = Path(input_dir) / "word" / "document.xml"2425if not doc_xml.exists():26return 0, f"Error: {doc_xml} not found"2728try:29dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))30root = dom.documentElement3132merge_count = 03334containers = _find_elements(root, "p") + _find_elements(root, "tc")3536for container in containers:37merge_count += _merge_tracked_changes_in(container, "ins")38merge_count += _merge_tracked_changes_in(container, "del")3940doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))41return merge_count, f"Simplified {merge_count} tracked changes"4243except Exception as e:44return 0, f"Error: {e}"454647def _merge_tracked_changes_in(container, tag: str) -> int:48merge_count = 04950tracked = [51child52for child in container.childNodes53if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)54]5556if len(tracked) < 2:57return 05859i = 060while i < len(tracked) - 1:61curr = tracked[i]62next_elem = tracked[i + 1]6364if _can_merge_tracked(curr, next_elem):65_merge_tracked_content(curr, next_elem)66container.removeChild(next_elem)67tracked.pop(i + 1)68merge_count += 169else:70i += 17172return merge_count737475def _is_element(node, tag: str) -> bool:76name = node.localName or node.tagName77return name == tag or name.endswith(f":{tag}")787980def _get_author(elem) -> str:81author = elem.getAttribute("w:author")82if not author:83for attr in elem.attributes.values():84if attr.localName == "author" or attr.name.endswith(":author"):85return attr.value86return author878889def _can_merge_tracked(elem1, elem2) -> bool:90if _get_author(elem1) != _get_author(elem2):91return False9293node = elem1.nextSibling94while node and node != elem2:95if node.nodeType == node.ELEMENT_NODE:96return False97if node.nodeType == node.TEXT_NODE and node.data.strip():98return False99node = node.nextSibling100101return True102103104def _merge_tracked_content(target, source):105while source.firstChild:106child = source.firstChild107source.removeChild(child)108target.appendChild(child)109110111def _find_elements(root, tag: str) -> list:112results = []113114def traverse(node):115if node.nodeType == node.ELEMENT_NODE:116name = node.localName or node.tagName117if name == tag or name.endswith(f":{tag}"):118results.append(node)119for child in node.childNodes:120traverse(child)121122traverse(root)123return results124125126def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:127if not doc_xml_path.exists():128return {}129130try:131tree = ET.parse(doc_xml_path)132root = tree.getroot()133except ET.ParseError:134return {}135136namespaces = {"w": WORD_NS}137author_attr = f"{{{WORD_NS}}}author"138139authors: dict[str, int] = {}140for tag in ["ins", "del"]:141for elem in root.findall(f".//w:{tag}", namespaces):142author = elem.get(author_attr)143if author:144authors[author] = authors.get(author, 0) + 1145146return authors147148149def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:150try:151with zipfile.ZipFile(docx_path, "r") as zf:152if "word/document.xml" not in zf.namelist():153return {}154with zf.open("word/document.xml") as f:155tree = ET.parse(f)156root = tree.getroot()157158namespaces = {"w": WORD_NS}159author_attr = f"{{{WORD_NS}}}author"160161authors: dict[str, int] = {}162for tag in ["ins", "del"]:163for elem in root.findall(f".//w:{tag}", namespaces):164author = elem.get(author_attr)165if author:166authors[author] = authors.get(author, 0) + 1167return authors168except (zipfile.BadZipFile, ET.ParseError):169return {}170171172def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:173modified_xml = modified_dir / "word" / "document.xml"174modified_authors = get_tracked_change_authors(modified_xml)175176if not modified_authors:177return default178179original_authors = _get_authors_from_docx(original_docx)180181new_changes: dict[str, int] = {}182for author, count in modified_authors.items():183original_count = original_authors.get(author, 0)184diff = count - original_count185if diff > 0:186new_changes[author] = diff187188if not new_changes:189return default190191if len(new_changes) == 1:192return next(iter(new_changes))193194raise ValueError(195f"Multiple authors added new changes: {new_changes}. "196"Cannot infer which author to validate."197)198