Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/helpers/simplify_redlines.py
1"""Simplify tracked changes by merging adjacent w:ins or w:del elements.23Merges adjacent <w:ins> elements from the same author into a single element.4Same for <w:del> elements. This makes heavily-redlined documents easier to5work with by reducing the number of tracked change wrappers.67Rules:8- Only merges w:ins with w:ins, w:del with w:del (same element type)9- Only merges if same author (ignores timestamp differences)10- Only merges if truly adjacent (only whitespace between them)11"""1213import xml.etree.ElementTree as ET14import zipfile15from pathlib import Path1617import defusedxml.minidom1819WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"202122def simplify_redlines(input_dir: str) -> tuple[int, str]:23doc_xml = Path(input_dir) / "word" / "document.xml"2425if not doc_xml.exists():26return 0, f"Error: {doc_xml} not found"2728try:29dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))30root = dom.documentElement3132merge_count = 03334containers = _find_elements(root, "p") + _find_elements(root, "tc")3536for container in containers:37merge_count += _merge_tracked_changes_in(container, "ins")38merge_count += _merge_tracked_changes_in(container, "del")3940doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))41return merge_count, f"Simplified {merge_count} tracked changes"4243except Exception as e:44return 0, f"Error: {e}"454647def _merge_tracked_changes_in(container, tag: str) -> int:48merge_count = 04950tracked = [51child52for child in container.childNodes53if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)54]5556if len(tracked) < 2:57return 05859i = 060while i < len(tracked) - 1:61curr = tracked[i]62next_elem = tracked[i + 1]6364if _can_merge_tracked(curr, next_elem):65_merge_tracked_content(curr, next_elem)66container.removeChild(next_elem)67tracked.pop(i + 1)68merge_count += 169else:70i += 17172return merge_count737475def _is_element(node, tag: str) -> bool:76name = node.localName or node.tagName77return name == tag or name.endswith(f":{tag}")787980def _get_author(elem) -> str:81author = elem.getAttribute("w:author")82if not author:83for attr in elem.attributes.values():84if attr.localName == "author" or attr.name.endswith(":author"):85return attr.value86return author878889def _can_merge_tracked(elem1, elem2) -> bool:90if _get_author(elem1) != _get_author(elem2):91return False9293node = elem1.nextSibling94while node and node != elem2:95if node.nodeType == node.ELEMENT_NODE:96return False97if node.nodeType == node.TEXT_NODE and node.data.strip():98return False99node = node.nextSibling100101return True102103104def _merge_tracked_content(target, source):105while source.firstChild:106child = source.firstChild107source.removeChild(child)108target.appendChild(child)109110111def _find_elements(root, tag: str) -> list:112results = []113114def traverse(node):115if node.nodeType == node.ELEMENT_NODE:116name = node.localName or node.tagName117if name == tag or name.endswith(f":{tag}"):118results.append(node)119for child in node.childNodes:120traverse(child)121122traverse(root)123return results124125126def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:127if not doc_xml_path.exists():128return {}129130try:131tree = ET.parse(doc_xml_path)132root = tree.getroot()133except ET.ParseError:134return {}135136namespaces = {"w": WORD_NS}137author_attr = f"{{{WORD_NS}}}author"138139authors: dict[str, int] = {}140for tag in ["ins", "del"]:141for elem in root.findall(f".//w:{tag}", namespaces):142author = elem.get(author_attr)143if author:144authors[author] = authors.get(author, 0) + 1145146return authors147148149def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:150try:151with zipfile.ZipFile(docx_path, "r") as zf:152if "word/document.xml" not in zf.namelist():153return {}154with zf.open("word/document.xml") as f:155tree = ET.parse(f)156root = tree.getroot()157158namespaces = {"w": WORD_NS}159author_attr = f"{{{WORD_NS}}}author"160161authors: dict[str, int] = {}162for tag in ["ins", "del"]:163for elem in root.findall(f".//w:{tag}", namespaces):164author = elem.get(author_attr)165if author:166authors[author] = authors.get(author, 0) + 1167return authors168except (zipfile.BadZipFile, ET.ParseError):169return {}170171172def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:173modified_xml = modified_dir / "word" / "document.xml"174modified_authors = get_tracked_change_authors(modified_xml)175176if not modified_authors:177return default178179original_authors = _get_authors_from_docx(original_docx)180181new_changes: dict[str, int] = {}182for author, count in modified_authors.items():183original_count = original_authors.get(author, 0)184diff = count - original_count185if diff > 0:186new_changes[author] = diff187188if not new_changes:189return default190191if len(new_changes) == 1:192return next(iter(new_changes))193194raise ValueError(195f"Multiple authors added new changes: {new_changes}. "196"Cannot infer which author to validate."197)198