Source from repo
PPTX Skill

Create, edit, and inspect PowerPoint presentations with professional design and automated visual QA
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
1.1 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/office/helpers/simplify_redlines.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code198 linesFree
scripts/office/helpers/simplify_redlines.py
1"""Simplify tracked changes by merging adjacent w:ins or w:del elements.
2 
3Merges adjacent <w:ins> elements from the same author into a single element.
4Same for <w:del> elements. This makes heavily-redlined documents easier to
5work with by reducing the number of tracked change wrappers.
6 
7Rules:
8- Only merges w:ins with w:ins, w:del with w:del (same element type)
9- Only merges if same author (ignores timestamp differences)
10- Only merges if truly adjacent (only whitespace between them)
11"""
12 
13import xml.etree.ElementTree as ET
14import zipfile
15from pathlib import Path
16 
17import defusedxml.minidom
18 
19WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
20 
21 
22def simplify_redlines(input_dir: str) -> tuple[int, str]:
23    doc_xml = Path(input_dir) / "word" / "document.xml"
24 
25    if not doc_xml.exists():
26        return 0, f"Error: {doc_xml} not found"
27 
28    try:
29        dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
30        root = dom.documentElement
31 
32        merge_count = 0
33 
34        containers = _find_elements(root, "p") + _find_elements(root, "tc")
35 
36        for container in containers:
37            merge_count += _merge_tracked_changes_in(container, "ins")
38            merge_count += _merge_tracked_changes_in(container, "del")
39 
40        doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
41        return merge_count, f"Simplified {merge_count} tracked changes"
42 
43    except Exception as e:
44        return 0, f"Error: {e}"
45 
46 
47def _merge_tracked_changes_in(container, tag: str) -> int:
48    merge_count = 0
49 
50    tracked = [
51        child
52        for child in container.childNodes
53        if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)
54    ]
55 
56    if len(tracked) < 2:
57        return 0
58 
59    i = 0
60    while i < len(tracked) - 1:
61        curr = tracked[i]
62        next_elem = tracked[i + 1]
63 
64        if _can_merge_tracked(curr, next_elem):
65            _merge_tracked_content(curr, next_elem)
66            container.removeChild(next_elem)
67            tracked.pop(i + 1)
68            merge_count += 1
69        else:
70            i += 1
71 
72    return merge_count
73 
74 
75def _is_element(node, tag: str) -> bool:
76    name = node.localName or node.tagName
77    return name == tag or name.endswith(f":{tag}")
78 
79 
80def _get_author(elem) -> str:
81    author = elem.getAttribute("w:author")
82    if not author:
83        for attr in elem.attributes.values():
84            if attr.localName == "author" or attr.name.endswith(":author"):
85                return attr.value
86    return author
87 
88 
89def _can_merge_tracked(elem1, elem2) -> bool:
90    if _get_author(elem1) != _get_author(elem2):
91        return False
92 
93    node = elem1.nextSibling
94    while node and node != elem2:
95        if node.nodeType == node.ELEMENT_NODE:
96            return False
97        if node.nodeType == node.TEXT_NODE and node.data.strip():
98            return False
99        node = node.nextSibling
100 
101    return True
102 
103 
104def _merge_tracked_content(target, source):
105    while source.firstChild:
106        child = source.firstChild
107        source.removeChild(child)
108        target.appendChild(child)
109 
110 
111def _find_elements(root, tag: str) -> list:
112    results = []
113 
114    def traverse(node):
115        if node.nodeType == node.ELEMENT_NODE:
116            name = node.localName or node.tagName
117            if name == tag or name.endswith(f":{tag}"):
118                results.append(node)
119            for child in node.childNodes:
120                traverse(child)
121 
122    traverse(root)
123    return results
124 
125 
126def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
127    if not doc_xml_path.exists():
128        return {}
129 
130    try:
131        tree = ET.parse(doc_xml_path)
132        root = tree.getroot()
133    except ET.ParseError:
134        return {}
135 
136    namespaces = {"w": WORD_NS}
137    author_attr = f"{{{WORD_NS}}}author"
138 
139    authors: dict[str, int] = {}
140    for tag in ["ins", "del"]:
141        for elem in root.findall(f".//w:{tag}", namespaces):
142            author = elem.get(author_attr)
143            if author:
144                authors[author] = authors.get(author, 0) + 1
145 
146    return authors
147 
148 
149def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
150    try:
151        with zipfile.ZipFile(docx_path, "r") as zf:
152            if "word/document.xml" not in zf.namelist():
153                return {}
154            with zf.open("word/document.xml") as f:
155                tree = ET.parse(f)
156                root = tree.getroot()
157 
158                namespaces = {"w": WORD_NS}
159                author_attr = f"{{{WORD_NS}}}author"
160 
161                authors: dict[str, int] = {}
162                for tag in ["ins", "del"]:
163                    for elem in root.findall(f".//w:{tag}", namespaces):
164                        author = elem.get(author_attr)
165                        if author:
166                            authors[author] = authors.get(author, 0) + 1
167                return authors
168    except (zipfile.BadZipFile, ET.ParseError):
169        return {}
170 
171 
172def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
173    modified_xml = modified_dir / "word" / "document.xml"
174    modified_authors = get_tracked_change_authors(modified_xml)
175 
176    if not modified_authors:
177        return default
178 
179    original_authors = _get_authors_from_docx(original_docx)
180 
181    new_changes: dict[str, int] = {}
182    for author, count in modified_authors.items():
183        original_count = original_authors.get(author, 0)
184        diff = count - original_count
185        if diff > 0:
186            new_changes[author] = diff
187 
188    if not new_changes:
189        return default
190 
191    if len(new_changes) == 1:
192        return next(iter(new_changes))
193 
194    raise ValueError(
195        f"Multiple authors added new changes: {new_changes}. "
196        "Cannot infer which author to validate."
197    )
198
Preparing the source view

PPTX Skill

scripts/office/helpers/simplify_redlines.py