Source from repo

Requirements for Outputs

Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

1.0 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/office/helpers/simplify_redlines.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code198 linesFree

scripts/office/helpers/simplify_redlines.py

1"""Simplify tracked changes by merging adjacent w:ins or w:del elements.
2 
3Merges adjacent <w:ins> elements from the same author into a single element.
4Same for <w:del> elements. This makes heavily-redlined documents easier to
5work with by reducing the number of tracked change wrappers.
6 
7Rules:
8- Only merges w:ins with w:ins, w:del with w:del (same element type)
9- Only merges if same author (ignores timestamp differences)
10- Only merges if truly adjacent (only whitespace between them)
11"""
12 
13import xml.etree.ElementTree as ET
14import zipfile
15from pathlib import Path
16 
17import defusedxml.minidom
18 
19WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
20 
21 
22def simplify_redlines(input_dir: str) -> tuple[int, str]:
23    doc_xml = Path(input_dir) / "word" / "document.xml"
24 
25    if not doc_xml.exists():
26        return 0, f"Error: {doc_xml} not found"
27 
28    try:
29        dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
30        root = dom.documentElement
31 
32        merge_count = 0
33 
34        containers = _find_elements(root, "p") + _find_elements(root, "tc")
35 
36        for container in containers:
37            merge_count += _merge_tracked_changes_in(container, "ins")
38            merge_count += _merge_tracked_changes_in(container, "del")
39 
40        doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
41        return merge_count, f"Simplified {merge_count} tracked changes"
42 
43    except Exception as e:
44        return 0, f"Error: {e}"
45 
46 
47def _merge_tracked_changes_in(container, tag: str) -> int:
48    merge_count = 0
49 
50    tracked = [
51        child
52        for child in container.childNodes
53        if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)
54    ]
55 
56    if len(tracked) < 2:
57        return 0
58 
59    i = 0
60    while i < len(tracked) - 1:
61        curr = tracked[i]
62        next_elem = tracked[i + 1]
63 
64        if _can_merge_tracked(curr, next_elem):
65            _merge_tracked_content(curr, next_elem)
66            container.removeChild(next_elem)
67            tracked.pop(i + 1)
68            merge_count += 1
69        else:
70            i += 1
71 
72    return merge_count
73 
74 
75def _is_element(node, tag: str) -> bool:
76    name = node.localName or node.tagName
77    return name == tag or name.endswith(f":{tag}")
78 
79 
80def _get_author(elem) -> str:
81    author = elem.getAttribute("w:author")
82    if not author:
83        for attr in elem.attributes.values():
84            if attr.localName == "author" or attr.name.endswith(":author"):
85                return attr.value
86    return author
87 
88 
89def _can_merge_tracked(elem1, elem2) -> bool:
90    if _get_author(elem1) != _get_author(elem2):
91        return False
92 
93    node = elem1.nextSibling
94    while node and node != elem2:
95        if node.nodeType == node.ELEMENT_NODE:
96            return False
97        if node.nodeType == node.TEXT_NODE and node.data.strip():
98            return False
99        node = node.nextSibling
100 
101    return True
102 
103 
104def _merge_tracked_content(target, source):
105    while source.firstChild:
106        child = source.firstChild
107        source.removeChild(child)
108        target.appendChild(child)
109 
110 
111def _find_elements(root, tag: str) -> list:
112    results = []
113 
114    def traverse(node):
115        if node.nodeType == node.ELEMENT_NODE:
116            name = node.localName or node.tagName
117            if name == tag or name.endswith(f":{tag}"):
118                results.append(node)
119            for child in node.childNodes:
120                traverse(child)
121 
122    traverse(root)
123    return results
124 
125 
126def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
127    if not doc_xml_path.exists():
128        return {}
129 
130    try:
131        tree = ET.parse(doc_xml_path)
132        root = tree.getroot()
133    except ET.ParseError:
134        return {}
135 
136    namespaces = {"w": WORD_NS}
137    author_attr = f"{{{WORD_NS}}}author"
138 
139    authors: dict[str, int] = {}
140    for tag in ["ins", "del"]:
141        for elem in root.findall(f".//w:{tag}", namespaces):
142            author = elem.get(author_attr)
143            if author:
144                authors[author] = authors.get(author, 0) + 1
145 
146    return authors
147 
148 
149def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
150    try:
151        with zipfile.ZipFile(docx_path, "r") as zf:
152            if "word/document.xml" not in zf.namelist():
153                return {}
154            with zf.open("word/document.xml") as f:
155                tree = ET.parse(f)
156                root = tree.getroot()
157 
158                namespaces = {"w": WORD_NS}
159                author_attr = f"{{{WORD_NS}}}author"
160 
161                authors: dict[str, int] = {}
162                for tag in ["ins", "del"]:
163                    for elem in root.findall(f".//w:{tag}", namespaces):
164                        author = elem.get(author_attr)
165                        if author:
166                            authors[author] = authors.get(author, 0) + 1
167                return authors
168    except (zipfile.BadZipFile, ET.ParseError):
169        return {}
170 
171 
172def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
173    modified_xml = modified_dir / "word" / "document.xml"
174    modified_authors = get_tracked_change_authors(modified_xml)
175 
176    if not modified_authors:
177        return default
178 
179    original_authors = _get_authors_from_docx(original_docx)
180 
181    new_changes: dict[str, int] = {}
182    for author, count in modified_authors.items():
183        original_count = original_authors.get(author, 0)
184        diff = count - original_count
185        if diff > 0:
186            new_changes[author] = diff
187 
188    if not new_changes:
189        return default
190 
191    if len(new_changes) == 1:
192        return next(iter(new_changes))
193 
194    raise ValueError(
195        f"Multiple authors added new changes: {new_changes}. "
196        "Cannot infer which author to validate."
197    )
198

Marketplace

Source from repo

Requirements for Outputs

Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

1.0 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/office/helpers/simplify_redlines.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code198 linesFree

scripts/office/helpers/simplify_redlines.py

1"""Simplify tracked changes by merging adjacent w:ins or w:del elements.
2 
3Merges adjacent <w:ins> elements from the same author into a single element.
4Same for <w:del> elements. This makes heavily-redlined documents easier to
5work with by reducing the number of tracked change wrappers.
6 
7Rules:
8- Only merges w:ins with w:ins, w:del with w:del (same element type)
9- Only merges if same author (ignores timestamp differences)
10- Only merges if truly adjacent (only whitespace between them)
11"""
12 
13import xml.etree.ElementTree as ET
14import zipfile
15from pathlib import Path
16 
17import defusedxml.minidom
18 
19WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
20 
21 
22def simplify_redlines(input_dir: str) -> tuple[int, str]:
23    doc_xml = Path(input_dir) / "word" / "document.xml"
24 
25    if not doc_xml.exists():
26        return 0, f"Error: {doc_xml} not found"
27 
28    try:
29        dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
30        root = dom.documentElement
31 
32        merge_count = 0
33 
34        containers = _find_elements(root, "p") + _find_elements(root, "tc")
35 
36        for container in containers:
37            merge_count += _merge_tracked_changes_in(container, "ins")
38            merge_count += _merge_tracked_changes_in(container, "del")
39 
40        doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
41        return merge_count, f"Simplified {merge_count} tracked changes"
42 
43    except Exception as e:
44        return 0, f"Error: {e}"
45 
46 
47def _merge_tracked_changes_in(container, tag: str) -> int:
48    merge_count = 0
49 
50    tracked = [
51        child
52        for child in container.childNodes
53        if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)
54    ]
55 
56    if len(tracked) < 2:
57        return 0
58 
59    i = 0
60    while i < len(tracked) - 1:
61        curr = tracked[i]
62        next_elem = tracked[i + 1]
63 
64        if _can_merge_tracked(curr, next_elem):
65            _merge_tracked_content(curr, next_elem)
66            container.removeChild(next_elem)
67            tracked.pop(i + 1)
68            merge_count += 1
69        else:
70            i += 1
71 
72    return merge_count
73 
74 
75def _is_element(node, tag: str) -> bool:
76    name = node.localName or node.tagName
77    return name == tag or name.endswith(f":{tag}")
78 
79 
80def _get_author(elem) -> str:
81    author = elem.getAttribute("w:author")
82    if not author:
83        for attr in elem.attributes.values():
84            if attr.localName == "author" or attr.name.endswith(":author"):
85                return attr.value
86    return author
87 
88 
89def _can_merge_tracked(elem1, elem2) -> bool:
90    if _get_author(elem1) != _get_author(elem2):
91        return False
92 
93    node = elem1.nextSibling
94    while node and node != elem2:
95        if node.nodeType == node.ELEMENT_NODE:
96            return False
97        if node.nodeType == node.TEXT_NODE and node.data.strip():
98            return False
99        node = node.nextSibling
100 
101    return True
102 
103 
104def _merge_tracked_content(target, source):
105    while source.firstChild:
106        child = source.firstChild
107        source.removeChild(child)
108        target.appendChild(child)
109 
110 
111def _find_elements(root, tag: str) -> list:
112    results = []
113 
114    def traverse(node):
115        if node.nodeType == node.ELEMENT_NODE:
116            name = node.localName or node.tagName
117            if name == tag or name.endswith(f":{tag}"):
118                results.append(node)
119            for child in node.childNodes:
120                traverse(child)
121 
122    traverse(root)
123    return results
124 
125 
126def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
127    if not doc_xml_path.exists():
128        return {}
129 
130    try:
131        tree = ET.parse(doc_xml_path)
132        root = tree.getroot()
133    except ET.ParseError:
134        return {}
135 
136    namespaces = {"w": WORD_NS}
137    author_attr = f"{{{WORD_NS}}}author"
138 
139    authors: dict[str, int] = {}
140    for tag in ["ins", "del"]:
141        for elem in root.findall(f".//w:{tag}", namespaces):
142            author = elem.get(author_attr)
143            if author:
144                authors[author] = authors.get(author, 0) + 1
145 
146    return authors
147 
148 
149def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
150    try:
151        with zipfile.ZipFile(docx_path, "r") as zf:
152            if "word/document.xml" not in zf.namelist():
153                return {}
154            with zf.open("word/document.xml") as f:
155                tree = ET.parse(f)
156                root = tree.getroot()
157 
158                namespaces = {"w": WORD_NS}
159                author_attr = f"{{{WORD_NS}}}author"
160 
161                authors: dict[str, int] = {}
162                for tag in ["ins", "del"]:
163                    for elem in root.findall(f".//w:{tag}", namespaces):
164                        author = elem.get(author_attr)
165                        if author:
166                            authors[author] = authors.get(author, 0) + 1
167                return authors
168    except (zipfile.BadZipFile, ET.ParseError):
169        return {}
170 
171 
172def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
173    modified_xml = modified_dir / "word" / "document.xml"
174    modified_authors = get_tracked_change_authors(modified_xml)
175 
176    if not modified_authors:
177        return default
178 
179    original_authors = _get_authors_from_docx(original_docx)
180 
181    new_changes: dict[str, int] = {}
182    for author, count in modified_authors.items():
183        original_count = original_authors.get(author, 0)
184        diff = count - original_count
185        if diff > 0:
186            new_changes[author] = diff
187 
188    if not new_changes:
189        return default
190 
191    if len(new_changes) == 1:
192        return next(iter(new_changes))
193 
194    raise ValueError(
195        f"Multiple authors added new changes: {new_changes}. "
196        "Cannot infer which author to validate."
197    )
198

Requirements for Outputs

scripts/office/helpers/simplify_redlines.py

Preparing the source view

Requirements for Outputs

scripts/office/helpers/simplify_redlines.py