Source from repo

DOCX creation, editing, and analysis

Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

1.1 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/office/helpers/merge_runs.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code200 linesFree

scripts/office/helpers/merge_runs.py

1"""Merge adjacent runs with identical formatting in DOCX.
2 
3Merges adjacent <w:r> elements that have identical <w:rPr> properties.
4Works on runs in paragraphs and inside tracked changes (<w:ins>, <w:del>).
5 
6Also:
7- Removes rsid attributes from runs (revision metadata that doesn't affect rendering)
8- Removes proofErr elements (spell/grammar markers that block merging)
9"""
10 
11from pathlib import Path
12 
13import defusedxml.minidom
14 
15 
16def merge_runs(input_dir: str) -> tuple[int, str]:
17    doc_xml = Path(input_dir) / "word" / "document.xml"
18 
19    if not doc_xml.exists():
20        return 0, f"Error: {doc_xml} not found"
21 
22    try:
23        dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
24        root = dom.documentElement
25 
26        _remove_elements(root, "proofErr")
27        _strip_run_rsid_attrs(root)
28 
29        containers = {run.parentNode for run in _find_elements(root, "r")}
30 
31        merge_count = 0
32        for container in containers:
33            merge_count += _merge_runs_in(container)
34 
35        doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
36        return merge_count, f"Merged {merge_count} runs"
37 
38    except Exception as e:
39        return 0, f"Error: {e}"
40 
41 
42 
43 
44def _find_elements(root, tag: str) -> list:
45    results = []
46 
47    def traverse(node):
48        if node.nodeType == node.ELEMENT_NODE:
49            name = node.localName or node.tagName
50            if name == tag or name.endswith(f":{tag}"):
51                results.append(node)
52            for child in node.childNodes:
53                traverse(child)
54 
55    traverse(root)
56    return results
57 
58 
59def _get_child(parent, tag: str):
60    for child in parent.childNodes:
61        if child.nodeType == child.ELEMENT_NODE:
62            name = child.localName or child.tagName
63            if name == tag or name.endswith(f":{tag}"):
64                return child
65    return None
66 
67 
68def _get_children(parent, tag: str) -> list:
69    results = []
70    for child in parent.childNodes:
71        if child.nodeType == child.ELEMENT_NODE:
72            name = child.localName or child.tagName
73            if name == tag or name.endswith(f":{tag}"):
74                results.append(child)
75    return results
76 
77 
78def _is_adjacent(elem1, elem2) -> bool:
79    node = elem1.nextSibling
80    while node:
81        if node == elem2:
82            return True
83        if node.nodeType == node.ELEMENT_NODE:
84            return False
85        if node.nodeType == node.TEXT_NODE and node.data.strip():
86            return False
87        node = node.nextSibling
88    return False
89 
90 
91 
92 
93def _remove_elements(root, tag: str):
94    for elem in _find_elements(root, tag):
95        if elem.parentNode:
96            elem.parentNode.removeChild(elem)
97 
98 
99def _strip_run_rsid_attrs(root):
100    for run in _find_elements(root, "r"):
101        for attr in list(run.attributes.values()):
102            if "rsid" in attr.name.lower():
103                run.removeAttribute(attr.name)
104 
105 
106 
107 
108def _merge_runs_in(container) -> int:
109    merge_count = 0
110    run = _first_child_run(container)
111 
112    while run:
113        while True:
114            next_elem = _next_element_sibling(run)
115            if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
116                _merge_run_content(run, next_elem)
117                container.removeChild(next_elem)
118                merge_count += 1
119            else:
120                break
121 
122        _consolidate_text(run)
123        run = _next_sibling_run(run)
124 
125    return merge_count
126 
127 
128def _first_child_run(container):
129    for child in container.childNodes:
130        if child.nodeType == child.ELEMENT_NODE and _is_run(child):
131            return child
132    return None
133 
134 
135def _next_element_sibling(node):
136    sibling = node.nextSibling
137    while sibling:
138        if sibling.nodeType == sibling.ELEMENT_NODE:
139            return sibling
140        sibling = sibling.nextSibling
141    return None
142 
143 
144def _next_sibling_run(node):
145    sibling = node.nextSibling
146    while sibling:
147        if sibling.nodeType == sibling.ELEMENT_NODE:
148            if _is_run(sibling):
149                return sibling
150        sibling = sibling.nextSibling
151    return None
152 
153 
154def _is_run(node) -> bool:
155    name = node.localName or node.tagName
156    return name == "r" or name.endswith(":r")
157 
158 
159def _can_merge(run1, run2) -> bool:
160    rpr1 = _get_child(run1, "rPr")
161    rpr2 = _get_child(run2, "rPr")
162 
163    if (rpr1 is None) != (rpr2 is None):
164        return False
165    if rpr1 is None:
166        return True
167    return rpr1.toxml() == rpr2.toxml()  
168 
169 
170def _merge_run_content(target, source):
171    for child in list(source.childNodes):
172        if child.nodeType == child.ELEMENT_NODE:
173            name = child.localName or child.tagName
174            if name != "rPr" and not name.endswith(":rPr"):
175                target.appendChild(child)
176 
177 
178def _consolidate_text(run):
179    t_elements = _get_children(run, "t")
180 
181    for i in range(len(t_elements) - 1, 0, -1):
182        curr, prev = t_elements[i], t_elements[i - 1]
183 
184        if _is_adjacent(prev, curr):
185            prev_text = prev.firstChild.data if prev.firstChild else ""
186            curr_text = curr.firstChild.data if curr.firstChild else ""
187            merged = prev_text + curr_text
188 
189            if prev.firstChild:
190                prev.firstChild.data = merged
191            else:
192                prev.appendChild(run.ownerDocument.createTextNode(merged))
193 
194            if merged.startswith(" ") or merged.endswith(" "):
195                prev.setAttribute("xml:space", "preserve")
196            elif prev.hasAttribute("xml:space"):
197                prev.removeAttribute("xml:space")
198 
199            run.removeChild(curr)
200

Marketplace

Source from repo

DOCX creation, editing, and analysis

Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

1.1 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

scripts/office/helpers/merge_runs.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code200 linesFree

scripts/office/helpers/merge_runs.py

1"""Merge adjacent runs with identical formatting in DOCX.
2 
3Merges adjacent <w:r> elements that have identical <w:rPr> properties.
4Works on runs in paragraphs and inside tracked changes (<w:ins>, <w:del>).
5 
6Also:
7- Removes rsid attributes from runs (revision metadata that doesn't affect rendering)
8- Removes proofErr elements (spell/grammar markers that block merging)
9"""
10 
11from pathlib import Path
12 
13import defusedxml.minidom
14 
15 
16def merge_runs(input_dir: str) -> tuple[int, str]:
17    doc_xml = Path(input_dir) / "word" / "document.xml"
18 
19    if not doc_xml.exists():
20        return 0, f"Error: {doc_xml} not found"
21 
22    try:
23        dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
24        root = dom.documentElement
25 
26        _remove_elements(root, "proofErr")
27        _strip_run_rsid_attrs(root)
28 
29        containers = {run.parentNode for run in _find_elements(root, "r")}
30 
31        merge_count = 0
32        for container in containers:
33            merge_count += _merge_runs_in(container)
34 
35        doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
36        return merge_count, f"Merged {merge_count} runs"
37 
38    except Exception as e:
39        return 0, f"Error: {e}"
40 
41 
42 
43 
44def _find_elements(root, tag: str) -> list:
45    results = []
46 
47    def traverse(node):
48        if node.nodeType == node.ELEMENT_NODE:
49            name = node.localName or node.tagName
50            if name == tag or name.endswith(f":{tag}"):
51                results.append(node)
52            for child in node.childNodes:
53                traverse(child)
54 
55    traverse(root)
56    return results
57 
58 
59def _get_child(parent, tag: str):
60    for child in parent.childNodes:
61        if child.nodeType == child.ELEMENT_NODE:
62            name = child.localName or child.tagName
63            if name == tag or name.endswith(f":{tag}"):
64                return child
65    return None
66 
67 
68def _get_children(parent, tag: str) -> list:
69    results = []
70    for child in parent.childNodes:
71        if child.nodeType == child.ELEMENT_NODE:
72            name = child.localName or child.tagName
73            if name == tag or name.endswith(f":{tag}"):
74                results.append(child)
75    return results
76 
77 
78def _is_adjacent(elem1, elem2) -> bool:
79    node = elem1.nextSibling
80    while node:
81        if node == elem2:
82            return True
83        if node.nodeType == node.ELEMENT_NODE:
84            return False
85        if node.nodeType == node.TEXT_NODE and node.data.strip():
86            return False
87        node = node.nextSibling
88    return False
89 
90 
91 
92 
93def _remove_elements(root, tag: str):
94    for elem in _find_elements(root, tag):
95        if elem.parentNode:
96            elem.parentNode.removeChild(elem)
97 
98 
99def _strip_run_rsid_attrs(root):
100    for run in _find_elements(root, "r"):
101        for attr in list(run.attributes.values()):
102            if "rsid" in attr.name.lower():
103                run.removeAttribute(attr.name)
104 
105 
106 
107 
108def _merge_runs_in(container) -> int:
109    merge_count = 0
110    run = _first_child_run(container)
111 
112    while run:
113        while True:
114            next_elem = _next_element_sibling(run)
115            if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
116                _merge_run_content(run, next_elem)
117                container.removeChild(next_elem)
118                merge_count += 1
119            else:
120                break
121 
122        _consolidate_text(run)
123        run = _next_sibling_run(run)
124 
125    return merge_count
126 
127 
128def _first_child_run(container):
129    for child in container.childNodes:
130        if child.nodeType == child.ELEMENT_NODE and _is_run(child):
131            return child
132    return None
133 
134 
135def _next_element_sibling(node):
136    sibling = node.nextSibling
137    while sibling:
138        if sibling.nodeType == sibling.ELEMENT_NODE:
139            return sibling
140        sibling = sibling.nextSibling
141    return None
142 
143 
144def _next_sibling_run(node):
145    sibling = node.nextSibling
146    while sibling:
147        if sibling.nodeType == sibling.ELEMENT_NODE:
148            if _is_run(sibling):
149                return sibling
150        sibling = sibling.nextSibling
151    return None
152 
153 
154def _is_run(node) -> bool:
155    name = node.localName or node.tagName
156    return name == "r" or name.endswith(":r")
157 
158 
159def _can_merge(run1, run2) -> bool:
160    rpr1 = _get_child(run1, "rPr")
161    rpr2 = _get_child(run2, "rPr")
162 
163    if (rpr1 is None) != (rpr2 is None):
164        return False
165    if rpr1 is None:
166        return True
167    return rpr1.toxml() == rpr2.toxml()  
168 
169 
170def _merge_run_content(target, source):
171    for child in list(source.childNodes):
172        if child.nodeType == child.ELEMENT_NODE:
173            name = child.localName or child.tagName
174            if name != "rPr" and not name.endswith(":rPr"):
175                target.appendChild(child)
176 
177 
178def _consolidate_text(run):
179    t_elements = _get_children(run, "t")
180 
181    for i in range(len(t_elements) - 1, 0, -1):
182        curr, prev = t_elements[i], t_elements[i - 1]
183 
184        if _is_adjacent(prev, curr):
185            prev_text = prev.firstChild.data if prev.firstChild else ""
186            curr_text = curr.firstChild.data if curr.firstChild else ""
187            merged = prev_text + curr_text
188 
189            if prev.firstChild:
190                prev.firstChild.data = merged
191            else:
192                prev.appendChild(run.ownerDocument.createTextNode(merged))
193 
194            if merged.startswith(" ") or merged.endswith(" "):
195                prev.setAttribute("xml:space", "preserve")
196            elif prev.hasAttribute("xml:space"):
197                prev.removeAttribute("xml:space")
198 
199            run.removeChild(curr)
200

DOCX creation, editing, and analysis

scripts/office/helpers/merge_runs.py

Preparing the source view

DOCX creation, editing, and analysis

scripts/office/helpers/merge_runs.py