Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/helpers/merge_runs.py
1"""Merge adjacent runs with identical formatting in DOCX.23Merges adjacent <w:r> elements that have identical <w:rPr> properties.4Works on runs in paragraphs and inside tracked changes (<w:ins>, <w:del>).56Also:7- Removes rsid attributes from runs (revision metadata that doesn't affect rendering)8- Removes proofErr elements (spell/grammar markers that block merging)9"""1011from pathlib import Path1213import defusedxml.minidom141516def merge_runs(input_dir: str) -> tuple[int, str]:17doc_xml = Path(input_dir) / "word" / "document.xml"1819if not doc_xml.exists():20return 0, f"Error: {doc_xml} not found"2122try:23dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))24root = dom.documentElement2526_remove_elements(root, "proofErr")27_strip_run_rsid_attrs(root)2829containers = {run.parentNode for run in _find_elements(root, "r")}3031merge_count = 032for container in containers:33merge_count += _merge_runs_in(container)3435doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))36return merge_count, f"Merged {merge_count} runs"3738except Exception as e:39return 0, f"Error: {e}"4041424344def _find_elements(root, tag: str) -> list:45results = []4647def traverse(node):48if node.nodeType == node.ELEMENT_NODE:49name = node.localName or node.tagName50if name == tag or name.endswith(f":{tag}"):51results.append(node)52for child in node.childNodes:53traverse(child)5455traverse(root)56return results575859def _get_child(parent, tag: str):60for child in parent.childNodes:61if child.nodeType == child.ELEMENT_NODE:62name = child.localName or child.tagName63if name == tag or name.endswith(f":{tag}"):64return child65return None666768def _get_children(parent, tag: str) -> list:69results = []70for child in parent.childNodes:71if child.nodeType == child.ELEMENT_NODE:72name = child.localName or child.tagName73if name == tag or name.endswith(f":{tag}"):74results.append(child)75return results767778def _is_adjacent(elem1, elem2) -> bool:79node = elem1.nextSibling80while node:81if node == elem2:82return True83if node.nodeType == node.ELEMENT_NODE:84return False85if node.nodeType == node.TEXT_NODE and node.data.strip():86return False87node = node.nextSibling88return False8990919293def _remove_elements(root, tag: str):94for elem in _find_elements(root, tag):95if elem.parentNode:96elem.parentNode.removeChild(elem)979899def _strip_run_rsid_attrs(root):100for run in _find_elements(root, "r"):101for attr in list(run.attributes.values()):102if "rsid" in attr.name.lower():103run.removeAttribute(attr.name)104105106107108def _merge_runs_in(container) -> int:109merge_count = 0110run = _first_child_run(container)111112while run:113while True:114next_elem = _next_element_sibling(run)115if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):116_merge_run_content(run, next_elem)117container.removeChild(next_elem)118merge_count += 1119else:120break121122_consolidate_text(run)123run = _next_sibling_run(run)124125return merge_count126127128def _first_child_run(container):129for child in container.childNodes:130if child.nodeType == child.ELEMENT_NODE and _is_run(child):131return child132return None133134135def _next_element_sibling(node):136sibling = node.nextSibling137while sibling:138if sibling.nodeType == sibling.ELEMENT_NODE:139return sibling140sibling = sibling.nextSibling141return None142143144def _next_sibling_run(node):145sibling = node.nextSibling146while sibling:147if sibling.nodeType == sibling.ELEMENT_NODE:148if _is_run(sibling):149return sibling150sibling = sibling.nextSibling151return None152153154def _is_run(node) -> bool:155name = node.localName or node.tagName156return name == "r" or name.endswith(":r")157158159def _can_merge(run1, run2) -> bool:160rpr1 = _get_child(run1, "rPr")161rpr2 = _get_child(run2, "rPr")162163if (rpr1 is None) != (rpr2 is None):164return False165if rpr1 is None:166return True167return rpr1.toxml() == rpr2.toxml()168169170def _merge_run_content(target, source):171for child in list(source.childNodes):172if child.nodeType == child.ELEMENT_NODE:173name = child.localName or child.tagName174if name != "rPr" and not name.endswith(":rPr"):175target.appendChild(child)176177178def _consolidate_text(run):179t_elements = _get_children(run, "t")180181for i in range(len(t_elements) - 1, 0, -1):182curr, prev = t_elements[i], t_elements[i - 1]183184if _is_adjacent(prev, curr):185prev_text = prev.firstChild.data if prev.firstChild else ""186curr_text = curr.firstChild.data if curr.firstChild else ""187merged = prev_text + curr_text188189if prev.firstChild:190prev.firstChild.data = merged191else:192prev.appendChild(run.ownerDocument.createTextNode(merged))193194if merged.startswith(" ") or merged.endswith(" "):195prev.setAttribute("xml:space", "preserve")196elif prev.hasAttribute("xml:space"):197prev.removeAttribute("xml:space")198199run.removeChild(curr)200