Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/helpers/merge_runs.py
1"""Merge adjacent runs with identical formatting in DOCX.23Merges adjacent <w:r> elements that have identical <w:rPr> properties.4Works on runs in paragraphs and inside tracked changes (<w:ins>, <w:del>).56Also:7- Removes rsid attributes from runs (revision metadata that doesn't affect rendering)8- Removes proofErr elements (spell/grammar markers that block merging)9"""1011from pathlib import Path1213import defusedxml.minidom141516def merge_runs(input_dir: str) -> tuple[int, str]:17doc_xml = Path(input_dir) / "word" / "document.xml"1819if not doc_xml.exists():20return 0, f"Error: {doc_xml} not found"2122try:23dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))24root = dom.documentElement2526_remove_elements(root, "proofErr")27_strip_run_rsid_attrs(root)2829containers = {run.parentNode for run in _find_elements(root, "r")}3031merge_count = 032for container in containers:33merge_count += _merge_runs_in(container)3435doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))36return merge_count, f"Merged {merge_count} runs"3738except Exception as e:39return 0, f"Error: {e}"4041424344def _find_elements(root, tag: str) -> list:45results = []4647def traverse(node):48if node.nodeType == node.ELEMENT_NODE:49name = node.localName or node.tagName50if name == tag or name.endswith(f":{tag}"):51results.append(node)52for child in node.childNodes:53traverse(child)5455traverse(root)56return results575859def _get_child(parent, tag: str):60for child in parent.childNodes:61if child.nodeType == child.ELEMENT_NODE:62name = child.localName or child.tagName63if name == tag or name.endswith(f":{tag}"):64return child65return None666768def _get_children(parent, tag: str) -> list:69results = []70for child in parent.childNodes:71if child.nodeType == child.ELEMENT_NODE:72name = child.localName or child.tagName73if name == tag or name.endswith(f":{tag}"):74results.append(child)75return results767778def _is_adjacent(elem1, elem2) -> bool:79node = elem1.nextSibling80while node:81if node == elem2:82return True83if node.nodeType == node.ELEMENT_NODE:84return False85if node.nodeType == node.TEXT_NODE and node.data.strip():86return False87node = node.nextSibling88return False8990919293def _remove_elements(root, tag: str):94for elem in _find_elements(root, tag):95if elem.parentNode:96elem.parentNode.removeChild(elem)979899def _strip_run_rsid_attrs(root):100for run in _find_elements(root, "r"):101for attr in list(run.attributes.values()):102if "rsid" in attr.name.lower():103run.removeAttribute(attr.name)104105106107108def _merge_runs_in(container) -> int:109merge_count = 0110run = _first_child_run(container)111112while run:113while True:114next_elem = _next_element_sibling(run)115if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):116_merge_run_content(run, next_elem)117container.removeChild(next_elem)118merge_count += 1119else:120break121122_consolidate_text(run)123run = _next_sibling_run(run)124125return merge_count126127128def _first_child_run(container):129for child in container.childNodes:130if child.nodeType == child.ELEMENT_NODE and _is_run(child):131return child132return None133134135def _next_element_sibling(node):136sibling = node.nextSibling137while sibling:138if sibling.nodeType == sibling.ELEMENT_NODE:139return sibling140sibling = sibling.nextSibling141return None142143144def _next_sibling_run(node):145sibling = node.nextSibling146while sibling:147if sibling.nodeType == sibling.ELEMENT_NODE:148if _is_run(sibling):149return sibling150sibling = sibling.nextSibling151return None152153154def _is_run(node) -> bool:155name = node.localName or node.tagName156return name == "r" or name.endswith(":r")157158159def _can_merge(run1, run2) -> bool:160rpr1 = _get_child(run1, "rPr")161rpr2 = _get_child(run2, "rPr")162163if (rpr1 is None) != (rpr2 is None):164return False165if rpr1 is None:166return True167return rpr1.toxml() == rpr2.toxml()168169170def _merge_run_content(target, source):171for child in list(source.childNodes):172if child.nodeType == child.ELEMENT_NODE:173name = child.localName or child.tagName174if name != "rPr" and not name.endswith(":rPr"):175target.appendChild(child)176177178def _consolidate_text(run):179t_elements = _get_children(run, "t")180181for i in range(len(t_elements) - 1, 0, -1):182curr, prev = t_elements[i], t_elements[i - 1]183184if _is_adjacent(prev, curr):185prev_text = prev.firstChild.data if prev.firstChild else ""186curr_text = curr.firstChild.data if curr.firstChild else ""187merged = prev_text + curr_text188189if prev.firstChild:190prev.firstChild.data = merged191else:192prev.appendChild(run.ownerDocument.createTextNode(merged))193194if merged.startswith(" ") or merged.endswith(" "):195prev.setAttribute("xml:space", "preserve")196elif prev.hasAttribute("xml:space"):197prev.removeAttribute("xml:space")198199run.removeChild(curr)200