Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/validators/docx.py
1"""2Validator for Word document XML files against XSD schemas.3"""45import random6import re7import tempfile8import zipfile910import defusedxml.minidom11import lxml.etree1213from .base import BaseSchemaValidator141516class DOCXSchemaValidator(BaseSchemaValidator):1718WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"19W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"20W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"2122ELEMENT_RELATIONSHIP_TYPES = {}2324def validate(self):25if not self.validate_xml():26return False2728all_valid = True29if not self.validate_namespaces():30all_valid = False3132if not self.validate_unique_ids():33all_valid = False3435if not self.validate_file_references():36all_valid = False3738if not self.validate_content_types():39all_valid = False4041if not self.validate_against_xsd():42all_valid = False4344if not self.validate_whitespace_preservation():45all_valid = False4647if not self.validate_deletions():48all_valid = False4950if not self.validate_insertions():51all_valid = False5253if not self.validate_all_relationship_ids():54all_valid = False5556if not self.validate_id_constraints():57all_valid = False5859if not self.validate_comment_markers():60all_valid = False6162self.compare_paragraph_counts()6364return all_valid6566def validate_whitespace_preservation(self):67errors = []6869for xml_file in self.xml_files:70if xml_file.name != "document.xml":71continue7273try:74root = lxml.etree.parse(str(xml_file)).getroot()7576for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):77if elem.text:78text = elem.text79if re.search(r"^[ \t\n\r]", text) or re.search(80r"[ \t\n\r]$", text81):82xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"83if (84xml_space_attr not in elem.attrib85or elem.attrib[xml_space_attr] != "preserve"86):87text_preview = (88repr(text)[:50] + "..."89if len(repr(text)) > 5090else repr(text)91)92errors.append(93f" {xml_file.relative_to(self.unpacked_dir)}: "94f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"95)9697except (lxml.etree.XMLSyntaxError, Exception) as e:98errors.append(99f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"100)101102if errors:103print(f"FAILED - Found {len(errors)} whitespace preservation violations:")104for error in errors:105print(error)106return False107else:108if self.verbose:109print("PASSED - All whitespace is properly preserved")110return True111112def validate_deletions(self):113errors = []114115for xml_file in self.xml_files:116if xml_file.name != "document.xml":117continue118119try:120root = lxml.etree.parse(str(xml_file)).getroot()121namespaces = {"w": self.WORD_2006_NAMESPACE}122123for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):124if t_elem.text:125text_preview = (126repr(t_elem.text)[:50] + "..."127if len(repr(t_elem.text)) > 50128else repr(t_elem.text)129)130errors.append(131f" {xml_file.relative_to(self.unpacked_dir)}: "132f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"133)134135for instr_elem in root.xpath(136".//w:del//w:instrText", namespaces=namespaces137):138text_preview = (139repr(instr_elem.text or "")[:50] + "..."140if len(repr(instr_elem.text or "")) > 50141else repr(instr_elem.text or "")142)143errors.append(144f" {xml_file.relative_to(self.unpacked_dir)}: "145f"Line {instr_elem.sourceline}: <w:instrText> found within <w:del> (use <w:delInstrText>): {text_preview}"146)147148except (lxml.etree.XMLSyntaxError, Exception) as e:149errors.append(150f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"151)152153if errors:154print(f"FAILED - Found {len(errors)} deletion validation violations:")155for error in errors:156print(error)157return False158else:159if self.verbose:160print("PASSED - No w:t elements found within w:del elements")161return True162163def count_paragraphs_in_unpacked(self):164count = 0165166for xml_file in self.xml_files:167if xml_file.name != "document.xml":168continue169170try:171root = lxml.etree.parse(str(xml_file)).getroot()172paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")173count = len(paragraphs)174except Exception as e:175print(f"Error counting paragraphs in unpacked document: {e}")176177return count178179def count_paragraphs_in_original(self):180original = self.original_file181if original is None:182return 0183184count = 0185186try:187with tempfile.TemporaryDirectory() as temp_dir:188with zipfile.ZipFile(original, "r") as zip_ref:189zip_ref.extractall(temp_dir)190191doc_xml_path = temp_dir + "/word/document.xml"192root = lxml.etree.parse(doc_xml_path).getroot()193194paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")195count = len(paragraphs)196197except Exception as e:198print(f"Error counting paragraphs in original document: {e}")199200return count201202def validate_insertions(self):203errors = []204205for xml_file in self.xml_files:206if xml_file.name != "document.xml":207continue208209try:210root = lxml.etree.parse(str(xml_file)).getroot()211namespaces = {"w": self.WORD_2006_NAMESPACE}212213invalid_elements = root.xpath(214".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces215)216217for elem in invalid_elements:218text_preview = (219repr(elem.text or "")[:50] + "..."220if len(repr(elem.text or "")) > 50221else repr(elem.text or "")222)223errors.append(224f" {xml_file.relative_to(self.unpacked_dir)}: "225f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"226)227228except (lxml.etree.XMLSyntaxError, Exception) as e:229errors.append(230f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"231)232233if errors:234print(f"FAILED - Found {len(errors)} insertion validation violations:")235for error in errors:236print(error)237return False238else:239if self.verbose:240print("PASSED - No w:delText elements within w:ins elements")241return True242243def compare_paragraph_counts(self):244original_count = self.count_paragraphs_in_original()245new_count = self.count_paragraphs_in_unpacked()246247diff = new_count - original_count248diff_str = f"+{diff}" if diff > 0 else str(diff)249print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")250251def _parse_id_value(self, val: str, base: int = 16) -> int:252return int(val, base)253254def validate_id_constraints(self):255errors = []256para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId"257durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId"258259for xml_file in self.xml_files:260try:261for elem in lxml.etree.parse(str(xml_file)).iter():262if val := elem.get(para_id_attr):263if self._parse_id_value(val, base=16) >= 0x80000000:264errors.append(265f" {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000"266)267268if val := elem.get(durable_id_attr):269if xml_file.name == "numbering.xml":270try:271if self._parse_id_value(val, base=10) >= 0x7FFFFFFF:272errors.append(273f" {xml_file.name}:{elem.sourceline}: "274f"durableId={val} >= 0x7FFFFFFF"275)276except ValueError:277errors.append(278f" {xml_file.name}:{elem.sourceline}: "279f"durableId={val} must be decimal in numbering.xml"280)281else:282if self._parse_id_value(val, base=16) >= 0x7FFFFFFF:283errors.append(284f" {xml_file.name}:{elem.sourceline}: "285f"durableId={val} >= 0x7FFFFFFF"286)287except Exception:288pass289290if errors:291print(f"FAILED - {len(errors)} ID constraint violations:")292for e in errors:293print(e)294elif self.verbose:295print("PASSED - All paraId/durableId values within constraints")296return not errors297298def validate_comment_markers(self):299errors = []300301document_xml = None302comments_xml = None303for xml_file in self.xml_files:304if xml_file.name == "document.xml" and "word" in str(xml_file):305document_xml = xml_file306elif xml_file.name == "comments.xml":307comments_xml = xml_file308309if not document_xml:310if self.verbose:311print("PASSED - No document.xml found (skipping comment validation)")312return True313314try:315doc_root = lxml.etree.parse(str(document_xml)).getroot()316namespaces = {"w": self.WORD_2006_NAMESPACE}317318range_starts = {319elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")320for elem in doc_root.xpath(321".//w:commentRangeStart", namespaces=namespaces322)323}324range_ends = {325elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")326for elem in doc_root.xpath(327".//w:commentRangeEnd", namespaces=namespaces328)329}330references = {331elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")332for elem in doc_root.xpath(333".//w:commentReference", namespaces=namespaces334)335}336337orphaned_ends = range_ends - range_starts338for comment_id in sorted(339orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0340):341errors.append(342f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart'343)344345orphaned_starts = range_starts - range_ends346for comment_id in sorted(347orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0348):349errors.append(350f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd'351)352353comment_ids = set()354if comments_xml and comments_xml.exists():355comments_root = lxml.etree.parse(str(comments_xml)).getroot()356comment_ids = {357elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")358for elem in comments_root.xpath(359".//w:comment", namespaces=namespaces360)361}362363marker_ids = range_starts | range_ends | references364invalid_refs = marker_ids - comment_ids365for comment_id in sorted(366invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0367):368if comment_id:369errors.append(370f' document.xml: marker id="{comment_id}" references non-existent comment'371)372373except (lxml.etree.XMLSyntaxError, Exception) as e:374errors.append(f" Error parsing XML: {e}")375376if errors:377print(f"FAILED - {len(errors)} comment marker violations:")378for error in errors:379print(error)380return False381else:382if self.verbose:383print("PASSED - All comment markers properly paired")384return True385386def repair(self) -> int:387repairs = super().repair()388repairs += self.repair_durableId()389return repairs390391def repair_durableId(self) -> int:392repairs = 0393394for xml_file in self.xml_files:395try:396content = xml_file.read_text(encoding="utf-8")397dom = defusedxml.minidom.parseString(content)398modified = False399400for elem in dom.getElementsByTagName("*"):401if not elem.hasAttribute("w16cid:durableId"):402continue403404durable_id = elem.getAttribute("w16cid:durableId")405needs_repair = False406407if xml_file.name == "numbering.xml":408try:409needs_repair = (410self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF411)412except ValueError:413needs_repair = True414else:415try:416needs_repair = (417self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF418)419except ValueError:420needs_repair = True421422if needs_repair:423value = random.randint(1, 0x7FFFFFFE)424if xml_file.name == "numbering.xml":425new_id = str(value)426else:427new_id = f"{value:08X}"428429elem.setAttribute("w16cid:durableId", new_id)430print(431f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}"432)433repairs += 1434modified = True435436if modified:437xml_file.write_bytes(dom.toxml(encoding="UTF-8"))438439except Exception:440pass441442return repairs443444445if __name__ == "__main__":446raise RuntimeError("This module should not be run directly.")447