Source from repo
Requirements for Outputs

Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
1.0 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/office/validators/docx.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code447 linesFree
scripts/office/validators/docx.py
1"""
2Validator for Word document XML files against XSD schemas.
3"""
4 
5import random
6import re
7import tempfile
8import zipfile
9 
10import defusedxml.minidom
11import lxml.etree
12 
13from .base import BaseSchemaValidator
14 
15 
16class DOCXSchemaValidator(BaseSchemaValidator):
17 
18    WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
19    W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"
20    W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"
21 
22    ELEMENT_RELATIONSHIP_TYPES = {}
23 
24    def validate(self):
25        if not self.validate_xml():
26            return False
27 
28        all_valid = True
29        if not self.validate_namespaces():
30            all_valid = False
31 
32        if not self.validate_unique_ids():
33            all_valid = False
34 
35        if not self.validate_file_references():
36            all_valid = False
37 
38        if not self.validate_content_types():
39            all_valid = False
40 
41        if not self.validate_against_xsd():
42            all_valid = False
43 
44        if not self.validate_whitespace_preservation():
45            all_valid = False
46 
47        if not self.validate_deletions():
48            all_valid = False
49 
50        if not self.validate_insertions():
51            all_valid = False
52 
53        if not self.validate_all_relationship_ids():
54            all_valid = False
55 
56        if not self.validate_id_constraints():
57            all_valid = False
58 
59        if not self.validate_comment_markers():
60            all_valid = False
61 
62        self.compare_paragraph_counts()
63 
64        return all_valid
65 
66    def validate_whitespace_preservation(self):
67        errors = []
68 
69        for xml_file in self.xml_files:
70            if xml_file.name != "document.xml":
71                continue
72 
73            try:
74                root = lxml.etree.parse(str(xml_file)).getroot()
75 
76                for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
77                    if elem.text:
78                        text = elem.text
79                        if re.search(r"^[ \t\n\r]", text) or re.search(
80                            r"[ \t\n\r]$", text
81                        ):
82                            xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
83                            if (
84                                xml_space_attr not in elem.attrib
85                                or elem.attrib[xml_space_attr] != "preserve"
86                            ):
87                                text_preview = (
88                                    repr(text)[:50] + "..."
89                                    if len(repr(text)) > 50
90                                    else repr(text)
91                                )
92                                errors.append(
93                                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
94                                    f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
95                                )
96 
97            except (lxml.etree.XMLSyntaxError, Exception) as e:
98                errors.append(
99                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
100                )
101 
102        if errors:
103            print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
104            for error in errors:
105                print(error)
106            return False
107        else:
108            if self.verbose:
109                print("PASSED - All whitespace is properly preserved")
110            return True
111 
112    def validate_deletions(self):
113        errors = []
114 
115        for xml_file in self.xml_files:
116            if xml_file.name != "document.xml":
117                continue
118 
119            try:
120                root = lxml.etree.parse(str(xml_file)).getroot()
121                namespaces = {"w": self.WORD_2006_NAMESPACE}
122 
123                for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):
124                    if t_elem.text:
125                        text_preview = (
126                            repr(t_elem.text)[:50] + "..."
127                            if len(repr(t_elem.text)) > 50
128                            else repr(t_elem.text)
129                        )
130                        errors.append(
131                            f"  {xml_file.relative_to(self.unpacked_dir)}: "
132                            f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
133                        )
134 
135                for instr_elem in root.xpath(
136                    ".//w:del//w:instrText", namespaces=namespaces
137                ):
138                    text_preview = (
139                        repr(instr_elem.text or "")[:50] + "..."
140                        if len(repr(instr_elem.text or "")) > 50
141                        else repr(instr_elem.text or "")
142                    )
143                    errors.append(
144                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
145                        f"Line {instr_elem.sourceline}: <w:instrText> found within <w:del> (use <w:delInstrText>): {text_preview}"
146                    )
147 
148            except (lxml.etree.XMLSyntaxError, Exception) as e:
149                errors.append(
150                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
151                )
152 
153        if errors:
154            print(f"FAILED - Found {len(errors)} deletion validation violations:")
155            for error in errors:
156                print(error)
157            return False
158        else:
159            if self.verbose:
160                print("PASSED - No w:t elements found within w:del elements")
161            return True
162 
163    def count_paragraphs_in_unpacked(self):
164        count = 0
165 
166        for xml_file in self.xml_files:
167            if xml_file.name != "document.xml":
168                continue
169 
170            try:
171                root = lxml.etree.parse(str(xml_file)).getroot()
172                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
173                count = len(paragraphs)
174            except Exception as e:
175                print(f"Error counting paragraphs in unpacked document: {e}")
176 
177        return count
178 
179    def count_paragraphs_in_original(self):
180        original = self.original_file
181        if original is None:
182            return 0
183 
184        count = 0
185 
186        try:
187            with tempfile.TemporaryDirectory() as temp_dir:
188                with zipfile.ZipFile(original, "r") as zip_ref:
189                    zip_ref.extractall(temp_dir)
190 
191                doc_xml_path = temp_dir + "/word/document.xml"
192                root = lxml.etree.parse(doc_xml_path).getroot()
193 
194                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
195                count = len(paragraphs)
196 
197        except Exception as e:
198            print(f"Error counting paragraphs in original document: {e}")
199 
200        return count
201 
202    def validate_insertions(self):
203        errors = []
204 
205        for xml_file in self.xml_files:
206            if xml_file.name != "document.xml":
207                continue
208 
209            try:
210                root = lxml.etree.parse(str(xml_file)).getroot()
211                namespaces = {"w": self.WORD_2006_NAMESPACE}
212 
213                invalid_elements = root.xpath(
214                    ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces
215                )
216 
217                for elem in invalid_elements:
218                    text_preview = (
219                        repr(elem.text or "")[:50] + "..."
220                        if len(repr(elem.text or "")) > 50
221                        else repr(elem.text or "")
222                    )
223                    errors.append(
224                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
225                        f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
226                    )
227 
228            except (lxml.etree.XMLSyntaxError, Exception) as e:
229                errors.append(
230                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
231                )
232 
233        if errors:
234            print(f"FAILED - Found {len(errors)} insertion validation violations:")
235            for error in errors:
236                print(error)
237            return False
238        else:
239            if self.verbose:
240                print("PASSED - No w:delText elements within w:ins elements")
241            return True
242 
243    def compare_paragraph_counts(self):
244        original_count = self.count_paragraphs_in_original()
245        new_count = self.count_paragraphs_in_unpacked()
246 
247        diff = new_count - original_count
248        diff_str = f"+{diff}" if diff > 0 else str(diff)
249        print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
250 
251    def _parse_id_value(self, val: str, base: int = 16) -> int:
252        return int(val, base)
253 
254    def validate_id_constraints(self):
255        errors = []
256        para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId"
257        durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId"
258 
259        for xml_file in self.xml_files:
260            try:
261                for elem in lxml.etree.parse(str(xml_file)).iter():
262                    if val := elem.get(para_id_attr):
263                        if self._parse_id_value(val, base=16) >= 0x80000000:
264                            errors.append(
265                                f"  {xml_file.name}:{elem.sourceline}: paraId={val} >= 0x80000000"
266                            )
267 
268                    if val := elem.get(durable_id_attr):
269                        if xml_file.name == "numbering.xml":
270                            try:
271                                if self._parse_id_value(val, base=10) >= 0x7FFFFFFF:
272                                    errors.append(
273                                        f"  {xml_file.name}:{elem.sourceline}: "
274                                        f"durableId={val} >= 0x7FFFFFFF"
275                                    )
276                            except ValueError:
277                                errors.append(
278                                    f"  {xml_file.name}:{elem.sourceline}: "
279                                    f"durableId={val} must be decimal in numbering.xml"
280                                )
281                        else:
282                            if self._parse_id_value(val, base=16) >= 0x7FFFFFFF:
283                                errors.append(
284                                    f"  {xml_file.name}:{elem.sourceline}: "
285                                    f"durableId={val} >= 0x7FFFFFFF"
286                                )
287            except Exception:
288                pass
289 
290        if errors:
291            print(f"FAILED - {len(errors)} ID constraint violations:")
292            for e in errors:
293                print(e)
294        elif self.verbose:
295            print("PASSED - All paraId/durableId values within constraints")
296        return not errors
297 
298    def validate_comment_markers(self):
299        errors = []
300 
301        document_xml = None
302        comments_xml = None
303        for xml_file in self.xml_files:
304            if xml_file.name == "document.xml" and "word" in str(xml_file):
305                document_xml = xml_file
306            elif xml_file.name == "comments.xml":
307                comments_xml = xml_file
308 
309        if not document_xml:
310            if self.verbose:
311                print("PASSED - No document.xml found (skipping comment validation)")
312            return True
313 
314        try:
315            doc_root = lxml.etree.parse(str(document_xml)).getroot()
316            namespaces = {"w": self.WORD_2006_NAMESPACE}
317 
318            range_starts = {
319                elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
320                for elem in doc_root.xpath(
321                    ".//w:commentRangeStart", namespaces=namespaces
322                )
323            }
324            range_ends = {
325                elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
326                for elem in doc_root.xpath(
327                    ".//w:commentRangeEnd", namespaces=namespaces
328                )
329            }
330            references = {
331                elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
332                for elem in doc_root.xpath(
333                    ".//w:commentReference", namespaces=namespaces
334                )
335            }
336 
337            orphaned_ends = range_ends - range_starts
338            for comment_id in sorted(
339                orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0
340            ):
341                errors.append(
342                    f'  document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart'
343                )
344 
345            orphaned_starts = range_starts - range_ends
346            for comment_id in sorted(
347                orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0
348            ):
349                errors.append(
350                    f'  document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd'
351                )
352 
353            comment_ids = set()
354            if comments_xml and comments_xml.exists():
355                comments_root = lxml.etree.parse(str(comments_xml)).getroot()
356                comment_ids = {
357                    elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
358                    for elem in comments_root.xpath(
359                        ".//w:comment", namespaces=namespaces
360                    )
361                }
362 
363                marker_ids = range_starts | range_ends | references
364                invalid_refs = marker_ids - comment_ids
365                for comment_id in sorted(
366                    invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0
367                ):
368                    if comment_id:  
369                        errors.append(
370                            f'  document.xml: marker id="{comment_id}" references non-existent comment'
371                        )
372 
373        except (lxml.etree.XMLSyntaxError, Exception) as e:
374            errors.append(f"  Error parsing XML: {e}")
375 
376        if errors:
377            print(f"FAILED - {len(errors)} comment marker violations:")
378            for error in errors:
379                print(error)
380            return False
381        else:
382            if self.verbose:
383                print("PASSED - All comment markers properly paired")
384            return True
385 
386    def repair(self) -> int:
387        repairs = super().repair()
388        repairs += self.repair_durableId()
389        return repairs
390 
391    def repair_durableId(self) -> int:
392        repairs = 0
393 
394        for xml_file in self.xml_files:
395            try:
396                content = xml_file.read_text(encoding="utf-8")
397                dom = defusedxml.minidom.parseString(content)
398                modified = False
399 
400                for elem in dom.getElementsByTagName("*"):
401                    if not elem.hasAttribute("w16cid:durableId"):
402                        continue
403 
404                    durable_id = elem.getAttribute("w16cid:durableId")
405                    needs_repair = False
406 
407                    if xml_file.name == "numbering.xml":
408                        try:
409                            needs_repair = (
410                                self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
411                            )
412                        except ValueError:
413                            needs_repair = True
414                    else:
415                        try:
416                            needs_repair = (
417                                self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
418                            )
419                        except ValueError:
420                            needs_repair = True
421 
422                    if needs_repair:
423                        value = random.randint(1, 0x7FFFFFFE)
424                        if xml_file.name == "numbering.xml":
425                            new_id = str(value)  
426                        else:
427                            new_id = f"{value:08X}"  
428 
429                        elem.setAttribute("w16cid:durableId", new_id)
430                        print(
431                            f"  Repaired: {xml_file.name}: durableId {durable_id} → {new_id}"
432                        )
433                        repairs += 1
434                        modified = True
435 
436                if modified:
437                    xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
438 
439            except Exception:
440                pass
441 
442        return repairs
443 
444 
445if __name__ == "__main__":
446    raise RuntimeError("This module should not be run directly.")
447
Preparing the source view

Requirements for Outputs

scripts/office/validators/docx.py