Source from repo
DOCX creation, editing, and analysis

Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
1.1 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/office/validators/base.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code848 linesFree
scripts/office/validators/base.py
1"""
2Base validator with common validation logic for document files.
3"""
4 
5import re
6from pathlib import Path
7 
8import defusedxml.minidom
9import lxml.etree
10 
11 
12class BaseSchemaValidator:
13 
14    IGNORED_VALIDATION_ERRORS = [
15        "hyphenationZone",
16        "purl.org/dc/terms",
17    ]
18 
19    UNIQUE_ID_REQUIREMENTS = {
20        "comment": ("id", "file"),  
21        "commentrangestart": ("id", "file"),  
22        "commentrangeend": ("id", "file"),  
23        "bookmarkstart": ("id", "file"),  
24        "bookmarkend": ("id", "file"),  
25        "sldid": ("id", "file"),  
26        "sldmasterid": ("id", "global"),  
27        "sldlayoutid": ("id", "global"),  
28        "cm": ("authorid", "file"),  
29        "sheet": ("sheetid", "file"),  
30        "definedname": ("id", "file"),  
31        "cxnsp": ("id", "file"),  
32        "sp": ("id", "file"),  
33        "pic": ("id", "file"),  
34        "grpsp": ("id", "file"),  
35    }
36 
37    EXCLUDED_ID_CONTAINERS = {
38        "sectionlst",  
39    }
40 
41    ELEMENT_RELATIONSHIP_TYPES = {}
42 
43    SCHEMA_MAPPINGS = {
44        "word": "ISO-IEC29500-4_2016/wml.xsd",  
45        "ppt": "ISO-IEC29500-4_2016/pml.xsd",  
46        "xl": "ISO-IEC29500-4_2016/sml.xsd",  
47        "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
48        "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
49        "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
50        "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
51        ".rels": "ecma/fouth-edition/opc-relationships.xsd",
52        "people.xml": "microsoft/wml-2012.xsd",
53        "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
54        "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
55        "commentsExtended.xml": "microsoft/wml-2012.xsd",
56        "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
57        "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
58        "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
59    }
60 
61    MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
62    XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
63 
64    PACKAGE_RELATIONSHIPS_NAMESPACE = (
65        "http://schemas.openxmlformats.org/package/2006/relationships"
66    )
67    OFFICE_RELATIONSHIPS_NAMESPACE = (
68        "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
69    )
70    CONTENT_TYPES_NAMESPACE = (
71        "http://schemas.openxmlformats.org/package/2006/content-types"
72    )
73 
74    MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
75 
76    OOXML_NAMESPACES = {
77        "http://schemas.openxmlformats.org/officeDocument/2006/math",
78        "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
79        "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
80        "http://schemas.openxmlformats.org/drawingml/2006/main",
81        "http://schemas.openxmlformats.org/drawingml/2006/chart",
82        "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
83        "http://schemas.openxmlformats.org/drawingml/2006/diagram",
84        "http://schemas.openxmlformats.org/drawingml/2006/picture",
85        "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
86        "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
87        "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
88        "http://schemas.openxmlformats.org/presentationml/2006/main",
89        "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
90        "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
91        "http://www.w3.org/XML/1998/namespace",
92    }
93 
94    def __init__(self, unpacked_dir, original_file=None, verbose=False):
95        self.unpacked_dir = Path(unpacked_dir).resolve()
96        self.original_file = Path(original_file) if original_file else None
97        self.verbose = verbose
98 
99        self.schemas_dir = Path(__file__).parent.parent / "schemas"
100 
101        patterns = ["*.xml", "*.rels"]
102        self.xml_files = [
103            f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
104        ]
105 
106        if not self.xml_files:
107            print(f"Warning: No XML files found in {self.unpacked_dir}")
108 
109    def validate(self):
110        raise NotImplementedError("Subclasses must implement the validate method")
111 
112    def repair(self) -> int:
113        return self.repair_whitespace_preservation()
114 
115    def repair_whitespace_preservation(self) -> int:
116        repairs = 0
117 
118        for xml_file in self.xml_files:
119            try:
120                content = xml_file.read_text(encoding="utf-8")
121                dom = defusedxml.minidom.parseString(content)
122                modified = False
123 
124                for elem in dom.getElementsByTagName("*"):
125                    if elem.tagName.endswith(":t") and elem.firstChild:
126                        text = elem.firstChild.nodeValue
127                        if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):
128                            if elem.getAttribute("xml:space") != "preserve":
129                                elem.setAttribute("xml:space", "preserve")
130                                text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)
131                                print(f"  Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")
132                                repairs += 1
133                                modified = True
134 
135                if modified:
136                    xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
137 
138            except Exception:
139                pass
140 
141        return repairs
142 
143    def validate_xml(self):
144        errors = []
145 
146        for xml_file in self.xml_files:
147            try:
148                lxml.etree.parse(str(xml_file))
149            except lxml.etree.XMLSyntaxError as e:
150                errors.append(
151                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
152                    f"Line {e.lineno}: {e.msg}"
153                )
154            except Exception as e:
155                errors.append(
156                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
157                    f"Unexpected error: {str(e)}"
158                )
159 
160        if errors:
161            print(f"FAILED - Found {len(errors)} XML violations:")
162            for error in errors:
163                print(error)
164            return False
165        else:
166            if self.verbose:
167                print("PASSED - All XML files are well-formed")
168            return True
169 
170    def validate_namespaces(self):
171        errors = []
172 
173        for xml_file in self.xml_files:
174            try:
175                root = lxml.etree.parse(str(xml_file)).getroot()
176                declared = set(root.nsmap.keys()) - {None}  
177 
178                for attr_val in [
179                    v for k, v in root.attrib.items() if k.endswith("Ignorable")
180                ]:
181                    undeclared = set(attr_val.split()) - declared
182                    errors.extend(
183                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
184                        f"Namespace '{ns}' in Ignorable but not declared"
185                        for ns in undeclared
186                    )
187            except lxml.etree.XMLSyntaxError:
188                continue
189 
190        if errors:
191            print(f"FAILED - {len(errors)} namespace issues:")
192            for error in errors:
193                print(error)
194            return False
195        if self.verbose:
196            print("PASSED - All namespace prefixes properly declared")
197        return True
198 
199    def validate_unique_ids(self):
200        errors = []
201        global_ids = {}  
202 
203        for xml_file in self.xml_files:
204            try:
205                root = lxml.etree.parse(str(xml_file)).getroot()
206                file_ids = {}  
207 
208                mc_elements = root.xpath(
209                    ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
210                )
211                for elem in mc_elements:
212                    elem.getparent().remove(elem)
213 
214                for elem in root.iter():
215                    tag = (
216                        elem.tag.split("}")[-1].lower()
217                        if "}" in elem.tag
218                        else elem.tag.lower()
219                    )
220 
221                    if tag in self.UNIQUE_ID_REQUIREMENTS:
222                        in_excluded_container = any(
223                            ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
224                            for ancestor in elem.iterancestors()
225                        )
226                        if in_excluded_container:
227                            continue
228 
229                        attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
230 
231                        id_value = None
232                        for attr, value in elem.attrib.items():
233                            attr_local = (
234                                attr.split("}")[-1].lower()
235                                if "}" in attr
236                                else attr.lower()
237                            )
238                            if attr_local == attr_name:
239                                id_value = value
240                                break
241 
242                        if id_value is not None:
243                            if scope == "global":
244                                if id_value in global_ids:
245                                    prev_file, prev_line, prev_tag = global_ids[
246                                        id_value
247                                    ]
248                                    errors.append(
249                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
250                                        f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
251                                        f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
252                                    )
253                                else:
254                                    global_ids[id_value] = (
255                                        xml_file.relative_to(self.unpacked_dir),
256                                        elem.sourceline,
257                                        tag,
258                                    )
259                            elif scope == "file":
260                                key = (tag, attr_name)
261                                if key not in file_ids:
262                                    file_ids[key] = {}
263 
264                                if id_value in file_ids[key]:
265                                    prev_line = file_ids[key][id_value]
266                                    errors.append(
267                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
268                                        f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
269                                        f"(first occurrence at line {prev_line})"
270                                    )
271                                else:
272                                    file_ids[key][id_value] = elem.sourceline
273 
274            except (lxml.etree.XMLSyntaxError, Exception) as e:
275                errors.append(
276                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
277                )
278 
279        if errors:
280            print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
281            for error in errors:
282                print(error)
283            return False
284        else:
285            if self.verbose:
286                print("PASSED - All required IDs are unique")
287            return True
288 
289    def validate_file_references(self):
290        errors = []
291 
292        rels_files = list(self.unpacked_dir.rglob("*.rels"))
293 
294        if not rels_files:
295            if self.verbose:
296                print("PASSED - No .rels files found")
297            return True
298 
299        all_files = []
300        for file_path in self.unpacked_dir.rglob("*"):
301            if (
302                file_path.is_file()
303                and file_path.name != "[Content_Types].xml"
304                and not file_path.name.endswith(".rels")
305            ):  
306                all_files.append(file_path.resolve())
307 
308        all_referenced_files = set()
309 
310        if self.verbose:
311            print(
312                f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
313            )
314 
315        for rels_file in rels_files:
316            try:
317                rels_root = lxml.etree.parse(str(rels_file)).getroot()
318 
319                rels_dir = rels_file.parent
320 
321                referenced_files = set()
322                broken_refs = []
323 
324                for rel in rels_root.findall(
325                    ".//ns:Relationship",
326                    namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
327                ):
328                    target = rel.get("Target")
329                    if target and not target.startswith(
330                        ("http", "mailto:")
331                    ):  
332                        if target.startswith("/"):
333                            target_path = self.unpacked_dir / target.lstrip("/")
334                        elif rels_file.name == ".rels":
335                            target_path = self.unpacked_dir / target
336                        else:
337                            base_dir = rels_dir.parent
338                            target_path = base_dir / target
339 
340                        try:
341                            target_path = target_path.resolve()
342                            if target_path.exists() and target_path.is_file():
343                                referenced_files.add(target_path)
344                                all_referenced_files.add(target_path)
345                            else:
346                                broken_refs.append((target, rel.sourceline))
347                        except (OSError, ValueError):
348                            broken_refs.append((target, rel.sourceline))
349 
350                if broken_refs:
351                    rel_path = rels_file.relative_to(self.unpacked_dir)
352                    for broken_ref, line_num in broken_refs:
353                        errors.append(
354                            f"  {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
355                        )
356 
357            except Exception as e:
358                rel_path = rels_file.relative_to(self.unpacked_dir)
359                errors.append(f"  Error parsing {rel_path}: {e}")
360 
361        unreferenced_files = set(all_files) - all_referenced_files
362 
363        if unreferenced_files:
364            for unref_file in sorted(unreferenced_files):
365                unref_rel_path = unref_file.relative_to(self.unpacked_dir)
366                errors.append(f"  Unreferenced file: {unref_rel_path}")
367 
368        if errors:
369            print(f"FAILED - Found {len(errors)} relationship validation errors:")
370            for error in errors:
371                print(error)
372            print(
373                "CRITICAL: These errors will cause the document to appear corrupt. "
374                + "Broken references MUST be fixed, "
375                + "and unreferenced files MUST be referenced or removed."
376            )
377            return False
378        else:
379            if self.verbose:
380                print(
381                    "PASSED - All references are valid and all files are properly referenced"
382                )
383            return True
384 
385    def validate_all_relationship_ids(self):
386        import lxml.etree
387 
388        errors = []
389 
390        for xml_file in self.xml_files:
391            if xml_file.suffix == ".rels":
392                continue
393 
394            rels_dir = xml_file.parent / "_rels"
395            rels_file = rels_dir / f"{xml_file.name}.rels"
396 
397            if not rels_file.exists():
398                continue
399 
400            try:
401                rels_root = lxml.etree.parse(str(rels_file)).getroot()
402                rid_to_type = {}
403 
404                for rel in rels_root.findall(
405                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
406                ):
407                    rid = rel.get("Id")
408                    rel_type = rel.get("Type", "")
409                    if rid:
410                        if rid in rid_to_type:
411                            rels_rel_path = rels_file.relative_to(self.unpacked_dir)
412                            errors.append(
413                                f"  {rels_rel_path}: Line {rel.sourceline}: "
414                                f"Duplicate relationship ID '{rid}' (IDs must be unique)"
415                            )
416                        type_name = (
417                            rel_type.split("/")[-1] if "/" in rel_type else rel_type
418                        )
419                        rid_to_type[rid] = type_name
420 
421                xml_root = lxml.etree.parse(str(xml_file)).getroot()
422 
423                r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
424                rid_attrs_to_check = ["id", "embed", "link"]
425                for elem in xml_root.iter():
426                    for attr_name in rid_attrs_to_check:
427                        rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
428                        if not rid_attr:
429                            continue
430                        xml_rel_path = xml_file.relative_to(self.unpacked_dir)
431                        elem_name = (
432                            elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
433                        )
434 
435                        if rid_attr not in rid_to_type:
436                            errors.append(
437                                f"  {xml_rel_path}: Line {elem.sourceline}: "
438                                f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
439                                f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
440                            )
441                        elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
442                            expected_type = self._get_expected_relationship_type(
443                                elem_name
444                            )
445                            if expected_type:
446                                actual_type = rid_to_type[rid_attr]
447                                if expected_type not in actual_type.lower():
448                                    errors.append(
449                                        f"  {xml_rel_path}: Line {elem.sourceline}: "
450                                        f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
451                                        f"but should point to a '{expected_type}' relationship"
452                                    )
453 
454            except Exception as e:
455                xml_rel_path = xml_file.relative_to(self.unpacked_dir)
456                errors.append(f"  Error processing {xml_rel_path}: {e}")
457 
458        if errors:
459            print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
460            for error in errors:
461                print(error)
462            print("\nThese ID mismatches will cause the document to appear corrupt!")
463            return False
464        else:
465            if self.verbose:
466                print("PASSED - All relationship ID references are valid")
467            return True
468 
469    def _get_expected_relationship_type(self, element_name):
470        elem_lower = element_name.lower()
471 
472        if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
473            return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
474 
475        if elem_lower.endswith("id") and len(elem_lower) > 2:
476            prefix = elem_lower[:-2]  
477            if prefix.endswith("master"):
478                return prefix.lower()
479            elif prefix.endswith("layout"):
480                return prefix.lower()
481            else:
482                if prefix == "sld":
483                    return "slide"
484                return prefix.lower()
485 
486        if elem_lower.endswith("reference") and len(elem_lower) > 9:
487            prefix = elem_lower[:-9]  
488            return prefix.lower()
489 
490        return None
491 
492    def validate_content_types(self):
493        errors = []
494 
495        content_types_file = self.unpacked_dir / "[Content_Types].xml"
496        if not content_types_file.exists():
497            print("FAILED - [Content_Types].xml file not found")
498            return False
499 
500        try:
501            root = lxml.etree.parse(str(content_types_file)).getroot()
502            declared_parts = set()
503            declared_extensions = set()
504 
505            for override in root.findall(
506                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
507            ):
508                part_name = override.get("PartName")
509                if part_name is not None:
510                    declared_parts.add(part_name.lstrip("/"))
511 
512            for default in root.findall(
513                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
514            ):
515                extension = default.get("Extension")
516                if extension is not None:
517                    declared_extensions.add(extension.lower())
518 
519            declarable_roots = {
520                "sld",
521                "sldLayout",
522                "sldMaster",
523                "presentation",  
524                "document",  
525                "workbook",
526                "worksheet",  
527                "theme",  
528            }
529 
530            media_extensions = {
531                "png": "image/png",
532                "jpg": "image/jpeg",
533                "jpeg": "image/jpeg",
534                "gif": "image/gif",
535                "bmp": "image/bmp",
536                "tiff": "image/tiff",
537                "wmf": "image/x-wmf",
538                "emf": "image/x-emf",
539            }
540 
541            all_files = list(self.unpacked_dir.rglob("*"))
542            all_files = [f for f in all_files if f.is_file()]
543 
544            for xml_file in self.xml_files:
545                path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
546                    "\\", "/"
547                )
548 
549                if any(
550                    skip in path_str
551                    for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
552                ):
553                    continue
554 
555                try:
556                    root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
557                    root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
558 
559                    if root_name in declarable_roots and path_str not in declared_parts:
560                        errors.append(
561                            f"  {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
562                        )
563 
564                except Exception:
565                    continue  
566 
567            for file_path in all_files:
568                if file_path.suffix.lower() in {".xml", ".rels"}:
569                    continue
570                if file_path.name == "[Content_Types].xml":
571                    continue
572                if "_rels" in file_path.parts or "docProps" in file_path.parts:
573                    continue
574 
575                extension = file_path.suffix.lstrip(".").lower()
576                if extension and extension not in declared_extensions:
577                    if extension in media_extensions:
578                        relative_path = file_path.relative_to(self.unpacked_dir)
579                        errors.append(
580                            f'  {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
581                        )
582 
583        except Exception as e:
584            errors.append(f"  Error parsing [Content_Types].xml: {e}")
585 
586        if errors:
587            print(f"FAILED - Found {len(errors)} content type declaration errors:")
588            for error in errors:
589                print(error)
590            return False
591        else:
592            if self.verbose:
593                print(
594                    "PASSED - All content files are properly declared in [Content_Types].xml"
595                )
596            return True
597 
598    def validate_file_against_xsd(self, xml_file, verbose=False):
599        xml_file = Path(xml_file).resolve()
600        unpacked_dir = self.unpacked_dir.resolve()
601 
602        is_valid, current_errors = self._validate_single_file_xsd(
603            xml_file, unpacked_dir
604        )
605 
606        if is_valid is None:
607            return None, set()  
608        elif is_valid:
609            return True, set()  
610 
611        original_errors = self._get_original_file_errors(xml_file)
612 
613        assert current_errors is not None
614        new_errors = current_errors - original_errors
615 
616        new_errors = {
617            e for e in new_errors
618            if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
619        }
620 
621        if new_errors:
622            if verbose:
623                relative_path = xml_file.relative_to(unpacked_dir)
624                print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
625                for error in list(new_errors)[:3]:
626                    truncated = error[:250] + "..." if len(error) > 250 else error
627                    print(f"  - {truncated}")
628            return False, new_errors
629        else:
630            if verbose:
631                print(
632                    f"PASSED - No new errors (original had {len(current_errors)} errors)"
633                )
634            return True, set()
635 
636    def validate_against_xsd(self):
637        new_errors = []
638        original_error_count = 0
639        valid_count = 0
640        skipped_count = 0
641 
642        for xml_file in self.xml_files:
643            relative_path = str(xml_file.relative_to(self.unpacked_dir))
644            is_valid, new_file_errors = self.validate_file_against_xsd(
645                xml_file, verbose=False
646            )
647 
648            if is_valid is None:
649                skipped_count += 1
650                continue
651            elif is_valid and not new_file_errors:
652                valid_count += 1
653                continue
654            elif is_valid:
655                original_error_count += 1
656                valid_count += 1
657                continue
658 
659            new_errors.append(f"  {relative_path}: {len(new_file_errors)} new error(s)")
660            for error in list(new_file_errors)[:3]:  
661                new_errors.append(
662                    f"    - {error[:250]}..." if len(error) > 250 else f"    - {error}"
663                )
664 
665        if self.verbose:
666            print(f"Validated {len(self.xml_files)} files:")
667            print(f"  - Valid: {valid_count}")
668            print(f"  - Skipped (no schema): {skipped_count}")
669            if original_error_count:
670                print(f"  - With original errors (ignored): {original_error_count}")
671            print(
672                f"  - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith('    ')]) or 0}"
673            )
674 
675        if new_errors:
676            print("\nFAILED - Found NEW validation errors:")
677            for error in new_errors:
678                print(error)
679            return False
680        else:
681            if self.verbose:
682                print("\nPASSED - No new XSD validation errors introduced")
683            return True
684 
685    def _get_schema_path(self, xml_file):
686        if xml_file.name in self.SCHEMA_MAPPINGS:
687            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
688 
689        if xml_file.suffix == ".rels":
690            return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
691 
692        if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
693            return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
694 
695        if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
696            return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
697 
698        if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
699            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
700 
701        return None
702 
703    def _clean_ignorable_namespaces(self, xml_doc):
704        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
705        xml_copy = lxml.etree.fromstring(xml_string)
706 
707        for elem in xml_copy.iter():
708            attrs_to_remove = []
709 
710            for attr in elem.attrib:
711                if "{" in attr:
712                    ns = attr.split("}")[0][1:]
713                    if ns not in self.OOXML_NAMESPACES:
714                        attrs_to_remove.append(attr)
715 
716            for attr in attrs_to_remove:
717                del elem.attrib[attr]
718 
719        self._remove_ignorable_elements(xml_copy)
720 
721        return lxml.etree.ElementTree(xml_copy)
722 
723    def _remove_ignorable_elements(self, root):
724        elements_to_remove = []
725 
726        for elem in list(root):
727            if not hasattr(elem, "tag") or callable(elem.tag):
728                continue
729 
730            tag_str = str(elem.tag)
731            if tag_str.startswith("{"):
732                ns = tag_str.split("}")[0][1:]
733                if ns not in self.OOXML_NAMESPACES:
734                    elements_to_remove.append(elem)
735                    continue
736 
737            self._remove_ignorable_elements(elem)
738 
739        for elem in elements_to_remove:
740            root.remove(elem)
741 
742    def _preprocess_for_mc_ignorable(self, xml_doc):
743        root = xml_doc.getroot()
744 
745        if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
746            del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
747 
748        return xml_doc
749 
750    def _validate_single_file_xsd(self, xml_file, base_path):
751        schema_path = self._get_schema_path(xml_file)
752        if not schema_path:
753            return None, None  
754 
755        try:
756            with open(schema_path, "rb") as xsd_file:
757                parser = lxml.etree.XMLParser()
758                xsd_doc = lxml.etree.parse(
759                    xsd_file, parser=parser, base_url=str(schema_path)
760                )
761                schema = lxml.etree.XMLSchema(xsd_doc)
762 
763            with open(xml_file, "r") as f:
764                xml_doc = lxml.etree.parse(f)
765 
766            xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
767            xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
768 
769            relative_path = xml_file.relative_to(base_path)
770            if (
771                relative_path.parts
772                and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
773            ):
774                xml_doc = self._clean_ignorable_namespaces(xml_doc)
775 
776            if schema.validate(xml_doc):
777                return True, set()
778            else:
779                errors = set()
780                for error in schema.error_log:
781                    errors.add(error.message)
782                return False, errors
783 
784        except Exception as e:
785            return False, {str(e)}
786 
787    def _get_original_file_errors(self, xml_file):
788        if self.original_file is None:
789            return set()
790 
791        import tempfile
792        import zipfile
793 
794        xml_file = Path(xml_file).resolve()
795        unpacked_dir = self.unpacked_dir.resolve()
796        relative_path = xml_file.relative_to(unpacked_dir)
797 
798        with tempfile.TemporaryDirectory() as temp_dir:
799            temp_path = Path(temp_dir)
800 
801            with zipfile.ZipFile(self.original_file, "r") as zip_ref:
802                zip_ref.extractall(temp_path)
803 
804            original_xml_file = temp_path / relative_path
805 
806            if not original_xml_file.exists():
807                return set()
808 
809            is_valid, errors = self._validate_single_file_xsd(
810                original_xml_file, temp_path
811            )
812            return errors if errors else set()
813 
814    def _remove_template_tags_from_text_nodes(self, xml_doc):
815        warnings = []
816        template_pattern = re.compile(r"\{\{[^}]*\}\}")
817 
818        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
819        xml_copy = lxml.etree.fromstring(xml_string)
820 
821        def process_text_content(text, content_type):
822            if not text:
823                return text
824            matches = list(template_pattern.finditer(text))
825            if matches:
826                for match in matches:
827                    warnings.append(
828                        f"Found template tag in {content_type}: {match.group()}"
829                    )
830                return template_pattern.sub("", text)
831            return text
832 
833        for elem in xml_copy.iter():
834            if not hasattr(elem, "tag") or callable(elem.tag):
835                continue
836            tag_str = str(elem.tag)
837            if tag_str.endswith("}t") or tag_str == "t":
838                continue
839 
840            elem.text = process_text_content(elem.text, "text content")
841            elem.tail = process_text_content(elem.tail, "tail content")
842 
843        return lxml.etree.ElementTree(xml_copy), warnings
844 
845 
846if __name__ == "__main__":
847    raise RuntimeError("This module should not be run directly.")
848
Preparing the source view

DOCX creation, editing, and analysis

scripts/office/validators/base.py