Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/validators/base.py
1"""2Base validator with common validation logic for document files.3"""45import re6from pathlib import Path78import defusedxml.minidom9import lxml.etree101112class BaseSchemaValidator:1314IGNORED_VALIDATION_ERRORS = [15"hyphenationZone",16"purl.org/dc/terms",17]1819UNIQUE_ID_REQUIREMENTS = {20"comment": ("id", "file"),21"commentrangestart": ("id", "file"),22"commentrangeend": ("id", "file"),23"bookmarkstart": ("id", "file"),24"bookmarkend": ("id", "file"),25"sldid": ("id", "file"),26"sldmasterid": ("id", "global"),27"sldlayoutid": ("id", "global"),28"cm": ("authorid", "file"),29"sheet": ("sheetid", "file"),30"definedname": ("id", "file"),31"cxnsp": ("id", "file"),32"sp": ("id", "file"),33"pic": ("id", "file"),34"grpsp": ("id", "file"),35}3637EXCLUDED_ID_CONTAINERS = {38"sectionlst",39}4041ELEMENT_RELATIONSHIP_TYPES = {}4243SCHEMA_MAPPINGS = {44"word": "ISO-IEC29500-4_2016/wml.xsd",45"ppt": "ISO-IEC29500-4_2016/pml.xsd",46"xl": "ISO-IEC29500-4_2016/sml.xsd",47"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",48"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",49"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",50"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",51".rels": "ecma/fouth-edition/opc-relationships.xsd",52"people.xml": "microsoft/wml-2012.xsd",53"commentsIds.xml": "microsoft/wml-cid-2016.xsd",54"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",55"commentsExtended.xml": "microsoft/wml-2012.xsd",56"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",57"theme": "ISO-IEC29500-4_2016/dml-main.xsd",58"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",59}6061MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"62XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"6364PACKAGE_RELATIONSHIPS_NAMESPACE = (65"http://schemas.openxmlformats.org/package/2006/relationships"66)67OFFICE_RELATIONSHIPS_NAMESPACE = (68"http://schemas.openxmlformats.org/officeDocument/2006/relationships"69)70CONTENT_TYPES_NAMESPACE = (71"http://schemas.openxmlformats.org/package/2006/content-types"72)7374MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}7576OOXML_NAMESPACES = {77"http://schemas.openxmlformats.org/officeDocument/2006/math",78"http://schemas.openxmlformats.org/officeDocument/2006/relationships",79"http://schemas.openxmlformats.org/schemaLibrary/2006/main",80"http://schemas.openxmlformats.org/drawingml/2006/main",81"http://schemas.openxmlformats.org/drawingml/2006/chart",82"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",83"http://schemas.openxmlformats.org/drawingml/2006/diagram",84"http://schemas.openxmlformats.org/drawingml/2006/picture",85"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",86"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",87"http://schemas.openxmlformats.org/wordprocessingml/2006/main",88"http://schemas.openxmlformats.org/presentationml/2006/main",89"http://schemas.openxmlformats.org/spreadsheetml/2006/main",90"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",91"http://www.w3.org/XML/1998/namespace",92}9394def __init__(self, unpacked_dir, original_file=None, verbose=False):95self.unpacked_dir = Path(unpacked_dir).resolve()96self.original_file = Path(original_file) if original_file else None97self.verbose = verbose9899self.schemas_dir = Path(__file__).parent.parent / "schemas"100101patterns = ["*.xml", "*.rels"]102self.xml_files = [103f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)104]105106if not self.xml_files:107print(f"Warning: No XML files found in {self.unpacked_dir}")108109def validate(self):110raise NotImplementedError("Subclasses must implement the validate method")111112def repair(self) -> int:113return self.repair_whitespace_preservation()114115def repair_whitespace_preservation(self) -> int:116repairs = 0117118for xml_file in self.xml_files:119try:120content = xml_file.read_text(encoding="utf-8")121dom = defusedxml.minidom.parseString(content)122modified = False123124for elem in dom.getElementsByTagName("*"):125if elem.tagName.endswith(":t") and elem.firstChild:126text = elem.firstChild.nodeValue127if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):128if elem.getAttribute("xml:space") != "preserve":129elem.setAttribute("xml:space", "preserve")130text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)131print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")132repairs += 1133modified = True134135if modified:136xml_file.write_bytes(dom.toxml(encoding="UTF-8"))137138except Exception:139pass140141return repairs142143def validate_xml(self):144errors = []145146for xml_file in self.xml_files:147try:148lxml.etree.parse(str(xml_file))149except lxml.etree.XMLSyntaxError as e:150errors.append(151f" {xml_file.relative_to(self.unpacked_dir)}: "152f"Line {e.lineno}: {e.msg}"153)154except Exception as e:155errors.append(156f" {xml_file.relative_to(self.unpacked_dir)}: "157f"Unexpected error: {str(e)}"158)159160if errors:161print(f"FAILED - Found {len(errors)} XML violations:")162for error in errors:163print(error)164return False165else:166if self.verbose:167print("PASSED - All XML files are well-formed")168return True169170def validate_namespaces(self):171errors = []172173for xml_file in self.xml_files:174try:175root = lxml.etree.parse(str(xml_file)).getroot()176declared = set(root.nsmap.keys()) - {None}177178for attr_val in [179v for k, v in root.attrib.items() if k.endswith("Ignorable")180]:181undeclared = set(attr_val.split()) - declared182errors.extend(183f" {xml_file.relative_to(self.unpacked_dir)}: "184f"Namespace '{ns}' in Ignorable but not declared"185for ns in undeclared186)187except lxml.etree.XMLSyntaxError:188continue189190if errors:191print(f"FAILED - {len(errors)} namespace issues:")192for error in errors:193print(error)194return False195if self.verbose:196print("PASSED - All namespace prefixes properly declared")197return True198199def validate_unique_ids(self):200errors = []201global_ids = {}202203for xml_file in self.xml_files:204try:205root = lxml.etree.parse(str(xml_file)).getroot()206file_ids = {}207208mc_elements = root.xpath(209".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}210)211for elem in mc_elements:212elem.getparent().remove(elem)213214for elem in root.iter():215tag = (216elem.tag.split("}")[-1].lower()217if "}" in elem.tag218else elem.tag.lower()219)220221if tag in self.UNIQUE_ID_REQUIREMENTS:222in_excluded_container = any(223ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS224for ancestor in elem.iterancestors()225)226if in_excluded_container:227continue228229attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]230231id_value = None232for attr, value in elem.attrib.items():233attr_local = (234attr.split("}")[-1].lower()235if "}" in attr236else attr.lower()237)238if attr_local == attr_name:239id_value = value240break241242if id_value is not None:243if scope == "global":244if id_value in global_ids:245prev_file, prev_line, prev_tag = global_ids[246id_value247]248errors.append(249f" {xml_file.relative_to(self.unpacked_dir)}: "250f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "251f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"252)253else:254global_ids[id_value] = (255xml_file.relative_to(self.unpacked_dir),256elem.sourceline,257tag,258)259elif scope == "file":260key = (tag, attr_name)261if key not in file_ids:262file_ids[key] = {}263264if id_value in file_ids[key]:265prev_line = file_ids[key][id_value]266errors.append(267f" {xml_file.relative_to(self.unpacked_dir)}: "268f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "269f"(first occurrence at line {prev_line})"270)271else:272file_ids[key][id_value] = elem.sourceline273274except (lxml.etree.XMLSyntaxError, Exception) as e:275errors.append(276f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"277)278279if errors:280print(f"FAILED - Found {len(errors)} ID uniqueness violations:")281for error in errors:282print(error)283return False284else:285if self.verbose:286print("PASSED - All required IDs are unique")287return True288289def validate_file_references(self):290errors = []291292rels_files = list(self.unpacked_dir.rglob("*.rels"))293294if not rels_files:295if self.verbose:296print("PASSED - No .rels files found")297return True298299all_files = []300for file_path in self.unpacked_dir.rglob("*"):301if (302file_path.is_file()303and file_path.name != "[Content_Types].xml"304and not file_path.name.endswith(".rels")305):306all_files.append(file_path.resolve())307308all_referenced_files = set()309310if self.verbose:311print(312f"Found {len(rels_files)} .rels files and {len(all_files)} target files"313)314315for rels_file in rels_files:316try:317rels_root = lxml.etree.parse(str(rels_file)).getroot()318319rels_dir = rels_file.parent320321referenced_files = set()322broken_refs = []323324for rel in rels_root.findall(325".//ns:Relationship",326namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},327):328target = rel.get("Target")329if target and not target.startswith(330("http", "mailto:")331):332if target.startswith("/"):333target_path = self.unpacked_dir / target.lstrip("/")334elif rels_file.name == ".rels":335target_path = self.unpacked_dir / target336else:337base_dir = rels_dir.parent338target_path = base_dir / target339340try:341target_path = target_path.resolve()342if target_path.exists() and target_path.is_file():343referenced_files.add(target_path)344all_referenced_files.add(target_path)345else:346broken_refs.append((target, rel.sourceline))347except (OSError, ValueError):348broken_refs.append((target, rel.sourceline))349350if broken_refs:351rel_path = rels_file.relative_to(self.unpacked_dir)352for broken_ref, line_num in broken_refs:353errors.append(354f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"355)356357except Exception as e:358rel_path = rels_file.relative_to(self.unpacked_dir)359errors.append(f" Error parsing {rel_path}: {e}")360361unreferenced_files = set(all_files) - all_referenced_files362363if unreferenced_files:364for unref_file in sorted(unreferenced_files):365unref_rel_path = unref_file.relative_to(self.unpacked_dir)366errors.append(f" Unreferenced file: {unref_rel_path}")367368if errors:369print(f"FAILED - Found {len(errors)} relationship validation errors:")370for error in errors:371print(error)372print(373"CRITICAL: These errors will cause the document to appear corrupt. "374+ "Broken references MUST be fixed, "375+ "and unreferenced files MUST be referenced or removed."376)377return False378else:379if self.verbose:380print(381"PASSED - All references are valid and all files are properly referenced"382)383return True384385def validate_all_relationship_ids(self):386import lxml.etree387388errors = []389390for xml_file in self.xml_files:391if xml_file.suffix == ".rels":392continue393394rels_dir = xml_file.parent / "_rels"395rels_file = rels_dir / f"{xml_file.name}.rels"396397if not rels_file.exists():398continue399400try:401rels_root = lxml.etree.parse(str(rels_file)).getroot()402rid_to_type = {}403404for rel in rels_root.findall(405f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"406):407rid = rel.get("Id")408rel_type = rel.get("Type", "")409if rid:410if rid in rid_to_type:411rels_rel_path = rels_file.relative_to(self.unpacked_dir)412errors.append(413f" {rels_rel_path}: Line {rel.sourceline}: "414f"Duplicate relationship ID '{rid}' (IDs must be unique)"415)416type_name = (417rel_type.split("/")[-1] if "/" in rel_type else rel_type418)419rid_to_type[rid] = type_name420421xml_root = lxml.etree.parse(str(xml_file)).getroot()422423r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE424rid_attrs_to_check = ["id", "embed", "link"]425for elem in xml_root.iter():426for attr_name in rid_attrs_to_check:427rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")428if not rid_attr:429continue430xml_rel_path = xml_file.relative_to(self.unpacked_dir)431elem_name = (432elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag433)434435if rid_attr not in rid_to_type:436errors.append(437f" {xml_rel_path}: Line {elem.sourceline}: "438f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "439f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"440)441elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:442expected_type = self._get_expected_relationship_type(443elem_name444)445if expected_type:446actual_type = rid_to_type[rid_attr]447if expected_type not in actual_type.lower():448errors.append(449f" {xml_rel_path}: Line {elem.sourceline}: "450f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "451f"but should point to a '{expected_type}' relationship"452)453454except Exception as e:455xml_rel_path = xml_file.relative_to(self.unpacked_dir)456errors.append(f" Error processing {xml_rel_path}: {e}")457458if errors:459print(f"FAILED - Found {len(errors)} relationship ID reference errors:")460for error in errors:461print(error)462print("\nThese ID mismatches will cause the document to appear corrupt!")463return False464else:465if self.verbose:466print("PASSED - All relationship ID references are valid")467return True468469def _get_expected_relationship_type(self, element_name):470elem_lower = element_name.lower()471472if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:473return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]474475if elem_lower.endswith("id") and len(elem_lower) > 2:476prefix = elem_lower[:-2]477if prefix.endswith("master"):478return prefix.lower()479elif prefix.endswith("layout"):480return prefix.lower()481else:482if prefix == "sld":483return "slide"484return prefix.lower()485486if elem_lower.endswith("reference") and len(elem_lower) > 9:487prefix = elem_lower[:-9]488return prefix.lower()489490return None491492def validate_content_types(self):493errors = []494495content_types_file = self.unpacked_dir / "[Content_Types].xml"496if not content_types_file.exists():497print("FAILED - [Content_Types].xml file not found")498return False499500try:501root = lxml.etree.parse(str(content_types_file)).getroot()502declared_parts = set()503declared_extensions = set()504505for override in root.findall(506f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"507):508part_name = override.get("PartName")509if part_name is not None:510declared_parts.add(part_name.lstrip("/"))511512for default in root.findall(513f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"514):515extension = default.get("Extension")516if extension is not None:517declared_extensions.add(extension.lower())518519declarable_roots = {520"sld",521"sldLayout",522"sldMaster",523"presentation",524"document",525"workbook",526"worksheet",527"theme",528}529530media_extensions = {531"png": "image/png",532"jpg": "image/jpeg",533"jpeg": "image/jpeg",534"gif": "image/gif",535"bmp": "image/bmp",536"tiff": "image/tiff",537"wmf": "image/x-wmf",538"emf": "image/x-emf",539}540541all_files = list(self.unpacked_dir.rglob("*"))542all_files = [f for f in all_files if f.is_file()]543544for xml_file in self.xml_files:545path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(546"\\", "/"547)548549if any(550skip in path_str551for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]552):553continue554555try:556root_tag = lxml.etree.parse(str(xml_file)).getroot().tag557root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag558559if root_name in declarable_roots and path_str not in declared_parts:560errors.append(561f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"562)563564except Exception:565continue566567for file_path in all_files:568if file_path.suffix.lower() in {".xml", ".rels"}:569continue570if file_path.name == "[Content_Types].xml":571continue572if "_rels" in file_path.parts or "docProps" in file_path.parts:573continue574575extension = file_path.suffix.lstrip(".").lower()576if extension and extension not in declared_extensions:577if extension in media_extensions:578relative_path = file_path.relative_to(self.unpacked_dir)579errors.append(580f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'581)582583except Exception as e:584errors.append(f" Error parsing [Content_Types].xml: {e}")585586if errors:587print(f"FAILED - Found {len(errors)} content type declaration errors:")588for error in errors:589print(error)590return False591else:592if self.verbose:593print(594"PASSED - All content files are properly declared in [Content_Types].xml"595)596return True597598def validate_file_against_xsd(self, xml_file, verbose=False):599xml_file = Path(xml_file).resolve()600unpacked_dir = self.unpacked_dir.resolve()601602is_valid, current_errors = self._validate_single_file_xsd(603xml_file, unpacked_dir604)605606if is_valid is None:607return None, set()608elif is_valid:609return True, set()610611original_errors = self._get_original_file_errors(xml_file)612613assert current_errors is not None614new_errors = current_errors - original_errors615616new_errors = {617e for e in new_errors618if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)619}620621if new_errors:622if verbose:623relative_path = xml_file.relative_to(unpacked_dir)624print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")625for error in list(new_errors)[:3]:626truncated = error[:250] + "..." if len(error) > 250 else error627print(f" - {truncated}")628return False, new_errors629else:630if verbose:631print(632f"PASSED - No new errors (original had {len(current_errors)} errors)"633)634return True, set()635636def validate_against_xsd(self):637new_errors = []638original_error_count = 0639valid_count = 0640skipped_count = 0641642for xml_file in self.xml_files:643relative_path = str(xml_file.relative_to(self.unpacked_dir))644is_valid, new_file_errors = self.validate_file_against_xsd(645xml_file, verbose=False646)647648if is_valid is None:649skipped_count += 1650continue651elif is_valid and not new_file_errors:652valid_count += 1653continue654elif is_valid:655original_error_count += 1656valid_count += 1657continue658659new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")660for error in list(new_file_errors)[:3]:661new_errors.append(662f" - {error[:250]}..." if len(error) > 250 else f" - {error}"663)664665if self.verbose:666print(f"Validated {len(self.xml_files)} files:")667print(f" - Valid: {valid_count}")668print(f" - Skipped (no schema): {skipped_count}")669if original_error_count:670print(f" - With original errors (ignored): {original_error_count}")671print(672f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"673)674675if new_errors:676print("\nFAILED - Found NEW validation errors:")677for error in new_errors:678print(error)679return False680else:681if self.verbose:682print("\nPASSED - No new XSD validation errors introduced")683return True684685def _get_schema_path(self, xml_file):686if xml_file.name in self.SCHEMA_MAPPINGS:687return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]688689if xml_file.suffix == ".rels":690return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]691692if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):693return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]694695if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):696return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]697698if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:699return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]700701return None702703def _clean_ignorable_namespaces(self, xml_doc):704xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")705xml_copy = lxml.etree.fromstring(xml_string)706707for elem in xml_copy.iter():708attrs_to_remove = []709710for attr in elem.attrib:711if "{" in attr:712ns = attr.split("}")[0][1:]713if ns not in self.OOXML_NAMESPACES:714attrs_to_remove.append(attr)715716for attr in attrs_to_remove:717del elem.attrib[attr]718719self._remove_ignorable_elements(xml_copy)720721return lxml.etree.ElementTree(xml_copy)722723def _remove_ignorable_elements(self, root):724elements_to_remove = []725726for elem in list(root):727if not hasattr(elem, "tag") or callable(elem.tag):728continue729730tag_str = str(elem.tag)731if tag_str.startswith("{"):732ns = tag_str.split("}")[0][1:]733if ns not in self.OOXML_NAMESPACES:734elements_to_remove.append(elem)735continue736737self._remove_ignorable_elements(elem)738739for elem in elements_to_remove:740root.remove(elem)741742def _preprocess_for_mc_ignorable(self, xml_doc):743root = xml_doc.getroot()744745if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:746del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]747748return xml_doc749750def _validate_single_file_xsd(self, xml_file, base_path):751schema_path = self._get_schema_path(xml_file)752if not schema_path:753return None, None754755try:756with open(schema_path, "rb") as xsd_file:757parser = lxml.etree.XMLParser()758xsd_doc = lxml.etree.parse(759xsd_file, parser=parser, base_url=str(schema_path)760)761schema = lxml.etree.XMLSchema(xsd_doc)762763with open(xml_file, "r") as f:764xml_doc = lxml.etree.parse(f)765766xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)767xml_doc = self._preprocess_for_mc_ignorable(xml_doc)768769relative_path = xml_file.relative_to(base_path)770if (771relative_path.parts772and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS773):774xml_doc = self._clean_ignorable_namespaces(xml_doc)775776if schema.validate(xml_doc):777return True, set()778else:779errors = set()780for error in schema.error_log:781errors.add(error.message)782return False, errors783784except Exception as e:785return False, {str(e)}786787def _get_original_file_errors(self, xml_file):788if self.original_file is None:789return set()790791import tempfile792import zipfile793794xml_file = Path(xml_file).resolve()795unpacked_dir = self.unpacked_dir.resolve()796relative_path = xml_file.relative_to(unpacked_dir)797798with tempfile.TemporaryDirectory() as temp_dir:799temp_path = Path(temp_dir)800801with zipfile.ZipFile(self.original_file, "r") as zip_ref:802zip_ref.extractall(temp_path)803804original_xml_file = temp_path / relative_path805806if not original_xml_file.exists():807return set()808809is_valid, errors = self._validate_single_file_xsd(810original_xml_file, temp_path811)812return errors if errors else set()813814def _remove_template_tags_from_text_nodes(self, xml_doc):815warnings = []816template_pattern = re.compile(r"\{\{[^}]*\}\}")817818xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")819xml_copy = lxml.etree.fromstring(xml_string)820821def process_text_content(text, content_type):822if not text:823return text824matches = list(template_pattern.finditer(text))825if matches:826for match in matches:827warnings.append(828f"Found template tag in {content_type}: {match.group()}"829)830return template_pattern.sub("", text)831return text832833for elem in xml_copy.iter():834if not hasattr(elem, "tag") or callable(elem.tag):835continue836tag_str = str(elem.tag)837if tag_str.endswith("}t") or tag_str == "t":838continue839840elem.text = process_text_content(elem.text, "text content")841elem.tail = process_text_content(elem.tail, "tail content")842843return lxml.etree.ElementTree(xml_copy), warnings844845846if __name__ == "__main__":847raise RuntimeError("This module should not be run directly.")848