Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and format Excel (.xlsx) spreadsheets with formulas, color coding, and financial model standards
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/validators/base.py
1"""2Base validator with common validation logic for document files.3"""45import re6from pathlib import Path78import defusedxml.minidom9import lxml.etree101112class BaseSchemaValidator:1314IGNORED_VALIDATION_ERRORS = [15"hyphenationZone",16"purl.org/dc/terms",17]1819UNIQUE_ID_REQUIREMENTS = {20"comment": ("id", "file"),21"commentrangestart": ("id", "file"),22"commentrangeend": ("id", "file"),23"bookmarkstart": ("id", "file"),24"bookmarkend": ("id", "file"),25"sldid": ("id", "file"),26"sldmasterid": ("id", "global"),27"sldlayoutid": ("id", "global"),28"cm": ("authorid", "file"),29"sheet": ("sheetid", "file"),30"definedname": ("id", "file"),31"cxnsp": ("id", "file"),32"sp": ("id", "file"),33"pic": ("id", "file"),34"grpsp": ("id", "file"),35}3637EXCLUDED_ID_CONTAINERS = {38"sectionlst",39}4041ELEMENT_RELATIONSHIP_TYPES = {}4243SCHEMA_MAPPINGS = {44"word": "ISO-IEC29500-4_2016/wml.xsd",45"ppt": "ISO-IEC29500-4_2016/pml.xsd",46"xl": "ISO-IEC29500-4_2016/sml.xsd",47"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",48"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",49"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",50"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",51".rels": "ecma/fouth-edition/opc-relationships.xsd",52"people.xml": "microsoft/wml-2012.xsd",53"commentsIds.xml": "microsoft/wml-cid-2016.xsd",54"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",55"commentsExtended.xml": "microsoft/wml-2012.xsd",56"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",57"theme": "ISO-IEC29500-4_2016/dml-main.xsd",58"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",59}6061MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"62XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"6364PACKAGE_RELATIONSHIPS_NAMESPACE = (65"http://schemas.openxmlformats.org/package/2006/relationships"66)67OFFICE_RELATIONSHIPS_NAMESPACE = (68"http://schemas.openxmlformats.org/officeDocument/2006/relationships"69)70CONTENT_TYPES_NAMESPACE = (71"http://schemas.openxmlformats.org/package/2006/content-types"72)7374MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}7576OOXML_NAMESPACES = {77"http://schemas.openxmlformats.org/officeDocument/2006/math",78"http://schemas.openxmlformats.org/officeDocument/2006/relationships",79"http://schemas.openxmlformats.org/schemaLibrary/2006/main",80"http://schemas.openxmlformats.org/drawingml/2006/main",81"http://schemas.openxmlformats.org/drawingml/2006/chart",82"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",83"http://schemas.openxmlformats.org/drawingml/2006/diagram",84"http://schemas.openxmlformats.org/drawingml/2006/picture",85"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",86"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",87"http://schemas.openxmlformats.org/wordprocessingml/2006/main",88"http://schemas.openxmlformats.org/presentationml/2006/main",89"http://schemas.openxmlformats.org/spreadsheetml/2006/main",90"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",91"http://www.w3.org/XML/1998/namespace",92}9394def __init__(self, unpacked_dir, original_file=None, verbose=False):95self.unpacked_dir = Path(unpacked_dir).resolve()96self.original_file = Path(original_file) if original_file else None97self.verbose = verbose9899self.schemas_dir = Path(__file__).parent.parent / "schemas"100101patterns = ["*.xml", "*.rels"]102self.xml_files = [103f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)104]105106if not self.xml_files:107print(f"Warning: No XML files found in {self.unpacked_dir}")108109def validate(self):110raise NotImplementedError("Subclasses must implement the validate method")111112def repair(self) -> int:113return self.repair_whitespace_preservation()114115def repair_whitespace_preservation(self) -> int:116repairs = 0117118for xml_file in self.xml_files:119try:120content = xml_file.read_text(encoding="utf-8")121dom = defusedxml.minidom.parseString(content)122modified = False123124for elem in dom.getElementsByTagName("*"):125if elem.tagName.endswith(":t") and elem.firstChild:126text = elem.firstChild.nodeValue127if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):128if elem.getAttribute("xml:space") != "preserve":129elem.setAttribute("xml:space", "preserve")130text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)131print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")132repairs += 1133modified = True134135if modified:136xml_file.write_bytes(dom.toxml(encoding="UTF-8"))137138except Exception:139pass140141return repairs142143def validate_xml(self):144errors = []145146for xml_file in self.xml_files:147try:148lxml.etree.parse(str(xml_file))149except lxml.etree.XMLSyntaxError as e:150errors.append(151f" {xml_file.relative_to(self.unpacked_dir)}: "152f"Line {e.lineno}: {e.msg}"153)154except Exception as e:155errors.append(156f" {xml_file.relative_to(self.unpacked_dir)}: "157f"Unexpected error: {str(e)}"158)159160if errors:161print(f"FAILED - Found {len(errors)} XML violations:")162for error in errors:163print(error)164return False165else:166if self.verbose:167print("PASSED - All XML files are well-formed")168return True169170def validate_namespaces(self):171errors = []172173for xml_file in self.xml_files:174try:175root = lxml.etree.parse(str(xml_file)).getroot()176declared = set(root.nsmap.keys()) - {None}177178for attr_val in [179v for k, v in root.attrib.items() if k.endswith("Ignorable")180]:181undeclared = set(attr_val.split()) - declared182errors.extend(183f" {xml_file.relative_to(self.unpacked_dir)}: "184f"Namespace '{ns}' in Ignorable but not declared"185for ns in undeclared186)187except lxml.etree.XMLSyntaxError:188continue189190if errors:191print(f"FAILED - {len(errors)} namespace issues:")192for error in errors:193print(error)194return False195if self.verbose:196print("PASSED - All namespace prefixes properly declared")197return True198199def validate_unique_ids(self):200errors = []201global_ids = {}202203for xml_file in self.xml_files:204try:205root = lxml.etree.parse(str(xml_file)).getroot()206file_ids = {}207208mc_elements = root.xpath(209".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}210)211for elem in mc_elements:212elem.getparent().remove(elem)213214for elem in root.iter():215tag = (216elem.tag.split("}")[-1].lower()217if "}" in elem.tag218else elem.tag.lower()219)220221if tag in self.UNIQUE_ID_REQUIREMENTS:222in_excluded_container = any(223ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS224for ancestor in elem.iterancestors()225)226if in_excluded_container:227continue228229attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]230231id_value = None232for attr, value in elem.attrib.items():233attr_local = (234attr.split("}")[-1].lower()235if "}" in attr236else attr.lower()237)238if attr_local == attr_name:239id_value = value240break241242if id_value is not None:243if scope == "global":244if id_value in global_ids:245prev_file, prev_line, prev_tag = global_ids[246id_value247]248errors.append(249f" {xml_file.relative_to(self.unpacked_dir)}: "250f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "251f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"252)253else:254global_ids[id_value] = (255xml_file.relative_to(self.unpacked_dir),256elem.sourceline,257tag,258)259elif scope == "file":260key = (tag, attr_name)261if key not in file_ids:262file_ids[key] = {}263264if id_value in file_ids[key]:265prev_line = file_ids[key][id_value]266errors.append(267f" {xml_file.relative_to(self.unpacked_dir)}: "268f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "269f"(first occurrence at line {prev_line})"270)271else:272file_ids[key][id_value] = elem.sourceline273274except (lxml.etree.XMLSyntaxError, Exception) as e:275errors.append(276f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"277)278279if errors:280print(f"FAILED - Found {len(errors)} ID uniqueness violations:")281for error in errors:282print(error)283return False284else:285if self.verbose:286print("PASSED - All required IDs are unique")287return True288289def validate_file_references(self):290errors = []291292rels_files = list(self.unpacked_dir.rglob("*.rels"))293294if not rels_files:295if self.verbose:296print("PASSED - No .rels files found")297return True298299all_files = []300for file_path in self.unpacked_dir.rglob("*"):301if (302file_path.is_file()303and file_path.name != "[Content_Types].xml"304and not file_path.name.endswith(".rels")305):306all_files.append(file_path.resolve())307308all_referenced_files = set()309310if self.verbose:311print(312f"Found {len(rels_files)} .rels files and {len(all_files)} target files"313)314315for rels_file in rels_files:316try:317rels_root = lxml.etree.parse(str(rels_file)).getroot()318319rels_dir = rels_file.parent320321referenced_files = set()322broken_refs = []323324for rel in rels_root.findall(325".//ns:Relationship",326namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},327):328target = rel.get("Target")329if target and not target.startswith(330("http", "mailto:")331):332if target.startswith("/"):333target_path = self.unpacked_dir / target.lstrip("/")334elif rels_file.name == ".rels":335target_path = self.unpacked_dir / target336else:337base_dir = rels_dir.parent338target_path = base_dir / target339340try:341target_path = target_path.resolve()342if target_path.exists() and target_path.is_file():343referenced_files.add(target_path)344all_referenced_files.add(target_path)345else:346broken_refs.append((target, rel.sourceline))347except (OSError, ValueError):348broken_refs.append((target, rel.sourceline))349350if broken_refs:351rel_path = rels_file.relative_to(self.unpacked_dir)352for broken_ref, line_num in broken_refs:353errors.append(354f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"355)356357except Exception as e:358rel_path = rels_file.relative_to(self.unpacked_dir)359errors.append(f" Error parsing {rel_path}: {e}")360361unreferenced_files = set(all_files) - all_referenced_files362363if unreferenced_files:364for unref_file in sorted(unreferenced_files):365unref_rel_path = unref_file.relative_to(self.unpacked_dir)366errors.append(f" Unreferenced file: {unref_rel_path}")367368if errors:369print(f"FAILED - Found {len(errors)} relationship validation errors:")370for error in errors:371print(error)372print(373"CRITICAL: These errors will cause the document to appear corrupt. "374+ "Broken references MUST be fixed, "375+ "and unreferenced files MUST be referenced or removed."376)377return False378else:379if self.verbose:380print(381"PASSED - All references are valid and all files are properly referenced"382)383return True384385def validate_all_relationship_ids(self):386import lxml.etree387388errors = []389390for xml_file in self.xml_files:391if xml_file.suffix == ".rels":392continue393394rels_dir = xml_file.parent / "_rels"395rels_file = rels_dir / f"{xml_file.name}.rels"396397if not rels_file.exists():398continue399400try:401rels_root = lxml.etree.parse(str(rels_file)).getroot()402rid_to_type = {}403404for rel in rels_root.findall(405f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"406):407rid = rel.get("Id")408rel_type = rel.get("Type", "")409if rid:410if rid in rid_to_type:411rels_rel_path = rels_file.relative_to(self.unpacked_dir)412errors.append(413f" {rels_rel_path}: Line {rel.sourceline}: "414f"Duplicate relationship ID '{rid}' (IDs must be unique)"415)416type_name = (417rel_type.split("/")[-1] if "/" in rel_type else rel_type418)419rid_to_type[rid] = type_name420421xml_root = lxml.etree.parse(str(xml_file)).getroot()422423r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE424rid_attrs_to_check = ["id", "embed", "link"]425for elem in xml_root.iter():426for attr_name in rid_attrs_to_check:427rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")428if not rid_attr:429continue430xml_rel_path = xml_file.relative_to(self.unpacked_dir)431elem_name = (432elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag433)434435if rid_attr not in rid_to_type:436errors.append(437f" {xml_rel_path}: Line {elem.sourceline}: "438f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "439f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"440)441elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:442expected_type = self._get_expected_relationship_type(443elem_name444)445if expected_type:446actual_type = rid_to_type[rid_attr]447if expected_type not in actual_type.lower():448errors.append(449f" {xml_rel_path}: Line {elem.sourceline}: "450f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "451f"but should point to a '{expected_type}' relationship"452)453454except Exception as e:455xml_rel_path = xml_file.relative_to(self.unpacked_dir)456errors.append(f" Error processing {xml_rel_path}: {e}")457458if errors:459print(f"FAILED - Found {len(errors)} relationship ID reference errors:")460for error in errors:461print(error)462print("\nThese ID mismatches will cause the document to appear corrupt!")463return False464else:465if self.verbose:466print("PASSED - All relationship ID references are valid")467return True468469def _get_expected_relationship_type(self, element_name):470elem_lower = element_name.lower()471472if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:473return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]474475if elem_lower.endswith("id") and len(elem_lower) > 2:476prefix = elem_lower[:-2]477if prefix.endswith("master"):478return prefix.lower()479elif prefix.endswith("layout"):480return prefix.lower()481else:482if prefix == "sld":483return "slide"484return prefix.lower()485486if elem_lower.endswith("reference") and len(elem_lower) > 9:487prefix = elem_lower[:-9]488return prefix.lower()489490return None491492def validate_content_types(self):493errors = []494495content_types_file = self.unpacked_dir / "[Content_Types].xml"496if not content_types_file.exists():497print("FAILED - [Content_Types].xml file not found")498return False499500try:501root = lxml.etree.parse(str(content_types_file)).getroot()502declared_parts = set()503declared_extensions = set()504505for override in root.findall(506f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"507):508part_name = override.get("PartName")509if part_name is not None:510declared_parts.add(part_name.lstrip("/"))511512for default in root.findall(513f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"514):515extension = default.get("Extension")516if extension is not None:517declared_extensions.add(extension.lower())518519declarable_roots = {520"sld",521"sldLayout",522"sldMaster",523"presentation",524"document",525"workbook",526"worksheet",527"theme",528}529530media_extensions = {531"png": "image/png",532"jpg": "image/jpeg",533"jpeg": "image/jpeg",534"gif": "image/gif",535"bmp": "image/bmp",536"tiff": "image/tiff",537"wmf": "image/x-wmf",538"emf": "image/x-emf",539}540541all_files = list(self.unpacked_dir.rglob("*"))542all_files = [f for f in all_files if f.is_file()]543544for xml_file in self.xml_files:545path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(546"\\", "/"547)548549if any(550skip in path_str551for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]552):553continue554555try:556root_tag = lxml.etree.parse(str(xml_file)).getroot().tag557root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag558559if root_name in declarable_roots and path_str not in declared_parts:560errors.append(561f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"562)563564except Exception:565continue566567for file_path in all_files:568if file_path.suffix.lower() in {".xml", ".rels"}:569continue570if file_path.name == "[Content_Types].xml":571continue572if "_rels" in file_path.parts or "docProps" in file_path.parts:573continue574575extension = file_path.suffix.lstrip(".").lower()576if extension and extension not in declared_extensions:577if extension in media_extensions:578relative_path = file_path.relative_to(self.unpacked_dir)579errors.append(580f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'581)582583except Exception as e:584errors.append(f" Error parsing [Content_Types].xml: {e}")585586if errors:587print(f"FAILED - Found {len(errors)} content type declaration errors:")588for error in errors:589print(error)590return False591else:592if self.verbose:593print(594"PASSED - All content files are properly declared in [Content_Types].xml"595)596return True597598def validate_file_against_xsd(self, xml_file, verbose=False):599xml_file = Path(xml_file).resolve()600unpacked_dir = self.unpacked_dir.resolve()601602is_valid, current_errors = self._validate_single_file_xsd(603xml_file, unpacked_dir604)605606if is_valid is None:607return None, set()608elif is_valid:609return True, set()610611original_errors = self._get_original_file_errors(xml_file)612613assert current_errors is not None614new_errors = current_errors - original_errors615616new_errors = {617e for e in new_errors618if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)619}620621if new_errors:622if verbose:623relative_path = xml_file.relative_to(unpacked_dir)624print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")625for error in list(new_errors)[:3]:626truncated = error[:250] + "..." if len(error) > 250 else error627print(f" - {truncated}")628return False, new_errors629else:630if verbose:631print(632f"PASSED - No new errors (original had {len(current_errors)} errors)"633)634return True, set()635636def validate_against_xsd(self):637new_errors = []638original_error_count = 0639valid_count = 0640skipped_count = 0641642for xml_file in self.xml_files:643relative_path = str(xml_file.relative_to(self.unpacked_dir))644is_valid, new_file_errors = self.validate_file_against_xsd(645xml_file, verbose=False646)647648if is_valid is None:649skipped_count += 1650continue651elif is_valid and not new_file_errors:652valid_count += 1653continue654elif is_valid:655original_error_count += 1656valid_count += 1657continue658659new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")660for error in list(new_file_errors)[:3]:661new_errors.append(662f" - {error[:250]}..." if len(error) > 250 else f" - {error}"663)664665if self.verbose:666print(f"Validated {len(self.xml_files)} files:")667print(f" - Valid: {valid_count}")668print(f" - Skipped (no schema): {skipped_count}")669if original_error_count:670print(f" - With original errors (ignored): {original_error_count}")671print(672f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"673)674675if new_errors:676print("\nFAILED - Found NEW validation errors:")677for error in new_errors:678print(error)679return False680else:681if self.verbose:682print("\nPASSED - No new XSD validation errors introduced")683return True684685def _get_schema_path(self, xml_file):686if xml_file.name in self.SCHEMA_MAPPINGS:687return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]688689if xml_file.suffix == ".rels":690return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]691692if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):693return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]694695if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):696return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]697698if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:699return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]700701return None702703def _clean_ignorable_namespaces(self, xml_doc):704xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")705xml_copy = lxml.etree.fromstring(xml_string)706707for elem in xml_copy.iter():708attrs_to_remove = []709710for attr in elem.attrib:711if "{" in attr:712ns = attr.split("}")[0][1:]713if ns not in self.OOXML_NAMESPACES:714attrs_to_remove.append(attr)715716for attr in attrs_to_remove:717del elem.attrib[attr]718719self._remove_ignorable_elements(xml_copy)720721return lxml.etree.ElementTree(xml_copy)722723def _remove_ignorable_elements(self, root):724elements_to_remove = []725726for elem in list(root):727if not hasattr(elem, "tag") or callable(elem.tag):728continue729730tag_str = str(elem.tag)731if tag_str.startswith("{"):732ns = tag_str.split("}")[0][1:]733if ns not in self.OOXML_NAMESPACES:734elements_to_remove.append(elem)735continue736737self._remove_ignorable_elements(elem)738739for elem in elements_to_remove:740root.remove(elem)741742def _preprocess_for_mc_ignorable(self, xml_doc):743root = xml_doc.getroot()744745if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:746del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]747748return xml_doc749750def _validate_single_file_xsd(self, xml_file, base_path):751schema_path = self._get_schema_path(xml_file)752if not schema_path:753return None, None754755try:756with open(schema_path, "rb") as xsd_file:757parser = lxml.etree.XMLParser()758xsd_doc = lxml.etree.parse(759xsd_file, parser=parser, base_url=str(schema_path)760)761schema = lxml.etree.XMLSchema(xsd_doc)762763with open(xml_file, "r") as f:764xml_doc = lxml.etree.parse(f)765766xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)767xml_doc = self._preprocess_for_mc_ignorable(xml_doc)768769relative_path = xml_file.relative_to(base_path)770if (771relative_path.parts772and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS773):774xml_doc = self._clean_ignorable_namespaces(xml_doc)775776if schema.validate(xml_doc):777return True, set()778else:779errors = set()780for error in schema.error_log:781errors.add(error.message)782return False, errors783784except Exception as e:785return False, {str(e)}786787def _get_original_file_errors(self, xml_file):788if self.original_file is None:789return set()790791import tempfile792import zipfile793794xml_file = Path(xml_file).resolve()795unpacked_dir = self.unpacked_dir.resolve()796relative_path = xml_file.relative_to(unpacked_dir)797798with tempfile.TemporaryDirectory() as temp_dir:799temp_path = Path(temp_dir)800801with zipfile.ZipFile(self.original_file, "r") as zip_ref:802zip_ref.extractall(temp_path)803804original_xml_file = temp_path / relative_path805806if not original_xml_file.exists():807return set()808809is_valid, errors = self._validate_single_file_xsd(810original_xml_file, temp_path811)812return errors if errors else set()813814def _remove_template_tags_from_text_nodes(self, xml_doc):815warnings = []816template_pattern = re.compile(r"\{\{[^}]*\}\}")817818xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")819xml_copy = lxml.etree.fromstring(xml_string)820821def process_text_content(text, content_type):822if not text:823return text824matches = list(template_pattern.finditer(text))825if matches:826for match in matches:827warnings.append(828f"Found template tag in {content_type}: {match.group()}"829)830return template_pattern.sub("", text)831return text832833for elem in xml_copy.iter():834if not hasattr(elem, "tag") or callable(elem.tag):835continue836tag_str = str(elem.tag)837if tag_str.endswith("}t") or tag_str == "t":838continue839840elem.text = process_text_content(elem.text, "text content")841elem.tail = process_text_content(elem.tail, "tail content")842843return lxml.etree.ElementTree(xml_copy), warnings844845846if __name__ == "__main__":847raise RuntimeError("This module should not be run directly.")848