Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, read, edit, and manipulate Word (.docx) documents with formatting, tables, and tracked changes
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/office/validators/redlining.py
1"""2Validator for tracked changes in Word documents.3"""45import subprocess6import tempfile7import zipfile8from pathlib import Path91011class RedliningValidator:1213def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):14self.unpacked_dir = Path(unpacked_dir)15self.original_docx = Path(original_docx)16self.verbose = verbose17self.author = author18self.namespaces = {19"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"20}2122def repair(self) -> int:23return 02425def validate(self):26modified_file = self.unpacked_dir / "word" / "document.xml"27if not modified_file.exists():28print(f"FAILED - Modified document.xml not found at {modified_file}")29return False3031try:32import xml.etree.ElementTree as ET3334tree = ET.parse(modified_file)35root = tree.getroot()3637del_elements = root.findall(".//w:del", self.namespaces)38ins_elements = root.findall(".//w:ins", self.namespaces)3940author_del_elements = [41elem42for elem in del_elements43if elem.get(f"{{{self.namespaces['w']}}}author") == self.author44]45author_ins_elements = [46elem47for elem in ins_elements48if elem.get(f"{{{self.namespaces['w']}}}author") == self.author49]5051if not author_del_elements and not author_ins_elements:52if self.verbose:53print(f"PASSED - No tracked changes by {self.author} found.")54return True5556except Exception:57pass5859with tempfile.TemporaryDirectory() as temp_dir:60temp_path = Path(temp_dir)6162try:63with zipfile.ZipFile(self.original_docx, "r") as zip_ref:64zip_ref.extractall(temp_path)65except Exception as e:66print(f"FAILED - Error unpacking original docx: {e}")67return False6869original_file = temp_path / "word" / "document.xml"70if not original_file.exists():71print(72f"FAILED - Original document.xml not found in {self.original_docx}"73)74return False7576try:77import xml.etree.ElementTree as ET7879modified_tree = ET.parse(modified_file)80modified_root = modified_tree.getroot()81original_tree = ET.parse(original_file)82original_root = original_tree.getroot()83except ET.ParseError as e:84print(f"FAILED - Error parsing XML files: {e}")85return False8687self._remove_author_tracked_changes(original_root)88self._remove_author_tracked_changes(modified_root)8990modified_text = self._extract_text_content(modified_root)91original_text = self._extract_text_content(original_root)9293if modified_text != original_text:94error_message = self._generate_detailed_diff(95original_text, modified_text96)97print(error_message)98return False99100if self.verbose:101print(f"PASSED - All changes by {self.author} are properly tracked")102return True103104def _generate_detailed_diff(self, original_text, modified_text):105error_parts = [106f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",107"",108"Likely causes:",109" 1. Modified text inside another author's <w:ins> or <w:del> tags",110" 2. Made edits without proper tracked changes",111" 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",112"",113"For pre-redlined documents, use correct patterns:",114" - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",115" - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",116"",117]118119git_diff = self._get_git_word_diff(original_text, modified_text)120if git_diff:121error_parts.extend(["Differences:", "============", git_diff])122else:123error_parts.append("Unable to generate word diff (git not available)")124125return "\n".join(error_parts)126127def _get_git_word_diff(self, original_text, modified_text):128try:129with tempfile.TemporaryDirectory() as temp_dir:130temp_path = Path(temp_dir)131132original_file = temp_path / "original.txt"133modified_file = temp_path / "modified.txt"134135original_file.write_text(original_text, encoding="utf-8")136modified_file.write_text(modified_text, encoding="utf-8")137138result = subprocess.run(139[140"git",141"diff",142"--word-diff=plain",143"--word-diff-regex=.",144"-U0",145"--no-index",146str(original_file),147str(modified_file),148],149capture_output=True,150text=True,151)152153if result.stdout.strip():154lines = result.stdout.split("\n")155content_lines = []156in_content = False157for line in lines:158if line.startswith("@@"):159in_content = True160continue161if in_content and line.strip():162content_lines.append(line)163164if content_lines:165return "\n".join(content_lines)166167result = subprocess.run(168[169"git",170"diff",171"--word-diff=plain",172"-U0",173"--no-index",174str(original_file),175str(modified_file),176],177capture_output=True,178text=True,179)180181if result.stdout.strip():182lines = result.stdout.split("\n")183content_lines = []184in_content = False185for line in lines:186if line.startswith("@@"):187in_content = True188continue189if in_content and line.strip():190content_lines.append(line)191return "\n".join(content_lines)192193except (subprocess.CalledProcessError, FileNotFoundError, Exception):194pass195196return None197198def _remove_author_tracked_changes(self, root):199ins_tag = f"{{{self.namespaces['w']}}}ins"200del_tag = f"{{{self.namespaces['w']}}}del"201author_attr = f"{{{self.namespaces['w']}}}author"202203for parent in root.iter():204to_remove = []205for child in parent:206if child.tag == ins_tag and child.get(author_attr) == self.author:207to_remove.append(child)208for elem in to_remove:209parent.remove(elem)210211deltext_tag = f"{{{self.namespaces['w']}}}delText"212t_tag = f"{{{self.namespaces['w']}}}t"213214for parent in root.iter():215to_process = []216for child in parent:217if child.tag == del_tag and child.get(author_attr) == self.author:218to_process.append((child, list(parent).index(child)))219220for del_elem, del_index in reversed(to_process):221for elem in del_elem.iter():222if elem.tag == deltext_tag:223elem.tag = t_tag224225for child in reversed(list(del_elem)):226parent.insert(del_index, child)227parent.remove(del_elem)228229def _extract_text_content(self, root):230p_tag = f"{{{self.namespaces['w']}}}p"231t_tag = f"{{{self.namespaces['w']}}}t"232233paragraphs = []234for p_elem in root.findall(f".//{p_tag}"):235text_parts = []236for t_elem in p_elem.findall(f".//{t_tag}"):237if t_elem.text:238text_parts.append(t_elem.text)239paragraph_text = "".join(text_parts)240if paragraph_text:241paragraphs.append(paragraph_text)242243return "\n".join(paragraphs)244245246if __name__ == "__main__":247raise RuntimeError("This module should not be run directly.")248