Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, edit, and inspect PowerPoint presentations with professional design and automated visual QA
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/clean.py
1"""Remove unreferenced files from an unpacked PPTX directory.23Usage: python clean.py <unpacked_dir>45Example:6python clean.py unpacked/78This script removes:9- Orphaned slides (not in sldIdLst) and their relationships10- [trash] directory (unreferenced files)11- Orphaned .rels files for deleted resources12- Unreferenced media, embeddings, charts, diagrams, drawings, ink files13- Unreferenced theme files14- Unreferenced notes slides15- Content-Type overrides for deleted files16"""1718import sys19from pathlib import Path2021import defusedxml.minidom222324import re252627def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]:28pres_path = unpacked_dir / "ppt" / "presentation.xml"29pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"3031if not pres_path.exists() or not pres_rels_path.exists():32return set()3334rels_dom = defusedxml.minidom.parse(str(pres_rels_path))35rid_to_slide = {}36for rel in rels_dom.getElementsByTagName("Relationship"):37rid = rel.getAttribute("Id")38target = rel.getAttribute("Target")39rel_type = rel.getAttribute("Type")40if "slide" in rel_type and target.startswith("slides/"):41rid_to_slide[rid] = target.replace("slides/", "")4243pres_content = pres_path.read_text(encoding="utf-8")44referenced_rids = set(re.findall(r'<p:sldId[^>]*r:id="([^"]+)"', pres_content))4546return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide}474849def remove_orphaned_slides(unpacked_dir: Path) -> list[str]:50slides_dir = unpacked_dir / "ppt" / "slides"51slides_rels_dir = slides_dir / "_rels"52pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"5354if not slides_dir.exists():55return []5657referenced_slides = get_slides_in_sldidlst(unpacked_dir)58removed = []5960for slide_file in slides_dir.glob("slide*.xml"):61if slide_file.name not in referenced_slides:62rel_path = slide_file.relative_to(unpacked_dir)63slide_file.unlink()64removed.append(str(rel_path))6566rels_file = slides_rels_dir / f"{slide_file.name}.rels"67if rels_file.exists():68rels_file.unlink()69removed.append(str(rels_file.relative_to(unpacked_dir)))7071if removed and pres_rels_path.exists():72rels_dom = defusedxml.minidom.parse(str(pres_rels_path))73changed = False7475for rel in list(rels_dom.getElementsByTagName("Relationship")):76target = rel.getAttribute("Target")77if target.startswith("slides/"):78slide_name = target.replace("slides/", "")79if slide_name not in referenced_slides:80if rel.parentNode:81rel.parentNode.removeChild(rel)82changed = True8384if changed:85with open(pres_rels_path, "wb") as f:86f.write(rels_dom.toxml(encoding="utf-8"))8788return removed899091def remove_trash_directory(unpacked_dir: Path) -> list[str]:92trash_dir = unpacked_dir / "[trash]"93removed = []9495if trash_dir.exists() and trash_dir.is_dir():96for file_path in trash_dir.iterdir():97if file_path.is_file():98rel_path = file_path.relative_to(unpacked_dir)99removed.append(str(rel_path))100file_path.unlink()101trash_dir.rmdir()102103return removed104105106def get_slide_referenced_files(unpacked_dir: Path) -> set:107referenced = set()108slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels"109110if not slides_rels_dir.exists():111return referenced112113for rels_file in slides_rels_dir.glob("*.rels"):114dom = defusedxml.minidom.parse(str(rels_file))115for rel in dom.getElementsByTagName("Relationship"):116target = rel.getAttribute("Target")117if not target:118continue119target_path = (rels_file.parent.parent / target).resolve()120try:121referenced.add(target_path.relative_to(unpacked_dir.resolve()))122except ValueError:123pass124125return referenced126127128def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]:129resource_dirs = ["charts", "diagrams", "drawings"]130removed = []131slide_referenced = get_slide_referenced_files(unpacked_dir)132133for dir_name in resource_dirs:134rels_dir = unpacked_dir / "ppt" / dir_name / "_rels"135if not rels_dir.exists():136continue137138for rels_file in rels_dir.glob("*.rels"):139resource_file = rels_dir.parent / rels_file.name.replace(".rels", "")140try:141resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve())142except ValueError:143continue144145if not resource_file.exists() or resource_rel_path not in slide_referenced:146rels_file.unlink()147rel_path = rels_file.relative_to(unpacked_dir)148removed.append(str(rel_path))149150return removed151152153def get_referenced_files(unpacked_dir: Path) -> set:154referenced = set()155156for rels_file in unpacked_dir.rglob("*.rels"):157dom = defusedxml.minidom.parse(str(rels_file))158for rel in dom.getElementsByTagName("Relationship"):159target = rel.getAttribute("Target")160if not target:161continue162target_path = (rels_file.parent.parent / target).resolve()163try:164referenced.add(target_path.relative_to(unpacked_dir.resolve()))165except ValueError:166pass167168return referenced169170171def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]:172resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"]173removed = []174175for dir_name in resource_dirs:176dir_path = unpacked_dir / "ppt" / dir_name177if not dir_path.exists():178continue179180for file_path in dir_path.glob("*"):181if not file_path.is_file():182continue183rel_path = file_path.relative_to(unpacked_dir)184if rel_path not in referenced:185file_path.unlink()186removed.append(str(rel_path))187188theme_dir = unpacked_dir / "ppt" / "theme"189if theme_dir.exists():190for file_path in theme_dir.glob("theme*.xml"):191rel_path = file_path.relative_to(unpacked_dir)192if rel_path not in referenced:193file_path.unlink()194removed.append(str(rel_path))195theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels"196if theme_rels.exists():197theme_rels.unlink()198removed.append(str(theme_rels.relative_to(unpacked_dir)))199200notes_dir = unpacked_dir / "ppt" / "notesSlides"201if notes_dir.exists():202for file_path in notes_dir.glob("*.xml"):203if not file_path.is_file():204continue205rel_path = file_path.relative_to(unpacked_dir)206if rel_path not in referenced:207file_path.unlink()208removed.append(str(rel_path))209210notes_rels_dir = notes_dir / "_rels"211if notes_rels_dir.exists():212for file_path in notes_rels_dir.glob("*.rels"):213notes_file = notes_dir / file_path.name.replace(".rels", "")214if not notes_file.exists():215file_path.unlink()216removed.append(str(file_path.relative_to(unpacked_dir)))217218return removed219220221def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None:222ct_path = unpacked_dir / "[Content_Types].xml"223if not ct_path.exists():224return225226dom = defusedxml.minidom.parse(str(ct_path))227changed = False228229for override in list(dom.getElementsByTagName("Override")):230part_name = override.getAttribute("PartName").lstrip("/")231if part_name in removed_files:232if override.parentNode:233override.parentNode.removeChild(override)234changed = True235236if changed:237with open(ct_path, "wb") as f:238f.write(dom.toxml(encoding="utf-8"))239240241def clean_unused_files(unpacked_dir: Path) -> list[str]:242all_removed = []243244slides_removed = remove_orphaned_slides(unpacked_dir)245all_removed.extend(slides_removed)246247trash_removed = remove_trash_directory(unpacked_dir)248all_removed.extend(trash_removed)249250while True:251removed_rels = remove_orphaned_rels_files(unpacked_dir)252referenced = get_referenced_files(unpacked_dir)253removed_files = remove_orphaned_files(unpacked_dir, referenced)254255total_removed = removed_rels + removed_files256if not total_removed:257break258259all_removed.extend(total_removed)260261if all_removed:262update_content_types(unpacked_dir, all_removed)263264return all_removed265266267if __name__ == "__main__":268if len(sys.argv) != 2:269print("Usage: python clean.py <unpacked_dir>", file=sys.stderr)270print("Example: python clean.py unpacked/", file=sys.stderr)271sys.exit(1)272273unpacked_dir = Path(sys.argv[1])274275if not unpacked_dir.exists():276print(f"Error: {unpacked_dir} not found", file=sys.stderr)277sys.exit(1)278279removed = clean_unused_files(unpacked_dir)280281if removed:282print(f"Removed {len(removed)} unreferenced files:")283for f in removed:284print(f" {f}")285else:286print("No unreferenced files found")287