Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Scans metapatterns.io pages for diagram images with weak alt text and exports exact update candidates in Markdown, JSON, or CSV.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/audit_diagram_alts.py
1#!/usr/bin/env python323from __future__ import annotations45import argparse6import csv7import json8import re9import sys10from collections import deque11from pathlib import PurePosixPath12from typing import Iterable13from urllib.parse import urljoin, urlparse1415import requests16from bs4 import BeautifulSoup, NavigableString, Tag171819GENERIC_ALT_WORDS = {20"diagram",21"image",22"figure",23"graphic",24"photo",25}2627UI_PATH_PARTS = (28"/icons/",29"/favicon",30"/logo",31"/menu",32"/toc",33"/cdn-cgi/",34)3536ALLOWED_HOST = "metapatterns.io"373839def build_parser() -> argparse.ArgumentParser:40parser = argparse.ArgumentParser(41description="Audit metapatterns.io pages for diagram images with weak alt text."42)43parser.add_argument("url", help="Page URL or section root to inspect.")44parser.add_argument(45"--crawl",46action="store_true",47help="Follow links under the same host and path prefix.",48)49parser.add_argument(50"--limit-pages",51type=int,52default=10,53help="Maximum number of pages to inspect when --crawl is set. Default: 10.",54)55parser.add_argument(56"--json",57action="store_true",58help="Emit JSON instead of Markdown.",59)60parser.add_argument(61"--csv",62action="store_true",63help="Emit CSV instead of Markdown.",64)65parser.add_argument(66"--timeout",67type=int,68default=20,69help="HTTP timeout in seconds. Default: 20.",70)71return parser727374def fetch_html(session: requests.Session, url: str, timeout: int) -> str:75response = session.get(url, timeout=timeout)76response.raise_for_status()77return response.text787980def normalize_url(url: str) -> str:81parsed = urlparse(url)82path = parsed.path or "/"83looks_like_file = bool(PurePosixPath(path).suffix)84if path != "/" and not path.endswith("/") and not looks_like_file:85path = f"{path}/"86return parsed._replace(fragment="", query="", path=path).geturl()878889def same_prefix(url: str, root: str) -> bool:90current = urlparse(url)91base = urlparse(root)92return current.netloc == base.netloc and current.path.startswith(base.path)939495def iter_page_urls(96session: requests.Session, start_url: str, crawl: bool, limit_pages: int, timeout: int97) -> list[str]:98start_url = normalize_url(start_url)99if not crawl:100return [start_url]101102visited: set[str] = set()103queue: deque[str] = deque([start_url])104ordered: list[str] = []105106while queue and len(ordered) < limit_pages:107current = queue.popleft()108if current in visited:109continue110visited.add(current)111ordered.append(current)112113try:114html = fetch_html(session, current, timeout)115except requests.RequestException:116continue117118soup = BeautifulSoup(html, "html.parser")119for link in soup.find_all("a", href=True):120href = urljoin(current, link["href"])121normalized = normalize_url(href)122parsed = urlparse(normalized)123if parsed.netloc != ALLOWED_HOST:124continue125if any(part in parsed.path.lower() for part in UI_PATH_PARTS):126continue127if PurePosixPath(parsed.path).suffix:128continue129if not same_prefix(normalized, start_url):130continue131if normalized not in visited:132queue.append(normalized)133134return ordered135136137def get_page_title(soup: BeautifulSoup) -> str:138title_tag = soup.find("h1")139if title_tag:140return normalize_space(title_tag.get_text(" ", strip=True))141if soup.title:142return normalize_space(soup.title.get_text(" ", strip=True))143return ""144145146def normalize_space(text: str | None) -> str:147if not text:148return ""149compact = re.sub(r"\s+", " ", text).strip()150compact = re.sub(r"\s+([,.:;!?])", r"\1", compact)151compact = re.sub(r"\(\s+", "(", compact)152compact = re.sub(r"\s+\)", ")", compact)153compact = re.sub(r"\s+#$", "", compact)154return compact155156157def filename_like(alt: str) -> bool:158if not alt:159return False160if re.fullmatch(r"[\w\s\-]+", alt) is None:161return False162parts = re.split(r"[\s\-]+", alt.lower())163return any(char.isdigit() for char in alt) or all(len(part) <= 12 for part in parts)164165166def classify_issue(img_url: str, alt: str) -> tuple[bool, str]:167alt_normalized = normalize_space(alt)168alt_lower = alt_normalized.lower()169170if not alt_normalized:171return True, "missing alt"172173if any(word == alt_lower for word in GENERIC_ALT_WORDS):174return True, "generic alt"175176if filename_like(alt_normalized):177return True, "filename-like alt"178179if "/diagrams/" in img_url and len(alt_normalized.split()) <= 3:180return True, "too short for a diagram"181182return False, "looks acceptable"183184185def is_ui_asset(img_url: str, alt: str) -> bool:186img_lower = img_url.lower()187alt_lower = normalize_space(alt).lower()188if any(part in img_lower for part in UI_PATH_PARTS):189return True190if alt_lower in {"logo", "menu", "table of contents", "+"}:191return True192return False193194195def find_content_root(soup: BeautifulSoup) -> Tag:196for selector in ("article", "main", ".book-page", ".content"):197node = soup.select_one(selector)198if node:199return node200return soup201202203def nearest_heading(tag: Tag) -> str:204for heading in tag.find_all_previous(["h1", "h2", "h3", "h4"], limit=12):205text = normalize_space(heading.get_text(" ", strip=True))206if text:207return text208return ""209210211def nearby_paragraph(tag: Tag) -> str:212for sibling in tag.next_elements:213if sibling is tag:214continue215if isinstance(sibling, Tag) and sibling.name == "p":216text = normalize_space(sibling.get_text(" ", strip=True))217if text:218return text219if isinstance(sibling, Tag) and sibling.name in {"img", "h1", "h2", "h3", "h4"}:220break221if isinstance(sibling, NavigableString):222continue223return ""224225226def analyze_page(session: requests.Session, url: str, timeout: int) -> dict:227html = fetch_html(session, url, timeout)228soup = BeautifulSoup(html, "html.parser")229root = find_content_root(soup)230page_title = get_page_title(soup)231232results = []233for img in root.find_all("img"):234src = img.get("src", "")235alt = img.get("alt", "")236if not src:237continue238img_url = urljoin(url, src)239if urlparse(img_url).netloc != ALLOWED_HOST:240continue241if is_ui_asset(img_url, alt):242continue243if "/diagrams/" not in img_url and img_url.lower().endswith(".svg"):244continue245246needs_update, issue = classify_issue(img_url, alt)247if "/diagrams/" not in img_url and not needs_update:248continue249250results.append(251{252"page_url": url,253"page_title": page_title,254"nearest_heading": nearest_heading(img),255"context_paragraph": nearby_paragraph(img),256"image_url": img_url,257"current_alt": alt,258"needs_update": needs_update,259"issue": issue,260}261)262263return {264"url": url,265"title": page_title,266"images": results,267}268269270def render_markdown(pages: Iterable[dict]) -> str:271lines: list[str] = []272for page in pages:273lines.append(f"## {page['title'] or page['url']}")274lines.append(page["url"])275lines.append("")276if not page["images"]:277lines.append("- No candidate diagram images found.")278lines.append("")279continue280for image in page["images"]:281lines.append(282f"- `needs_update={str(image['needs_update']).lower()}` | `{image['issue']}` | `{image['current_alt'] or '(empty)'}`"283)284lines.append(f" src: {image['image_url']}")285if image["nearest_heading"]:286lines.append(f" heading: {image['nearest_heading']}")287if image["context_paragraph"]:288lines.append(f" context: {image['context_paragraph']}")289lines.append("")290return "\n".join(lines).strip()291292293def emit_csv(pages: Iterable[dict]) -> None:294fieldnames = [295"page_url",296"page_title",297"nearest_heading",298"image_url",299"current_alt",300"needs_update",301"issue",302"context_paragraph",303]304writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)305writer.writeheader()306for page in pages:307for image in page["images"]:308writer.writerow(309{310"page_url": image["page_url"],311"page_title": image["page_title"],312"nearest_heading": image["nearest_heading"],313"image_url": image["image_url"],314"current_alt": image["current_alt"],315"needs_update": str(image["needs_update"]).lower(),316"issue": image["issue"],317"context_paragraph": image["context_paragraph"],318}319)320321322def main() -> int:323parser = build_parser()324args = parser.parse_args()325start_url = normalize_url(args.url)326parsed = urlparse(start_url)327if parsed.netloc != ALLOWED_HOST:328parser.error("Only metapatterns.io URLs are supported by this skill.")329330session = requests.Session()331session.headers.update(332{333"User-Agent": "metapatterns-diagram-alt-audit/1.0",334}335)336337try:338page_urls = iter_page_urls(339session=session,340start_url=start_url,341crawl=args.crawl,342limit_pages=args.limit_pages,343timeout=args.timeout,344)345pages = []346for page_url in page_urls:347try:348pages.append(analyze_page(session, page_url, args.timeout))349except requests.RequestException:350continue351except requests.RequestException as exc:352print(f"error: {exc}", file=sys.stderr)353return 1354355if args.json:356json.dump({"pages": pages}, sys.stdout, indent=2, ensure_ascii=False)357sys.stdout.write("\n")358elif args.csv:359emit_csv(pages)360else:361print(render_markdown(pages))362return 0363364365if __name__ == "__main__":366raise SystemExit(main())367