Source from bundle

Metapatterns Diagram Alt Audit

Scans metapatterns.io pages for diagram images with weak alt text and exports exact update candidates in Markdown, JSON, or CSV.

Костянтин@Latand

Files

Skill

0.6K

Size

28.6 KB

Entrypoint

SKILL.md

Format

folder

Open file

scripts/audit_diagram_alts.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code367 linesFree

scripts/audit_diagram_alts.py

1#!/usr/bin/env python3
2 
3from __future__ import annotations
4 
5import argparse
6import csv
7import json
8import re
9import sys
10from collections import deque
11from pathlib import PurePosixPath
12from typing import Iterable
13from urllib.parse import urljoin, urlparse
14 
15import requests
16from bs4 import BeautifulSoup, NavigableString, Tag
17 
18 
19GENERIC_ALT_WORDS = {
20    "diagram",
21    "image",
22    "figure",
23    "graphic",
24    "photo",
25}
26 
27UI_PATH_PARTS = (
28    "/icons/",
29    "/favicon",
30    "/logo",
31    "/menu",
32    "/toc",
33    "/cdn-cgi/",
34)
35 
36ALLOWED_HOST = "metapatterns.io"
37 
38 
39def build_parser() -> argparse.ArgumentParser:
40    parser = argparse.ArgumentParser(
41        description="Audit metapatterns.io pages for diagram images with weak alt text."
42    )
43    parser.add_argument("url", help="Page URL or section root to inspect.")
44    parser.add_argument(
45        "--crawl",
46        action="store_true",
47        help="Follow links under the same host and path prefix.",
48    )
49    parser.add_argument(
50        "--limit-pages",
51        type=int,
52        default=10,
53        help="Maximum number of pages to inspect when --crawl is set. Default: 10.",
54    )
55    parser.add_argument(
56        "--json",
57        action="store_true",
58        help="Emit JSON instead of Markdown.",
59    )
60    parser.add_argument(
61        "--csv",
62        action="store_true",
63        help="Emit CSV instead of Markdown.",
64    )
65    parser.add_argument(
66        "--timeout",
67        type=int,
68        default=20,
69        help="HTTP timeout in seconds. Default: 20.",
70    )
71    return parser
72 
73 
74def fetch_html(session: requests.Session, url: str, timeout: int) -> str:
75    response = session.get(url, timeout=timeout)
76    response.raise_for_status()
77    return response.text
78 
79 
80def normalize_url(url: str) -> str:
81    parsed = urlparse(url)
82    path = parsed.path or "/"
83    looks_like_file = bool(PurePosixPath(path).suffix)
84    if path != "/" and not path.endswith("/") and not looks_like_file:
85        path = f"{path}/"
86    return parsed._replace(fragment="", query="", path=path).geturl()
87 
88 
89def same_prefix(url: str, root: str) -> bool:
90    current = urlparse(url)
91    base = urlparse(root)
92    return current.netloc == base.netloc and current.path.startswith(base.path)
93 
94 
95def iter_page_urls(
96    session: requests.Session, start_url: str, crawl: bool, limit_pages: int, timeout: int
97) -> list[str]:
98    start_url = normalize_url(start_url)
99    if not crawl:
100        return [start_url]
101 
102    visited: set[str] = set()
103    queue: deque[str] = deque([start_url])
104    ordered: list[str] = []
105 
106    while queue and len(ordered) < limit_pages:
107        current = queue.popleft()
108        if current in visited:
109            continue
110        visited.add(current)
111        ordered.append(current)
112 
113        try:
114            html = fetch_html(session, current, timeout)
115        except requests.RequestException:
116            continue
117 
118        soup = BeautifulSoup(html, "html.parser")
119        for link in soup.find_all("a", href=True):
120            href = urljoin(current, link["href"])
121            normalized = normalize_url(href)
122            parsed = urlparse(normalized)
123            if parsed.netloc != ALLOWED_HOST:
124                continue
125            if any(part in parsed.path.lower() for part in UI_PATH_PARTS):
126                continue
127            if PurePosixPath(parsed.path).suffix:
128                continue
129            if not same_prefix(normalized, start_url):
130                continue
131            if normalized not in visited:
132                queue.append(normalized)
133 
134    return ordered
135 
136 
137def get_page_title(soup: BeautifulSoup) -> str:
138    title_tag = soup.find("h1")
139    if title_tag:
140        return normalize_space(title_tag.get_text(" ", strip=True))
141    if soup.title:
142        return normalize_space(soup.title.get_text(" ", strip=True))
143    return ""
144 
145 
146def normalize_space(text: str | None) -> str:
147    if not text:
148        return ""
149    compact = re.sub(r"\s+", " ", text).strip()
150    compact = re.sub(r"\s+([,.:;!?])", r"\1", compact)
151    compact = re.sub(r"\(\s+", "(", compact)
152    compact = re.sub(r"\s+\)", ")", compact)
153    compact = re.sub(r"\s+#$", "", compact)
154    return compact
155 
156 
157def filename_like(alt: str) -> bool:
158    if not alt:
159        return False
160    if re.fullmatch(r"[\w\s\-]+", alt) is None:
161        return False
162    parts = re.split(r"[\s\-]+", alt.lower())
163    return any(char.isdigit() for char in alt) or all(len(part) <= 12 for part in parts)
164 
165 
166def classify_issue(img_url: str, alt: str) -> tuple[bool, str]:
167    alt_normalized = normalize_space(alt)
168    alt_lower = alt_normalized.lower()
169 
170    if not alt_normalized:
171        return True, "missing alt"
172 
173    if any(word == alt_lower for word in GENERIC_ALT_WORDS):
174        return True, "generic alt"
175 
176    if filename_like(alt_normalized):
177        return True, "filename-like alt"
178 
179    if "/diagrams/" in img_url and len(alt_normalized.split()) <= 3:
180        return True, "too short for a diagram"
181 
182    return False, "looks acceptable"
183 
184 
185def is_ui_asset(img_url: str, alt: str) -> bool:
186    img_lower = img_url.lower()
187    alt_lower = normalize_space(alt).lower()
188    if any(part in img_lower for part in UI_PATH_PARTS):
189        return True
190    if alt_lower in {"logo", "menu", "table of contents", "+"}:
191        return True
192    return False
193 
194 
195def find_content_root(soup: BeautifulSoup) -> Tag:
196    for selector in ("article", "main", ".book-page", ".content"):
197        node = soup.select_one(selector)
198        if node:
199            return node
200    return soup
201 
202 
203def nearest_heading(tag: Tag) -> str:
204    for heading in tag.find_all_previous(["h1", "h2", "h3", "h4"], limit=12):
205        text = normalize_space(heading.get_text(" ", strip=True))
206        if text:
207            return text
208    return ""
209 
210 
211def nearby_paragraph(tag: Tag) -> str:
212    for sibling in tag.next_elements:
213        if sibling is tag:
214            continue
215        if isinstance(sibling, Tag) and sibling.name == "p":
216            text = normalize_space(sibling.get_text(" ", strip=True))
217            if text:
218                return text
219        if isinstance(sibling, Tag) and sibling.name in {"img", "h1", "h2", "h3", "h4"}:
220            break
221        if isinstance(sibling, NavigableString):
222            continue
223    return ""
224 
225 
226def analyze_page(session: requests.Session, url: str, timeout: int) -> dict:
227    html = fetch_html(session, url, timeout)
228    soup = BeautifulSoup(html, "html.parser")
229    root = find_content_root(soup)
230    page_title = get_page_title(soup)
231 
232    results = []
233    for img in root.find_all("img"):
234        src = img.get("src", "")
235        alt = img.get("alt", "")
236        if not src:
237            continue
238        img_url = urljoin(url, src)
239        if urlparse(img_url).netloc != ALLOWED_HOST:
240            continue
241        if is_ui_asset(img_url, alt):
242            continue
243        if "/diagrams/" not in img_url and img_url.lower().endswith(".svg"):
244            continue
245 
246        needs_update, issue = classify_issue(img_url, alt)
247        if "/diagrams/" not in img_url and not needs_update:
248            continue
249 
250        results.append(
251            {
252                "page_url": url,
253                "page_title": page_title,
254                "nearest_heading": nearest_heading(img),
255                "context_paragraph": nearby_paragraph(img),
256                "image_url": img_url,
257                "current_alt": alt,
258                "needs_update": needs_update,
259                "issue": issue,
260            }
261        )
262 
263    return {
264        "url": url,
265        "title": page_title,
266        "images": results,
267    }
268 
269 
270def render_markdown(pages: Iterable[dict]) -> str:
271    lines: list[str] = []
272    for page in pages:
273        lines.append(f"## {page['title'] or page['url']}")
274        lines.append(page["url"])
275        lines.append("")
276        if not page["images"]:
277            lines.append("- No candidate diagram images found.")
278            lines.append("")
279            continue
280        for image in page["images"]:
281            lines.append(
282                f"- `needs_update={str(image['needs_update']).lower()}` | `{image['issue']}` | `{image['current_alt'] or '(empty)'}`"
283            )
284            lines.append(f"  src: {image['image_url']}")
285            if image["nearest_heading"]:
286                lines.append(f"  heading: {image['nearest_heading']}")
287            if image["context_paragraph"]:
288                lines.append(f"  context: {image['context_paragraph']}")
289        lines.append("")
290    return "\n".join(lines).strip()
291 
292 
293def emit_csv(pages: Iterable[dict]) -> None:
294    fieldnames = [
295        "page_url",
296        "page_title",
297        "nearest_heading",
298        "image_url",
299        "current_alt",
300        "needs_update",
301        "issue",
302        "context_paragraph",
303    ]
304    writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
305    writer.writeheader()
306    for page in pages:
307        for image in page["images"]:
308            writer.writerow(
309                {
310                    "page_url": image["page_url"],
311                    "page_title": image["page_title"],
312                    "nearest_heading": image["nearest_heading"],
313                    "image_url": image["image_url"],
314                    "current_alt": image["current_alt"],
315                    "needs_update": str(image["needs_update"]).lower(),
316                    "issue": image["issue"],
317                    "context_paragraph": image["context_paragraph"],
318                }
319            )
320 
321 
322def main() -> int:
323    parser = build_parser()
324    args = parser.parse_args()
325    start_url = normalize_url(args.url)
326    parsed = urlparse(start_url)
327    if parsed.netloc != ALLOWED_HOST:
328        parser.error("Only metapatterns.io URLs are supported by this skill.")
329 
330    session = requests.Session()
331    session.headers.update(
332        {
333            "User-Agent": "metapatterns-diagram-alt-audit/1.0",
334        }
335    )
336 
337    try:
338        page_urls = iter_page_urls(
339            session=session,
340            start_url=start_url,
341            crawl=args.crawl,
342            limit_pages=args.limit_pages,
343            timeout=args.timeout,
344        )
345        pages = []
346        for page_url in page_urls:
347            try:
348                pages.append(analyze_page(session, page_url, args.timeout))
349            except requests.RequestException:
350                continue
351    except requests.RequestException as exc:
352        print(f"error: {exc}", file=sys.stderr)
353        return 1
354 
355    if args.json:
356        json.dump({"pages": pages}, sys.stdout, indent=2, ensure_ascii=False)
357        sys.stdout.write("\n")
358    elif args.csv:
359        emit_csv(pages)
360    else:
361        print(render_markdown(pages))
362    return 0
363 
364 
365if __name__ == "__main__":
366    raise SystemExit(main())
367

Marketplace

Source from bundle

Metapatterns Diagram Alt Audit

Scans metapatterns.io pages for diagram images with weak alt text and exports exact update candidates in Markdown, JSON, or CSV.

Костянтин@Latand

Files

Skill

0.6K

Size

28.6 KB

Entrypoint

SKILL.md

Format

folder

Open file

scripts/audit_diagram_alts.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code367 linesFree

scripts/audit_diagram_alts.py

1#!/usr/bin/env python3
2 
3from __future__ import annotations
4 
5import argparse
6import csv
7import json
8import re
9import sys
10from collections import deque
11from pathlib import PurePosixPath
12from typing import Iterable
13from urllib.parse import urljoin, urlparse
14 
15import requests
16from bs4 import BeautifulSoup, NavigableString, Tag
17 
18 
19GENERIC_ALT_WORDS = {
20    "diagram",
21    "image",
22    "figure",
23    "graphic",
24    "photo",
25}
26 
27UI_PATH_PARTS = (
28    "/icons/",
29    "/favicon",
30    "/logo",
31    "/menu",
32    "/toc",
33    "/cdn-cgi/",
34)
35 
36ALLOWED_HOST = "metapatterns.io"
37 
38 
39def build_parser() -> argparse.ArgumentParser:
40    parser = argparse.ArgumentParser(
41        description="Audit metapatterns.io pages for diagram images with weak alt text."
42    )
43    parser.add_argument("url", help="Page URL or section root to inspect.")
44    parser.add_argument(
45        "--crawl",
46        action="store_true",
47        help="Follow links under the same host and path prefix.",
48    )
49    parser.add_argument(
50        "--limit-pages",
51        type=int,
52        default=10,
53        help="Maximum number of pages to inspect when --crawl is set. Default: 10.",
54    )
55    parser.add_argument(
56        "--json",
57        action="store_true",
58        help="Emit JSON instead of Markdown.",
59    )
60    parser.add_argument(
61        "--csv",
62        action="store_true",
63        help="Emit CSV instead of Markdown.",
64    )
65    parser.add_argument(
66        "--timeout",
67        type=int,
68        default=20,
69        help="HTTP timeout in seconds. Default: 20.",
70    )
71    return parser
72 
73 
74def fetch_html(session: requests.Session, url: str, timeout: int) -> str:
75    response = session.get(url, timeout=timeout)
76    response.raise_for_status()
77    return response.text
78 
79 
80def normalize_url(url: str) -> str:
81    parsed = urlparse(url)
82    path = parsed.path or "/"
83    looks_like_file = bool(PurePosixPath(path).suffix)
84    if path != "/" and not path.endswith("/") and not looks_like_file:
85        path = f"{path}/"
86    return parsed._replace(fragment="", query="", path=path).geturl()
87 
88 
89def same_prefix(url: str, root: str) -> bool:
90    current = urlparse(url)
91    base = urlparse(root)
92    return current.netloc == base.netloc and current.path.startswith(base.path)
93 
94 
95def iter_page_urls(
96    session: requests.Session, start_url: str, crawl: bool, limit_pages: int, timeout: int
97) -> list[str]:
98    start_url = normalize_url(start_url)
99    if not crawl:
100        return [start_url]
101 
102    visited: set[str] = set()
103    queue: deque[str] = deque([start_url])
104    ordered: list[str] = []
105 
106    while queue and len(ordered) < limit_pages:
107        current = queue.popleft()
108        if current in visited:
109            continue
110        visited.add(current)
111        ordered.append(current)
112 
113        try:
114            html = fetch_html(session, current, timeout)
115        except requests.RequestException:
116            continue
117 
118        soup = BeautifulSoup(html, "html.parser")
119        for link in soup.find_all("a", href=True):
120            href = urljoin(current, link["href"])
121            normalized = normalize_url(href)
122            parsed = urlparse(normalized)
123            if parsed.netloc != ALLOWED_HOST:
124                continue
125            if any(part in parsed.path.lower() for part in UI_PATH_PARTS):
126                continue
127            if PurePosixPath(parsed.path).suffix:
128                continue
129            if not same_prefix(normalized, start_url):
130                continue
131            if normalized not in visited:
132                queue.append(normalized)
133 
134    return ordered
135 
136 
137def get_page_title(soup: BeautifulSoup) -> str:
138    title_tag = soup.find("h1")
139    if title_tag:
140        return normalize_space(title_tag.get_text(" ", strip=True))
141    if soup.title:
142        return normalize_space(soup.title.get_text(" ", strip=True))
143    return ""
144 
145 
146def normalize_space(text: str | None) -> str:
147    if not text:
148        return ""
149    compact = re.sub(r"\s+", " ", text).strip()
150    compact = re.sub(r"\s+([,.:;!?])", r"\1", compact)
151    compact = re.sub(r"\(\s+", "(", compact)
152    compact = re.sub(r"\s+\)", ")", compact)
153    compact = re.sub(r"\s+#$", "", compact)
154    return compact
155 
156 
157def filename_like(alt: str) -> bool:
158    if not alt:
159        return False
160    if re.fullmatch(r"[\w\s\-]+", alt) is None:
161        return False
162    parts = re.split(r"[\s\-]+", alt.lower())
163    return any(char.isdigit() for char in alt) or all(len(part) <= 12 for part in parts)
164 
165 
166def classify_issue(img_url: str, alt: str) -> tuple[bool, str]:
167    alt_normalized = normalize_space(alt)
168    alt_lower = alt_normalized.lower()
169 
170    if not alt_normalized:
171        return True, "missing alt"
172 
173    if any(word == alt_lower for word in GENERIC_ALT_WORDS):
174        return True, "generic alt"
175 
176    if filename_like(alt_normalized):
177        return True, "filename-like alt"
178 
179    if "/diagrams/" in img_url and len(alt_normalized.split()) <= 3:
180        return True, "too short for a diagram"
181 
182    return False, "looks acceptable"
183 
184 
185def is_ui_asset(img_url: str, alt: str) -> bool:
186    img_lower = img_url.lower()
187    alt_lower = normalize_space(alt).lower()
188    if any(part in img_lower for part in UI_PATH_PARTS):
189        return True
190    if alt_lower in {"logo", "menu", "table of contents", "+"}:
191        return True
192    return False
193 
194 
195def find_content_root(soup: BeautifulSoup) -> Tag:
196    for selector in ("article", "main", ".book-page", ".content"):
197        node = soup.select_one(selector)
198        if node:
199            return node
200    return soup
201 
202 
203def nearest_heading(tag: Tag) -> str:
204    for heading in tag.find_all_previous(["h1", "h2", "h3", "h4"], limit=12):
205        text = normalize_space(heading.get_text(" ", strip=True))
206        if text:
207            return text
208    return ""
209 
210 
211def nearby_paragraph(tag: Tag) -> str:
212    for sibling in tag.next_elements:
213        if sibling is tag:
214            continue
215        if isinstance(sibling, Tag) and sibling.name == "p":
216            text = normalize_space(sibling.get_text(" ", strip=True))
217            if text:
218                return text
219        if isinstance(sibling, Tag) and sibling.name in {"img", "h1", "h2", "h3", "h4"}:
220            break
221        if isinstance(sibling, NavigableString):
222            continue
223    return ""
224 
225 
226def analyze_page(session: requests.Session, url: str, timeout: int) -> dict:
227    html = fetch_html(session, url, timeout)
228    soup = BeautifulSoup(html, "html.parser")
229    root = find_content_root(soup)
230    page_title = get_page_title(soup)
231 
232    results = []
233    for img in root.find_all("img"):
234        src = img.get("src", "")
235        alt = img.get("alt", "")
236        if not src:
237            continue
238        img_url = urljoin(url, src)
239        if urlparse(img_url).netloc != ALLOWED_HOST:
240            continue
241        if is_ui_asset(img_url, alt):
242            continue
243        if "/diagrams/" not in img_url and img_url.lower().endswith(".svg"):
244            continue
245 
246        needs_update, issue = classify_issue(img_url, alt)
247        if "/diagrams/" not in img_url and not needs_update:
248            continue
249 
250        results.append(
251            {
252                "page_url": url,
253                "page_title": page_title,
254                "nearest_heading": nearest_heading(img),
255                "context_paragraph": nearby_paragraph(img),
256                "image_url": img_url,
257                "current_alt": alt,
258                "needs_update": needs_update,
259                "issue": issue,
260            }
261        )
262 
263    return {
264        "url": url,
265        "title": page_title,
266        "images": results,
267    }
268 
269 
270def render_markdown(pages: Iterable[dict]) -> str:
271    lines: list[str] = []
272    for page in pages:
273        lines.append(f"## {page['title'] or page['url']}")
274        lines.append(page["url"])
275        lines.append("")
276        if not page["images"]:
277            lines.append("- No candidate diagram images found.")
278            lines.append("")
279            continue
280        for image in page["images"]:
281            lines.append(
282                f"- `needs_update={str(image['needs_update']).lower()}` | `{image['issue']}` | `{image['current_alt'] or '(empty)'}`"
283            )
284            lines.append(f"  src: {image['image_url']}")
285            if image["nearest_heading"]:
286                lines.append(f"  heading: {image['nearest_heading']}")
287            if image["context_paragraph"]:
288                lines.append(f"  context: {image['context_paragraph']}")
289        lines.append("")
290    return "\n".join(lines).strip()
291 
292 
293def emit_csv(pages: Iterable[dict]) -> None:
294    fieldnames = [
295        "page_url",
296        "page_title",
297        "nearest_heading",
298        "image_url",
299        "current_alt",
300        "needs_update",
301        "issue",
302        "context_paragraph",
303    ]
304    writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
305    writer.writeheader()
306    for page in pages:
307        for image in page["images"]:
308            writer.writerow(
309                {
310                    "page_url": image["page_url"],
311                    "page_title": image["page_title"],
312                    "nearest_heading": image["nearest_heading"],
313                    "image_url": image["image_url"],
314                    "current_alt": image["current_alt"],
315                    "needs_update": str(image["needs_update"]).lower(),
316                    "issue": image["issue"],
317                    "context_paragraph": image["context_paragraph"],
318                }
319            )
320 
321 
322def main() -> int:
323    parser = build_parser()
324    args = parser.parse_args()
325    start_url = normalize_url(args.url)
326    parsed = urlparse(start_url)
327    if parsed.netloc != ALLOWED_HOST:
328        parser.error("Only metapatterns.io URLs are supported by this skill.")
329 
330    session = requests.Session()
331    session.headers.update(
332        {
333            "User-Agent": "metapatterns-diagram-alt-audit/1.0",
334        }
335    )
336 
337    try:
338        page_urls = iter_page_urls(
339            session=session,
340            start_url=start_url,
341            crawl=args.crawl,
342            limit_pages=args.limit_pages,
343            timeout=args.timeout,
344        )
345        pages = []
346        for page_url in page_urls:
347            try:
348                pages.append(analyze_page(session, page_url, args.timeout))
349            except requests.RequestException:
350                continue
351    except requests.RequestException as exc:
352        print(f"error: {exc}", file=sys.stderr)
353        return 1
354 
355    if args.json:
356        json.dump({"pages": pages}, sys.stdout, indent=2, ensure_ascii=False)
357        sys.stdout.write("\n")
358    elif args.csv:
359        emit_csv(pages)
360    else:
361        print(render_markdown(pages))
362    return 0
363 
364 
365if __name__ == "__main__":
366    raise SystemExit(main())
367

Metapatterns Diagram Alt Audit

scripts/audit_diagram_alts.py

Preparing the source view

Metapatterns Diagram Alt Audit

scripts/audit_diagram_alts.py