Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/generate_report.py
1#!/usr/bin/env python32"""Generate an HTML report from run_loop.py output.34Takes the JSON output from run_loop.py and generates a visual HTML report5showing each description attempt with check/x for each test case.6Distinguishes between train and test queries.7"""89import argparse10import html11import json12import sys13from pathlib import Path141516def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:17"""Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""18history = data.get("history", [])19holdout = data.get("holdout", 0)20title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""2122# Get all unique queries from train and test sets, with should_trigger info23train_queries: list[dict] = []24test_queries: list[dict] = []25if history:26for r in history[0].get("train_results", history[0].get("results", [])):27train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})28if history[0].get("test_results"):29for r in history[0].get("test_results", []):30test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})3132refresh_tag = ' <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""3334html_parts = ["""<!DOCTYPE html>35<html>36<head>37<meta charset="utf-8">38""" + refresh_tag + """ <title>""" + title_prefix + """Skill Description Optimization</title>39<link rel="preconnect" href="https://fonts.googleapis.com">40<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>41<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">42<style>43body {44font-family: 'Lora', Georgia, serif;45max-width: 100%;46margin: 0 auto;47padding: 20px;48background: #faf9f5;49color: #141413;50}51h1 { font-family: 'Poppins', sans-serif; color: #141413; }52.explainer {53background: white;54padding: 15px;55border-radius: 6px;56margin-bottom: 20px;57border: 1px solid #e8e6dc;58color: #b0aea5;59font-size: 0.875rem;60line-height: 1.6;61}62.summary {63background: white;64padding: 15px;65border-radius: 6px;66margin-bottom: 20px;67border: 1px solid #e8e6dc;68}69.summary p { margin: 5px 0; }70.best { color: #788c5d; font-weight: bold; }71.table-container {72overflow-x: auto;73width: 100%;74}75table {76border-collapse: collapse;77background: white;78border: 1px solid #e8e6dc;79border-radius: 6px;80font-size: 12px;81min-width: 100%;82}83th, td {84padding: 8px;85text-align: left;86border: 1px solid #e8e6dc;87white-space: normal;88word-wrap: break-word;89}90th {91font-family: 'Poppins', sans-serif;92background: #141413;93color: #faf9f5;94font-weight: 500;95}96th.test-col {97background: #6a9bcc;98}99th.query-col { min-width: 200px; }100td.description {101font-family: monospace;102font-size: 11px;103word-wrap: break-word;104max-width: 400px;105}106td.result {107text-align: center;108font-size: 16px;109min-width: 40px;110}111td.test-result {112background: #f0f6fc;113}114.pass { color: #788c5d; }115.fail { color: #c44; }116.rate {117font-size: 9px;118color: #b0aea5;119display: block;120}121tr:hover { background: #faf9f5; }122.score {123display: inline-block;124padding: 2px 6px;125border-radius: 4px;126font-weight: bold;127font-size: 11px;128}129.score-good { background: #eef2e8; color: #788c5d; }130.score-ok { background: #fef3c7; color: #d97706; }131.score-bad { background: #fceaea; color: #c44; }132.train-label { color: #b0aea5; font-size: 10px; }133.test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }134.best-row { background: #f5f8f2; }135th.positive-col { border-bottom: 3px solid #788c5d; }136th.negative-col { border-bottom: 3px solid #c44; }137th.test-col.positive-col { border-bottom: 3px solid #788c5d; }138th.test-col.negative-col { border-bottom: 3px solid #c44; }139.legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }140.legend-item { display: flex; align-items: center; gap: 6px; }141.legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }142.swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }143.swatch-negative { background: #141413; border-bottom: 3px solid #c44; }144.swatch-test { background: #6a9bcc; }145.swatch-train { background: #141413; }146</style>147</head>148<body>149<h1>""" + title_prefix + """Skill Description Optimization</h1>150<div class="explainer">151<strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.152</div>153"""]154155# Summary section156best_test_score = data.get('best_test_score')157best_train_score = data.get('best_train_score')158html_parts.append(f"""159<div class="summary">160<p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>161<p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>162<p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>163<p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>164</div>165""")166167# Legend168html_parts.append("""169<div class="legend">170<span style="font-weight:600">Query columns:</span>171<span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>172<span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>173<span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>174<span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>175</div>176""")177178# Table header179html_parts.append("""180<div class="table-container">181<table>182<thead>183<tr>184<th>Iter</th>185<th>Train</th>186<th>Test</th>187<th class="query-col">Description</th>188""")189190# Add column headers for train queries191for qinfo in train_queries:192polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"193html_parts.append(f' <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')194195# Add column headers for test queries (different color)196for qinfo in test_queries:197polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"198html_parts.append(f' <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')199200html_parts.append(""" </tr>201</thead>202<tbody>203""")204205# Find best iteration for highlighting206if test_queries:207best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")208else:209best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")210211# Add rows for each iteration212for h in history:213iteration = h.get("iteration", "?")214train_passed = h.get("train_passed", h.get("passed", 0))215train_total = h.get("train_total", h.get("total", 0))216test_passed = h.get("test_passed")217test_total = h.get("test_total")218description = h.get("description", "")219train_results = h.get("train_results", h.get("results", []))220test_results = h.get("test_results", [])221222# Create lookups for results by query223train_by_query = {r["query"]: r for r in train_results}224test_by_query = {r["query"]: r for r in test_results} if test_results else {}225226# Compute aggregate correct/total runs across all retries227def aggregate_runs(results: list[dict]) -> tuple[int, int]:228correct = 0229total = 0230for r in results:231runs = r.get("runs", 0)232triggers = r.get("triggers", 0)233total += runs234if r.get("should_trigger", True):235correct += triggers236else:237correct += runs - triggers238return correct, total239240train_correct, train_runs = aggregate_runs(train_results)241test_correct, test_runs = aggregate_runs(test_results)242243# Determine score classes244def score_class(correct: int, total: int) -> str:245if total > 0:246ratio = correct / total247if ratio >= 0.8:248return "score-good"249elif ratio >= 0.5:250return "score-ok"251return "score-bad"252253train_class = score_class(train_correct, train_runs)254test_class = score_class(test_correct, test_runs)255256row_class = "best-row" if iteration == best_iter else ""257258html_parts.append(f""" <tr class="{row_class}">259<td>{iteration}</td>260<td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>261<td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>262<td class="description">{html.escape(description)}</td>263""")264265# Add result for each train query266for qinfo in train_queries:267r = train_by_query.get(qinfo["query"], {})268did_pass = r.get("pass", False)269triggers = r.get("triggers", 0)270runs = r.get("runs", 0)271272icon = "✓" if did_pass else "✗"273css_class = "pass" if did_pass else "fail"274275html_parts.append(f' <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')276277# Add result for each test query (with different background)278for qinfo in test_queries:279r = test_by_query.get(qinfo["query"], {})280did_pass = r.get("pass", False)281triggers = r.get("triggers", 0)282runs = r.get("runs", 0)283284icon = "✓" if did_pass else "✗"285css_class = "pass" if did_pass else "fail"286287html_parts.append(f' <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')288289html_parts.append(" </tr>\n")290291html_parts.append(""" </tbody>292</table>293</div>294""")295296html_parts.append("""297</body>298</html>299""")300301return "".join(html_parts)302303304def main():305parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")306parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")307parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")308parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")309args = parser.parse_args()310311if args.input == "-":312data = json.load(sys.stdin)313else:314data = json.loads(Path(args.input).read_text())315316html_output = generate_html(data, skill_name=args.skill_name)317318if args.output:319Path(args.output).write_text(html_output)320print(f"Report written to {args.output}", file=sys.stderr)321else:322print(html_output)323324325if __name__ == "__main__":326main()327