Source from repo
Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
219.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/generate_report.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code327 linesFree
scripts/generate_report.py
1#!/usr/bin/env python3
2"""Generate an HTML report from run_loop.py output.
3 
4Takes the JSON output from run_loop.py and generates a visual HTML report
5showing each description attempt with check/x for each test case.
6Distinguishes between train and test queries.
7"""
8 
9import argparse
10import html
11import json
12import sys
13from pathlib import Path
14 
15 
16def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
17    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
18    history = data.get("history", [])
19    holdout = data.get("holdout", 0)
20    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
21 
22    # Get all unique queries from train and test sets, with should_trigger info
23    train_queries: list[dict] = []
24    test_queries: list[dict] = []
25    if history:
26        for r in history[0].get("train_results", history[0].get("results", [])):
27            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
28        if history[0].get("test_results"):
29            for r in history[0].get("test_results", []):
30                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
31 
32    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
33 
34    html_parts = ["""<!DOCTYPE html>
35<html>
36<head>
37    <meta charset="utf-8">
38""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
39    <link rel="preconnect" href="https://fonts.googleapis.com">
40    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
41    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
42    <style>
43        body {
44            font-family: 'Lora', Georgia, serif;
45            max-width: 100%;
46            margin: 0 auto;
47            padding: 20px;
48            background: #faf9f5;
49            color: #141413;
50        }
51        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
52        .explainer {
53            background: white;
54            padding: 15px;
55            border-radius: 6px;
56            margin-bottom: 20px;
57            border: 1px solid #e8e6dc;
58            color: #b0aea5;
59            font-size: 0.875rem;
60            line-height: 1.6;
61        }
62        .summary {
63            background: white;
64            padding: 15px;
65            border-radius: 6px;
66            margin-bottom: 20px;
67            border: 1px solid #e8e6dc;
68        }
69        .summary p { margin: 5px 0; }
70        .best { color: #788c5d; font-weight: bold; }
71        .table-container {
72            overflow-x: auto;
73            width: 100%;
74        }
75        table {
76            border-collapse: collapse;
77            background: white;
78            border: 1px solid #e8e6dc;
79            border-radius: 6px;
80            font-size: 12px;
81            min-width: 100%;
82        }
83        th, td {
84            padding: 8px;
85            text-align: left;
86            border: 1px solid #e8e6dc;
87            white-space: normal;
88            word-wrap: break-word;
89        }
90        th {
91            font-family: 'Poppins', sans-serif;
92            background: #141413;
93            color: #faf9f5;
94            font-weight: 500;
95        }
96        th.test-col {
97            background: #6a9bcc;
98        }
99        th.query-col { min-width: 200px; }
100        td.description {
101            font-family: monospace;
102            font-size: 11px;
103            word-wrap: break-word;
104            max-width: 400px;
105        }
106        td.result {
107            text-align: center;
108            font-size: 16px;
109            min-width: 40px;
110        }
111        td.test-result {
112            background: #f0f6fc;
113        }
114        .pass { color: #788c5d; }
115        .fail { color: #c44; }
116        .rate {
117            font-size: 9px;
118            color: #b0aea5;
119            display: block;
120        }
121        tr:hover { background: #faf9f5; }
122        .score {
123            display: inline-block;
124            padding: 2px 6px;
125            border-radius: 4px;
126            font-weight: bold;
127            font-size: 11px;
128        }
129        .score-good { background: #eef2e8; color: #788c5d; }
130        .score-ok { background: #fef3c7; color: #d97706; }
131        .score-bad { background: #fceaea; color: #c44; }
132        .train-label { color: #b0aea5; font-size: 10px; }
133        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
134        .best-row { background: #f5f8f2; }
135        th.positive-col { border-bottom: 3px solid #788c5d; }
136        th.negative-col { border-bottom: 3px solid #c44; }
137        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
138        th.test-col.negative-col { border-bottom: 3px solid #c44; }
139        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
140        .legend-item { display: flex; align-items: center; gap: 6px; }
141        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
142        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
143        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
144        .swatch-test { background: #6a9bcc; }
145        .swatch-train { background: #141413; }
146    </style>
147</head>
148<body>
149    <h1>""" + title_prefix + """Skill Description Optimization</h1>
150    <div class="explainer">
151        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
152    </div>
153"""]
154 
155    # Summary section
156    best_test_score = data.get('best_test_score')
157    best_train_score = data.get('best_train_score')
158    html_parts.append(f"""
159    <div class="summary">
160        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
161        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
162        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
163        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
164    </div>
165""")
166 
167    # Legend
168    html_parts.append("""
169    <div class="legend">
170        <span style="font-weight:600">Query columns:</span>
171        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
172        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
173        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
174        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
175    </div>
176""")
177 
178    # Table header
179    html_parts.append("""
180    <div class="table-container">
181    <table>
182        <thead>
183            <tr>
184                <th>Iter</th>
185                <th>Train</th>
186                <th>Test</th>
187                <th class="query-col">Description</th>
188""")
189 
190    # Add column headers for train queries
191    for qinfo in train_queries:
192        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
193        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
194 
195    # Add column headers for test queries (different color)
196    for qinfo in test_queries:
197        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
198        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
199 
200    html_parts.append("""            </tr>
201        </thead>
202        <tbody>
203""")
204 
205    # Find best iteration for highlighting
206    if test_queries:
207        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
208    else:
209        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
210 
211    # Add rows for each iteration
212    for h in history:
213        iteration = h.get("iteration", "?")
214        train_passed = h.get("train_passed", h.get("passed", 0))
215        train_total = h.get("train_total", h.get("total", 0))
216        test_passed = h.get("test_passed")
217        test_total = h.get("test_total")
218        description = h.get("description", "")
219        train_results = h.get("train_results", h.get("results", []))
220        test_results = h.get("test_results", [])
221 
222        # Create lookups for results by query
223        train_by_query = {r["query"]: r for r in train_results}
224        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
225 
226        # Compute aggregate correct/total runs across all retries
227        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
228            correct = 0
229            total = 0
230            for r in results:
231                runs = r.get("runs", 0)
232                triggers = r.get("triggers", 0)
233                total += runs
234                if r.get("should_trigger", True):
235                    correct += triggers
236                else:
237                    correct += runs - triggers
238            return correct, total
239 
240        train_correct, train_runs = aggregate_runs(train_results)
241        test_correct, test_runs = aggregate_runs(test_results)
242 
243        # Determine score classes
244        def score_class(correct: int, total: int) -> str:
245            if total > 0:
246                ratio = correct / total
247                if ratio >= 0.8:
248                    return "score-good"
249                elif ratio >= 0.5:
250                    return "score-ok"
251            return "score-bad"
252 
253        train_class = score_class(train_correct, train_runs)
254        test_class = score_class(test_correct, test_runs)
255 
256        row_class = "best-row" if iteration == best_iter else ""
257 
258        html_parts.append(f"""            <tr class="{row_class}">
259                <td>{iteration}</td>
260                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
261                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
262                <td class="description">{html.escape(description)}</td>
263""")
264 
265        # Add result for each train query
266        for qinfo in train_queries:
267            r = train_by_query.get(qinfo["query"], {})
268            did_pass = r.get("pass", False)
269            triggers = r.get("triggers", 0)
270            runs = r.get("runs", 0)
271 
272            icon = "✓" if did_pass else "✗"
273            css_class = "pass" if did_pass else "fail"
274 
275            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
276 
277        # Add result for each test query (with different background)
278        for qinfo in test_queries:
279            r = test_by_query.get(qinfo["query"], {})
280            did_pass = r.get("pass", False)
281            triggers = r.get("triggers", 0)
282            runs = r.get("runs", 0)
283 
284            icon = "✓" if did_pass else "✗"
285            css_class = "pass" if did_pass else "fail"
286 
287            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
288 
289        html_parts.append("            </tr>\n")
290 
291    html_parts.append("""        </tbody>
292    </table>
293    </div>
294""")
295 
296    html_parts.append("""
297</body>
298</html>
299""")
300 
301    return "".join(html_parts)
302 
303 
304def main():
305    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
306    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
307    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
308    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
309    args = parser.parse_args()
310 
311    if args.input == "-":
312        data = json.load(sys.stdin)
313    else:
314        data = json.loads(Path(args.input).read_text())
315 
316    html_output = generate_html(data, skill_name=args.skill_name)
317 
318    if args.output:
319        Path(args.output).write_text(html_output)
320        print(f"Report written to {args.output}", file=sys.stderr)
321    else:
322        print(html_output)
323 
324 
325if __name__ == "__main__":
326    main()
327
Preparing the source view

Skill Creator

scripts/generate_report.py