Source from repo
Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
219.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/aggregate_benchmark.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code402 linesFree
scripts/aggregate_benchmark.py
1#!/usr/bin/env python3
2"""
3Aggregate individual run results into benchmark summary statistics.
4 
5Reads grading.json files from run directories and produces:
6- run_summary with mean, stddev, min, max for each metric
7- delta between with_skill and without_skill configurations
8 
9Usage:
10    python aggregate_benchmark.py <benchmark_dir>
11 
12Example:
13    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14 
15The script supports two directory layouts:
16 
17    Workspace layout (from skill-creator iterations):
18    <benchmark_dir>/
19    └── eval-N/
20        ├── with_skill/
21        │   ├── run-1/grading.json
22        │   └── run-2/grading.json
23        └── without_skill/
24            ├── run-1/grading.json
25            └── run-2/grading.json
26 
27    Legacy layout (with runs/ subdirectory):
28    <benchmark_dir>/
29    └── runs/
30        └── eval-N/
31            ├── with_skill/
32            │   └── run-1/grading.json
33            └── without_skill/
34                └── run-1/grading.json
35"""
36 
37import argparse
38import json
39import math
40import sys
41from datetime import datetime, timezone
42from pathlib import Path
43 
44 
45def calculate_stats(values: list[float]) -> dict:
46    """Calculate mean, stddev, min, max for a list of values."""
47    if not values:
48        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49 
50    n = len(values)
51    mean = sum(values) / n
52 
53    if n > 1:
54        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55        stddev = math.sqrt(variance)
56    else:
57        stddev = 0.0
58 
59    return {
60        "mean": round(mean, 4),
61        "stddev": round(stddev, 4),
62        "min": round(min(values), 4),
63        "max": round(max(values), 4)
64    }
65 
66 
67def load_run_results(benchmark_dir: Path) -> dict:
68    """
69    Load all run results from a benchmark directory.
70 
71    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72    or "new_skill"/"old_skill"), each containing a list of run results.
73    """
74    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75    runs_dir = benchmark_dir / "runs"
76    if runs_dir.exists():
77        search_dir = runs_dir
78    elif list(benchmark_dir.glob("eval-*")):
79        search_dir = benchmark_dir
80    else:
81        print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82        return {}
83 
84    results: dict[str, list] = {}
85 
86    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87        metadata_path = eval_dir / "eval_metadata.json"
88        if metadata_path.exists():
89            try:
90                with open(metadata_path) as mf:
91                    eval_id = json.load(mf).get("eval_id", eval_idx)
92            except (json.JSONDecodeError, OSError):
93                eval_id = eval_idx
94        else:
95            try:
96                eval_id = int(eval_dir.name.split("-")[1])
97            except ValueError:
98                eval_id = eval_idx
99 
100        # Discover config directories dynamically rather than hardcoding names
101        for config_dir in sorted(eval_dir.iterdir()):
102            if not config_dir.is_dir():
103                continue
104            # Skip non-config directories (inputs, outputs, etc.)
105            if not list(config_dir.glob("run-*")):
106                continue
107            config = config_dir.name
108            if config not in results:
109                results[config] = []
110 
111            for run_dir in sorted(config_dir.glob("run-*")):
112                run_number = int(run_dir.name.split("-")[1])
113                grading_file = run_dir / "grading.json"
114 
115                if not grading_file.exists():
116                    print(f"Warning: grading.json not found in {run_dir}")
117                    continue
118 
119                try:
120                    with open(grading_file) as f:
121                        grading = json.load(f)
122                except json.JSONDecodeError as e:
123                    print(f"Warning: Invalid JSON in {grading_file}: {e}")
124                    continue
125 
126                # Extract metrics
127                result = {
128                    "eval_id": eval_id,
129                    "run_number": run_number,
130                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131                    "passed": grading.get("summary", {}).get("passed", 0),
132                    "failed": grading.get("summary", {}).get("failed", 0),
133                    "total": grading.get("summary", {}).get("total", 0),
134                }
135 
136                # Extract timing — check grading.json first, then sibling timing.json
137                timing = grading.get("timing", {})
138                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139                timing_file = run_dir / "timing.json"
140                if result["time_seconds"] == 0.0 and timing_file.exists():
141                    try:
142                        with open(timing_file) as tf:
143                            timing_data = json.load(tf)
144                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145                        result["tokens"] = timing_data.get("total_tokens", 0)
146                    except json.JSONDecodeError:
147                        pass
148 
149                # Extract metrics if available
150                metrics = grading.get("execution_metrics", {})
151                result["tool_calls"] = metrics.get("total_tool_calls", 0)
152                if not result.get("tokens"):
153                    result["tokens"] = metrics.get("output_chars", 0)
154                result["errors"] = metrics.get("errors_encountered", 0)
155 
156                # Extract expectations — viewer requires fields: text, passed, evidence
157                raw_expectations = grading.get("expectations", [])
158                for exp in raw_expectations:
159                    if "text" not in exp or "passed" not in exp:
160                        print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161                result["expectations"] = raw_expectations
162 
163                # Extract notes from user_notes_summary
164                notes_summary = grading.get("user_notes_summary", {})
165                notes = []
166                notes.extend(notes_summary.get("uncertainties", []))
167                notes.extend(notes_summary.get("needs_review", []))
168                notes.extend(notes_summary.get("workarounds", []))
169                result["notes"] = notes
170 
171                results[config].append(result)
172 
173    return results
174 
175 
176def aggregate_results(results: dict) -> dict:
177    """
178    Aggregate run results into summary statistics.
179 
180    Returns run_summary with stats for each configuration and delta.
181    """
182    run_summary = {}
183    configs = list(results.keys())
184 
185    for config in configs:
186        runs = results.get(config, [])
187 
188        if not runs:
189            run_summary[config] = {
190                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193            }
194            continue
195 
196        pass_rates = [r["pass_rate"] for r in runs]
197        times = [r["time_seconds"] for r in runs]
198        tokens = [r.get("tokens", 0) for r in runs]
199 
200        run_summary[config] = {
201            "pass_rate": calculate_stats(pass_rates),
202            "time_seconds": calculate_stats(times),
203            "tokens": calculate_stats(tokens)
204        }
205 
206    # Calculate delta between the first two configs (if two exist)
207    if len(configs) >= 2:
208        primary = run_summary.get(configs[0], {})
209        baseline = run_summary.get(configs[1], {})
210    else:
211        primary = run_summary.get(configs[0], {}) if configs else {}
212        baseline = {}
213 
214    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217 
218    run_summary["delta"] = {
219        "pass_rate": f"{delta_pass_rate:+.2f}",
220        "time_seconds": f"{delta_time:+.1f}",
221        "tokens": f"{delta_tokens:+.0f}"
222    }
223 
224    return run_summary
225 
226 
227def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228    """
229    Generate complete benchmark.json from run results.
230    """
231    results = load_run_results(benchmark_dir)
232    run_summary = aggregate_results(results)
233 
234    # Build runs array for benchmark.json
235    runs = []
236    for config in results:
237        for result in results[config]:
238            runs.append({
239                "eval_id": result["eval_id"],
240                "configuration": config,
241                "run_number": result["run_number"],
242                "result": {
243                    "pass_rate": result["pass_rate"],
244                    "passed": result["passed"],
245                    "failed": result["failed"],
246                    "total": result["total"],
247                    "time_seconds": result["time_seconds"],
248                    "tokens": result.get("tokens", 0),
249                    "tool_calls": result.get("tool_calls", 0),
250                    "errors": result.get("errors", 0)
251                },
252                "expectations": result["expectations"],
253                "notes": result["notes"]
254            })
255 
256    # Determine eval IDs from results
257    eval_ids = sorted(set(
258        r["eval_id"]
259        for config in results.values()
260        for r in config
261    ))
262 
263    benchmark = {
264        "metadata": {
265            "skill_name": skill_name or "<skill-name>",
266            "skill_path": skill_path or "<path/to/skill>",
267            "executor_model": "<model-name>",
268            "analyzer_model": "<model-name>",
269            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270            "evals_run": eval_ids,
271            "runs_per_configuration": 3
272        },
273        "runs": runs,
274        "run_summary": run_summary,
275        "notes": []  # To be filled by analyzer
276    }
277 
278    return benchmark
279 
280 
281def generate_markdown(benchmark: dict) -> str:
282    """Generate human-readable benchmark.md from benchmark data."""
283    metadata = benchmark["metadata"]
284    run_summary = benchmark["run_summary"]
285 
286    # Determine config names (excluding "delta")
287    configs = [k for k in run_summary if k != "delta"]
288    config_a = configs[0] if len(configs) >= 1 else "config_a"
289    config_b = configs[1] if len(configs) >= 2 else "config_b"
290    label_a = config_a.replace("_", " ").title()
291    label_b = config_b.replace("_", " ").title()
292 
293    lines = [
294        f"# Skill Benchmark: {metadata['skill_name']}",
295        "",
296        f"**Model**: {metadata['executor_model']}",
297        f"**Date**: {metadata['timestamp']}",
298        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299        "",
300        "## Summary",
301        "",
302        f"| Metric | {label_a} | {label_b} | Delta |",
303        "|--------|------------|---------------|-------|",
304    ]
305 
306    a_summary = run_summary.get(config_a, {})
307    b_summary = run_summary.get(config_b, {})
308    delta = run_summary.get("delta", {})
309 
310    # Format pass rate
311    a_pr = a_summary.get("pass_rate", {})
312    b_pr = b_summary.get("pass_rate", {})
313    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314 
315    # Format time
316    a_time = a_summary.get("time_seconds", {})
317    b_time = b_summary.get("time_seconds", {})
318    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319 
320    # Format tokens
321    a_tokens = a_summary.get("tokens", {})
322    b_tokens = b_summary.get("tokens", {})
323    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324 
325    # Notes section
326    if benchmark.get("notes"):
327        lines.extend([
328            "",
329            "## Notes",
330            ""
331        ])
332        for note in benchmark["notes"]:
333            lines.append(f"- {note}")
334 
335    return "\n".join(lines)
336 
337 
338def main():
339    parser = argparse.ArgumentParser(
340        description="Aggregate benchmark run results into summary statistics"
341    )
342    parser.add_argument(
343        "benchmark_dir",
344        type=Path,
345        help="Path to the benchmark directory"
346    )
347    parser.add_argument(
348        "--skill-name",
349        default="",
350        help="Name of the skill being benchmarked"
351    )
352    parser.add_argument(
353        "--skill-path",
354        default="",
355        help="Path to the skill being benchmarked"
356    )
357    parser.add_argument(
358        "--output", "-o",
359        type=Path,
360        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361    )
362 
363    args = parser.parse_args()
364 
365    if not args.benchmark_dir.exists():
366        print(f"Directory not found: {args.benchmark_dir}")
367        sys.exit(1)
368 
369    # Generate benchmark
370    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371 
372    # Determine output paths
373    output_json = args.output or (args.benchmark_dir / "benchmark.json")
374    output_md = output_json.with_suffix(".md")
375 
376    # Write benchmark.json
377    with open(output_json, "w") as f:
378        json.dump(benchmark, f, indent=2)
379    print(f"Generated: {output_json}")
380 
381    # Write benchmark.md
382    markdown = generate_markdown(benchmark)
383    with open(output_md, "w") as f:
384        f.write(markdown)
385    print(f"Generated: {output_md}")
386 
387    # Print summary
388    run_summary = benchmark["run_summary"]
389    configs = [k for k in run_summary if k != "delta"]
390    delta = run_summary.get("delta", {})
391 
392    print(f"\nSummary:")
393    for config in configs:
394        pr = run_summary[config]["pass_rate"]["mean"]
395        label = config.replace("_", " ").title()
396        print(f"  {label}: {pr*100:.1f}% pass rate")
397    print(f"  Delta:         {delta.get('pass_rate', '—')}")
398 
399 
400if __name__ == "__main__":
401    main()
402
Preparing the source view

Skill Creator

scripts/aggregate_benchmark.py