Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/aggregate_benchmark.py
1#!/usr/bin/env python32"""3Aggregate individual run results into benchmark summary statistics.45Reads grading.json files from run directories and produces:6- run_summary with mean, stddev, min, max for each metric7- delta between with_skill and without_skill configurations89Usage:10python aggregate_benchmark.py <benchmark_dir>1112Example:13python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/1415The script supports two directory layouts:1617Workspace layout (from skill-creator iterations):18<benchmark_dir>/19└── eval-N/20├── with_skill/21│ ├── run-1/grading.json22│ └── run-2/grading.json23└── without_skill/24├── run-1/grading.json25└── run-2/grading.json2627Legacy layout (with runs/ subdirectory):28<benchmark_dir>/29└── runs/30└── eval-N/31├── with_skill/32│ └── run-1/grading.json33└── without_skill/34└── run-1/grading.json35"""3637import argparse38import json39import math40import sys41from datetime import datetime, timezone42from pathlib import Path434445def calculate_stats(values: list[float]) -> dict:46"""Calculate mean, stddev, min, max for a list of values."""47if not values:48return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}4950n = len(values)51mean = sum(values) / n5253if n > 1:54variance = sum((x - mean) ** 2 for x in values) / (n - 1)55stddev = math.sqrt(variance)56else:57stddev = 0.05859return {60"mean": round(mean, 4),61"stddev": round(stddev, 4),62"min": round(min(values), 4),63"max": round(max(values), 4)64}656667def load_run_results(benchmark_dir: Path) -> dict:68"""69Load all run results from a benchmark directory.7071Returns dict keyed by config name (e.g. "with_skill"/"without_skill",72or "new_skill"/"old_skill"), each containing a list of run results.73"""74# Support both layouts: eval dirs directly under benchmark_dir, or under runs/75runs_dir = benchmark_dir / "runs"76if runs_dir.exists():77search_dir = runs_dir78elif list(benchmark_dir.glob("eval-*")):79search_dir = benchmark_dir80else:81print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")82return {}8384results: dict[str, list] = {}8586for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):87metadata_path = eval_dir / "eval_metadata.json"88if metadata_path.exists():89try:90with open(metadata_path) as mf:91eval_id = json.load(mf).get("eval_id", eval_idx)92except (json.JSONDecodeError, OSError):93eval_id = eval_idx94else:95try:96eval_id = int(eval_dir.name.split("-")[1])97except ValueError:98eval_id = eval_idx99100# Discover config directories dynamically rather than hardcoding names101for config_dir in sorted(eval_dir.iterdir()):102if not config_dir.is_dir():103continue104# Skip non-config directories (inputs, outputs, etc.)105if not list(config_dir.glob("run-*")):106continue107config = config_dir.name108if config not in results:109results[config] = []110111for run_dir in sorted(config_dir.glob("run-*")):112run_number = int(run_dir.name.split("-")[1])113grading_file = run_dir / "grading.json"114115if not grading_file.exists():116print(f"Warning: grading.json not found in {run_dir}")117continue118119try:120with open(grading_file) as f:121grading = json.load(f)122except json.JSONDecodeError as e:123print(f"Warning: Invalid JSON in {grading_file}: {e}")124continue125126# Extract metrics127result = {128"eval_id": eval_id,129"run_number": run_number,130"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),131"passed": grading.get("summary", {}).get("passed", 0),132"failed": grading.get("summary", {}).get("failed", 0),133"total": grading.get("summary", {}).get("total", 0),134}135136# Extract timing — check grading.json first, then sibling timing.json137timing = grading.get("timing", {})138result["time_seconds"] = timing.get("total_duration_seconds", 0.0)139timing_file = run_dir / "timing.json"140if result["time_seconds"] == 0.0 and timing_file.exists():141try:142with open(timing_file) as tf:143timing_data = json.load(tf)144result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)145result["tokens"] = timing_data.get("total_tokens", 0)146except json.JSONDecodeError:147pass148149# Extract metrics if available150metrics = grading.get("execution_metrics", {})151result["tool_calls"] = metrics.get("total_tool_calls", 0)152if not result.get("tokens"):153result["tokens"] = metrics.get("output_chars", 0)154result["errors"] = metrics.get("errors_encountered", 0)155156# Extract expectations — viewer requires fields: text, passed, evidence157raw_expectations = grading.get("expectations", [])158for exp in raw_expectations:159if "text" not in exp or "passed" not in exp:160print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")161result["expectations"] = raw_expectations162163# Extract notes from user_notes_summary164notes_summary = grading.get("user_notes_summary", {})165notes = []166notes.extend(notes_summary.get("uncertainties", []))167notes.extend(notes_summary.get("needs_review", []))168notes.extend(notes_summary.get("workarounds", []))169result["notes"] = notes170171results[config].append(result)172173return results174175176def aggregate_results(results: dict) -> dict:177"""178Aggregate run results into summary statistics.179180Returns run_summary with stats for each configuration and delta.181"""182run_summary = {}183configs = list(results.keys())184185for config in configs:186runs = results.get(config, [])187188if not runs:189run_summary[config] = {190"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},191"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},192"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}193}194continue195196pass_rates = [r["pass_rate"] for r in runs]197times = [r["time_seconds"] for r in runs]198tokens = [r.get("tokens", 0) for r in runs]199200run_summary[config] = {201"pass_rate": calculate_stats(pass_rates),202"time_seconds": calculate_stats(times),203"tokens": calculate_stats(tokens)204}205206# Calculate delta between the first two configs (if two exist)207if len(configs) >= 2:208primary = run_summary.get(configs[0], {})209baseline = run_summary.get(configs[1], {})210else:211primary = run_summary.get(configs[0], {}) if configs else {}212baseline = {}213214delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)215delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)216delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)217218run_summary["delta"] = {219"pass_rate": f"{delta_pass_rate:+.2f}",220"time_seconds": f"{delta_time:+.1f}",221"tokens": f"{delta_tokens:+.0f}"222}223224return run_summary225226227def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:228"""229Generate complete benchmark.json from run results.230"""231results = load_run_results(benchmark_dir)232run_summary = aggregate_results(results)233234# Build runs array for benchmark.json235runs = []236for config in results:237for result in results[config]:238runs.append({239"eval_id": result["eval_id"],240"configuration": config,241"run_number": result["run_number"],242"result": {243"pass_rate": result["pass_rate"],244"passed": result["passed"],245"failed": result["failed"],246"total": result["total"],247"time_seconds": result["time_seconds"],248"tokens": result.get("tokens", 0),249"tool_calls": result.get("tool_calls", 0),250"errors": result.get("errors", 0)251},252"expectations": result["expectations"],253"notes": result["notes"]254})255256# Determine eval IDs from results257eval_ids = sorted(set(258r["eval_id"]259for config in results.values()260for r in config261))262263benchmark = {264"metadata": {265"skill_name": skill_name or "<skill-name>",266"skill_path": skill_path or "<path/to/skill>",267"executor_model": "<model-name>",268"analyzer_model": "<model-name>",269"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),270"evals_run": eval_ids,271"runs_per_configuration": 3272},273"runs": runs,274"run_summary": run_summary,275"notes": [] # To be filled by analyzer276}277278return benchmark279280281def generate_markdown(benchmark: dict) -> str:282"""Generate human-readable benchmark.md from benchmark data."""283metadata = benchmark["metadata"]284run_summary = benchmark["run_summary"]285286# Determine config names (excluding "delta")287configs = [k for k in run_summary if k != "delta"]288config_a = configs[0] if len(configs) >= 1 else "config_a"289config_b = configs[1] if len(configs) >= 2 else "config_b"290label_a = config_a.replace("_", " ").title()291label_b = config_b.replace("_", " ").title()292293lines = [294f"# Skill Benchmark: {metadata['skill_name']}",295"",296f"**Model**: {metadata['executor_model']}",297f"**Date**: {metadata['timestamp']}",298f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",299"",300"## Summary",301"",302f"| Metric | {label_a} | {label_b} | Delta |",303"|--------|------------|---------------|-------|",304]305306a_summary = run_summary.get(config_a, {})307b_summary = run_summary.get(config_b, {})308delta = run_summary.get("delta", {})309310# Format pass rate311a_pr = a_summary.get("pass_rate", {})312b_pr = b_summary.get("pass_rate", {})313lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")314315# Format time316a_time = a_summary.get("time_seconds", {})317b_time = b_summary.get("time_seconds", {})318lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")319320# Format tokens321a_tokens = a_summary.get("tokens", {})322b_tokens = b_summary.get("tokens", {})323lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")324325# Notes section326if benchmark.get("notes"):327lines.extend([328"",329"## Notes",330""331])332for note in benchmark["notes"]:333lines.append(f"- {note}")334335return "\n".join(lines)336337338def main():339parser = argparse.ArgumentParser(340description="Aggregate benchmark run results into summary statistics"341)342parser.add_argument(343"benchmark_dir",344type=Path,345help="Path to the benchmark directory"346)347parser.add_argument(348"--skill-name",349default="",350help="Name of the skill being benchmarked"351)352parser.add_argument(353"--skill-path",354default="",355help="Path to the skill being benchmarked"356)357parser.add_argument(358"--output", "-o",359type=Path,360help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"361)362363args = parser.parse_args()364365if not args.benchmark_dir.exists():366print(f"Directory not found: {args.benchmark_dir}")367sys.exit(1)368369# Generate benchmark370benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)371372# Determine output paths373output_json = args.output or (args.benchmark_dir / "benchmark.json")374output_md = output_json.with_suffix(".md")375376# Write benchmark.json377with open(output_json, "w") as f:378json.dump(benchmark, f, indent=2)379print(f"Generated: {output_json}")380381# Write benchmark.md382markdown = generate_markdown(benchmark)383with open(output_md, "w") as f:384f.write(markdown)385print(f"Generated: {output_md}")386387# Print summary388run_summary = benchmark["run_summary"]389configs = [k for k in run_summary if k != "delta"]390delta = run_summary.get("delta", {})391392print(f"\nSummary:")393for config in configs:394pr = run_summary[config]["pass_rate"]["mean"]395label = config.replace("_", " ").title()396print(f" {label}: {pr*100:.1f}% pass rate")397print(f" Delta: {delta.get('pass_rate', '—')}")398399400if __name__ == "__main__":401main()402