Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/scripts/render_router_report.py
1#!/usr/bin/env python32"""Render a published-quality Markdown report from router benchmark results.34Reads per-run JSON files produced by runRouter.ts and emits:5- Per-model top-1 / top-3 accuracy with bootstrap 95% CIs6- Per-model format failure rate7- Per-model wall time stats8- Per-skill confusion matrix (when expected was X, what was predicted)9- Per-prompt cross-model agreement and per-prompt failures10- Per-rep consistency check (within-prompt-model)1112Usage:13python3 researcher/scripts/render_router_report.py \14--results researcher/benchmarks/router/results/<date>-<seed> \15--fixture researcher/benchmarks/router/prompts.jsonl \16--output researcher/benchmarks/router/results-published/<date>.md1718The output is committed; raw per-run JSONs stay gitignored.19"""2021from __future__ import annotations2223import argparse24import json25import random26import statistics27import sys28from collections import defaultdict29from pathlib import Path30from typing import Any313233def load_jsonl(path: Path) -> list[dict[str, Any]]:34records: list[dict[str, Any]] = []35for line in path.read_text(encoding="utf-8").splitlines():36if line.strip():37records.append(json.loads(line))38return records394041def load_run_records(results_dir: Path) -> list[dict[str, Any]]:42records: list[dict[str, Any]] = []43for path in sorted(results_dir.glob("*.json")):44if path.name == "summary.json":45continue46try:47data = json.loads(path.read_text(encoding="utf-8"))48except json.JSONDecodeError:49continue50records.append(data)51return records525354def bootstrap_ci(values: list[int], iterations: int = 2000, seed: int = 0) -> tuple[float, float, float]:55if not values:56return (0.0, 0.0, 0.0)57rng = random.Random(seed)58n = len(values)59samples: list[float] = []60for _ in range(iterations):61draw = [values[rng.randrange(n)] for _ in range(n)]62samples.append(sum(draw) / n)63samples.sort()64point = sum(values) / n65lower = samples[int(iterations * 0.025)]66upper = samples[int(iterations * 0.975)]67return (point, lower, upper)686970def summarize_per_model(records: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:71by_model: dict[str, list[dict[str, Any]]] = defaultdict(list)72for record in records:73by_model[record["model_id"]].append(record)7475summary: dict[str, dict[str, Any]] = {}76for model_id, model_records in by_model.items():77top1 = [1 if r.get("top1_correct") else 0 for r in model_records if r.get("status") == "finished"]78top3 = [1 if r.get("top3_correct") else 0 for r in model_records if r.get("status") == "finished"]79finished = [r for r in model_records if r.get("status") == "finished"]80format_failures = sum(1 for r in model_records if r.get("status") == "format_failure")81unavailable = sum(1 for r in model_records if r.get("status") == "model_unavailable")82durations = [r.get("duration_ms", 0) for r in model_records if isinstance(r.get("duration_ms"), int)]8384top1_point, top1_lower, top1_upper = bootstrap_ci(top1, seed=hash(model_id) & 0xFFFFFFFF)85top3_point, top3_lower, top3_upper = bootstrap_ci(top3, seed=(hash(model_id) ^ 0xdead) & 0xFFFFFFFF)8687summary[model_id] = {88"total_runs": len(model_records),89"finished": len(finished),90"format_failures": format_failures,91"model_unavailable": unavailable,92"top1_accuracy": round(top1_point, 4) if top1 else None,93"top1_ci": [round(top1_lower, 4), round(top1_upper, 4)] if top1 else None,94"top3_accuracy": round(top3_point, 4) if top3 else None,95"top3_ci": [round(top3_lower, 4), round(top3_upper, 4)] if top3 else None,96"median_duration_ms": int(statistics.median(durations)) if durations else None,97"p95_duration_ms": int(sorted(durations)[int(0.95 * len(durations)) - 1]) if len(durations) >= 20 else None,98}99return summary100101102def build_confusion(records: list[dict[str, Any]], prompts: dict[str, dict[str, Any]]) -> dict[str, dict[str, int]]:103matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))104for r in records:105if r.get("status") != "finished":106continue107expected = prompts.get(r["prompt_id"], {}).get("expected_primary_skill")108predicted = r.get("predicted_primary")109if not expected or not predicted:110continue111matrix[expected][predicted] += 1112return {expected: dict(row) for expected, row in matrix.items()}113114115def per_prompt_breakdown(records: list[dict[str, Any]], prompts: dict[str, dict[str, Any]]) -> list[dict[str, Any]]:116by_prompt: dict[str, list[dict[str, Any]]] = defaultdict(list)117for r in records:118by_prompt[r["prompt_id"]].append(r)119120rows: list[dict[str, Any]] = []121for prompt_id, prompt_records in sorted(by_prompt.items()):122meta = prompts.get(prompt_id, {})123expected = meta.get("expected_primary_skill")124total = len(prompt_records)125top1 = sum(1 for r in prompt_records if r.get("top1_correct"))126top3 = sum(1 for r in prompt_records if r.get("top3_correct"))127unique_predictions = sorted({r.get("predicted_primary") for r in prompt_records if r.get("predicted_primary")})128rows.append(129{130"prompt_id": prompt_id,131"expected": expected,132"runs": total,133"top1_rate": round(top1 / total, 3) if total else 0.0,134"top3_rate": round(top3 / total, 3) if total else 0.0,135"unique_predicted_primary": unique_predictions,136}137)138return rows139140141def hardest_prompts(per_prompt: list[dict[str, Any]], n: int = 10) -> list[dict[str, Any]]:142return sorted(per_prompt, key=lambda row: row["top1_rate"])[:n]143144145def render(summary: dict[str, dict[str, Any]], confusion: dict[str, dict[str, int]], per_prompt: list[dict[str, Any]], meta: dict[str, Any]) -> str:146lines: list[str] = []147lines.append("# Router Benchmark Results")148lines.append("")149lines.append(f"_run timestamp: {meta.get('timestamp')}_")150lines.append(f"_repo commit: `{meta.get('repo_sha', 'unknown')}`_")151lines.append(f"_fixture sha256-16: `{meta.get('fixture_sha', 'unknown')}`_")152lines.append(f"_seed: {meta.get('seed')}_")153lines.append(f"_runs: {meta.get('total_runs')}_ ")154lines.append(f"_models: {', '.join(meta.get('models', []))}_ ")155lines.append(f"_reps per (prompt, model): {meta.get('reps')}_")156lines.append("")157lines.append("## Methodology")158lines.append("")159lines.append(160"Each prompt is presented to each model with the 15 skill activation descriptions in a "161"deterministically-shuffled order (different shuffle per replication). The model must return "162"JSON with a ranked list of skill names. Top-1 accuracy is whether the first ranked skill "163"matches the human-labeled `expected_primary_skill`; top-3 is whether the expected skill appears "164"in the first three positions."165)166lines.append("")167lines.append(168"No skills are loaded into the agent (`settingSources: []`); the only routing signal is the "169"in-prompt descriptions. Confidence intervals are 95% bootstrap with 2000 resamples."170)171lines.append("")172lines.append("## Per-model leaderboard")173lines.append("")174lines.append("| Model | Top-1 | 95% CI | Top-3 | 95% CI | Format Failures | Median ms |")175lines.append("| --- | --- | --- | --- | --- | --- | --- |")176for model_id, stats in sorted(summary.items(), key=lambda item: -(item[1].get("top1_accuracy") or 0)):177top1 = stats.get("top1_accuracy")178top1_ci = stats.get("top1_ci")179top3 = stats.get("top3_accuracy")180top3_ci = stats.get("top3_ci")181median = stats.get("median_duration_ms")182lines.append(183f"| `{model_id}` | "184f"{top1:.3f} | "185f"[{top1_ci[0]:.3f}, {top1_ci[1]:.3f}] | "186f"{top3:.3f} | "187f"[{top3_ci[0]:.3f}, {top3_ci[1]:.3f}] | "188f"{stats.get('format_failures')} | "189f"{median if median else '-'} |"190)191192lines.append("")193lines.append("## Per-skill confusion (when expected is X, predicted is Y)")194lines.append("")195lines.append("Rows are the ground-truth `expected_primary_skill`; columns are what models actually predicted. Only `finished` runs counted.")196lines.append("")197all_predicted: set[str] = set()198for row in confusion.values():199all_predicted.update(row.keys())200sorted_predicted = sorted(all_predicted)201header = "| Expected \\ Predicted |" + "".join(f" `{p}` |" for p in sorted_predicted)202sep = "| --- |" + "".join(" --- |" for _ in sorted_predicted)203lines.append(header)204lines.append(sep)205for expected in sorted(confusion.keys()):206row_total = sum(confusion[expected].values())207cells = []208for predicted in sorted_predicted:209count = confusion[expected].get(predicted, 0)210if count == 0:211cells.append(" - |")212elif predicted == expected:213cells.append(f" **{count}** |")214else:215cells.append(f" {count} |")216lines.append(f"| `{expected}` (n={row_total}) |" + "".join(cells))217218hardest = hardest_prompts(per_prompt, n=10)219lines.append("")220lines.append("## Hardest prompts (lowest top-1 across all models)")221lines.append("")222lines.append("| Prompt | Expected | Top-1 Rate | Predicted Primaries |")223lines.append("| --- | --- | --- | --- |")224for row in hardest:225predicted = ", ".join(f"`{p}`" for p in row["unique_predicted_primary"][:5])226lines.append(227f"| {row['prompt_id']} | `{row['expected']}` | {row['top1_rate']:.2f} | {predicted} |"228)229230lines.append("")231lines.append("## Reproducibility")232lines.append("")233lines.append("Reproduce these numbers exactly with:")234lines.append("")235lines.append("```bash")236lines.append("cd researcher/benchmarks/sdk-runner")237lines.append("npm install")238lines.append("export CURSOR_API_KEY=<your-key>")239lines.append(240"node --experimental-strip-types src/runRouter.ts "241f"--models {','.join(meta.get('models', []))} "242f"--reps {meta.get('reps')} "243f"--seed {meta.get('seed')} "244"--max-budget-usd 15"245)246lines.append("python3 ../../scripts/render_router_report.py \\")247lines.append(" --results ../router/results/<date>-<seed> \\")248lines.append(" --fixture ../router/prompts.jsonl \\")249lines.append(" --output ../router/results-published/<date>.md")250lines.append("```")251lines.append("")252lines.append(253"Per-run JSON artifacts (prompt, model, replication, raw model output, parsed ranking) are "254"preserved under the gitignored `results/` directory next to the summary that drives this report."255)256return "\n".join(lines) + "\n"257258259def delta_section(260new_summary: dict[str, dict[str, Any]],261new_confusion: dict[str, dict[str, int]],262new_per_prompt: list[dict[str, Any]],263baseline_summary: dict[str, dict[str, Any]],264baseline_confusion: dict[str, dict[str, int]],265baseline_per_prompt: list[dict[str, Any]],266baseline_label: str,267) -> list[str]:268lines: list[str] = []269lines.append("## Delta vs baseline")270lines.append("")271lines.append(f"_baseline: {baseline_label}_")272lines.append("")273lines.append("### Per-model accuracy change")274lines.append("")275lines.append("| Model | Baseline Top-1 | New Top-1 | Delta | Baseline Top-3 | New Top-3 | Delta |")276lines.append("| --- | --- | --- | --- | --- | --- | --- |")277models = sorted(set(new_summary) | set(baseline_summary))278for model in models:279bt1 = baseline_summary.get(model, {}).get("top1_accuracy")280nt1 = new_summary.get(model, {}).get("top1_accuracy")281bt3 = baseline_summary.get(model, {}).get("top3_accuracy")282nt3 = new_summary.get(model, {}).get("top3_accuracy")283d1 = (nt1 - bt1) if isinstance(bt1, (int, float)) and isinstance(nt1, (int, float)) else None284d3 = (nt3 - bt3) if isinstance(bt3, (int, float)) and isinstance(nt3, (int, float)) else None285bt1_s = f"{bt1:.3f}" if isinstance(bt1, (int, float)) else "-"286nt1_s = f"{nt1:.3f}" if isinstance(nt1, (int, float)) else "-"287bt3_s = f"{bt3:.3f}" if isinstance(bt3, (int, float)) else "-"288nt3_s = f"{nt3:.3f}" if isinstance(nt3, (int, float)) else "-"289d1_s = f"{'+' if d1 and d1 > 0 else ''}{d1:.3f}" if d1 is not None else "-"290d3_s = f"{'+' if d3 and d3 > 0 else ''}{d3:.3f}" if d3 is not None else "-"291lines.append(f"| `{model}` | {bt1_s} | {nt1_s} | {d1_s} | {bt3_s} | {nt3_s} | {d3_s} |")292lines.append("")293lines.append("### Per-skill top-1 rate change")294lines.append("")295lines.append("Counts a row as correct when the predicted primary equals the expected primary.")296lines.append("")297lines.append("| Skill (expected) | Baseline | New | Delta |")298lines.append("| --- | --- | --- | --- |")299all_skills = sorted(set(baseline_confusion) | set(new_confusion))300for skill in all_skills:301b_row = baseline_confusion.get(skill, {})302n_row = new_confusion.get(skill, {})303b_total = sum(b_row.values())304n_total = sum(n_row.values())305b_correct = b_row.get(skill, 0)306n_correct = n_row.get(skill, 0)307b_rate = b_correct / b_total if b_total else 0.0308n_rate = n_correct / n_total if n_total else 0.0309delta = n_rate - b_rate310delta_s = f"{'+' if delta > 0 else ''}{delta:.3f}"311marker = " <- improved" if delta >= 0.05 else (" <- regressed" if delta <= -0.05 else "")312lines.append(313f"| `{skill}` | {b_correct}/{b_total} = {b_rate:.3f} | {n_correct}/{n_total} = {n_rate:.3f} | {delta_s}{marker} |"314)315lines.append("")316lines.append("### Previously-hardest prompts")317lines.append("")318baseline_hardest_ids = {row["prompt_id"] for row in sorted(baseline_per_prompt, key=lambda r: r["top1_rate"])[:10]}319lines.append("| Prompt | Expected | Baseline Top-1 Rate | New Top-1 Rate | Delta |")320lines.append("| --- | --- | --- | --- | --- |")321new_by_id = {row["prompt_id"]: row for row in new_per_prompt}322baseline_by_id = {row["prompt_id"]: row for row in baseline_per_prompt}323for prompt_id in sorted(baseline_hardest_ids):324baseline = baseline_by_id.get(prompt_id, {})325new = new_by_id.get(prompt_id, {})326b_rate = baseline.get("top1_rate", 0.0)327n_rate = new.get("top1_rate", 0.0)328delta = n_rate - b_rate329delta_s = f"{'+' if delta > 0 else ''}{delta:.3f}"330expected = baseline.get("expected") or new.get("expected") or "-"331lines.append(f"| {prompt_id} | `{expected}` | {b_rate:.2f} | {n_rate:.2f} | {delta_s} |")332return lines333334335def main() -> int:336parser = argparse.ArgumentParser(description="Render router benchmark report")337parser.add_argument("--results", type=Path, required=True, help="Directory of per-run JSON files")338parser.add_argument("--fixture", type=Path, required=True, help="Router prompts JSONL")339parser.add_argument("--output", type=Path, required=True, help="Destination Markdown file")340parser.add_argument("--baseline", type=Path, help="Optional baseline results directory to compute deltas against")341parser.add_argument("--baseline-label", type=str, help="Human label for the baseline (e.g. '2026-05-15 v2.2.0 descriptions')")342args = parser.parse_args()343344if not args.results.exists():345print(f"results dir missing: {args.results}", file=sys.stderr)346return 1347348records = load_run_records(args.results)349if not records:350print("no per-run records found", file=sys.stderr)351return 1352prompts_list = load_jsonl(args.fixture)353prompts = {row["prompt_id"]: row for row in prompts_list}354355summary_path = args.results / "summary.json"356summary_meta: dict[str, Any] = {}357if summary_path.exists():358try:359summary_meta = json.loads(summary_path.read_text(encoding="utf-8"))360except json.JSONDecodeError:361summary_meta = {}362363meta = {364"timestamp": summary_meta.get("timestamp") or "unknown",365"repo_sha": summary_meta.get("repo_sha") or "unknown",366"fixture_sha": summary_meta.get("fixture_sha") or "unknown",367"seed": summary_meta.get("seed") or 1,368"total_runs": len(records),369"models": sorted({r["model_id"] for r in records}),370"reps": max((r.get("rep", 0) for r in records), default=0) + 1,371}372373summary = summarize_per_model(records)374confusion = build_confusion(records, prompts)375per_prompt = per_prompt_breakdown(records, prompts)376377rendered = render(summary, confusion, per_prompt, meta)378379if args.baseline and args.baseline.exists():380baseline_records = load_run_records(args.baseline)381baseline_summary = summarize_per_model(baseline_records)382baseline_confusion = build_confusion(baseline_records, prompts)383baseline_per_prompt = per_prompt_breakdown(baseline_records, prompts)384delta = delta_section(385summary,386confusion,387per_prompt,388baseline_summary,389baseline_confusion,390baseline_per_prompt,391args.baseline_label or str(args.baseline),392)393rendered = rendered.rstrip("\n") + "\n\n" + "\n".join(delta) + "\n"394395args.output.parent.mkdir(parents=True, exist_ok=True)396args.output.write_text(rendered, encoding="utf-8")397print(f"wrote {args.output}")398print(json.dumps({"models": meta["models"], "total_runs": meta["total_runs"], "per_model_top1": {k: v.get("top1_accuracy") for k, v in summary.items()}}, indent=2))399return 0400401402if __name__ == "__main__":403sys.exit(main())404