Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
339
Skill
n/a
Size
4.3 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
researcher/scripts/render_router_report.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code404 linesFree
researcher/scripts/render_router_report.py
1#!/usr/bin/env python3
2"""Render a published-quality Markdown report from router benchmark results.
3 
4Reads per-run JSON files produced by runRouter.ts and emits:
5  - Per-model top-1 / top-3 accuracy with bootstrap 95% CIs
6  - Per-model format failure rate
7  - Per-model wall time stats
8  - Per-skill confusion matrix (when expected was X, what was predicted)
9  - Per-prompt cross-model agreement and per-prompt failures
10  - Per-rep consistency check (within-prompt-model)
11 
12Usage:
13  python3 researcher/scripts/render_router_report.py \
14      --results researcher/benchmarks/router/results/<date>-<seed> \
15      --fixture researcher/benchmarks/router/prompts.jsonl \
16      --output researcher/benchmarks/router/results-published/<date>.md
17 
18The output is committed; raw per-run JSONs stay gitignored.
19"""
20 
21from __future__ import annotations
22 
23import argparse
24import json
25import random
26import statistics
27import sys
28from collections import defaultdict
29from pathlib import Path
30from typing import Any
31 
32 
33def load_jsonl(path: Path) -> list[dict[str, Any]]:
34    records: list[dict[str, Any]] = []
35    for line in path.read_text(encoding="utf-8").splitlines():
36        if line.strip():
37            records.append(json.loads(line))
38    return records
39 
40 
41def load_run_records(results_dir: Path) -> list[dict[str, Any]]:
42    records: list[dict[str, Any]] = []
43    for path in sorted(results_dir.glob("*.json")):
44        if path.name == "summary.json":
45            continue
46        try:
47            data = json.loads(path.read_text(encoding="utf-8"))
48        except json.JSONDecodeError:
49            continue
50        records.append(data)
51    return records
52 
53 
54def bootstrap_ci(values: list[int], iterations: int = 2000, seed: int = 0) -> tuple[float, float, float]:
55    if not values:
56        return (0.0, 0.0, 0.0)
57    rng = random.Random(seed)
58    n = len(values)
59    samples: list[float] = []
60    for _ in range(iterations):
61        draw = [values[rng.randrange(n)] for _ in range(n)]
62        samples.append(sum(draw) / n)
63    samples.sort()
64    point = sum(values) / n
65    lower = samples[int(iterations * 0.025)]
66    upper = samples[int(iterations * 0.975)]
67    return (point, lower, upper)
68 
69 
70def summarize_per_model(records: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
71    by_model: dict[str, list[dict[str, Any]]] = defaultdict(list)
72    for record in records:
73        by_model[record["model_id"]].append(record)
74 
75    summary: dict[str, dict[str, Any]] = {}
76    for model_id, model_records in by_model.items():
77        top1 = [1 if r.get("top1_correct") else 0 for r in model_records if r.get("status") == "finished"]
78        top3 = [1 if r.get("top3_correct") else 0 for r in model_records if r.get("status") == "finished"]
79        finished = [r for r in model_records if r.get("status") == "finished"]
80        format_failures = sum(1 for r in model_records if r.get("status") == "format_failure")
81        unavailable = sum(1 for r in model_records if r.get("status") == "model_unavailable")
82        durations = [r.get("duration_ms", 0) for r in model_records if isinstance(r.get("duration_ms"), int)]
83 
84        top1_point, top1_lower, top1_upper = bootstrap_ci(top1, seed=hash(model_id) & 0xFFFFFFFF)
85        top3_point, top3_lower, top3_upper = bootstrap_ci(top3, seed=(hash(model_id) ^ 0xdead) & 0xFFFFFFFF)
86 
87        summary[model_id] = {
88            "total_runs": len(model_records),
89            "finished": len(finished),
90            "format_failures": format_failures,
91            "model_unavailable": unavailable,
92            "top1_accuracy": round(top1_point, 4) if top1 else None,
93            "top1_ci": [round(top1_lower, 4), round(top1_upper, 4)] if top1 else None,
94            "top3_accuracy": round(top3_point, 4) if top3 else None,
95            "top3_ci": [round(top3_lower, 4), round(top3_upper, 4)] if top3 else None,
96            "median_duration_ms": int(statistics.median(durations)) if durations else None,
97            "p95_duration_ms": int(sorted(durations)[int(0.95 * len(durations)) - 1]) if len(durations) >= 20 else None,
98        }
99    return summary
100 
101 
102def build_confusion(records: list[dict[str, Any]], prompts: dict[str, dict[str, Any]]) -> dict[str, dict[str, int]]:
103    matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
104    for r in records:
105        if r.get("status") != "finished":
106            continue
107        expected = prompts.get(r["prompt_id"], {}).get("expected_primary_skill")
108        predicted = r.get("predicted_primary")
109        if not expected or not predicted:
110            continue
111        matrix[expected][predicted] += 1
112    return {expected: dict(row) for expected, row in matrix.items()}
113 
114 
115def per_prompt_breakdown(records: list[dict[str, Any]], prompts: dict[str, dict[str, Any]]) -> list[dict[str, Any]]:
116    by_prompt: dict[str, list[dict[str, Any]]] = defaultdict(list)
117    for r in records:
118        by_prompt[r["prompt_id"]].append(r)
119 
120    rows: list[dict[str, Any]] = []
121    for prompt_id, prompt_records in sorted(by_prompt.items()):
122        meta = prompts.get(prompt_id, {})
123        expected = meta.get("expected_primary_skill")
124        total = len(prompt_records)
125        top1 = sum(1 for r in prompt_records if r.get("top1_correct"))
126        top3 = sum(1 for r in prompt_records if r.get("top3_correct"))
127        unique_predictions = sorted({r.get("predicted_primary") for r in prompt_records if r.get("predicted_primary")})
128        rows.append(
129            {
130                "prompt_id": prompt_id,
131                "expected": expected,
132                "runs": total,
133                "top1_rate": round(top1 / total, 3) if total else 0.0,
134                "top3_rate": round(top3 / total, 3) if total else 0.0,
135                "unique_predicted_primary": unique_predictions,
136            }
137        )
138    return rows
139 
140 
141def hardest_prompts(per_prompt: list[dict[str, Any]], n: int = 10) -> list[dict[str, Any]]:
142    return sorted(per_prompt, key=lambda row: row["top1_rate"])[:n]
143 
144 
145def render(summary: dict[str, dict[str, Any]], confusion: dict[str, dict[str, int]], per_prompt: list[dict[str, Any]], meta: dict[str, Any]) -> str:
146    lines: list[str] = []
147    lines.append("# Router Benchmark Results")
148    lines.append("")
149    lines.append(f"_run timestamp: {meta.get('timestamp')}_")
150    lines.append(f"_repo commit: `{meta.get('repo_sha', 'unknown')}`_")
151    lines.append(f"_fixture sha256-16: `{meta.get('fixture_sha', 'unknown')}`_")
152    lines.append(f"_seed: {meta.get('seed')}_")
153    lines.append(f"_runs: {meta.get('total_runs')}_  ")
154    lines.append(f"_models: {', '.join(meta.get('models', []))}_  ")
155    lines.append(f"_reps per (prompt, model): {meta.get('reps')}_")
156    lines.append("")
157    lines.append("## Methodology")
158    lines.append("")
159    lines.append(
160        "Each prompt is presented to each model with the 15 skill activation descriptions in a "
161        "deterministically-shuffled order (different shuffle per replication). The model must return "
162        "JSON with a ranked list of skill names. Top-1 accuracy is whether the first ranked skill "
163        "matches the human-labeled `expected_primary_skill`; top-3 is whether the expected skill appears "
164        "in the first three positions."
165    )
166    lines.append("")
167    lines.append(
168        "No skills are loaded into the agent (`settingSources: []`); the only routing signal is the "
169        "in-prompt descriptions. Confidence intervals are 95% bootstrap with 2000 resamples."
170    )
171    lines.append("")
172    lines.append("## Per-model leaderboard")
173    lines.append("")
174    lines.append("| Model | Top-1 | 95% CI | Top-3 | 95% CI | Format Failures | Median ms |")
175    lines.append("| --- | --- | --- | --- | --- | --- | --- |")
176    for model_id, stats in sorted(summary.items(), key=lambda item: -(item[1].get("top1_accuracy") or 0)):
177        top1 = stats.get("top1_accuracy")
178        top1_ci = stats.get("top1_ci")
179        top3 = stats.get("top3_accuracy")
180        top3_ci = stats.get("top3_ci")
181        median = stats.get("median_duration_ms")
182        lines.append(
183            f"| `{model_id}` | "
184            f"{top1:.3f} | "
185            f"[{top1_ci[0]:.3f}, {top1_ci[1]:.3f}] | "
186            f"{top3:.3f} | "
187            f"[{top3_ci[0]:.3f}, {top3_ci[1]:.3f}] | "
188            f"{stats.get('format_failures')} | "
189            f"{median if median else '-'} |"
190        )
191 
192    lines.append("")
193    lines.append("## Per-skill confusion (when expected is X, predicted is Y)")
194    lines.append("")
195    lines.append("Rows are the ground-truth `expected_primary_skill`; columns are what models actually predicted. Only `finished` runs counted.")
196    lines.append("")
197    all_predicted: set[str] = set()
198    for row in confusion.values():
199        all_predicted.update(row.keys())
200    sorted_predicted = sorted(all_predicted)
201    header = "| Expected \\ Predicted |" + "".join(f" `{p}` |" for p in sorted_predicted)
202    sep = "| --- |" + "".join(" --- |" for _ in sorted_predicted)
203    lines.append(header)
204    lines.append(sep)
205    for expected in sorted(confusion.keys()):
206        row_total = sum(confusion[expected].values())
207        cells = []
208        for predicted in sorted_predicted:
209            count = confusion[expected].get(predicted, 0)
210            if count == 0:
211                cells.append(" - |")
212            elif predicted == expected:
213                cells.append(f" **{count}** |")
214            else:
215                cells.append(f" {count} |")
216        lines.append(f"| `{expected}` (n={row_total}) |" + "".join(cells))
217 
218    hardest = hardest_prompts(per_prompt, n=10)
219    lines.append("")
220    lines.append("## Hardest prompts (lowest top-1 across all models)")
221    lines.append("")
222    lines.append("| Prompt | Expected | Top-1 Rate | Predicted Primaries |")
223    lines.append("| --- | --- | --- | --- |")
224    for row in hardest:
225        predicted = ", ".join(f"`{p}`" for p in row["unique_predicted_primary"][:5])
226        lines.append(
227            f"| {row['prompt_id']} | `{row['expected']}` | {row['top1_rate']:.2f} | {predicted} |"
228        )
229 
230    lines.append("")
231    lines.append("## Reproducibility")
232    lines.append("")
233    lines.append("Reproduce these numbers exactly with:")
234    lines.append("")
235    lines.append("```bash")
236    lines.append("cd researcher/benchmarks/sdk-runner")
237    lines.append("npm install")
238    lines.append("export CURSOR_API_KEY=<your-key>")
239    lines.append(
240        "node --experimental-strip-types src/runRouter.ts "
241        f"--models {','.join(meta.get('models', []))} "
242        f"--reps {meta.get('reps')} "
243        f"--seed {meta.get('seed')} "
244        "--max-budget-usd 15"
245    )
246    lines.append("python3 ../../scripts/render_router_report.py \\")
247    lines.append("    --results ../router/results/<date>-<seed> \\")
248    lines.append("    --fixture ../router/prompts.jsonl \\")
249    lines.append("    --output ../router/results-published/<date>.md")
250    lines.append("```")
251    lines.append("")
252    lines.append(
253        "Per-run JSON artifacts (prompt, model, replication, raw model output, parsed ranking) are "
254        "preserved under the gitignored `results/` directory next to the summary that drives this report."
255    )
256    return "\n".join(lines) + "\n"
257 
258 
259def delta_section(
260    new_summary: dict[str, dict[str, Any]],
261    new_confusion: dict[str, dict[str, int]],
262    new_per_prompt: list[dict[str, Any]],
263    baseline_summary: dict[str, dict[str, Any]],
264    baseline_confusion: dict[str, dict[str, int]],
265    baseline_per_prompt: list[dict[str, Any]],
266    baseline_label: str,
267) -> list[str]:
268    lines: list[str] = []
269    lines.append("## Delta vs baseline")
270    lines.append("")
271    lines.append(f"_baseline: {baseline_label}_")
272    lines.append("")
273    lines.append("### Per-model accuracy change")
274    lines.append("")
275    lines.append("| Model | Baseline Top-1 | New Top-1 | Delta | Baseline Top-3 | New Top-3 | Delta |")
276    lines.append("| --- | --- | --- | --- | --- | --- | --- |")
277    models = sorted(set(new_summary) | set(baseline_summary))
278    for model in models:
279        bt1 = baseline_summary.get(model, {}).get("top1_accuracy")
280        nt1 = new_summary.get(model, {}).get("top1_accuracy")
281        bt3 = baseline_summary.get(model, {}).get("top3_accuracy")
282        nt3 = new_summary.get(model, {}).get("top3_accuracy")
283        d1 = (nt1 - bt1) if isinstance(bt1, (int, float)) and isinstance(nt1, (int, float)) else None
284        d3 = (nt3 - bt3) if isinstance(bt3, (int, float)) and isinstance(nt3, (int, float)) else None
285        bt1_s = f"{bt1:.3f}" if isinstance(bt1, (int, float)) else "-"
286        nt1_s = f"{nt1:.3f}" if isinstance(nt1, (int, float)) else "-"
287        bt3_s = f"{bt3:.3f}" if isinstance(bt3, (int, float)) else "-"
288        nt3_s = f"{nt3:.3f}" if isinstance(nt3, (int, float)) else "-"
289        d1_s = f"{'+' if d1 and d1 > 0 else ''}{d1:.3f}" if d1 is not None else "-"
290        d3_s = f"{'+' if d3 and d3 > 0 else ''}{d3:.3f}" if d3 is not None else "-"
291        lines.append(f"| `{model}` | {bt1_s} | {nt1_s} | {d1_s} | {bt3_s} | {nt3_s} | {d3_s} |")
292    lines.append("")
293    lines.append("### Per-skill top-1 rate change")
294    lines.append("")
295    lines.append("Counts a row as correct when the predicted primary equals the expected primary.")
296    lines.append("")
297    lines.append("| Skill (expected) | Baseline | New | Delta |")
298    lines.append("| --- | --- | --- | --- |")
299    all_skills = sorted(set(baseline_confusion) | set(new_confusion))
300    for skill in all_skills:
301        b_row = baseline_confusion.get(skill, {})
302        n_row = new_confusion.get(skill, {})
303        b_total = sum(b_row.values())
304        n_total = sum(n_row.values())
305        b_correct = b_row.get(skill, 0)
306        n_correct = n_row.get(skill, 0)
307        b_rate = b_correct / b_total if b_total else 0.0
308        n_rate = n_correct / n_total if n_total else 0.0
309        delta = n_rate - b_rate
310        delta_s = f"{'+' if delta > 0 else ''}{delta:.3f}"
311        marker = " <- improved" if delta >= 0.05 else (" <- regressed" if delta <= -0.05 else "")
312        lines.append(
313            f"| `{skill}` | {b_correct}/{b_total} = {b_rate:.3f} | {n_correct}/{n_total} = {n_rate:.3f} | {delta_s}{marker} |"
314        )
315    lines.append("")
316    lines.append("### Previously-hardest prompts")
317    lines.append("")
318    baseline_hardest_ids = {row["prompt_id"] for row in sorted(baseline_per_prompt, key=lambda r: r["top1_rate"])[:10]}
319    lines.append("| Prompt | Expected | Baseline Top-1 Rate | New Top-1 Rate | Delta |")
320    lines.append("| --- | --- | --- | --- | --- |")
321    new_by_id = {row["prompt_id"]: row for row in new_per_prompt}
322    baseline_by_id = {row["prompt_id"]: row for row in baseline_per_prompt}
323    for prompt_id in sorted(baseline_hardest_ids):
324        baseline = baseline_by_id.get(prompt_id, {})
325        new = new_by_id.get(prompt_id, {})
326        b_rate = baseline.get("top1_rate", 0.0)
327        n_rate = new.get("top1_rate", 0.0)
328        delta = n_rate - b_rate
329        delta_s = f"{'+' if delta > 0 else ''}{delta:.3f}"
330        expected = baseline.get("expected") or new.get("expected") or "-"
331        lines.append(f"| {prompt_id} | `{expected}` | {b_rate:.2f} | {n_rate:.2f} | {delta_s} |")
332    return lines
333 
334 
335def main() -> int:
336    parser = argparse.ArgumentParser(description="Render router benchmark report")
337    parser.add_argument("--results", type=Path, required=True, help="Directory of per-run JSON files")
338    parser.add_argument("--fixture", type=Path, required=True, help="Router prompts JSONL")
339    parser.add_argument("--output", type=Path, required=True, help="Destination Markdown file")
340    parser.add_argument("--baseline", type=Path, help="Optional baseline results directory to compute deltas against")
341    parser.add_argument("--baseline-label", type=str, help="Human label for the baseline (e.g. '2026-05-15 v2.2.0 descriptions')")
342    args = parser.parse_args()
343 
344    if not args.results.exists():
345        print(f"results dir missing: {args.results}", file=sys.stderr)
346        return 1
347 
348    records = load_run_records(args.results)
349    if not records:
350        print("no per-run records found", file=sys.stderr)
351        return 1
352    prompts_list = load_jsonl(args.fixture)
353    prompts = {row["prompt_id"]: row for row in prompts_list}
354 
355    summary_path = args.results / "summary.json"
356    summary_meta: dict[str, Any] = {}
357    if summary_path.exists():
358        try:
359            summary_meta = json.loads(summary_path.read_text(encoding="utf-8"))
360        except json.JSONDecodeError:
361            summary_meta = {}
362 
363    meta = {
364        "timestamp": summary_meta.get("timestamp") or "unknown",
365        "repo_sha": summary_meta.get("repo_sha") or "unknown",
366        "fixture_sha": summary_meta.get("fixture_sha") or "unknown",
367        "seed": summary_meta.get("seed") or 1,
368        "total_runs": len(records),
369        "models": sorted({r["model_id"] for r in records}),
370        "reps": max((r.get("rep", 0) for r in records), default=0) + 1,
371    }
372 
373    summary = summarize_per_model(records)
374    confusion = build_confusion(records, prompts)
375    per_prompt = per_prompt_breakdown(records, prompts)
376 
377    rendered = render(summary, confusion, per_prompt, meta)
378 
379    if args.baseline and args.baseline.exists():
380        baseline_records = load_run_records(args.baseline)
381        baseline_summary = summarize_per_model(baseline_records)
382        baseline_confusion = build_confusion(baseline_records, prompts)
383        baseline_per_prompt = per_prompt_breakdown(baseline_records, prompts)
384        delta = delta_section(
385            summary,
386            confusion,
387            per_prompt,
388            baseline_summary,
389            baseline_confusion,
390            baseline_per_prompt,
391            args.baseline_label or str(args.baseline),
392        )
393        rendered = rendered.rstrip("\n") + "\n\n" + "\n".join(delta) + "\n"
394 
395    args.output.parent.mkdir(parents=True, exist_ok=True)
396    args.output.write_text(rendered, encoding="utf-8")
397    print(f"wrote {args.output}")
398    print(json.dumps({"models": meta["models"], "total_runs": meta["total_runs"], "per_model_top1": {k: v.get("top1_accuracy") for k, v in summary.items()}}, indent=2))
399    return 0
400 
401 
402if __name__ == "__main__":
403    sys.exit(main())
404
Preparing the source view

Agent Skills for Context Engineering

researcher/scripts/render_router_report.py