Source from repo

Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.

muratcankoylanGitHub muratcankoylanSource repo Original GitHub link

Files

339

Skill

n/a

Size

4.3 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

researcher/scripts/run_benchmarks.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code143 linesFree

researcher/scripts/run_benchmarks.py

1#!/usr/bin/env python3
2"""Run deterministic researcher benchmark checks."""
3 
4from __future__ import annotations
5 
6import argparse
7import json
8import subprocess
9import sys
10from datetime import datetime, timezone
11from pathlib import Path
12from typing import Any
13 
14 
15ROOT = Path(__file__).resolve().parents[2]
16RESEARCHER = ROOT / "researcher"
17 
18 
19def utc_now() -> str:
20    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
21 
22 
23def run_command(name: str, cmd: list[str]) -> dict[str, Any]:
24    completed = subprocess.run(cmd, cwd=ROOT, text=True, capture_output=True, check=False)
25    parsed: Any = None
26    if completed.stdout:
27        try:
28            parsed = json.loads(completed.stdout)
29        except json.JSONDecodeError:
30            parsed = None
31    return {
32        "name": name,
33        "passed": completed.returncode == 0,
34        "exit_code": completed.returncode,
35        "stdout_json": parsed,
36        "stderr": completed.stderr,
37    }
38 
39 
40def load_jsonl(path: Path) -> list[dict[str, Any]]:
41    records: list[dict[str, Any]] = []
42    for line in path.read_text(encoding="utf-8").splitlines():
43        if line.strip():
44            records.append(json.loads(line))
45    return records
46 
47 
48def load_scenarios() -> list[dict[str, Any]]:
49    scenarios: list[dict[str, Any]] = []
50    for path in sorted((RESEARCHER / "benchmarks" / "scenarios").glob("*.jsonl")):
51        scenarios.extend(load_jsonl(path))
52    return scenarios
53 
54 
55def load_goldens() -> dict[str, dict[str, Any]]:
56    goldens: dict[str, dict[str, Any]] = {}
57    for path in sorted((RESEARCHER / "benchmarks" / "goldens").glob("*.json")):
58        data = json.loads(path.read_text(encoding="utf-8"))
59        for scenario_id, expected in data.items():
60            goldens[scenario_id] = expected
61    return goldens
62 
63 
64def evaluate_scenarios(scenarios: list[dict[str, Any]]) -> dict[str, Any]:
65    goldens = load_goldens()
66    failures: list[dict[str, Any]] = []
67    scenario_ids = {scenario.get("scenario_id") for scenario in scenarios}
68    for scenario in scenarios:
69        if not scenario.get("scenario_id") or not scenario.get("expected_gate"):
70            failures.append({"scenario_id": scenario.get("scenario_id", "unknown"), "reason": "missing required fields"})
71            continue
72        scenario_id = scenario["scenario_id"]
73        golden = goldens.get(scenario_id)
74        if not golden:
75            failures.append({"scenario_id": scenario_id, "reason": "missing golden"})
76            continue
77        if golden.get("expected_gate") != scenario.get("expected_gate"):
78            failures.append(
79                {
80                    "scenario_id": scenario_id,
81                    "reason": "expected_gate differs from golden",
82                    "scenario": scenario.get("expected_gate"),
83                    "golden": golden.get("expected_gate"),
84                }
85            )
86    for extra in sorted(set(goldens) - scenario_ids):
87        failures.append({"scenario_id": extra, "reason": "golden has no matching scenario"})
88    return {
89        "name": "adversarial-scenario-shape",
90        "passed": not failures,
91        "scenario_count": len(scenarios),
92        "failures": failures,
93    }
94 
95 
96def run_benchmarks(record: bool) -> dict[str, Any]:
97    checks = [
98        run_command(
99            "repo-validation",
100            [sys.executable, str(RESEARCHER / "scripts" / "validate_repo.py"), "--json"],
101        ),
102        run_command(
103            "activation-cases",
104            [sys.executable, str(RESEARCHER / "scripts" / "check_activation_cases.py"), "--json"],
105        ),
106    ]
107    scenarios = load_scenarios()
108    checks.append(evaluate_scenarios(scenarios))
109    failures = [check for check in checks if not check.get("passed")]
110    result = {
111        "timestamp": utc_now(),
112        "ok": not failures,
113        "summary": {"benchmarks": len(checks), "failures": len(failures), "scenarios": len(scenarios)},
114        "checks": checks,
115    }
116    if record:
117        history = RESEARCHER / "reports" / "benchmark-history.jsonl"
118        with history.open("a", encoding="utf-8") as handle:
119            handle.write(json.dumps(result, sort_keys=True) + "\n")
120    return result
121 
122 
123def main() -> int:
124    parser = argparse.ArgumentParser(description="Run researcher benchmark harness")
125    parser.add_argument("--json", action="store_true")
126    parser.add_argument("--record", action="store_true", help="append result to benchmark history")
127    args = parser.parse_args()
128 
129    result = run_benchmarks(args.record)
130    if args.json:
131        print(json.dumps(result, indent=2))
132    else:
133        summary = result["summary"]
134        print(
135            f"Benchmarks {'passed' if result['ok'] else 'failed'}: "
136            f"{summary['benchmarks']} checks, {summary['failures']} failures, {summary['scenarios']} scenarios"
137        )
138    return 0 if result["ok"] else 1
139 
140 
141if __name__ == "__main__":
142    sys.exit(main())
143

Agent Skills for Context Engineering

researcher/scripts/run_benchmarks.py

Preparing the source view

Agent Skills for Context Engineering

researcher/scripts/run_benchmarks.py