Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/scripts/run_benchmarks.py
1#!/usr/bin/env python32"""Run deterministic researcher benchmark checks."""34from __future__ import annotations56import argparse7import json8import subprocess9import sys10from datetime import datetime, timezone11from pathlib import Path12from typing import Any131415ROOT = Path(__file__).resolve().parents[2]16RESEARCHER = ROOT / "researcher"171819def utc_now() -> str:20return datetime.now(timezone.utc).replace(microsecond=0).isoformat()212223def run_command(name: str, cmd: list[str]) -> dict[str, Any]:24completed = subprocess.run(cmd, cwd=ROOT, text=True, capture_output=True, check=False)25parsed: Any = None26if completed.stdout:27try:28parsed = json.loads(completed.stdout)29except json.JSONDecodeError:30parsed = None31return {32"name": name,33"passed": completed.returncode == 0,34"exit_code": completed.returncode,35"stdout_json": parsed,36"stderr": completed.stderr,37}383940def load_jsonl(path: Path) -> list[dict[str, Any]]:41records: list[dict[str, Any]] = []42for line in path.read_text(encoding="utf-8").splitlines():43if line.strip():44records.append(json.loads(line))45return records464748def load_scenarios() -> list[dict[str, Any]]:49scenarios: list[dict[str, Any]] = []50for path in sorted((RESEARCHER / "benchmarks" / "scenarios").glob("*.jsonl")):51scenarios.extend(load_jsonl(path))52return scenarios535455def load_goldens() -> dict[str, dict[str, Any]]:56goldens: dict[str, dict[str, Any]] = {}57for path in sorted((RESEARCHER / "benchmarks" / "goldens").glob("*.json")):58data = json.loads(path.read_text(encoding="utf-8"))59for scenario_id, expected in data.items():60goldens[scenario_id] = expected61return goldens626364def evaluate_scenarios(scenarios: list[dict[str, Any]]) -> dict[str, Any]:65goldens = load_goldens()66failures: list[dict[str, Any]] = []67scenario_ids = {scenario.get("scenario_id") for scenario in scenarios}68for scenario in scenarios:69if not scenario.get("scenario_id") or not scenario.get("expected_gate"):70failures.append({"scenario_id": scenario.get("scenario_id", "unknown"), "reason": "missing required fields"})71continue72scenario_id = scenario["scenario_id"]73golden = goldens.get(scenario_id)74if not golden:75failures.append({"scenario_id": scenario_id, "reason": "missing golden"})76continue77if golden.get("expected_gate") != scenario.get("expected_gate"):78failures.append(79{80"scenario_id": scenario_id,81"reason": "expected_gate differs from golden",82"scenario": scenario.get("expected_gate"),83"golden": golden.get("expected_gate"),84}85)86for extra in sorted(set(goldens) - scenario_ids):87failures.append({"scenario_id": extra, "reason": "golden has no matching scenario"})88return {89"name": "adversarial-scenario-shape",90"passed": not failures,91"scenario_count": len(scenarios),92"failures": failures,93}949596def run_benchmarks(record: bool) -> dict[str, Any]:97checks = [98run_command(99"repo-validation",100[sys.executable, str(RESEARCHER / "scripts" / "validate_repo.py"), "--json"],101),102run_command(103"activation-cases",104[sys.executable, str(RESEARCHER / "scripts" / "check_activation_cases.py"), "--json"],105),106]107scenarios = load_scenarios()108checks.append(evaluate_scenarios(scenarios))109failures = [check for check in checks if not check.get("passed")]110result = {111"timestamp": utc_now(),112"ok": not failures,113"summary": {"benchmarks": len(checks), "failures": len(failures), "scenarios": len(scenarios)},114"checks": checks,115}116if record:117history = RESEARCHER / "reports" / "benchmark-history.jsonl"118with history.open("a", encoding="utf-8") as handle:119handle.write(json.dumps(result, sort_keys=True) + "\n")120return result121122123def main() -> int:124parser = argparse.ArgumentParser(description="Run researcher benchmark harness")125parser.add_argument("--json", action="store_true")126parser.add_argument("--record", action="store_true", help="append result to benchmark history")127args = parser.parse_args()128129result = run_benchmarks(args.record)130if args.json:131print(json.dumps(result, indent=2))132else:133summary = result["summary"]134print(135f"Benchmarks {'passed' if result['ok'] else 'failed'}: "136f"{summary['benchmarks']} checks, {summary['failures']} failures, {summary['scenarios']} scenarios"137)138return 0 if result["ok"] else 1139140141if __name__ == "__main__":142sys.exit(main())143