Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/benchmarks/sdk-runner/src/runRouter.ts
1/**2* Stage 2: Skill router benchmark.3*4* Hypothesis: the activation-scenario descriptions in v2.2.0 frontmatter let a5* frontier model route prompts to the correct skill at high top-1 accuracy.6*7* Procedure (per the methodology in researcher/benchmarks/PLAN.md):8* 1. Load 50+ ground-truth prompts from router/prompts.jsonl.9* 2. For each (prompt, model, replication), build a routing prompt with the10* 15 skill descriptions in deterministically-shuffled order.11* 3. Call Agent.prompt() with settingSources: [] (no skills loaded; the12* descriptions in the prompt are the only signal).13* 4. Parse strict JSON ranking. Score top-1 and top-3 accuracy.14* 5. Persist per-run JSON + transcript; append a summary to history.15*16* Runs only execute when CURSOR_API_KEY is set AND a cost cap is provided.17* --dry-run prints the plan and cost forecast and exits cleanly.18*/1920import { join } from "node:path";21import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";2223import {24RESEARCHER_DIR,25REPO_ROOT,26apiKeyFingerprint,27appendHistoryEntry,28assertBudget,29buildRunPlan,30fixtureSha,31forecastCost,32loadJsonl,33loadSkillDescriptions,34parseCliFlags,35repoCommitSha,36resolveConfig,37resultFileName,38runConcurrently,39runHeader,40shuffleSeeded,41todayUtc,42utcNow,43writeJson,44} from "./common.ts";4546interface RouterPrompt {47prompt_id: string;48prompt: string;49expected_primary_skill: string;50acceptable_secondary_skills?: string[];51rejected_skills?: string[];52reason?: string;53}5455interface RouterRunRecord {56prompt_id: string;57model_id: string;58rep: number;59shuffle_seed: number;60attempts?: number;61status: "format_failure" | "model_unavailable" | "finished" | "error" | "cancelled" | "dry_run";62duration_ms?: number;63predicted_primary?: string;64predicted_top3?: string[];65top1_correct?: boolean;66top3_correct?: boolean;67raw_text?: string;68notes?: string;69}7071const DEFAULT_FIXTURE = join(RESEARCHER_DIR, "benchmarks", "router", "prompts.jsonl");72const ROUTING_PROMPT_TEMPLATE = join(RESEARCHER_DIR, "benchmarks", "router", "routing-prompt.md");73const RESULTS_DIR = join(RESEARCHER_DIR, "benchmarks", "router", "results");74const HISTORY_PATH = join(RESEARCHER_DIR, "reports", "router-history.jsonl");7576const ESTIMATED_TOKENS_INPUT = 4000;77const ESTIMATED_TOKENS_OUTPUT = 400;78const ESTIMATED_USD_PER_RUN = 0.012;79const MAX_FORMAT_ATTEMPTS = 2;8081async function main(): Promise<number> {82const flags = parseCliFlags(process.argv.slice(2));83const config = resolveConfig(flags, DEFAULT_FIXTURE);8485console.log(runHeader("Router Benchmark (Stage 2)"));86console.log(`fixture: ${config.fixturePath}`);87console.log(`models: ${config.models.join(", ")}`);88console.log(`reps per (prompt, model): ${config.reps}`);89console.log(`seed: ${config.seed}`);90console.log(`concurrency: ${config.concurrency}`);91console.log(`resume: ${!config.noResume}`);92console.log(`dry-run: ${config.dryRun}`);93console.log(`api key: ${apiKeyFingerprint()}`);9495const prompts = loadJsonl<RouterPrompt>(config.fixturePath);96console.log(`prompts loaded: ${prompts.length}`);97if (!prompts.length) {98console.error("No prompts in fixture; aborting.");99return 1;100}101102const skills = loadSkillDescriptions();103console.log(`skills available: ${skills.length}`);104105const plan = buildRunPlan(106prompts.map((p) => p.prompt_id),107config.models,108config.reps,109config.seed,110);111const forecast = forecastCost(112plan,113ESTIMATED_TOKENS_INPUT,114ESTIMATED_TOKENS_OUTPUT,115ESTIMATED_USD_PER_RUN * MAX_FORMAT_ATTEMPTS,116);117const worstCaseInvocations = plan.length * MAX_FORMAT_ATTEMPTS;118console.log(`planned runs: ${forecast.totalRuns}`);119console.log(`est. tokens per run: ${ESTIMATED_TOKENS_INPUT}in / ${ESTIMATED_TOKENS_OUTPUT}out`);120console.log(`max attempts per run: ${MAX_FORMAT_ATTEMPTS}`);121console.log(`max SDK invocations: ${worstCaseInvocations}`);122console.log(`est. worst-case total cost: ${forecast.estimatedTotalUsd} USD`);123124if (worstCaseInvocations > config.maxRuns) {125throw new Error(126`Worst-case SDK invocations ${worstCaseInvocations} exceeds --max-runs ${config.maxRuns}. ` +127"Increase --max-runs or lower the plan size.",128);129}130assertBudget(plan, forecast, config);131132if (config.dryRun) {133console.log("Dry-run: no SDK calls made.");134if (plan[0]) {135const sample = prompts.find((p) => p.prompt_id === plan[0].promptId)!;136const shuffled = shuffleSeeded(skills.map((s) => s.name), plan[0].shuffleSeed);137console.log("sample plan item:", plan[0]);138console.log("sample skill order:", shuffled);139console.log("sample prompt:", sample.prompt);140}141return 0;142}143144if (!process.env.CURSOR_API_KEY) {145console.error("CURSOR_API_KEY is not set. Refusing to run.");146return 1;147}148149let cursorSdk: typeof import("@cursor/sdk") | null = null;150try {151cursorSdk = await import("@cursor/sdk");152} catch {153console.error(154"@cursor/sdk is not installed. Run `npm install` inside researcher/benchmarks/sdk-runner before executing.",155);156return 1;157}158const { Agent, CursorAgentError } = cursorSdk;159160const promptTemplate = await loadPromptTemplate();161const runDir = join(RESULTS_DIR, `${todayUtc()}-${config.seed}`);162mkdirSync(runDir, { recursive: true });163164const existingResults = config.noResume ? new Map<string, RouterRunRecord>() : loadExistingResults(runDir);165const remaining = plan.filter((item) => !existingResults.has(resultFileName(item.promptId, item.modelId, item.rep)));166if (existingResults.size) {167console.log(`resume: ${existingResults.size} prior results found, ${remaining.length} runs remaining`);168}169170const totalToExecute = remaining.length;171const startedAt = Date.now();172let completed = 0;173const newResults: RouterRunRecord[] = [];174const printLock = { value: Promise.resolve() };175176await runConcurrently(remaining, config.concurrency, async (item, _index) => {177const prompt = prompts.find((p) => p.prompt_id === item.promptId);178if (!prompt) return;179const shuffledSkills = shuffleSeeded(skills, item.shuffleSeed);180const filled = renderPrompt(promptTemplate, shuffledSkills, prompt.prompt);181182const record: RouterRunRecord = {183prompt_id: item.promptId,184model_id: item.modelId,185rep: item.rep,186shuffle_seed: item.shuffleSeed,187attempts: 0,188status: "error",189};190const started = Date.now();191try {192for (let attempt = 1; attempt <= MAX_FORMAT_ATTEMPTS; attempt++) {193record.attempts = attempt;194const result = await Agent.prompt(filled, {195apiKey: process.env.CURSOR_API_KEY!,196model: { id: item.modelId },197local: { cwd: REPO_ROOT, settingSources: [] },198});199record.status = result.status;200record.raw_text = result.result ?? "";201const parsed = parseRouterJson(result.result ?? "");202if (parsed) {203record.predicted_primary = parsed[0];204record.predicted_top3 = parsed.slice(0, 3);205record.top1_correct = parsed[0] === prompt.expected_primary_skill;206record.top3_correct = parsed.slice(0, 3).includes(prompt.expected_primary_skill);207break;208}209record.status = "format_failure";210record.notes = attempt < MAX_FORMAT_ATTEMPTS ? "format failure; retrying once" : "format failure after retry";211}212} catch (error) {213if (CursorAgentError && error instanceof CursorAgentError) {214record.status = "model_unavailable";215record.notes = error.message;216} else {217record.notes = (error as Error).message;218}219}220record.duration_ms = Date.now() - started;221writeJson(join(runDir, resultFileName(item.promptId, item.modelId, item.rep)), record);222newResults.push(record);223224completed++;225const elapsedMs = Date.now() - startedAt;226const avgMs = elapsedMs / completed;227const remainingMs = Math.round(avgMs * (totalToExecute - completed));228const top1 = record.top1_correct === true ? "T1" : record.top1_correct === false ? " " : "??";229const padded = String(completed).padStart(4, " ");230const total = String(totalToExecute).padStart(4, " ");231const line = `[${padded}/${total}] ${item.modelId.padEnd(20)} ${item.promptId} rep=${item.rep} ${record.status.padEnd(18)} ${(record.duration_ms ?? 0)232.toString()233.padStart(5, " ")}ms ${top1} ETA=${formatDuration(remainingMs)}`;234// Serialize console writes so concurrent workers do not interleave lines.235printLock.value = printLock.value.then(() => {236console.log(line);237});238});239240const results: RouterRunRecord[] = [...existingResults.values(), ...newResults];241242const summary = summarize(results, prompts);243const metadata = {244timestamp: utcNow(),245repo_sha: repoCommitSha(),246fixture_sha: fixtureSha(config.fixturePath),247seed: config.seed,248models: config.models,249reps: config.reps,250prompts: prompts.length,251};252writeJson(join(runDir, "summary.json"), { ...metadata, summary });253appendHistoryEntry(HISTORY_PATH, { ...metadata, summary });254255console.log("summary:", summary);256console.log(`raw results in ${runDir}`);257return 0;258}259260async function loadPromptTemplate(): Promise<string> {261const { readFileSync, existsSync } = await import("node:fs");262if (!existsSync(ROUTING_PROMPT_TEMPLATE)) {263throw new Error(`Routing prompt template missing: ${ROUTING_PROMPT_TEMPLATE}`);264}265return readFileSync(ROUTING_PROMPT_TEMPLATE, "utf-8");266}267268function renderPrompt(269template: string,270shuffledSkills: Array<{ name: string; description: string }>,271userPrompt: string,272): string {273const skillBlock = shuffledSkills274.map((skill, index) => `${index + 1}. ${skill.name}\n ${skill.description}`)275.join("\n\n");276return template277.replace("{{SKILL_BLOCK}}", skillBlock)278.replace("{{USER_PROMPT}}", userPrompt)279.replace("{{SKILL_COUNT}}", String(shuffledSkills.length));280}281282function parseRouterJson(raw: string): string[] | null {283const match = raw.match(/\{[\s\S]*\}/);284if (!match) return null;285try {286const parsed = JSON.parse(match[0]);287if (Array.isArray(parsed.ranking)) {288return parsed.ranking.map((value: unknown) => String(value));289}290return null;291} catch {292return null;293}294}295296function loadExistingResults(runDir: string): Map<string, RouterRunRecord> {297const map = new Map<string, RouterRunRecord>();298if (!existsSync(runDir)) return map;299for (const entry of readdirSync(runDir)) {300if (entry === "summary.json" || !entry.endsWith(".json")) continue;301try {302const parsed = JSON.parse(readFileSync(join(runDir, entry), "utf-8")) as RouterRunRecord;303if (parsed && parsed.prompt_id && parsed.model_id !== undefined && parsed.rep !== undefined) {304map.set(entry, parsed);305}306} catch {307// Skip malformed leftovers; next sweep will overwrite.308}309}310return map;311}312313function formatDuration(ms: number): string {314if (!Number.isFinite(ms) || ms < 0) return "unknown";315const totalSec = Math.round(ms / 1000);316const h = Math.floor(totalSec / 3600);317const m = Math.floor((totalSec % 3600) / 60);318const s = totalSec % 60;319if (h > 0) return `${h}h${m.toString().padStart(2, "0")}m`;320if (m > 0) return `${m}m${s.toString().padStart(2, "0")}s`;321return `${s}s`;322}323324function summarize(results: RouterRunRecord[], prompts: RouterPrompt[]): Record<string, unknown> {325const perModel: Record<string, { total: number; format_failures: number; top1: number; top3: number }> = {};326for (const r of results) {327const bucket = perModel[r.model_id] ?? { total: 0, format_failures: 0, top1: 0, top3: 0 };328bucket.total += 1;329if (r.status === "format_failure") bucket.format_failures += 1;330if (r.top1_correct) bucket.top1 += 1;331if (r.top3_correct) bucket.top3 += 1;332perModel[r.model_id] = bucket;333}334const summary: Record<string, unknown> = {};335for (const [model, b] of Object.entries(perModel)) {336summary[model] = {337total: b.total,338format_failure_rate: Number((b.format_failures / b.total).toFixed(4)),339top1_accuracy: Number((b.top1 / b.total).toFixed(4)),340top3_accuracy: Number((b.top3 / b.total).toFixed(4)),341};342}343summary["promptCount"] = prompts.length;344return summary;345}346347void writeFileSync;348349main()350.then((code) => process.exit(code))351.catch((error) => {352console.error(error);353process.exit(2);354});355