Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
339
Skill
n/a
Size
4.3 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
researcher/benchmarks/sdk-runner/src/runRouter.ts

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code355 linesFree
researcher/benchmarks/sdk-runner/src/runRouter.ts
1/**
2 * Stage 2: Skill router benchmark.
3 *
4 * Hypothesis: the activation-scenario descriptions in v2.2.0 frontmatter let a
5 * frontier model route prompts to the correct skill at high top-1 accuracy.
6 *
7 * Procedure (per the methodology in researcher/benchmarks/PLAN.md):
8 *   1. Load 50+ ground-truth prompts from router/prompts.jsonl.
9 *   2. For each (prompt, model, replication), build a routing prompt with the
10 *      15 skill descriptions in deterministically-shuffled order.
11 *   3. Call Agent.prompt() with settingSources: [] (no skills loaded; the
12 *      descriptions in the prompt are the only signal).
13 *   4. Parse strict JSON ranking. Score top-1 and top-3 accuracy.
14 *   5. Persist per-run JSON + transcript; append a summary to history.
15 *
16 * Runs only execute when CURSOR_API_KEY is set AND a cost cap is provided.
17 * --dry-run prints the plan and cost forecast and exits cleanly.
18 */
19 
20import { join } from "node:path";
21import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
22 
23import {
24  RESEARCHER_DIR,
25  REPO_ROOT,
26  apiKeyFingerprint,
27  appendHistoryEntry,
28  assertBudget,
29  buildRunPlan,
30  fixtureSha,
31  forecastCost,
32  loadJsonl,
33  loadSkillDescriptions,
34  parseCliFlags,
35  repoCommitSha,
36  resolveConfig,
37  resultFileName,
38  runConcurrently,
39  runHeader,
40  shuffleSeeded,
41  todayUtc,
42  utcNow,
43  writeJson,
44} from "./common.ts";
45 
46interface RouterPrompt {
47  prompt_id: string;
48  prompt: string;
49  expected_primary_skill: string;
50  acceptable_secondary_skills?: string[];
51  rejected_skills?: string[];
52  reason?: string;
53}
54 
55interface RouterRunRecord {
56  prompt_id: string;
57  model_id: string;
58  rep: number;
59  shuffle_seed: number;
60  attempts?: number;
61  status: "format_failure" | "model_unavailable" | "finished" | "error" | "cancelled" | "dry_run";
62  duration_ms?: number;
63  predicted_primary?: string;
64  predicted_top3?: string[];
65  top1_correct?: boolean;
66  top3_correct?: boolean;
67  raw_text?: string;
68  notes?: string;
69}
70 
71const DEFAULT_FIXTURE = join(RESEARCHER_DIR, "benchmarks", "router", "prompts.jsonl");
72const ROUTING_PROMPT_TEMPLATE = join(RESEARCHER_DIR, "benchmarks", "router", "routing-prompt.md");
73const RESULTS_DIR = join(RESEARCHER_DIR, "benchmarks", "router", "results");
74const HISTORY_PATH = join(RESEARCHER_DIR, "reports", "router-history.jsonl");
75 
76const ESTIMATED_TOKENS_INPUT = 4000;
77const ESTIMATED_TOKENS_OUTPUT = 400;
78const ESTIMATED_USD_PER_RUN = 0.012;
79const MAX_FORMAT_ATTEMPTS = 2;
80 
81async function main(): Promise<number> {
82  const flags = parseCliFlags(process.argv.slice(2));
83  const config = resolveConfig(flags, DEFAULT_FIXTURE);
84 
85  console.log(runHeader("Router Benchmark (Stage 2)"));
86  console.log(`fixture: ${config.fixturePath}`);
87  console.log(`models: ${config.models.join(", ")}`);
88  console.log(`reps per (prompt, model): ${config.reps}`);
89  console.log(`seed: ${config.seed}`);
90  console.log(`concurrency: ${config.concurrency}`);
91  console.log(`resume: ${!config.noResume}`);
92  console.log(`dry-run: ${config.dryRun}`);
93  console.log(`api key: ${apiKeyFingerprint()}`);
94 
95  const prompts = loadJsonl<RouterPrompt>(config.fixturePath);
96  console.log(`prompts loaded: ${prompts.length}`);
97  if (!prompts.length) {
98    console.error("No prompts in fixture; aborting.");
99    return 1;
100  }
101 
102  const skills = loadSkillDescriptions();
103  console.log(`skills available: ${skills.length}`);
104 
105  const plan = buildRunPlan(
106    prompts.map((p) => p.prompt_id),
107    config.models,
108    config.reps,
109    config.seed,
110  );
111  const forecast = forecastCost(
112    plan,
113    ESTIMATED_TOKENS_INPUT,
114    ESTIMATED_TOKENS_OUTPUT,
115    ESTIMATED_USD_PER_RUN * MAX_FORMAT_ATTEMPTS,
116  );
117  const worstCaseInvocations = plan.length * MAX_FORMAT_ATTEMPTS;
118  console.log(`planned runs: ${forecast.totalRuns}`);
119  console.log(`est. tokens per run: ${ESTIMATED_TOKENS_INPUT}in / ${ESTIMATED_TOKENS_OUTPUT}out`);
120  console.log(`max attempts per run: ${MAX_FORMAT_ATTEMPTS}`);
121  console.log(`max SDK invocations: ${worstCaseInvocations}`);
122  console.log(`est. worst-case total cost: ${forecast.estimatedTotalUsd} USD`);
123 
124  if (worstCaseInvocations > config.maxRuns) {
125    throw new Error(
126      `Worst-case SDK invocations ${worstCaseInvocations} exceeds --max-runs ${config.maxRuns}. ` +
127        "Increase --max-runs or lower the plan size.",
128    );
129  }
130  assertBudget(plan, forecast, config);
131 
132  if (config.dryRun) {
133    console.log("Dry-run: no SDK calls made.");
134    if (plan[0]) {
135      const sample = prompts.find((p) => p.prompt_id === plan[0].promptId)!;
136      const shuffled = shuffleSeeded(skills.map((s) => s.name), plan[0].shuffleSeed);
137      console.log("sample plan item:", plan[0]);
138      console.log("sample skill order:", shuffled);
139      console.log("sample prompt:", sample.prompt);
140    }
141    return 0;
142  }
143 
144  if (!process.env.CURSOR_API_KEY) {
145    console.error("CURSOR_API_KEY is not set. Refusing to run.");
146    return 1;
147  }
148 
149  let cursorSdk: typeof import("@cursor/sdk") | null = null;
150  try {
151    cursorSdk = await import("@cursor/sdk");
152  } catch {
153    console.error(
154      "@cursor/sdk is not installed. Run `npm install` inside researcher/benchmarks/sdk-runner before executing.",
155    );
156    return 1;
157  }
158  const { Agent, CursorAgentError } = cursorSdk;
159 
160  const promptTemplate = await loadPromptTemplate();
161  const runDir = join(RESULTS_DIR, `${todayUtc()}-${config.seed}`);
162  mkdirSync(runDir, { recursive: true });
163 
164  const existingResults = config.noResume ? new Map<string, RouterRunRecord>() : loadExistingResults(runDir);
165  const remaining = plan.filter((item) => !existingResults.has(resultFileName(item.promptId, item.modelId, item.rep)));
166  if (existingResults.size) {
167    console.log(`resume: ${existingResults.size} prior results found, ${remaining.length} runs remaining`);
168  }
169 
170  const totalToExecute = remaining.length;
171  const startedAt = Date.now();
172  let completed = 0;
173  const newResults: RouterRunRecord[] = [];
174  const printLock = { value: Promise.resolve() };
175 
176  await runConcurrently(remaining, config.concurrency, async (item, _index) => {
177    const prompt = prompts.find((p) => p.prompt_id === item.promptId);
178    if (!prompt) return;
179    const shuffledSkills = shuffleSeeded(skills, item.shuffleSeed);
180    const filled = renderPrompt(promptTemplate, shuffledSkills, prompt.prompt);
181 
182    const record: RouterRunRecord = {
183      prompt_id: item.promptId,
184      model_id: item.modelId,
185      rep: item.rep,
186      shuffle_seed: item.shuffleSeed,
187      attempts: 0,
188      status: "error",
189    };
190    const started = Date.now();
191    try {
192      for (let attempt = 1; attempt <= MAX_FORMAT_ATTEMPTS; attempt++) {
193        record.attempts = attempt;
194        const result = await Agent.prompt(filled, {
195          apiKey: process.env.CURSOR_API_KEY!,
196          model: { id: item.modelId },
197          local: { cwd: REPO_ROOT, settingSources: [] },
198        });
199        record.status = result.status;
200        record.raw_text = result.result ?? "";
201        const parsed = parseRouterJson(result.result ?? "");
202        if (parsed) {
203          record.predicted_primary = parsed[0];
204          record.predicted_top3 = parsed.slice(0, 3);
205          record.top1_correct = parsed[0] === prompt.expected_primary_skill;
206          record.top3_correct = parsed.slice(0, 3).includes(prompt.expected_primary_skill);
207          break;
208        }
209        record.status = "format_failure";
210        record.notes = attempt < MAX_FORMAT_ATTEMPTS ? "format failure; retrying once" : "format failure after retry";
211      }
212    } catch (error) {
213      if (CursorAgentError && error instanceof CursorAgentError) {
214        record.status = "model_unavailable";
215        record.notes = error.message;
216      } else {
217        record.notes = (error as Error).message;
218      }
219    }
220    record.duration_ms = Date.now() - started;
221    writeJson(join(runDir, resultFileName(item.promptId, item.modelId, item.rep)), record);
222    newResults.push(record);
223 
224    completed++;
225    const elapsedMs = Date.now() - startedAt;
226    const avgMs = elapsedMs / completed;
227    const remainingMs = Math.round(avgMs * (totalToExecute - completed));
228    const top1 = record.top1_correct === true ? "T1" : record.top1_correct === false ? "  " : "??";
229    const padded = String(completed).padStart(4, " ");
230    const total = String(totalToExecute).padStart(4, " ");
231    const line = `[${padded}/${total}] ${item.modelId.padEnd(20)} ${item.promptId} rep=${item.rep} ${record.status.padEnd(18)} ${(record.duration_ms ?? 0)
232      .toString()
233      .padStart(5, " ")}ms ${top1} ETA=${formatDuration(remainingMs)}`;
234    // Serialize console writes so concurrent workers do not interleave lines.
235    printLock.value = printLock.value.then(() => {
236      console.log(line);
237    });
238  });
239 
240  const results: RouterRunRecord[] = [...existingResults.values(), ...newResults];
241 
242  const summary = summarize(results, prompts);
243  const metadata = {
244    timestamp: utcNow(),
245    repo_sha: repoCommitSha(),
246    fixture_sha: fixtureSha(config.fixturePath),
247    seed: config.seed,
248    models: config.models,
249    reps: config.reps,
250    prompts: prompts.length,
251  };
252  writeJson(join(runDir, "summary.json"), { ...metadata, summary });
253  appendHistoryEntry(HISTORY_PATH, { ...metadata, summary });
254 
255  console.log("summary:", summary);
256  console.log(`raw results in ${runDir}`);
257  return 0;
258}
259 
260async function loadPromptTemplate(): Promise<string> {
261  const { readFileSync, existsSync } = await import("node:fs");
262  if (!existsSync(ROUTING_PROMPT_TEMPLATE)) {
263    throw new Error(`Routing prompt template missing: ${ROUTING_PROMPT_TEMPLATE}`);
264  }
265  return readFileSync(ROUTING_PROMPT_TEMPLATE, "utf-8");
266}
267 
268function renderPrompt(
269  template: string,
270  shuffledSkills: Array<{ name: string; description: string }>,
271  userPrompt: string,
272): string {
273  const skillBlock = shuffledSkills
274    .map((skill, index) => `${index + 1}. ${skill.name}\n   ${skill.description}`)
275    .join("\n\n");
276  return template
277    .replace("{{SKILL_BLOCK}}", skillBlock)
278    .replace("{{USER_PROMPT}}", userPrompt)
279    .replace("{{SKILL_COUNT}}", String(shuffledSkills.length));
280}
281 
282function parseRouterJson(raw: string): string[] | null {
283  const match = raw.match(/\{[\s\S]*\}/);
284  if (!match) return null;
285  try {
286    const parsed = JSON.parse(match[0]);
287    if (Array.isArray(parsed.ranking)) {
288      return parsed.ranking.map((value: unknown) => String(value));
289    }
290    return null;
291  } catch {
292    return null;
293  }
294}
295 
296function loadExistingResults(runDir: string): Map<string, RouterRunRecord> {
297  const map = new Map<string, RouterRunRecord>();
298  if (!existsSync(runDir)) return map;
299  for (const entry of readdirSync(runDir)) {
300    if (entry === "summary.json" || !entry.endsWith(".json")) continue;
301    try {
302      const parsed = JSON.parse(readFileSync(join(runDir, entry), "utf-8")) as RouterRunRecord;
303      if (parsed && parsed.prompt_id && parsed.model_id !== undefined && parsed.rep !== undefined) {
304        map.set(entry, parsed);
305      }
306    } catch {
307      // Skip malformed leftovers; next sweep will overwrite.
308    }
309  }
310  return map;
311}
312 
313function formatDuration(ms: number): string {
314  if (!Number.isFinite(ms) || ms < 0) return "unknown";
315  const totalSec = Math.round(ms / 1000);
316  const h = Math.floor(totalSec / 3600);
317  const m = Math.floor((totalSec % 3600) / 60);
318  const s = totalSec % 60;
319  if (h > 0) return `${h}h${m.toString().padStart(2, "0")}m`;
320  if (m > 0) return `${m}m${s.toString().padStart(2, "0")}s`;
321  return `${s}s`;
322}
323 
324function summarize(results: RouterRunRecord[], prompts: RouterPrompt[]): Record<string, unknown> {
325  const perModel: Record<string, { total: number; format_failures: number; top1: number; top3: number }> = {};
326  for (const r of results) {
327    const bucket = perModel[r.model_id] ?? { total: 0, format_failures: 0, top1: 0, top3: 0 };
328    bucket.total += 1;
329    if (r.status === "format_failure") bucket.format_failures += 1;
330    if (r.top1_correct) bucket.top1 += 1;
331    if (r.top3_correct) bucket.top3 += 1;
332    perModel[r.model_id] = bucket;
333  }
334  const summary: Record<string, unknown> = {};
335  for (const [model, b] of Object.entries(perModel)) {
336    summary[model] = {
337      total: b.total,
338      format_failure_rate: Number((b.format_failures / b.total).toFixed(4)),
339      top1_accuracy: Number((b.top1 / b.total).toFixed(4)),
340      top3_accuracy: Number((b.top3 / b.total).toFixed(4)),
341    };
342  }
343  summary["promptCount"] = prompts.length;
344  return summary;
345}
346 
347void writeFileSync;
348 
349main()
350  .then((code) => process.exit(code))
351  .catch((error) => {
352    console.error(error);
353    process.exit(2);
354  });
355
Preparing the source view

Agent Skills for Context Engineering

researcher/benchmarks/sdk-runner/src/runRouter.ts