Source from repo

Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.

muratcankoylanGitHub muratcankoylanSource repo Original GitHub link

Files

339

Skill

n/a

Size

4.3 MB

Entrypoint

SKILL.md

Format

git-repo

Open file

researcher/benchmarks/sdk-runner/src/runEffectiveness.ts

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code129 linesFree

researcher/benchmarks/sdk-runner/src/runEffectiveness.ts

1/**
2 * Stage 3: Skill effectiveness benchmark.
3 *
4 * Hypothesis: loading a relevant skill improves outcome quality or token
5 * efficiency on tasks the skill claims to address. Irrelevant skills should
6 * have no effect (negative control).
7 *
8 * Scaffold only in v2.2.x. Full implementation lands in v2.4.0 once the task
9 * set reaches 20 and the budget for the full sweep is signed off.
10 *
11 * The runner already supports --dry-run to print the plan and cost forecast
12 * against the currently-built task set.
13 */
14 
15import { join } from "node:path";
16import { existsSync, readdirSync, statSync, readFileSync, mkdirSync } from "node:fs";
17 
18import {
19  RESEARCHER_DIR,
20  apiKeyFingerprint,
21  assertBudget,
22  buildRunPlan,
23  forecastCost,
24  parseCliFlags,
25  resolveConfig,
26  runHeader,
27  todayUtc,
28} from "./common.ts";
29 
30interface EffectivenessTaskMetadata {
31  id: string;
32  slug: string;
33  target_skill: string;
34  irrelevant_skill: string;
35  category: string;
36  difficulty: "easy" | "medium" | "hard";
37  notes?: string;
38}
39 
40const TASKS_DIR = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "tasks");
41const RESULTS_DIR = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "results");
42const CONDITIONS = ["control", "target", "negative", "full", "target_plus_one", "target_plus_unrelated"] as const;
43 
44const ESTIMATED_TOKENS_INPUT = 20_000;
45const ESTIMATED_TOKENS_OUTPUT = 4_000;
46const ESTIMATED_USD_PER_RUN = 0.18;
47 
48async function main(): Promise<number> {
49  const flags = parseCliFlags(process.argv.slice(2));
50  const defaultFixture = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "tasks");
51  const config = resolveConfig(flags, defaultFixture);
52 
53  console.log(runHeader("Effectiveness Benchmark (Stage 3)"));
54  console.log(`tasks dir: ${TASKS_DIR}`);
55  console.log(`models: ${config.models.join(", ")}`);
56  console.log(`reps per (task, condition, model): ${config.reps}`);
57  console.log(`seed: ${config.seed}`);
58  console.log(`dry-run: ${config.dryRun}`);
59  console.log(`api key: ${apiKeyFingerprint()}`);
60 
61  if (!existsSync(TASKS_DIR)) {
62    console.error(`Tasks directory missing: ${TASKS_DIR}`);
63    return 1;
64  }
65 
66  const tasks = discoverTasks();
67  console.log(`tasks discovered: ${tasks.length}`);
68 
69  if (tasks.length === 0) {
70    console.error("No tasks present yet. Add at least one task under researcher/benchmarks/effectiveness/tasks/.");
71    return 1;
72  }
73 
74  const planIds: string[] = [];
75  for (const task of tasks) {
76    for (const condition of CONDITIONS) {
77      planIds.push(`${task.id}|${condition}`);
78    }
79  }
80  const plan = buildRunPlan(planIds, config.models, config.reps, config.seed);
81  const forecast = forecastCost(plan, ESTIMATED_TOKENS_INPUT, ESTIMATED_TOKENS_OUTPUT, ESTIMATED_USD_PER_RUN);
82  console.log(`planned runs: ${forecast.totalRuns}`);
83  console.log(`conditions per task: ${CONDITIONS.length}`);
84  console.log(`est. tokens per run: ${ESTIMATED_TOKENS_INPUT}in / ${ESTIMATED_TOKENS_OUTPUT}out`);
85  console.log(`est. total cost: ${forecast.estimatedTotalUsd} USD`);
86 
87  assertBudget(plan, forecast, config);
88 
89  if (config.dryRun) {
90    console.log("Dry-run: no SDK calls made.");
91    console.log("First three tasks:");
92    for (const task of tasks.slice(0, 3)) {
93      console.log(`  - ${task.id} target=${task.target_skill} difficulty=${task.difficulty}`);
94    }
95    return 0;
96  }
97 
98  console.error(
99    "Stage 3 executor not yet wired in v2.2.x scaffold. See researcher/benchmarks/PLAN.md for the v2.4.0 implementation contract. " +
100      "Use --dry-run to validate task and config shape today.",
101  );
102  mkdirSync(join(RESULTS_DIR, `${todayUtc()}-${config.seed}`), { recursive: true });
103  return 0;
104}
105 
106function discoverTasks(): EffectivenessTaskMetadata[] {
107  const tasks: EffectivenessTaskMetadata[] = [];
108  for (const entry of readdirSync(TASKS_DIR).sort()) {
109    const dir = join(TASKS_DIR, entry);
110    if (!statSync(dir).isDirectory()) continue;
111    const metaPath = join(dir, "metadata.json");
112    if (!existsSync(metaPath)) continue;
113    try {
114      const meta = JSON.parse(readFileSync(metaPath, "utf-8")) as EffectivenessTaskMetadata;
115      tasks.push(meta);
116    } catch (error) {
117      console.warn(`Skipping ${entry}: invalid metadata.json (${(error as Error).message})`);
118    }
119  }
120  return tasks;
121}
122 
123main()
124  .then((code) => process.exit(code))
125  .catch((error) => {
126    console.error(error);
127    process.exit(2);
128  });
129

Preparing the source view

Agent Skills for Context Engineering

researcher/benchmarks/sdk-runner/src/runEffectiveness.ts