Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/benchmarks/sdk-runner/src/runEffectiveness.ts
1/**2* Stage 3: Skill effectiveness benchmark.3*4* Hypothesis: loading a relevant skill improves outcome quality or token5* efficiency on tasks the skill claims to address. Irrelevant skills should6* have no effect (negative control).7*8* Scaffold only in v2.2.x. Full implementation lands in v2.4.0 once the task9* set reaches 20 and the budget for the full sweep is signed off.10*11* The runner already supports --dry-run to print the plan and cost forecast12* against the currently-built task set.13*/1415import { join } from "node:path";16import { existsSync, readdirSync, statSync, readFileSync, mkdirSync } from "node:fs";1718import {19RESEARCHER_DIR,20apiKeyFingerprint,21assertBudget,22buildRunPlan,23forecastCost,24parseCliFlags,25resolveConfig,26runHeader,27todayUtc,28} from "./common.ts";2930interface EffectivenessTaskMetadata {31id: string;32slug: string;33target_skill: string;34irrelevant_skill: string;35category: string;36difficulty: "easy" | "medium" | "hard";37notes?: string;38}3940const TASKS_DIR = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "tasks");41const RESULTS_DIR = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "results");42const CONDITIONS = ["control", "target", "negative", "full", "target_plus_one", "target_plus_unrelated"] as const;4344const ESTIMATED_TOKENS_INPUT = 20_000;45const ESTIMATED_TOKENS_OUTPUT = 4_000;46const ESTIMATED_USD_PER_RUN = 0.18;4748async function main(): Promise<number> {49const flags = parseCliFlags(process.argv.slice(2));50const defaultFixture = join(RESEARCHER_DIR, "benchmarks", "effectiveness", "tasks");51const config = resolveConfig(flags, defaultFixture);5253console.log(runHeader("Effectiveness Benchmark (Stage 3)"));54console.log(`tasks dir: ${TASKS_DIR}`);55console.log(`models: ${config.models.join(", ")}`);56console.log(`reps per (task, condition, model): ${config.reps}`);57console.log(`seed: ${config.seed}`);58console.log(`dry-run: ${config.dryRun}`);59console.log(`api key: ${apiKeyFingerprint()}`);6061if (!existsSync(TASKS_DIR)) {62console.error(`Tasks directory missing: ${TASKS_DIR}`);63return 1;64}6566const tasks = discoverTasks();67console.log(`tasks discovered: ${tasks.length}`);6869if (tasks.length === 0) {70console.error("No tasks present yet. Add at least one task under researcher/benchmarks/effectiveness/tasks/.");71return 1;72}7374const planIds: string[] = [];75for (const task of tasks) {76for (const condition of CONDITIONS) {77planIds.push(`${task.id}|${condition}`);78}79}80const plan = buildRunPlan(planIds, config.models, config.reps, config.seed);81const forecast = forecastCost(plan, ESTIMATED_TOKENS_INPUT, ESTIMATED_TOKENS_OUTPUT, ESTIMATED_USD_PER_RUN);82console.log(`planned runs: ${forecast.totalRuns}`);83console.log(`conditions per task: ${CONDITIONS.length}`);84console.log(`est. tokens per run: ${ESTIMATED_TOKENS_INPUT}in / ${ESTIMATED_TOKENS_OUTPUT}out`);85console.log(`est. total cost: ${forecast.estimatedTotalUsd} USD`);8687assertBudget(plan, forecast, config);8889if (config.dryRun) {90console.log("Dry-run: no SDK calls made.");91console.log("First three tasks:");92for (const task of tasks.slice(0, 3)) {93console.log(` - ${task.id} target=${task.target_skill} difficulty=${task.difficulty}`);94}95return 0;96}9798console.error(99"Stage 3 executor not yet wired in v2.2.x scaffold. See researcher/benchmarks/PLAN.md for the v2.4.0 implementation contract. " +100"Use --dry-run to validate task and config shape today.",101);102mkdirSync(join(RESULTS_DIR, `${todayUtc()}-${config.seed}`), { recursive: true });103return 0;104}105106function discoverTasks(): EffectivenessTaskMetadata[] {107const tasks: EffectivenessTaskMetadata[] = [];108for (const entry of readdirSync(TASKS_DIR).sort()) {109const dir = join(TASKS_DIR, entry);110if (!statSync(dir).isDirectory()) continue;111const metaPath = join(dir, "metadata.json");112if (!existsSync(metaPath)) continue;113try {114const meta = JSON.parse(readFileSync(metaPath, "utf-8")) as EffectivenessTaskMetadata;115tasks.push(meta);116} catch (error) {117console.warn(`Skipping ${entry}: invalid metadata.json (${(error as Error).message})`);118}119}120return tasks;121}122123main()124.then((code) => process.exit(code))125.catch((error) => {126console.error(error);127process.exit(2);128});129