Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
researcher/benchmarks/sdk-runner/src/common.ts
1/**2* Shared utilities for the researcher SDK benchmark runner.3*4* Pure helpers only. No SDK calls live here so the helpers can be5* unit-tested or invoked from --dry-run without an API key.6*/78import { createHash } from "node:crypto";9import { execSync } from "node:child_process";10import { mkdirSync, readFileSync, writeFileSync, existsSync, appendFileSync } from "node:fs";11import { dirname, join, resolve } from "node:path";12import { fileURLToPath } from "node:url";1314export const SDK_RUNNER_DIR = dirname(fileURLToPath(import.meta.url));15export const RUNNER_ROOT = resolve(SDK_RUNNER_DIR, "..");16export const RESEARCHER_DIR = resolve(RUNNER_ROOT, "..", "..");17export const REPO_ROOT = resolve(RESEARCHER_DIR, "..");1819export type SkillId = string;2021export interface ResolvedConfig {22models: string[];23reps: number;24maxRuns: number;25maxBudgetUsd: number;26seed: number;27fixturePath: string;28dryRun: boolean;29unsafeNoCostCap: boolean;30concurrency: number;31noResume: boolean;32}3334export interface RunPlanItem {35promptId: string;36modelId: string;37rep: number;38shuffleSeed: number;39}4041export interface CliFlags {42models?: string[];43reps?: number;44maxRuns?: number;45maxBudgetUsd?: number;46seed?: number;47fixture?: string;48dryRun: boolean;49unsafeNoCostCap: boolean;50concurrency?: number;51noResume: boolean;52}5354const DEFAULT_MODELS = ["composer-2"];5556export function parseCliFlags(argv: string[]): CliFlags {57const flags: CliFlags = { dryRun: false, unsafeNoCostCap: false, noResume: false };58for (let i = 0; i < argv.length; i++) {59const arg = argv[i];60switch (arg) {61case "--dry-run":62flags.dryRun = true;63break;64case "--unsafe-no-cost-cap":65flags.unsafeNoCostCap = true;66break;67case "--no-resume":68flags.noResume = true;69break;70case "--models":71flags.models = (argv[++i] ?? "").split(",").map((value) => value.trim()).filter(Boolean);72break;73case "--reps":74flags.reps = Number(argv[++i]);75break;76case "--max-runs":77flags.maxRuns = Number(argv[++i]);78break;79case "--max-budget-usd":80flags.maxBudgetUsd = Number(argv[++i]);81break;82case "--seed":83flags.seed = Number(argv[++i]);84break;85case "--fixture":86flags.fixture = argv[++i] ?? "";87break;88case "--concurrency":89flags.concurrency = Number(argv[++i]);90break;91default:92if (arg?.startsWith("--")) {93throw new Error(`Unknown flag: ${arg}`);94}95}96}97return flags;98}99100export function resolveConfig(101flags: CliFlags,102defaultFixturePath: string,103): ResolvedConfig {104if (!flags.unsafeNoCostCap && !flags.maxRuns && !flags.maxBudgetUsd && !flags.dryRun) {105throw new Error(106"Refusing to run without a cost cap. Pass --max-runs or --max-budget-usd or --unsafe-no-cost-cap. " +107"Use --dry-run to see the plan without any agent calls.",108);109}110return {111models: flags.models?.length ? flags.models : DEFAULT_MODELS,112reps: flags.reps && flags.reps > 0 ? flags.reps : 3,113maxRuns: flags.maxRuns ?? Number.MAX_SAFE_INTEGER,114maxBudgetUsd: flags.maxBudgetUsd ?? Number.MAX_SAFE_INTEGER,115seed: flags.seed ?? 1,116fixturePath: flags.fixture ?? defaultFixturePath,117dryRun: flags.dryRun,118unsafeNoCostCap: flags.unsafeNoCostCap,119concurrency: flags.concurrency && flags.concurrency > 0 ? flags.concurrency : 1,120noResume: flags.noResume,121};122}123124/**125* Bounded-concurrency executor. Runs `worker(item, index)` for every input,126* keeping at most `limit` workers active at any time. Preserves output order.127* Failures inside a worker bubble up; callers are expected to wrap workers in128* their own try/catch when partial failure is acceptable.129*/130export async function runConcurrently<T, R>(131items: T[],132limit: number,133worker: (item: T, index: number) => Promise<R>,134): Promise<R[]> {135const results: R[] = new Array(items.length);136const concurrency = Math.max(1, Math.min(limit, items.length));137let next = 0;138async function run(): Promise<void> {139while (true) {140const index = next++;141if (index >= items.length) return;142results[index] = await worker(items[index] as T, index);143}144}145const workers = Array.from({ length: concurrency }, () => run());146await Promise.all(workers);147return results;148}149150/**151* Pure-function key derivation for a per-run result file. Used both by the152* runner (when writing) and the resume scan (when checking). Keeping this in153* one place prevents the two from drifting.154*/155export function resultFileName(promptId: string, modelId: string, rep: number): string {156return `${promptId}-${modelId}-${rep}.json`;157}158159export function loadJsonl<T>(path: string): T[] {160if (!existsSync(path)) {161throw new Error(`Fixture missing: ${path}`);162}163const lines = readFileSync(path, "utf-8").split("\n");164const records: T[] = [];165for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {166const trimmed = lines[lineIndex]?.trim();167if (!trimmed) continue;168try {169records.push(JSON.parse(trimmed) as T);170} catch (error) {171throw new Error(`Invalid JSONL at ${path}:${lineIndex + 1}: ${(error as Error).message}`);172}173}174return records;175}176177export function appendHistoryEntry(historyPath: string, entry: Record<string, unknown>): void {178mkdirSync(dirname(historyPath), { recursive: true });179appendFileSync(historyPath, JSON.stringify(entry) + "\n");180}181182export function writeJson(path: string, data: unknown): void {183mkdirSync(dirname(path), { recursive: true });184writeFileSync(path, JSON.stringify(data, null, 2) + "\n");185}186187export function utcNow(): string {188return new Date().toISOString().replace(/\.\d{3}Z$/, "Z");189}190191export function todayUtc(): string {192return new Date().toISOString().slice(0, 10);193}194195export function repoCommitSha(): string | null {196try {197return execSync("git rev-parse HEAD", { cwd: REPO_ROOT, encoding: "utf-8" }).trim();198} catch {199return null;200}201}202203export function fixtureSha(path: string): string {204const content = readFileSync(path, "utf-8");205return createHash("sha256").update(content).digest("hex").slice(0, 16);206}207208export function apiKeyFingerprint(): string {209const key = process.env.CURSOR_API_KEY;210if (!key || key.length < 8) {211return "unset";212}213return `***${key.slice(-4)}`;214}215216/**217* Deterministic Fisher-Yates shuffle using a seeded mulberry32 PRNG so the218* skill ordering inside the routing prompt is reproducible for a given seed.219*/220export function shuffleSeeded<T>(input: T[], seed: number): T[] {221const out = input.slice();222let state = seed >>> 0;223const next = () => {224state += 0x6d2b79f5;225let t = state;226t = Math.imul(t ^ (t >>> 15), t | 1);227t ^= t + Math.imul(t ^ (t >>> 7), t | 61);228return ((t ^ (t >>> 14)) >>> 0) / 4294967296;229};230for (let i = out.length - 1; i > 0; i--) {231const j = Math.floor(next() * (i + 1));232[out[i], out[j]] = [out[j]!, out[i]!];233}234return out;235}236237export function buildRunPlan(238promptIds: string[],239models: string[],240reps: number,241baseSeed: number,242): RunPlanItem[] {243const plan: RunPlanItem[] = [];244for (const promptId of promptIds) {245for (const modelId of models) {246for (let rep = 0; rep < reps; rep++) {247plan.push({248promptId,249modelId,250rep,251shuffleSeed: hash32(`${promptId}|${modelId}|${rep}|${baseSeed}`),252});253}254}255}256return plan;257}258259export function hash32(text: string): number {260const digest = createHash("sha256").update(text).digest();261return digest.readUInt32BE(0);262}263264export function loadSkillDescriptions(): Array<{ name: string; description: string }> {265const corpusPath = join(RESEARCHER_DIR, "corpus", "index.json");266if (!existsSync(corpusPath)) {267throw new Error(`Corpus index missing at ${corpusPath}`);268}269const corpus = JSON.parse(readFileSync(corpusPath, "utf-8")) as {270skills: Array<{ name: string; path: string }>;271};272const out: Array<{ name: string; description: string }> = [];273for (const skill of corpus.skills) {274const skillPath = join(REPO_ROOT, skill.path);275if (!existsSync(skillPath)) {276throw new Error(`Skill missing: ${skillPath}`);277}278const text = readFileSync(skillPath, "utf-8");279const description = extractDescription(text);280if (!description) {281throw new Error(`Skill ${skill.name} has no frontmatter description`);282}283out.push({ name: skill.name, description });284}285return out;286}287288export function extractDescription(text: string): string | null {289if (!text.startsWith("---")) return null;290const end = text.indexOf("\n---", 4);291if (end === -1) return null;292const body = text.slice(4, end);293let inDescription = false;294const lines: string[] = [];295for (const raw of body.split("\n")) {296if (raw.startsWith("description:")) {297inDescription = true;298const value = raw.slice("description:".length).trim();299if (value && value !== ">" && value !== ">-") {300lines.push(value.replace(/^"|"$/g, ""));301inDescription = false;302}303continue;304}305if (inDescription) {306if (/^[a-z_]+:/i.test(raw)) {307inDescription = false;308continue;309}310const trimmed = raw.trim();311if (trimmed) lines.push(trimmed);312}313}314return lines.join(" ").trim() || null;315}316317export interface CostForecast {318totalRuns: number;319estimatedTokensPerRunInput: number;320estimatedTokensPerRunOutput: number;321estimatedUsdPerRun: number;322estimatedTotalUsd: number;323}324325export function forecastCost(326plan: RunPlanItem[],327estimatedTokensPerRunInput: number,328estimatedTokensPerRunOutput: number,329estimatedUsdPerRun: number,330): CostForecast {331return {332totalRuns: plan.length,333estimatedTokensPerRunInput,334estimatedTokensPerRunOutput,335estimatedUsdPerRun,336estimatedTotalUsd: Number((estimatedUsdPerRun * plan.length).toFixed(4)),337};338}339340export function assertBudget(plan: RunPlanItem[], forecast: CostForecast, config: ResolvedConfig): void {341if (plan.length > config.maxRuns) {342throw new Error(`Plan size ${plan.length} exceeds --max-runs ${config.maxRuns}`);343}344if (forecast.estimatedTotalUsd > config.maxBudgetUsd) {345throw new Error(346`Forecast ${forecast.estimatedTotalUsd} USD exceeds --max-budget-usd ${config.maxBudgetUsd}`,347);348}349}350351export function runHeader(label: string): string {352const sep = "=".repeat(72);353return `${sep}\n${label}\n${sep}`;354}355