Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/run_eval.py
1#!/usr/bin/env python32"""Run trigger evaluation for a skill description.34Tests whether a skill's description causes Claude to trigger (read the skill)5for a set of queries. Outputs results as JSON.6"""78import argparse9import json10import os11import select12import subprocess13import sys14import time15import uuid16from concurrent.futures import ProcessPoolExecutor, as_completed17from pathlib import Path1819from scripts.utils import parse_skill_md202122def find_project_root() -> Path:23"""Find the project root by walking up from cwd looking for .claude/.2425Mimics how Claude Code discovers its project root, so the command file26we create ends up where claude -p will look for it.27"""28current = Path.cwd()29for parent in [current, *current.parents]:30if (parent / ".claude").is_dir():31return parent32return current333435def run_single_query(36query: str,37skill_name: str,38skill_description: str,39timeout: int,40project_root: str,41model: str | None = None,42) -> bool:43"""Run a single query and return whether the skill was triggered.4445Creates a command file in .claude/commands/ so it appears in Claude's46available_skills list, then runs `claude -p` with the raw query.47Uses --include-partial-messages to detect triggering early from48stream events (content_block_start) rather than waiting for the49full assistant message, which only arrives after tool execution.50"""51unique_id = uuid.uuid4().hex[:8]52clean_name = f"{skill_name}-skill-{unique_id}"53project_commands_dir = Path(project_root) / ".claude" / "commands"54command_file = project_commands_dir / f"{clean_name}.md"5556try:57project_commands_dir.mkdir(parents=True, exist_ok=True)58# Use YAML block scalar to avoid breaking on quotes in description59indented_desc = "\n ".join(skill_description.split("\n"))60command_content = (61f"---\n"62f"description: |\n"63f" {indented_desc}\n"64f"---\n\n"65f"# {skill_name}\n\n"66f"This skill handles: {skill_description}\n"67)68command_file.write_text(command_content)6970cmd = [71"claude",72"-p", query,73"--output-format", "stream-json",74"--verbose",75"--include-partial-messages",76]77if model:78cmd.extend(["--model", model])7980# Remove CLAUDECODE env var to allow nesting claude -p inside a81# Claude Code session. The guard is for interactive terminal conflicts;82# programmatic subprocess usage is safe.83env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}8485process = subprocess.Popen(86cmd,87stdout=subprocess.PIPE,88stderr=subprocess.DEVNULL,89cwd=project_root,90env=env,91)9293triggered = False94start_time = time.time()95buffer = ""96# Track state for stream event detection97pending_tool_name = None98accumulated_json = ""99100try:101while time.time() - start_time < timeout:102if process.poll() is not None:103remaining = process.stdout.read()104if remaining:105buffer += remaining.decode("utf-8", errors="replace")106break107108ready, _, _ = select.select([process.stdout], [], [], 1.0)109if not ready:110continue111112chunk = os.read(process.stdout.fileno(), 8192)113if not chunk:114break115buffer += chunk.decode("utf-8", errors="replace")116117while "\n" in buffer:118line, buffer = buffer.split("\n", 1)119line = line.strip()120if not line:121continue122123try:124event = json.loads(line)125except json.JSONDecodeError:126continue127128# Early detection via stream events129if event.get("type") == "stream_event":130se = event.get("event", {})131se_type = se.get("type", "")132133if se_type == "content_block_start":134cb = se.get("content_block", {})135if cb.get("type") == "tool_use":136tool_name = cb.get("name", "")137if tool_name in ("Skill", "Read"):138pending_tool_name = tool_name139accumulated_json = ""140else:141return False142143elif se_type == "content_block_delta" and pending_tool_name:144delta = se.get("delta", {})145if delta.get("type") == "input_json_delta":146accumulated_json += delta.get("partial_json", "")147if clean_name in accumulated_json:148return True149150elif se_type in ("content_block_stop", "message_stop"):151if pending_tool_name:152return clean_name in accumulated_json153if se_type == "message_stop":154return False155156# Fallback: full assistant message157elif event.get("type") == "assistant":158message = event.get("message", {})159for content_item in message.get("content", []):160if content_item.get("type") != "tool_use":161continue162tool_name = content_item.get("name", "")163tool_input = content_item.get("input", {})164if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):165triggered = True166elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):167triggered = True168return triggered169170elif event.get("type") == "result":171return triggered172finally:173# Clean up process on any exit path (return, exception, timeout)174if process.poll() is None:175process.kill()176process.wait()177178return triggered179finally:180if command_file.exists():181command_file.unlink()182183184def run_eval(185eval_set: list[dict],186skill_name: str,187description: str,188num_workers: int,189timeout: int,190project_root: Path,191runs_per_query: int = 1,192trigger_threshold: float = 0.5,193model: str | None = None,194) -> dict:195"""Run the full eval set and return results."""196results = []197198with ProcessPoolExecutor(max_workers=num_workers) as executor:199future_to_info = {}200for item in eval_set:201for run_idx in range(runs_per_query):202future = executor.submit(203run_single_query,204item["query"],205skill_name,206description,207timeout,208str(project_root),209model,210)211future_to_info[future] = (item, run_idx)212213query_triggers: dict[str, list[bool]] = {}214query_items: dict[str, dict] = {}215for future in as_completed(future_to_info):216item, _ = future_to_info[future]217query = item["query"]218query_items[query] = item219if query not in query_triggers:220query_triggers[query] = []221try:222query_triggers[query].append(future.result())223except Exception as e:224print(f"Warning: query failed: {e}", file=sys.stderr)225query_triggers[query].append(False)226227for query, triggers in query_triggers.items():228item = query_items[query]229trigger_rate = sum(triggers) / len(triggers)230should_trigger = item["should_trigger"]231if should_trigger:232did_pass = trigger_rate >= trigger_threshold233else:234did_pass = trigger_rate < trigger_threshold235results.append({236"query": query,237"should_trigger": should_trigger,238"trigger_rate": trigger_rate,239"triggers": sum(triggers),240"runs": len(triggers),241"pass": did_pass,242})243244passed = sum(1 for r in results if r["pass"])245total = len(results)246247return {248"skill_name": skill_name,249"description": description,250"results": results,251"summary": {252"total": total,253"passed": passed,254"failed": total - passed,255},256}257258259def main():260parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")261parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")262parser.add_argument("--skill-path", required=True, help="Path to skill directory")263parser.add_argument("--description", default=None, help="Override description to test")264parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")265parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")266parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")267parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")268parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")269parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")270args = parser.parse_args()271272eval_set = json.loads(Path(args.eval_set).read_text())273skill_path = Path(args.skill_path)274275if not (skill_path / "SKILL.md").exists():276print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)277sys.exit(1)278279name, original_description, content = parse_skill_md(skill_path)280description = args.description or original_description281project_root = find_project_root()282283if args.verbose:284print(f"Evaluating: {description}", file=sys.stderr)285286output = run_eval(287eval_set=eval_set,288skill_name=name,289description=description,290num_workers=args.num_workers,291timeout=args.timeout,292project_root=project_root,293runs_per_query=args.runs_per_query,294trigger_threshold=args.trigger_threshold,295model=args.model,296)297298if args.verbose:299summary = output["summary"]300print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)301for r in output["results"]:302status = "PASS" if r["pass"] else "FAIL"303rate_str = f"{r['triggers']}/{r['runs']}"304print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)305306print(json.dumps(output, indent=2))307308309if __name__ == "__main__":310main()311