Source from repo
Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
219.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/run_eval.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code311 linesFree
scripts/run_eval.py
1#!/usr/bin/env python3
2"""Run trigger evaluation for a skill description.
3 
4Tests whether a skill's description causes Claude to trigger (read the skill)
5for a set of queries. Outputs results as JSON.
6"""
7 
8import argparse
9import json
10import os
11import select
12import subprocess
13import sys
14import time
15import uuid
16from concurrent.futures import ProcessPoolExecutor, as_completed
17from pathlib import Path
18 
19from scripts.utils import parse_skill_md
20 
21 
22def find_project_root() -> Path:
23    """Find the project root by walking up from cwd looking for .claude/.
24 
25    Mimics how Claude Code discovers its project root, so the command file
26    we create ends up where claude -p will look for it.
27    """
28    current = Path.cwd()
29    for parent in [current, *current.parents]:
30        if (parent / ".claude").is_dir():
31            return parent
32    return current
33 
34 
35def run_single_query(
36    query: str,
37    skill_name: str,
38    skill_description: str,
39    timeout: int,
40    project_root: str,
41    model: str | None = None,
42) -> bool:
43    """Run a single query and return whether the skill was triggered.
44 
45    Creates a command file in .claude/commands/ so it appears in Claude's
46    available_skills list, then runs `claude -p` with the raw query.
47    Uses --include-partial-messages to detect triggering early from
48    stream events (content_block_start) rather than waiting for the
49    full assistant message, which only arrives after tool execution.
50    """
51    unique_id = uuid.uuid4().hex[:8]
52    clean_name = f"{skill_name}-skill-{unique_id}"
53    project_commands_dir = Path(project_root) / ".claude" / "commands"
54    command_file = project_commands_dir / f"{clean_name}.md"
55 
56    try:
57        project_commands_dir.mkdir(parents=True, exist_ok=True)
58        # Use YAML block scalar to avoid breaking on quotes in description
59        indented_desc = "\n  ".join(skill_description.split("\n"))
60        command_content = (
61            f"---\n"
62            f"description: |\n"
63            f"  {indented_desc}\n"
64            f"---\n\n"
65            f"# {skill_name}\n\n"
66            f"This skill handles: {skill_description}\n"
67        )
68        command_file.write_text(command_content)
69 
70        cmd = [
71            "claude",
72            "-p", query,
73            "--output-format", "stream-json",
74            "--verbose",
75            "--include-partial-messages",
76        ]
77        if model:
78            cmd.extend(["--model", model])
79 
80        # Remove CLAUDECODE env var to allow nesting claude -p inside a
81        # Claude Code session. The guard is for interactive terminal conflicts;
82        # programmatic subprocess usage is safe.
83        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
84 
85        process = subprocess.Popen(
86            cmd,
87            stdout=subprocess.PIPE,
88            stderr=subprocess.DEVNULL,
89            cwd=project_root,
90            env=env,
91        )
92 
93        triggered = False
94        start_time = time.time()
95        buffer = ""
96        # Track state for stream event detection
97        pending_tool_name = None
98        accumulated_json = ""
99 
100        try:
101            while time.time() - start_time < timeout:
102                if process.poll() is not None:
103                    remaining = process.stdout.read()
104                    if remaining:
105                        buffer += remaining.decode("utf-8", errors="replace")
106                    break
107 
108                ready, _, _ = select.select([process.stdout], [], [], 1.0)
109                if not ready:
110                    continue
111 
112                chunk = os.read(process.stdout.fileno(), 8192)
113                if not chunk:
114                    break
115                buffer += chunk.decode("utf-8", errors="replace")
116 
117                while "\n" in buffer:
118                    line, buffer = buffer.split("\n", 1)
119                    line = line.strip()
120                    if not line:
121                        continue
122 
123                    try:
124                        event = json.loads(line)
125                    except json.JSONDecodeError:
126                        continue
127 
128                    # Early detection via stream events
129                    if event.get("type") == "stream_event":
130                        se = event.get("event", {})
131                        se_type = se.get("type", "")
132 
133                        if se_type == "content_block_start":
134                            cb = se.get("content_block", {})
135                            if cb.get("type") == "tool_use":
136                                tool_name = cb.get("name", "")
137                                if tool_name in ("Skill", "Read"):
138                                    pending_tool_name = tool_name
139                                    accumulated_json = ""
140                                else:
141                                    return False
142 
143                        elif se_type == "content_block_delta" and pending_tool_name:
144                            delta = se.get("delta", {})
145                            if delta.get("type") == "input_json_delta":
146                                accumulated_json += delta.get("partial_json", "")
147                                if clean_name in accumulated_json:
148                                    return True
149 
150                        elif se_type in ("content_block_stop", "message_stop"):
151                            if pending_tool_name:
152                                return clean_name in accumulated_json
153                            if se_type == "message_stop":
154                                return False
155 
156                    # Fallback: full assistant message
157                    elif event.get("type") == "assistant":
158                        message = event.get("message", {})
159                        for content_item in message.get("content", []):
160                            if content_item.get("type") != "tool_use":
161                                continue
162                            tool_name = content_item.get("name", "")
163                            tool_input = content_item.get("input", {})
164                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
165                                triggered = True
166                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
167                                triggered = True
168                            return triggered
169 
170                    elif event.get("type") == "result":
171                        return triggered
172        finally:
173            # Clean up process on any exit path (return, exception, timeout)
174            if process.poll() is None:
175                process.kill()
176                process.wait()
177 
178        return triggered
179    finally:
180        if command_file.exists():
181            command_file.unlink()
182 
183 
184def run_eval(
185    eval_set: list[dict],
186    skill_name: str,
187    description: str,
188    num_workers: int,
189    timeout: int,
190    project_root: Path,
191    runs_per_query: int = 1,
192    trigger_threshold: float = 0.5,
193    model: str | None = None,
194) -> dict:
195    """Run the full eval set and return results."""
196    results = []
197 
198    with ProcessPoolExecutor(max_workers=num_workers) as executor:
199        future_to_info = {}
200        for item in eval_set:
201            for run_idx in range(runs_per_query):
202                future = executor.submit(
203                    run_single_query,
204                    item["query"],
205                    skill_name,
206                    description,
207                    timeout,
208                    str(project_root),
209                    model,
210                )
211                future_to_info[future] = (item, run_idx)
212 
213        query_triggers: dict[str, list[bool]] = {}
214        query_items: dict[str, dict] = {}
215        for future in as_completed(future_to_info):
216            item, _ = future_to_info[future]
217            query = item["query"]
218            query_items[query] = item
219            if query not in query_triggers:
220                query_triggers[query] = []
221            try:
222                query_triggers[query].append(future.result())
223            except Exception as e:
224                print(f"Warning: query failed: {e}", file=sys.stderr)
225                query_triggers[query].append(False)
226 
227    for query, triggers in query_triggers.items():
228        item = query_items[query]
229        trigger_rate = sum(triggers) / len(triggers)
230        should_trigger = item["should_trigger"]
231        if should_trigger:
232            did_pass = trigger_rate >= trigger_threshold
233        else:
234            did_pass = trigger_rate < trigger_threshold
235        results.append({
236            "query": query,
237            "should_trigger": should_trigger,
238            "trigger_rate": trigger_rate,
239            "triggers": sum(triggers),
240            "runs": len(triggers),
241            "pass": did_pass,
242        })
243 
244    passed = sum(1 for r in results if r["pass"])
245    total = len(results)
246 
247    return {
248        "skill_name": skill_name,
249        "description": description,
250        "results": results,
251        "summary": {
252            "total": total,
253            "passed": passed,
254            "failed": total - passed,
255        },
256    }
257 
258 
259def main():
260    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
261    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
262    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
263    parser.add_argument("--description", default=None, help="Override description to test")
264    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
265    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
266    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
267    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
268    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
269    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
270    args = parser.parse_args()
271 
272    eval_set = json.loads(Path(args.eval_set).read_text())
273    skill_path = Path(args.skill_path)
274 
275    if not (skill_path / "SKILL.md").exists():
276        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
277        sys.exit(1)
278 
279    name, original_description, content = parse_skill_md(skill_path)
280    description = args.description or original_description
281    project_root = find_project_root()
282 
283    if args.verbose:
284        print(f"Evaluating: {description}", file=sys.stderr)
285 
286    output = run_eval(
287        eval_set=eval_set,
288        skill_name=name,
289        description=description,
290        num_workers=args.num_workers,
291        timeout=args.timeout,
292        project_root=project_root,
293        runs_per_query=args.runs_per_query,
294        trigger_threshold=args.trigger_threshold,
295        model=args.model,
296    )
297 
298    if args.verbose:
299        summary = output["summary"]
300        print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)
301        for r in output["results"]:
302            status = "PASS" if r["pass"] else "FAIL"
303            rate_str = f"{r['triggers']}/{r['runs']}"
304            print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
305 
306    print(json.dumps(output, indent=2))
307 
308 
309if __name__ == "__main__":
310    main()
311
Preparing the source view

Skill Creator

scripts/run_eval.py