Source from repo
Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page
Files
Skill
n/a
Size
219.7 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
scripts/improve_description.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code248 linesFree
scripts/improve_description.py
1#!/usr/bin/env python3
2"""Improve a skill description based on eval results.
3 
4Takes eval results (from run_eval.py) and generates an improved description
5by calling `claude -p` as a subprocess (same auth pattern as run_eval.py —
6uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed).
7"""
8 
9import argparse
10import json
11import os
12import re
13import subprocess
14import sys
15from pathlib import Path
16 
17from scripts.utils import parse_skill_md
18 
19 
20def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:
21    """Run `claude -p` with the prompt on stdin and return the text response.
22 
23    Prompt goes over stdin (not argv) because it embeds the full SKILL.md
24    body and can easily exceed comfortable argv length.
25    """
26    cmd = ["claude", "-p", "--output-format", "text"]
27    if model:
28        cmd.extend(["--model", model])
29 
30    # Remove CLAUDECODE env var to allow nesting claude -p inside a
31    # Claude Code session. The guard is for interactive terminal conflicts;
32    # programmatic subprocess usage is safe. Same pattern as run_eval.py.
33    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
34 
35    result = subprocess.run(
36        cmd,
37        input=prompt,
38        capture_output=True,
39        text=True,
40        env=env,
41        timeout=timeout,
42    )
43    if result.returncode != 0:
44        raise RuntimeError(
45            f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
46        )
47    return result.stdout
48 
49 
50def improve_description(
51    skill_name: str,
52    skill_content: str,
53    current_description: str,
54    eval_results: dict,
55    history: list[dict],
56    model: str,
57    test_results: dict | None = None,
58    log_dir: Path | None = None,
59    iteration: int | None = None,
60) -> str:
61    """Call Claude to improve the description based on eval results."""
62    failed_triggers = [
63        r for r in eval_results["results"]
64        if r["should_trigger"] and not r["pass"]
65    ]
66    false_triggers = [
67        r for r in eval_results["results"]
68        if not r["should_trigger"] and not r["pass"]
69    ]
70 
71    # Build scores summary
72    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
73    if test_results:
74        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
75        scores_summary = f"Train: {train_score}, Test: {test_score}"
76    else:
77        scores_summary = f"Train: {train_score}"
78 
79    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
80 
81The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
82 
83Here's the current description:
84<current_description>
85"{current_description}"
86</current_description>
87 
88Current scores ({scores_summary}):
89<scores_summary>
90"""
91    if failed_triggers:
92        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
93        for r in failed_triggers:
94            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
95        prompt += "\n"
96 
97    if false_triggers:
98        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
99        for r in false_triggers:
100            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
101        prompt += "\n"
102 
103    if history:
104        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
105        for h in history:
106            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
107            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
108            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
109            prompt += f'<attempt {score_str}>\n'
110            prompt += f'Description: "{h["description"]}"\n'
111            if "results" in h:
112                prompt += "Train results:\n"
113                for r in h["results"]:
114                    status = "PASS" if r["pass"] else "FAIL"
115                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
116            if h.get("note"):
117                prompt += f'Note: {h["note"]}\n'
118            prompt += "</attempt>\n\n"
119 
120    prompt += f"""</scores_summary>
121 
122Skill content (for context on what the skill does):
123<skill_content>
124{skill_content}
125</skill_content>
126 
127Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
128 
1291. Avoid overfitting
1302. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
131 
132Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it.
133 
134Here are some tips that we've found to work well in writing these descriptions:
135- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
136- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
137- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
138- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
139 
140I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
141 
142Please respond with only the new description text in <new_description> tags, nothing else."""
143 
144    text = _call_claude(prompt, model)
145 
146    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
147    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
148 
149    transcript: dict = {
150        "iteration": iteration,
151        "prompt": prompt,
152        "response": text,
153        "parsed_description": description,
154        "char_count": len(description),
155        "over_limit": len(description) > 1024,
156    }
157 
158    # Safety net: the prompt already states the 1024-char hard limit, but if
159    # the model blew past it anyway, make one fresh single-turn call that
160    # quotes the too-long version and asks for a shorter rewrite. (The old
161    # SDK path did this as a true multi-turn; `claude -p` is one-shot, so we
162    # inline the prior output into the new prompt instead.)
163    if len(description) > 1024:
164        shorten_prompt = (
165            f"{prompt}\n\n"
166            f"---\n\n"
167            f"A previous attempt produced this description, which at "
168            f"{len(description)} characters is over the 1024-character hard limit:\n\n"
169            f'"{description}"\n\n'
170            f"Rewrite it to be under 1024 characters while keeping the most "
171            f"important trigger words and intent coverage. Respond with only "
172            f"the new description in <new_description> tags."
173        )
174        shorten_text = _call_claude(shorten_prompt, model)
175        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
176        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
177 
178        transcript["rewrite_prompt"] = shorten_prompt
179        transcript["rewrite_response"] = shorten_text
180        transcript["rewrite_description"] = shortened
181        transcript["rewrite_char_count"] = len(shortened)
182        description = shortened
183 
184    transcript["final_description"] = description
185 
186    if log_dir:
187        log_dir.mkdir(parents=True, exist_ok=True)
188        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
189        log_file.write_text(json.dumps(transcript, indent=2))
190 
191    return description
192 
193 
194def main():
195    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
196    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
197    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
198    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
199    parser.add_argument("--model", required=True, help="Model for improvement")
200    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
201    args = parser.parse_args()
202 
203    skill_path = Path(args.skill_path)
204    if not (skill_path / "SKILL.md").exists():
205        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
206        sys.exit(1)
207 
208    eval_results = json.loads(Path(args.eval_results).read_text())
209    history = []
210    if args.history:
211        history = json.loads(Path(args.history).read_text())
212 
213    name, _, content = parse_skill_md(skill_path)
214    current_description = eval_results["description"]
215 
216    if args.verbose:
217        print(f"Current: {current_description}", file=sys.stderr)
218        print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
219 
220    new_description = improve_description(
221        skill_name=name,
222        skill_content=content,
223        current_description=current_description,
224        eval_results=eval_results,
225        history=history,
226        model=args.model,
227    )
228 
229    if args.verbose:
230        print(f"Improved: {new_description}", file=sys.stderr)
231 
232    # Output as JSON with both the new description and updated history
233    output = {
234        "description": new_description,
235        "history": history + [{
236            "description": current_description,
237            "passed": eval_results["summary"]["passed"],
238            "failed": eval_results["summary"]["failed"],
239            "total": eval_results["summary"]["total"],
240            "results": eval_results["results"],
241        }],
242    }
243    print(json.dumps(output, indent=2))
244 
245 
246if __name__ == "__main__":
247    main()
248
Preparing the source view

Skill Creator

scripts/improve_description.py