Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/improve_description.py
1#!/usr/bin/env python32"""Improve a skill description based on eval results.34Takes eval results (from run_eval.py) and generates an improved description5by calling `claude -p` as a subprocess (same auth pattern as run_eval.py —6uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed).7"""89import argparse10import json11import os12import re13import subprocess14import sys15from pathlib import Path1617from scripts.utils import parse_skill_md181920def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:21"""Run `claude -p` with the prompt on stdin and return the text response.2223Prompt goes over stdin (not argv) because it embeds the full SKILL.md24body and can easily exceed comfortable argv length.25"""26cmd = ["claude", "-p", "--output-format", "text"]27if model:28cmd.extend(["--model", model])2930# Remove CLAUDECODE env var to allow nesting claude -p inside a31# Claude Code session. The guard is for interactive terminal conflicts;32# programmatic subprocess usage is safe. Same pattern as run_eval.py.33env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}3435result = subprocess.run(36cmd,37input=prompt,38capture_output=True,39text=True,40env=env,41timeout=timeout,42)43if result.returncode != 0:44raise RuntimeError(45f"claude -p exited {result.returncode}\nstderr: {result.stderr}"46)47return result.stdout484950def improve_description(51skill_name: str,52skill_content: str,53current_description: str,54eval_results: dict,55history: list[dict],56model: str,57test_results: dict | None = None,58log_dir: Path | None = None,59iteration: int | None = None,60) -> str:61"""Call Claude to improve the description based on eval results."""62failed_triggers = [63r for r in eval_results["results"]64if r["should_trigger"] and not r["pass"]65]66false_triggers = [67r for r in eval_results["results"]68if not r["should_trigger"] and not r["pass"]69]7071# Build scores summary72train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"73if test_results:74test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"75scores_summary = f"Train: {train_score}, Test: {test_score}"76else:77scores_summary = f"Train: {train_score}"7879prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.8081The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.8283Here's the current description:84<current_description>85"{current_description}"86</current_description>8788Current scores ({scores_summary}):89<scores_summary>90"""91if failed_triggers:92prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"93for r in failed_triggers:94prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'95prompt += "\n"9697if false_triggers:98prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"99for r in false_triggers:100prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'101prompt += "\n"102103if history:104prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"105for h in history:106train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"107test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None108score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")109prompt += f'<attempt {score_str}>\n'110prompt += f'Description: "{h["description"]}"\n'111if "results" in h:112prompt += "Train results:\n"113for r in h["results"]:114status = "PASS" if r["pass"] else "FAIL"115prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'116if h.get("note"):117prompt += f'Note: {h["note"]}\n'118prompt += "</attempt>\n\n"119120prompt += f"""</scores_summary>121122Skill content (for context on what the skill does):123<skill_content>124{skill_content}125</skill_content>126127Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:1281291. Avoid overfitting1302. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.131132Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it.133134Here are some tips that we've found to work well in writing these descriptions:135- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"136- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.137- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.138- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.139140I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end.141142Please respond with only the new description text in <new_description> tags, nothing else."""143144text = _call_claude(prompt, model)145146match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)147description = match.group(1).strip().strip('"') if match else text.strip().strip('"')148149transcript: dict = {150"iteration": iteration,151"prompt": prompt,152"response": text,153"parsed_description": description,154"char_count": len(description),155"over_limit": len(description) > 1024,156}157158# Safety net: the prompt already states the 1024-char hard limit, but if159# the model blew past it anyway, make one fresh single-turn call that160# quotes the too-long version and asks for a shorter rewrite. (The old161# SDK path did this as a true multi-turn; `claude -p` is one-shot, so we162# inline the prior output into the new prompt instead.)163if len(description) > 1024:164shorten_prompt = (165f"{prompt}\n\n"166f"---\n\n"167f"A previous attempt produced this description, which at "168f"{len(description)} characters is over the 1024-character hard limit:\n\n"169f'"{description}"\n\n'170f"Rewrite it to be under 1024 characters while keeping the most "171f"important trigger words and intent coverage. Respond with only "172f"the new description in <new_description> tags."173)174shorten_text = _call_claude(shorten_prompt, model)175match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)176shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')177178transcript["rewrite_prompt"] = shorten_prompt179transcript["rewrite_response"] = shorten_text180transcript["rewrite_description"] = shortened181transcript["rewrite_char_count"] = len(shortened)182description = shortened183184transcript["final_description"] = description185186if log_dir:187log_dir.mkdir(parents=True, exist_ok=True)188log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"189log_file.write_text(json.dumps(transcript, indent=2))190191return description192193194def main():195parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")196parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")197parser.add_argument("--skill-path", required=True, help="Path to skill directory")198parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")199parser.add_argument("--model", required=True, help="Model for improvement")200parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")201args = parser.parse_args()202203skill_path = Path(args.skill_path)204if not (skill_path / "SKILL.md").exists():205print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)206sys.exit(1)207208eval_results = json.loads(Path(args.eval_results).read_text())209history = []210if args.history:211history = json.loads(Path(args.history).read_text())212213name, _, content = parse_skill_md(skill_path)214current_description = eval_results["description"]215216if args.verbose:217print(f"Current: {current_description}", file=sys.stderr)218print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)219220new_description = improve_description(221skill_name=name,222skill_content=content,223current_description=current_description,224eval_results=eval_results,225history=history,226model=args.model,227)228229if args.verbose:230print(f"Improved: {new_description}", file=sys.stderr)231232# Output as JSON with both the new description and updated history233output = {234"description": new_description,235"history": history + [{236"description": current_description,237"passed": eval_results["summary"]["passed"],238"failed": eval_results["summary"]["failed"],239"total": eval_results["summary"]["total"],240"results": eval_results["results"],241}],242}243print(json.dumps(output, indent=2))244245246if __name__ == "__main__":247main()248