Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
eval-viewer/generate_review.py
1#!/usr/bin/env python32"""Generate and serve a review page for eval results.34Reads the workspace directory, discovers runs (directories with outputs/),5embeds all output data into a self-contained HTML page, and serves it via6a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.78Usage:9python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]10python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json1112No dependencies beyond the Python stdlib are required.13"""1415import argparse16import base6417import json18import mimetypes19import os20import re21import signal22import subprocess23import sys24import time25import webbrowser26from functools import partial27from http.server import HTTPServer, BaseHTTPRequestHandler28from pathlib import Path2930# Files to exclude from output listings31METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}3233# Extensions we render as inline text34TEXT_EXTENSIONS = {35".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",36".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",37".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",38}3940# Extensions we render as inline images41IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}4243# MIME type overrides for common types44MIME_OVERRIDES = {45".svg": "image/svg+xml",46".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",47".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",48".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",49}505152def get_mime_type(path: Path) -> str:53ext = path.suffix.lower()54if ext in MIME_OVERRIDES:55return MIME_OVERRIDES[ext]56mime, _ = mimetypes.guess_type(str(path))57return mime or "application/octet-stream"585960def find_runs(workspace: Path) -> list[dict]:61"""Recursively find directories that contain an outputs/ subdirectory."""62runs: list[dict] = []63_find_runs_recursive(workspace, workspace, runs)64runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))65return runs666768def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:69if not current.is_dir():70return7172outputs_dir = current / "outputs"73if outputs_dir.is_dir():74run = build_run(root, current)75if run:76runs.append(run)77return7879skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}80for child in sorted(current.iterdir()):81if child.is_dir() and child.name not in skip:82_find_runs_recursive(root, child, runs)838485def build_run(root: Path, run_dir: Path) -> dict | None:86"""Build a run dict with prompt, outputs, and grading data."""87prompt = ""88eval_id = None8990# Try eval_metadata.json91for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:92if candidate.exists():93try:94metadata = json.loads(candidate.read_text())95prompt = metadata.get("prompt", "")96eval_id = metadata.get("eval_id")97except (json.JSONDecodeError, OSError):98pass99if prompt:100break101102# Fall back to transcript.md103if not prompt:104for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:105if candidate.exists():106try:107text = candidate.read_text()108match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)109if match:110prompt = match.group(1).strip()111except OSError:112pass113if prompt:114break115116if not prompt:117prompt = "(No prompt found)"118119run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")120121# Collect output files122outputs_dir = run_dir / "outputs"123output_files: list[dict] = []124if outputs_dir.is_dir():125for f in sorted(outputs_dir.iterdir()):126if f.is_file() and f.name not in METADATA_FILES:127output_files.append(embed_file(f))128129# Load grading if present130grading = None131for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:132if candidate.exists():133try:134grading = json.loads(candidate.read_text())135except (json.JSONDecodeError, OSError):136pass137if grading:138break139140return {141"id": run_id,142"prompt": prompt,143"eval_id": eval_id,144"outputs": output_files,145"grading": grading,146}147148149def embed_file(path: Path) -> dict:150"""Read a file and return an embedded representation."""151ext = path.suffix.lower()152mime = get_mime_type(path)153154if ext in TEXT_EXTENSIONS:155try:156content = path.read_text(errors="replace")157except OSError:158content = "(Error reading file)"159return {160"name": path.name,161"type": "text",162"content": content,163}164elif ext in IMAGE_EXTENSIONS:165try:166raw = path.read_bytes()167b64 = base64.b64encode(raw).decode("ascii")168except OSError:169return {"name": path.name, "type": "error", "content": "(Error reading file)"}170return {171"name": path.name,172"type": "image",173"mime": mime,174"data_uri": f"data:{mime};base64,{b64}",175}176elif ext == ".pdf":177try:178raw = path.read_bytes()179b64 = base64.b64encode(raw).decode("ascii")180except OSError:181return {"name": path.name, "type": "error", "content": "(Error reading file)"}182return {183"name": path.name,184"type": "pdf",185"data_uri": f"data:{mime};base64,{b64}",186}187elif ext == ".xlsx":188try:189raw = path.read_bytes()190b64 = base64.b64encode(raw).decode("ascii")191except OSError:192return {"name": path.name, "type": "error", "content": "(Error reading file)"}193return {194"name": path.name,195"type": "xlsx",196"data_b64": b64,197}198else:199# Binary / unknown — base64 download link200try:201raw = path.read_bytes()202b64 = base64.b64encode(raw).decode("ascii")203except OSError:204return {"name": path.name, "type": "error", "content": "(Error reading file)"}205return {206"name": path.name,207"type": "binary",208"mime": mime,209"data_uri": f"data:{mime};base64,{b64}",210}211212213def load_previous_iteration(workspace: Path) -> dict[str, dict]:214"""Load previous iteration's feedback and outputs.215216Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.217"""218result: dict[str, dict] = {}219220# Load feedback221feedback_map: dict[str, str] = {}222feedback_path = workspace / "feedback.json"223if feedback_path.exists():224try:225data = json.loads(feedback_path.read_text())226feedback_map = {227r["run_id"]: r["feedback"]228for r in data.get("reviews", [])229if r.get("feedback", "").strip()230}231except (json.JSONDecodeError, OSError, KeyError):232pass233234# Load runs (to get outputs)235prev_runs = find_runs(workspace)236for run in prev_runs:237result[run["id"]] = {238"feedback": feedback_map.get(run["id"], ""),239"outputs": run.get("outputs", []),240}241242# Also add feedback for run_ids that had feedback but no matching run243for run_id, fb in feedback_map.items():244if run_id not in result:245result[run_id] = {"feedback": fb, "outputs": []}246247return result248249250def generate_html(251runs: list[dict],252skill_name: str,253previous: dict[str, dict] | None = None,254benchmark: dict | None = None,255) -> str:256"""Generate the complete standalone HTML page with embedded data."""257template_path = Path(__file__).parent / "viewer.html"258template = template_path.read_text()259260# Build previous_feedback and previous_outputs maps for the template261previous_feedback: dict[str, str] = {}262previous_outputs: dict[str, list[dict]] = {}263if previous:264for run_id, data in previous.items():265if data.get("feedback"):266previous_feedback[run_id] = data["feedback"]267if data.get("outputs"):268previous_outputs[run_id] = data["outputs"]269270embedded = {271"skill_name": skill_name,272"runs": runs,273"previous_feedback": previous_feedback,274"previous_outputs": previous_outputs,275}276if benchmark:277embedded["benchmark"] = benchmark278279data_json = json.dumps(embedded)280281return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")282283284# ---------------------------------------------------------------------------285# HTTP server (stdlib only, zero dependencies)286# ---------------------------------------------------------------------------287288def _kill_port(port: int) -> None:289"""Kill any process listening on the given port."""290try:291result = subprocess.run(292["lsof", "-ti", f":{port}"],293capture_output=True, text=True, timeout=5,294)295for pid_str in result.stdout.strip().split("\n"):296if pid_str.strip():297try:298os.kill(int(pid_str.strip()), signal.SIGTERM)299except (ProcessLookupError, ValueError):300pass301if result.stdout.strip():302time.sleep(0.5)303except subprocess.TimeoutExpired:304pass305except FileNotFoundError:306print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)307308class ReviewHandler(BaseHTTPRequestHandler):309"""Serves the review HTML and handles feedback saves.310311Regenerates the HTML on each page load so that refreshing the browser312picks up new eval outputs without restarting the server.313"""314315def __init__(316self,317workspace: Path,318skill_name: str,319feedback_path: Path,320previous: dict[str, dict],321benchmark_path: Path | None,322*args,323**kwargs,324):325self.workspace = workspace326self.skill_name = skill_name327self.feedback_path = feedback_path328self.previous = previous329self.benchmark_path = benchmark_path330super().__init__(*args, **kwargs)331332def do_GET(self) -> None:333if self.path == "/" or self.path == "/index.html":334# Regenerate HTML on each request (re-scans workspace for new outputs)335runs = find_runs(self.workspace)336benchmark = None337if self.benchmark_path and self.benchmark_path.exists():338try:339benchmark = json.loads(self.benchmark_path.read_text())340except (json.JSONDecodeError, OSError):341pass342html = generate_html(runs, self.skill_name, self.previous, benchmark)343content = html.encode("utf-8")344self.send_response(200)345self.send_header("Content-Type", "text/html; charset=utf-8")346self.send_header("Content-Length", str(len(content)))347self.end_headers()348self.wfile.write(content)349elif self.path == "/api/feedback":350data = b"{}"351if self.feedback_path.exists():352data = self.feedback_path.read_bytes()353self.send_response(200)354self.send_header("Content-Type", "application/json")355self.send_header("Content-Length", str(len(data)))356self.end_headers()357self.wfile.write(data)358else:359self.send_error(404)360361def do_POST(self) -> None:362if self.path == "/api/feedback":363length = int(self.headers.get("Content-Length", 0))364body = self.rfile.read(length)365try:366data = json.loads(body)367if not isinstance(data, dict) or "reviews" not in data:368raise ValueError("Expected JSON object with 'reviews' key")369self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")370resp = b'{"ok":true}'371self.send_response(200)372except (json.JSONDecodeError, OSError, ValueError) as e:373resp = json.dumps({"error": str(e)}).encode()374self.send_response(500)375self.send_header("Content-Type", "application/json")376self.send_header("Content-Length", str(len(resp)))377self.end_headers()378self.wfile.write(resp)379else:380self.send_error(404)381382def log_message(self, format: str, *args: object) -> None:383# Suppress request logging to keep terminal clean384pass385386387def main() -> None:388parser = argparse.ArgumentParser(description="Generate and serve eval review")389parser.add_argument("workspace", type=Path, help="Path to workspace directory")390parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")391parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")392parser.add_argument(393"--previous-workspace", type=Path, default=None,394help="Path to previous iteration's workspace (shows old outputs and feedback as context)",395)396parser.add_argument(397"--benchmark", type=Path, default=None,398help="Path to benchmark.json to show in the Benchmark tab",399)400parser.add_argument(401"--static", "-s", type=Path, default=None,402help="Write standalone HTML to this path instead of starting a server",403)404args = parser.parse_args()405406workspace = args.workspace.resolve()407if not workspace.is_dir():408print(f"Error: {workspace} is not a directory", file=sys.stderr)409sys.exit(1)410411runs = find_runs(workspace)412if not runs:413print(f"No runs found in {workspace}", file=sys.stderr)414sys.exit(1)415416skill_name = args.skill_name or workspace.name.replace("-workspace", "")417feedback_path = workspace / "feedback.json"418419previous: dict[str, dict] = {}420if args.previous_workspace:421previous = load_previous_iteration(args.previous_workspace.resolve())422423benchmark_path = args.benchmark.resolve() if args.benchmark else None424benchmark = None425if benchmark_path and benchmark_path.exists():426try:427benchmark = json.loads(benchmark_path.read_text())428except (json.JSONDecodeError, OSError):429pass430431if args.static:432html = generate_html(runs, skill_name, previous, benchmark)433args.static.parent.mkdir(parents=True, exist_ok=True)434args.static.write_text(html)435print(f"\n Static viewer written to: {args.static}\n")436sys.exit(0)437438# Kill any existing process on the target port439port = args.port440_kill_port(port)441handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)442try:443server = HTTPServer(("127.0.0.1", port), handler)444except OSError:445# Port still in use after kill attempt — find a free one446server = HTTPServer(("127.0.0.1", 0), handler)447port = server.server_address[1]448449url = f"http://localhost:{port}"450print(f"\n Eval Viewer")451print(f" ─────────────────────────────────")452print(f" URL: {url}")453print(f" Workspace: {workspace}")454print(f" Feedback: {feedback_path}")455if previous:456print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")457if benchmark_path:458print(f" Benchmark: {benchmark_path}")459print(f"\n Press Ctrl+C to stop.\n")460461webbrowser.open(url)462463try:464server.serve_forever()465except KeyboardInterrupt:466print("\nStopped.")467server.server_close()468469470if __name__ == "__main__":471main()472