Source from repo

Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

219.7 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

eval-viewer/generate_review.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code472 linesFree

eval-viewer/generate_review.py

1#!/usr/bin/env python3
2"""Generate and serve a review page for eval results.
3 
4Reads the workspace directory, discovers runs (directories with outputs/),
5embeds all output data into a self-contained HTML page, and serves it via
6a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
7 
8Usage:
9    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
10    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
11 
12No dependencies beyond the Python stdlib are required.
13"""
14 
15import argparse
16import base64
17import json
18import mimetypes
19import os
20import re
21import signal
22import subprocess
23import sys
24import time
25import webbrowser
26from functools import partial
27from http.server import HTTPServer, BaseHTTPRequestHandler
28from pathlib import Path
29 
30# Files to exclude from output listings
31METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
32 
33# Extensions we render as inline text
34TEXT_EXTENSIONS = {
35    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
36    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
37    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
38}
39 
40# Extensions we render as inline images
41IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
42 
43# MIME type overrides for common types
44MIME_OVERRIDES = {
45    ".svg": "image/svg+xml",
46    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
47    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
48    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
49}
50 
51 
52def get_mime_type(path: Path) -> str:
53    ext = path.suffix.lower()
54    if ext in MIME_OVERRIDES:
55        return MIME_OVERRIDES[ext]
56    mime, _ = mimetypes.guess_type(str(path))
57    return mime or "application/octet-stream"
58 
59 
60def find_runs(workspace: Path) -> list[dict]:
61    """Recursively find directories that contain an outputs/ subdirectory."""
62    runs: list[dict] = []
63    _find_runs_recursive(workspace, workspace, runs)
64    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
65    return runs
66 
67 
68def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
69    if not current.is_dir():
70        return
71 
72    outputs_dir = current / "outputs"
73    if outputs_dir.is_dir():
74        run = build_run(root, current)
75        if run:
76            runs.append(run)
77        return
78 
79    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
80    for child in sorted(current.iterdir()):
81        if child.is_dir() and child.name not in skip:
82            _find_runs_recursive(root, child, runs)
83 
84 
85def build_run(root: Path, run_dir: Path) -> dict | None:
86    """Build a run dict with prompt, outputs, and grading data."""
87    prompt = ""
88    eval_id = None
89 
90    # Try eval_metadata.json
91    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
92        if candidate.exists():
93            try:
94                metadata = json.loads(candidate.read_text())
95                prompt = metadata.get("prompt", "")
96                eval_id = metadata.get("eval_id")
97            except (json.JSONDecodeError, OSError):
98                pass
99            if prompt:
100                break
101 
102    # Fall back to transcript.md
103    if not prompt:
104        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
105            if candidate.exists():
106                try:
107                    text = candidate.read_text()
108                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
109                    if match:
110                        prompt = match.group(1).strip()
111                except OSError:
112                    pass
113                if prompt:
114                    break
115 
116    if not prompt:
117        prompt = "(No prompt found)"
118 
119    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
120 
121    # Collect output files
122    outputs_dir = run_dir / "outputs"
123    output_files: list[dict] = []
124    if outputs_dir.is_dir():
125        for f in sorted(outputs_dir.iterdir()):
126            if f.is_file() and f.name not in METADATA_FILES:
127                output_files.append(embed_file(f))
128 
129    # Load grading if present
130    grading = None
131    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
132        if candidate.exists():
133            try:
134                grading = json.loads(candidate.read_text())
135            except (json.JSONDecodeError, OSError):
136                pass
137            if grading:
138                break
139 
140    return {
141        "id": run_id,
142        "prompt": prompt,
143        "eval_id": eval_id,
144        "outputs": output_files,
145        "grading": grading,
146    }
147 
148 
149def embed_file(path: Path) -> dict:
150    """Read a file and return an embedded representation."""
151    ext = path.suffix.lower()
152    mime = get_mime_type(path)
153 
154    if ext in TEXT_EXTENSIONS:
155        try:
156            content = path.read_text(errors="replace")
157        except OSError:
158            content = "(Error reading file)"
159        return {
160            "name": path.name,
161            "type": "text",
162            "content": content,
163        }
164    elif ext in IMAGE_EXTENSIONS:
165        try:
166            raw = path.read_bytes()
167            b64 = base64.b64encode(raw).decode("ascii")
168        except OSError:
169            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
170        return {
171            "name": path.name,
172            "type": "image",
173            "mime": mime,
174            "data_uri": f"data:{mime};base64,{b64}",
175        }
176    elif ext == ".pdf":
177        try:
178            raw = path.read_bytes()
179            b64 = base64.b64encode(raw).decode("ascii")
180        except OSError:
181            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
182        return {
183            "name": path.name,
184            "type": "pdf",
185            "data_uri": f"data:{mime};base64,{b64}",
186        }
187    elif ext == ".xlsx":
188        try:
189            raw = path.read_bytes()
190            b64 = base64.b64encode(raw).decode("ascii")
191        except OSError:
192            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
193        return {
194            "name": path.name,
195            "type": "xlsx",
196            "data_b64": b64,
197        }
198    else:
199        # Binary / unknown — base64 download link
200        try:
201            raw = path.read_bytes()
202            b64 = base64.b64encode(raw).decode("ascii")
203        except OSError:
204            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
205        return {
206            "name": path.name,
207            "type": "binary",
208            "mime": mime,
209            "data_uri": f"data:{mime};base64,{b64}",
210        }
211 
212 
213def load_previous_iteration(workspace: Path) -> dict[str, dict]:
214    """Load previous iteration's feedback and outputs.
215 
216    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
217    """
218    result: dict[str, dict] = {}
219 
220    # Load feedback
221    feedback_map: dict[str, str] = {}
222    feedback_path = workspace / "feedback.json"
223    if feedback_path.exists():
224        try:
225            data = json.loads(feedback_path.read_text())
226            feedback_map = {
227                r["run_id"]: r["feedback"]
228                for r in data.get("reviews", [])
229                if r.get("feedback", "").strip()
230            }
231        except (json.JSONDecodeError, OSError, KeyError):
232            pass
233 
234    # Load runs (to get outputs)
235    prev_runs = find_runs(workspace)
236    for run in prev_runs:
237        result[run["id"]] = {
238            "feedback": feedback_map.get(run["id"], ""),
239            "outputs": run.get("outputs", []),
240        }
241 
242    # Also add feedback for run_ids that had feedback but no matching run
243    for run_id, fb in feedback_map.items():
244        if run_id not in result:
245            result[run_id] = {"feedback": fb, "outputs": []}
246 
247    return result
248 
249 
250def generate_html(
251    runs: list[dict],
252    skill_name: str,
253    previous: dict[str, dict] | None = None,
254    benchmark: dict | None = None,
255) -> str:
256    """Generate the complete standalone HTML page with embedded data."""
257    template_path = Path(__file__).parent / "viewer.html"
258    template = template_path.read_text()
259 
260    # Build previous_feedback and previous_outputs maps for the template
261    previous_feedback: dict[str, str] = {}
262    previous_outputs: dict[str, list[dict]] = {}
263    if previous:
264        for run_id, data in previous.items():
265            if data.get("feedback"):
266                previous_feedback[run_id] = data["feedback"]
267            if data.get("outputs"):
268                previous_outputs[run_id] = data["outputs"]
269 
270    embedded = {
271        "skill_name": skill_name,
272        "runs": runs,
273        "previous_feedback": previous_feedback,
274        "previous_outputs": previous_outputs,
275    }
276    if benchmark:
277        embedded["benchmark"] = benchmark
278 
279    data_json = json.dumps(embedded)
280 
281    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
282 
283 
284# ---------------------------------------------------------------------------
285# HTTP server (stdlib only, zero dependencies)
286# ---------------------------------------------------------------------------
287 
288def _kill_port(port: int) -> None:
289    """Kill any process listening on the given port."""
290    try:
291        result = subprocess.run(
292            ["lsof", "-ti", f":{port}"],
293            capture_output=True, text=True, timeout=5,
294        )
295        for pid_str in result.stdout.strip().split("\n"):
296            if pid_str.strip():
297                try:
298                    os.kill(int(pid_str.strip()), signal.SIGTERM)
299                except (ProcessLookupError, ValueError):
300                    pass
301        if result.stdout.strip():
302            time.sleep(0.5)
303    except subprocess.TimeoutExpired:
304        pass
305    except FileNotFoundError:
306        print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
307 
308class ReviewHandler(BaseHTTPRequestHandler):
309    """Serves the review HTML and handles feedback saves.
310 
311    Regenerates the HTML on each page load so that refreshing the browser
312    picks up new eval outputs without restarting the server.
313    """
314 
315    def __init__(
316        self,
317        workspace: Path,
318        skill_name: str,
319        feedback_path: Path,
320        previous: dict[str, dict],
321        benchmark_path: Path | None,
322        *args,
323        **kwargs,
324    ):
325        self.workspace = workspace
326        self.skill_name = skill_name
327        self.feedback_path = feedback_path
328        self.previous = previous
329        self.benchmark_path = benchmark_path
330        super().__init__(*args, **kwargs)
331 
332    def do_GET(self) -> None:
333        if self.path == "/" or self.path == "/index.html":
334            # Regenerate HTML on each request (re-scans workspace for new outputs)
335            runs = find_runs(self.workspace)
336            benchmark = None
337            if self.benchmark_path and self.benchmark_path.exists():
338                try:
339                    benchmark = json.loads(self.benchmark_path.read_text())
340                except (json.JSONDecodeError, OSError):
341                    pass
342            html = generate_html(runs, self.skill_name, self.previous, benchmark)
343            content = html.encode("utf-8")
344            self.send_response(200)
345            self.send_header("Content-Type", "text/html; charset=utf-8")
346            self.send_header("Content-Length", str(len(content)))
347            self.end_headers()
348            self.wfile.write(content)
349        elif self.path == "/api/feedback":
350            data = b"{}"
351            if self.feedback_path.exists():
352                data = self.feedback_path.read_bytes()
353            self.send_response(200)
354            self.send_header("Content-Type", "application/json")
355            self.send_header("Content-Length", str(len(data)))
356            self.end_headers()
357            self.wfile.write(data)
358        else:
359            self.send_error(404)
360 
361    def do_POST(self) -> None:
362        if self.path == "/api/feedback":
363            length = int(self.headers.get("Content-Length", 0))
364            body = self.rfile.read(length)
365            try:
366                data = json.loads(body)
367                if not isinstance(data, dict) or "reviews" not in data:
368                    raise ValueError("Expected JSON object with 'reviews' key")
369                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
370                resp = b'{"ok":true}'
371                self.send_response(200)
372            except (json.JSONDecodeError, OSError, ValueError) as e:
373                resp = json.dumps({"error": str(e)}).encode()
374                self.send_response(500)
375            self.send_header("Content-Type", "application/json")
376            self.send_header("Content-Length", str(len(resp)))
377            self.end_headers()
378            self.wfile.write(resp)
379        else:
380            self.send_error(404)
381 
382    def log_message(self, format: str, *args: object) -> None:
383        # Suppress request logging to keep terminal clean
384        pass
385 
386 
387def main() -> None:
388    parser = argparse.ArgumentParser(description="Generate and serve eval review")
389    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
390    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
391    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
392    parser.add_argument(
393        "--previous-workspace", type=Path, default=None,
394        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
395    )
396    parser.add_argument(
397        "--benchmark", type=Path, default=None,
398        help="Path to benchmark.json to show in the Benchmark tab",
399    )
400    parser.add_argument(
401        "--static", "-s", type=Path, default=None,
402        help="Write standalone HTML to this path instead of starting a server",
403    )
404    args = parser.parse_args()
405 
406    workspace = args.workspace.resolve()
407    if not workspace.is_dir():
408        print(f"Error: {workspace} is not a directory", file=sys.stderr)
409        sys.exit(1)
410 
411    runs = find_runs(workspace)
412    if not runs:
413        print(f"No runs found in {workspace}", file=sys.stderr)
414        sys.exit(1)
415 
416    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
417    feedback_path = workspace / "feedback.json"
418 
419    previous: dict[str, dict] = {}
420    if args.previous_workspace:
421        previous = load_previous_iteration(args.previous_workspace.resolve())
422 
423    benchmark_path = args.benchmark.resolve() if args.benchmark else None
424    benchmark = None
425    if benchmark_path and benchmark_path.exists():
426        try:
427            benchmark = json.loads(benchmark_path.read_text())
428        except (json.JSONDecodeError, OSError):
429            pass
430 
431    if args.static:
432        html = generate_html(runs, skill_name, previous, benchmark)
433        args.static.parent.mkdir(parents=True, exist_ok=True)
434        args.static.write_text(html)
435        print(f"\n  Static viewer written to: {args.static}\n")
436        sys.exit(0)
437 
438    # Kill any existing process on the target port
439    port = args.port
440    _kill_port(port)
441    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
442    try:
443        server = HTTPServer(("127.0.0.1", port), handler)
444    except OSError:
445        # Port still in use after kill attempt — find a free one
446        server = HTTPServer(("127.0.0.1", 0), handler)
447        port = server.server_address[1]
448 
449    url = f"http://localhost:{port}"
450    print(f"\n  Eval Viewer")
451    print(f"  ─────────────────────────────────")
452    print(f"  URL:       {url}")
453    print(f"  Workspace: {workspace}")
454    print(f"  Feedback:  {feedback_path}")
455    if previous:
456        print(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
457    if benchmark_path:
458        print(f"  Benchmark: {benchmark_path}")
459    print(f"\n  Press Ctrl+C to stop.\n")
460 
461    webbrowser.open(url)
462 
463    try:
464        server.serve_forever()
465    except KeyboardInterrupt:
466        print("\nStopped.")
467        server.server_close()
468 
469 
470if __name__ == "__main__":
471    main()
472

Marketplace

Source from repo

Skill Creator

Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization

anthropicsGitHub anthropicsOfficialSource repo Original GitHub link Publisher page

Files

Skill

n/a

Size

219.7 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

eval-viewer/generate_review.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code472 linesFree

eval-viewer/generate_review.py

1#!/usr/bin/env python3
2"""Generate and serve a review page for eval results.
3 
4Reads the workspace directory, discovers runs (directories with outputs/),
5embeds all output data into a self-contained HTML page, and serves it via
6a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
7 
8Usage:
9    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
10    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
11 
12No dependencies beyond the Python stdlib are required.
13"""
14 
15import argparse
16import base64
17import json
18import mimetypes
19import os
20import re
21import signal
22import subprocess
23import sys
24import time
25import webbrowser
26from functools import partial
27from http.server import HTTPServer, BaseHTTPRequestHandler
28from pathlib import Path
29 
30# Files to exclude from output listings
31METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
32 
33# Extensions we render as inline text
34TEXT_EXTENSIONS = {
35    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
36    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
37    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
38}
39 
40# Extensions we render as inline images
41IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
42 
43# MIME type overrides for common types
44MIME_OVERRIDES = {
45    ".svg": "image/svg+xml",
46    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
47    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
48    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
49}
50 
51 
52def get_mime_type(path: Path) -> str:
53    ext = path.suffix.lower()
54    if ext in MIME_OVERRIDES:
55        return MIME_OVERRIDES[ext]
56    mime, _ = mimetypes.guess_type(str(path))
57    return mime or "application/octet-stream"
58 
59 
60def find_runs(workspace: Path) -> list[dict]:
61    """Recursively find directories that contain an outputs/ subdirectory."""
62    runs: list[dict] = []
63    _find_runs_recursive(workspace, workspace, runs)
64    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
65    return runs
66 
67 
68def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
69    if not current.is_dir():
70        return
71 
72    outputs_dir = current / "outputs"
73    if outputs_dir.is_dir():
74        run = build_run(root, current)
75        if run:
76            runs.append(run)
77        return
78 
79    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
80    for child in sorted(current.iterdir()):
81        if child.is_dir() and child.name not in skip:
82            _find_runs_recursive(root, child, runs)
83 
84 
85def build_run(root: Path, run_dir: Path) -> dict | None:
86    """Build a run dict with prompt, outputs, and grading data."""
87    prompt = ""
88    eval_id = None
89 
90    # Try eval_metadata.json
91    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
92        if candidate.exists():
93            try:
94                metadata = json.loads(candidate.read_text())
95                prompt = metadata.get("prompt", "")
96                eval_id = metadata.get("eval_id")
97            except (json.JSONDecodeError, OSError):
98                pass
99            if prompt:
100                break
101 
102    # Fall back to transcript.md
103    if not prompt:
104        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
105            if candidate.exists():
106                try:
107                    text = candidate.read_text()
108                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
109                    if match:
110                        prompt = match.group(1).strip()
111                except OSError:
112                    pass
113                if prompt:
114                    break
115 
116    if not prompt:
117        prompt = "(No prompt found)"
118 
119    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
120 
121    # Collect output files
122    outputs_dir = run_dir / "outputs"
123    output_files: list[dict] = []
124    if outputs_dir.is_dir():
125        for f in sorted(outputs_dir.iterdir()):
126            if f.is_file() and f.name not in METADATA_FILES:
127                output_files.append(embed_file(f))
128 
129    # Load grading if present
130    grading = None
131    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
132        if candidate.exists():
133            try:
134                grading = json.loads(candidate.read_text())
135            except (json.JSONDecodeError, OSError):
136                pass
137            if grading:
138                break
139 
140    return {
141        "id": run_id,
142        "prompt": prompt,
143        "eval_id": eval_id,
144        "outputs": output_files,
145        "grading": grading,
146    }
147 
148 
149def embed_file(path: Path) -> dict:
150    """Read a file and return an embedded representation."""
151    ext = path.suffix.lower()
152    mime = get_mime_type(path)
153 
154    if ext in TEXT_EXTENSIONS:
155        try:
156            content = path.read_text(errors="replace")
157        except OSError:
158            content = "(Error reading file)"
159        return {
160            "name": path.name,
161            "type": "text",
162            "content": content,
163        }
164    elif ext in IMAGE_EXTENSIONS:
165        try:
166            raw = path.read_bytes()
167            b64 = base64.b64encode(raw).decode("ascii")
168        except OSError:
169            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
170        return {
171            "name": path.name,
172            "type": "image",
173            "mime": mime,
174            "data_uri": f"data:{mime};base64,{b64}",
175        }
176    elif ext == ".pdf":
177        try:
178            raw = path.read_bytes()
179            b64 = base64.b64encode(raw).decode("ascii")
180        except OSError:
181            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
182        return {
183            "name": path.name,
184            "type": "pdf",
185            "data_uri": f"data:{mime};base64,{b64}",
186        }
187    elif ext == ".xlsx":
188        try:
189            raw = path.read_bytes()
190            b64 = base64.b64encode(raw).decode("ascii")
191        except OSError:
192            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
193        return {
194            "name": path.name,
195            "type": "xlsx",
196            "data_b64": b64,
197        }
198    else:
199        # Binary / unknown — base64 download link
200        try:
201            raw = path.read_bytes()
202            b64 = base64.b64encode(raw).decode("ascii")
203        except OSError:
204            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
205        return {
206            "name": path.name,
207            "type": "binary",
208            "mime": mime,
209            "data_uri": f"data:{mime};base64,{b64}",
210        }
211 
212 
213def load_previous_iteration(workspace: Path) -> dict[str, dict]:
214    """Load previous iteration's feedback and outputs.
215 
216    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
217    """
218    result: dict[str, dict] = {}
219 
220    # Load feedback
221    feedback_map: dict[str, str] = {}
222    feedback_path = workspace / "feedback.json"
223    if feedback_path.exists():
224        try:
225            data = json.loads(feedback_path.read_text())
226            feedback_map = {
227                r["run_id"]: r["feedback"]
228                for r in data.get("reviews", [])
229                if r.get("feedback", "").strip()
230            }
231        except (json.JSONDecodeError, OSError, KeyError):
232            pass
233 
234    # Load runs (to get outputs)
235    prev_runs = find_runs(workspace)
236    for run in prev_runs:
237        result[run["id"]] = {
238            "feedback": feedback_map.get(run["id"], ""),
239            "outputs": run.get("outputs", []),
240        }
241 
242    # Also add feedback for run_ids that had feedback but no matching run
243    for run_id, fb in feedback_map.items():
244        if run_id not in result:
245            result[run_id] = {"feedback": fb, "outputs": []}
246 
247    return result
248 
249 
250def generate_html(
251    runs: list[dict],
252    skill_name: str,
253    previous: dict[str, dict] | None = None,
254    benchmark: dict | None = None,
255) -> str:
256    """Generate the complete standalone HTML page with embedded data."""
257    template_path = Path(__file__).parent / "viewer.html"
258    template = template_path.read_text()
259 
260    # Build previous_feedback and previous_outputs maps for the template
261    previous_feedback: dict[str, str] = {}
262    previous_outputs: dict[str, list[dict]] = {}
263    if previous:
264        for run_id, data in previous.items():
265            if data.get("feedback"):
266                previous_feedback[run_id] = data["feedback"]
267            if data.get("outputs"):
268                previous_outputs[run_id] = data["outputs"]
269 
270    embedded = {
271        "skill_name": skill_name,
272        "runs": runs,
273        "previous_feedback": previous_feedback,
274        "previous_outputs": previous_outputs,
275    }
276    if benchmark:
277        embedded["benchmark"] = benchmark
278 
279    data_json = json.dumps(embedded)
280 
281    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
282 
283 
284# ---------------------------------------------------------------------------
285# HTTP server (stdlib only, zero dependencies)
286# ---------------------------------------------------------------------------
287 
288def _kill_port(port: int) -> None:
289    """Kill any process listening on the given port."""
290    try:
291        result = subprocess.run(
292            ["lsof", "-ti", f":{port}"],
293            capture_output=True, text=True, timeout=5,
294        )
295        for pid_str in result.stdout.strip().split("\n"):
296            if pid_str.strip():
297                try:
298                    os.kill(int(pid_str.strip()), signal.SIGTERM)
299                except (ProcessLookupError, ValueError):
300                    pass
301        if result.stdout.strip():
302            time.sleep(0.5)
303    except subprocess.TimeoutExpired:
304        pass
305    except FileNotFoundError:
306        print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
307 
308class ReviewHandler(BaseHTTPRequestHandler):
309    """Serves the review HTML and handles feedback saves.
310 
311    Regenerates the HTML on each page load so that refreshing the browser
312    picks up new eval outputs without restarting the server.
313    """
314 
315    def __init__(
316        self,
317        workspace: Path,
318        skill_name: str,
319        feedback_path: Path,
320        previous: dict[str, dict],
321        benchmark_path: Path | None,
322        *args,
323        **kwargs,
324    ):
325        self.workspace = workspace
326        self.skill_name = skill_name
327        self.feedback_path = feedback_path
328        self.previous = previous
329        self.benchmark_path = benchmark_path
330        super().__init__(*args, **kwargs)
331 
332    def do_GET(self) -> None:
333        if self.path == "/" or self.path == "/index.html":
334            # Regenerate HTML on each request (re-scans workspace for new outputs)
335            runs = find_runs(self.workspace)
336            benchmark = None
337            if self.benchmark_path and self.benchmark_path.exists():
338                try:
339                    benchmark = json.loads(self.benchmark_path.read_text())
340                except (json.JSONDecodeError, OSError):
341                    pass
342            html = generate_html(runs, self.skill_name, self.previous, benchmark)
343            content = html.encode("utf-8")
344            self.send_response(200)
345            self.send_header("Content-Type", "text/html; charset=utf-8")
346            self.send_header("Content-Length", str(len(content)))
347            self.end_headers()
348            self.wfile.write(content)
349        elif self.path == "/api/feedback":
350            data = b"{}"
351            if self.feedback_path.exists():
352                data = self.feedback_path.read_bytes()
353            self.send_response(200)
354            self.send_header("Content-Type", "application/json")
355            self.send_header("Content-Length", str(len(data)))
356            self.end_headers()
357            self.wfile.write(data)
358        else:
359            self.send_error(404)
360 
361    def do_POST(self) -> None:
362        if self.path == "/api/feedback":
363            length = int(self.headers.get("Content-Length", 0))
364            body = self.rfile.read(length)
365            try:
366                data = json.loads(body)
367                if not isinstance(data, dict) or "reviews" not in data:
368                    raise ValueError("Expected JSON object with 'reviews' key")
369                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
370                resp = b'{"ok":true}'
371                self.send_response(200)
372            except (json.JSONDecodeError, OSError, ValueError) as e:
373                resp = json.dumps({"error": str(e)}).encode()
374                self.send_response(500)
375            self.send_header("Content-Type", "application/json")
376            self.send_header("Content-Length", str(len(resp)))
377            self.end_headers()
378            self.wfile.write(resp)
379        else:
380            self.send_error(404)
381 
382    def log_message(self, format: str, *args: object) -> None:
383        # Suppress request logging to keep terminal clean
384        pass
385 
386 
387def main() -> None:
388    parser = argparse.ArgumentParser(description="Generate and serve eval review")
389    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
390    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
391    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
392    parser.add_argument(
393        "--previous-workspace", type=Path, default=None,
394        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
395    )
396    parser.add_argument(
397        "--benchmark", type=Path, default=None,
398        help="Path to benchmark.json to show in the Benchmark tab",
399    )
400    parser.add_argument(
401        "--static", "-s", type=Path, default=None,
402        help="Write standalone HTML to this path instead of starting a server",
403    )
404    args = parser.parse_args()
405 
406    workspace = args.workspace.resolve()
407    if not workspace.is_dir():
408        print(f"Error: {workspace} is not a directory", file=sys.stderr)
409        sys.exit(1)
410 
411    runs = find_runs(workspace)
412    if not runs:
413        print(f"No runs found in {workspace}", file=sys.stderr)
414        sys.exit(1)
415 
416    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
417    feedback_path = workspace / "feedback.json"
418 
419    previous: dict[str, dict] = {}
420    if args.previous_workspace:
421        previous = load_previous_iteration(args.previous_workspace.resolve())
422 
423    benchmark_path = args.benchmark.resolve() if args.benchmark else None
424    benchmark = None
425    if benchmark_path and benchmark_path.exists():
426        try:
427            benchmark = json.loads(benchmark_path.read_text())
428        except (json.JSONDecodeError, OSError):
429            pass
430 
431    if args.static:
432        html = generate_html(runs, skill_name, previous, benchmark)
433        args.static.parent.mkdir(parents=True, exist_ok=True)
434        args.static.write_text(html)
435        print(f"\n  Static viewer written to: {args.static}\n")
436        sys.exit(0)
437 
438    # Kill any existing process on the target port
439    port = args.port
440    _kill_port(port)
441    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
442    try:
443        server = HTTPServer(("127.0.0.1", port), handler)
444    except OSError:
445        # Port still in use after kill attempt — find a free one
446        server = HTTPServer(("127.0.0.1", 0), handler)
447        port = server.server_address[1]
448 
449    url = f"http://localhost:{port}"
450    print(f"\n  Eval Viewer")
451    print(f"  ─────────────────────────────────")
452    print(f"  URL:       {url}")
453    print(f"  Workspace: {workspace}")
454    print(f"  Feedback:  {feedback_path}")
455    if previous:
456        print(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
457    if benchmark_path:
458        print(f"  Benchmark: {benchmark_path}")
459    print(f"\n  Press Ctrl+C to stop.\n")
460 
461    webbrowser.open(url)
462 
463    try:
464        server.serve_forever()
465    except KeyboardInterrupt:
466        print("\nStopped.")
467        server.server_close()
468 
469 
470if __name__ == "__main__":
471    main()
472

Skill Creator

eval-viewer/generate_review.py

Preparing the source view

Skill Creator

eval-viewer/generate_review.py