Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Create, test, and iteratively improve Claude skills with eval benchmarks and description optimization
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
eval-viewer/viewer.html
1<!DOCTYPE html>2<html lang="en">3<head>4<meta charset="UTF-8">5<meta name="viewport" content="width=device-width, initial-scale=1.0">6<title>Eval Review</title>7<link rel="preconnect" href="https://fonts.googleapis.com">8<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>9<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">10<script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>11<style>12:root {13--bg: #faf9f5;14--surface: #ffffff;15--border: #e8e6dc;16--text: #141413;17--text-muted: #b0aea5;18--accent: #d97757;19--accent-hover: #c4613f;20--green: #788c5d;21--green-bg: #eef2e8;22--red: #c44;23--red-bg: #fceaea;24--header-bg: #141413;25--header-text: #faf9f5;26--radius: 6px;27}2829* { box-sizing: border-box; margin: 0; padding: 0; }3031body {32font-family: 'Lora', Georgia, serif;33background: var(--bg);34color: var(--text);35height: 100vh;36display: flex;37flex-direction: column;38}3940/* ---- Header ---- */41.header {42background: var(--header-bg);43color: var(--header-text);44padding: 1rem 2rem;45display: flex;46justify-content: space-between;47align-items: center;48flex-shrink: 0;49}50.header h1 {51font-family: 'Poppins', sans-serif;52font-size: 1.25rem;53font-weight: 600;54}55.header .instructions {56font-size: 0.8rem;57opacity: 0.7;58margin-top: 0.25rem;59}60.header .progress {61font-size: 0.875rem;62opacity: 0.8;63text-align: right;64}6566/* ---- Main content ---- */67.main {68flex: 1;69overflow-y: auto;70padding: 1.5rem 2rem;71display: flex;72flex-direction: column;73gap: 1.25rem;74}7576/* ---- Sections ---- */77.section {78background: var(--surface);79border: 1px solid var(--border);80border-radius: var(--radius);81flex-shrink: 0;82}83.section-header {84font-family: 'Poppins', sans-serif;85padding: 0.75rem 1rem;86font-size: 0.75rem;87font-weight: 500;88text-transform: uppercase;89letter-spacing: 0.05em;90color: var(--text-muted);91border-bottom: 1px solid var(--border);92background: var(--bg);93}94.section-body {95padding: 1rem;96}9798/* ---- Config badge ---- */99.config-badge {100display: inline-block;101padding: 0.2rem 0.625rem;102border-radius: 9999px;103font-family: 'Poppins', sans-serif;104font-size: 0.6875rem;105font-weight: 600;106text-transform: uppercase;107letter-spacing: 0.03em;108margin-left: 0.75rem;109vertical-align: middle;110}111.config-badge.config-primary {112background: rgba(33, 150, 243, 0.12);113color: #1976d2;114}115.config-badge.config-baseline {116background: rgba(255, 193, 7, 0.15);117color: #f57f17;118}119120/* ---- Prompt ---- */121.prompt-text {122white-space: pre-wrap;123font-size: 0.9375rem;124line-height: 1.6;125}126127/* ---- Outputs ---- */128.output-file {129border: 1px solid var(--border);130border-radius: var(--radius);131overflow: hidden;132}133.output-file + .output-file {134margin-top: 1rem;135}136.output-file-header {137padding: 0.5rem 0.75rem;138font-size: 0.8rem;139font-weight: 600;140color: var(--text-muted);141background: var(--bg);142border-bottom: 1px solid var(--border);143font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;144display: flex;145justify-content: space-between;146align-items: center;147}148.output-file-header .dl-btn {149font-size: 0.7rem;150color: var(--accent);151text-decoration: none;152cursor: pointer;153font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;154font-weight: 500;155opacity: 0.8;156}157.output-file-header .dl-btn:hover {158opacity: 1;159text-decoration: underline;160}161.output-file-content {162padding: 0.75rem;163overflow-x: auto;164}165.output-file-content pre {166font-size: 0.8125rem;167line-height: 1.5;168white-space: pre-wrap;169word-break: break-word;170font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;171}172.output-file-content img {173max-width: 100%;174height: auto;175border-radius: 4px;176}177.output-file-content iframe {178width: 100%;179height: 600px;180border: none;181}182.output-file-content table {183border-collapse: collapse;184font-size: 0.8125rem;185width: 100%;186}187.output-file-content table td,188.output-file-content table th {189border: 1px solid var(--border);190padding: 0.375rem 0.5rem;191text-align: left;192}193.output-file-content table th {194background: var(--bg);195font-weight: 600;196}197.output-file-content .download-link {198display: inline-flex;199align-items: center;200gap: 0.5rem;201padding: 0.5rem 1rem;202background: var(--bg);203border: 1px solid var(--border);204border-radius: 4px;205color: var(--accent);206text-decoration: none;207font-size: 0.875rem;208cursor: pointer;209}210.output-file-content .download-link:hover {211background: var(--border);212}213.empty-state {214color: var(--text-muted);215font-style: italic;216padding: 2rem;217text-align: center;218}219220/* ---- Feedback ---- */221.prev-feedback {222background: var(--bg);223border: 1px solid var(--border);224border-radius: 4px;225padding: 0.625rem 0.75rem;226margin-top: 0.75rem;227font-size: 0.8125rem;228color: var(--text-muted);229line-height: 1.5;230}231.prev-feedback-label {232font-size: 0.7rem;233font-weight: 600;234text-transform: uppercase;235letter-spacing: 0.04em;236margin-bottom: 0.25rem;237color: var(--text-muted);238}239.feedback-textarea {240width: 100%;241min-height: 100px;242padding: 0.75rem;243border: 1px solid var(--border);244border-radius: 4px;245font-family: inherit;246font-size: 0.9375rem;247line-height: 1.5;248resize: vertical;249color: var(--text);250}251.feedback-textarea:focus {252outline: none;253border-color: var(--accent);254box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);255}256.feedback-status {257font-size: 0.75rem;258color: var(--text-muted);259margin-top: 0.5rem;260min-height: 1.1em;261}262263/* ---- Grades (collapsible) ---- */264.grades-toggle {265display: flex;266align-items: center;267cursor: pointer;268user-select: none;269}270.grades-toggle:hover {271color: var(--accent);272}273.grades-toggle .arrow {274margin-right: 0.5rem;275transition: transform 0.15s;276font-size: 0.75rem;277}278.grades-toggle .arrow.open {279transform: rotate(90deg);280}281.grades-content {282display: none;283margin-top: 0.75rem;284}285.grades-content.open {286display: block;287}288.grades-summary {289font-size: 0.875rem;290margin-bottom: 0.75rem;291display: flex;292align-items: center;293gap: 0.5rem;294}295.grade-badge {296display: inline-block;297padding: 0.125rem 0.5rem;298border-radius: 9999px;299font-size: 0.75rem;300font-weight: 600;301}302.grade-pass { background: var(--green-bg); color: var(--green); }303.grade-fail { background: var(--red-bg); color: var(--red); }304.assertion-list {305list-style: none;306}307.assertion-item {308padding: 0.625rem 0;309border-bottom: 1px solid var(--border);310font-size: 0.8125rem;311}312.assertion-item:last-child { border-bottom: none; }313.assertion-status {314font-weight: 600;315margin-right: 0.5rem;316}317.assertion-status.pass { color: var(--green); }318.assertion-status.fail { color: var(--red); }319.assertion-evidence {320color: var(--text-muted);321font-size: 0.75rem;322margin-top: 0.25rem;323padding-left: 1.5rem;324}325326/* ---- View tabs ---- */327.view-tabs {328display: flex;329gap: 0;330padding: 0 2rem;331background: var(--bg);332border-bottom: 1px solid var(--border);333flex-shrink: 0;334}335.view-tab {336font-family: 'Poppins', sans-serif;337padding: 0.625rem 1.25rem;338font-size: 0.8125rem;339font-weight: 500;340cursor: pointer;341border: none;342background: none;343color: var(--text-muted);344border-bottom: 2px solid transparent;345transition: all 0.15s;346}347.view-tab:hover { color: var(--text); }348.view-tab.active {349color: var(--accent);350border-bottom-color: var(--accent);351}352.view-panel { display: none; }353.view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }354355/* ---- Benchmark view ---- */356.benchmark-view {357padding: 1.5rem 2rem;358overflow-y: auto;359flex: 1;360}361.benchmark-table {362border-collapse: collapse;363background: var(--surface);364border: 1px solid var(--border);365border-radius: var(--radius);366font-size: 0.8125rem;367width: 100%;368margin-bottom: 1.5rem;369}370.benchmark-table th, .benchmark-table td {371padding: 0.625rem 0.75rem;372text-align: left;373border: 1px solid var(--border);374}375.benchmark-table th {376font-family: 'Poppins', sans-serif;377background: var(--header-bg);378color: var(--header-text);379font-weight: 500;380font-size: 0.75rem;381text-transform: uppercase;382letter-spacing: 0.04em;383}384.benchmark-table tr:hover { background: var(--bg); }385.benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }386.benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }387.benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }388.benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }389.benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }390.benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }391.benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }392.benchmark-delta-positive { color: var(--green); font-weight: 600; }393.benchmark-delta-negative { color: var(--red); font-weight: 600; }394.benchmark-notes {395background: var(--surface);396border: 1px solid var(--border);397border-radius: var(--radius);398padding: 1rem;399}400.benchmark-notes h3 {401font-family: 'Poppins', sans-serif;402font-size: 0.875rem;403margin-bottom: 0.75rem;404}405.benchmark-notes ul {406list-style: disc;407padding-left: 1.25rem;408}409.benchmark-notes li {410font-size: 0.8125rem;411line-height: 1.6;412margin-bottom: 0.375rem;413}414.benchmark-empty {415color: var(--text-muted);416font-style: italic;417text-align: center;418padding: 3rem;419}420421/* ---- Navigation ---- */422.nav {423display: flex;424justify-content: space-between;425align-items: center;426padding: 1rem 2rem;427border-top: 1px solid var(--border);428background: var(--surface);429flex-shrink: 0;430}431.nav-btn {432font-family: 'Poppins', sans-serif;433padding: 0.5rem 1.25rem;434border: 1px solid var(--border);435border-radius: var(--radius);436background: var(--surface);437cursor: pointer;438font-size: 0.875rem;439font-weight: 500;440color: var(--text);441transition: all 0.15s;442}443.nav-btn:hover:not(:disabled) {444background: var(--bg);445border-color: var(--text-muted);446}447.nav-btn:disabled {448opacity: 0.4;449cursor: not-allowed;450}451.done-btn {452font-family: 'Poppins', sans-serif;453padding: 0.5rem 1.5rem;454border: 1px solid var(--border);455border-radius: var(--radius);456background: var(--surface);457color: var(--text);458cursor: pointer;459font-size: 0.875rem;460font-weight: 500;461transition: all 0.15s;462}463.done-btn:hover {464background: var(--bg);465border-color: var(--text-muted);466}467.done-btn.ready {468border: none;469background: var(--accent);470color: white;471font-weight: 600;472}473.done-btn.ready:hover {474background: var(--accent-hover);475}476/* ---- Done overlay ---- */477.done-overlay {478display: none;479position: fixed;480inset: 0;481background: rgba(0, 0, 0, 0.5);482z-index: 100;483justify-content: center;484align-items: center;485}486.done-overlay.visible {487display: flex;488}489.done-card {490background: var(--surface);491border-radius: 12px;492padding: 2rem 3rem;493text-align: center;494box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);495max-width: 500px;496}497.done-card h2 {498font-size: 1.5rem;499margin-bottom: 0.5rem;500}501.done-card p {502color: var(--text-muted);503margin-bottom: 1.5rem;504line-height: 1.5;505}506.done-card .btn-row {507display: flex;508gap: 0.5rem;509justify-content: center;510}511.done-card button {512padding: 0.5rem 1.25rem;513border: 1px solid var(--border);514border-radius: var(--radius);515background: var(--surface);516cursor: pointer;517font-size: 0.875rem;518}519.done-card button:hover {520background: var(--bg);521}522/* ---- Toast ---- */523.toast {524position: fixed;525bottom: 5rem;526left: 50%;527transform: translateX(-50%);528background: var(--header-bg);529color: var(--header-text);530padding: 0.625rem 1.25rem;531border-radius: var(--radius);532font-size: 0.875rem;533opacity: 0;534transition: opacity 0.3s;535pointer-events: none;536z-index: 200;537}538.toast.visible {539opacity: 1;540}541</style>542</head>543<body>544<div id="app" style="height:100vh; display:flex; flex-direction:column;">545<div class="header">546<div>547<h1>Eval Review: <span id="skill-name"></span></h1>548<div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>549</div>550<div class="progress" id="progress"></div>551</div>552553<!-- View tabs (only shown when benchmark data exists) -->554<div class="view-tabs" id="view-tabs" style="display:none;">555<button class="view-tab active" onclick="switchView('outputs')">Outputs</button>556<button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>557</div>558559<!-- Outputs panel (qualitative review) -->560<div class="view-panel active" id="panel-outputs">561<div class="main">562<!-- Prompt -->563<div class="section">564<div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>565<div class="section-body">566<div class="prompt-text" id="prompt-text"></div>567</div>568</div>569570<!-- Outputs -->571<div class="section">572<div class="section-header">Output</div>573<div class="section-body" id="outputs-body">574<div class="empty-state">No output files found</div>575</div>576</div>577578<!-- Previous Output (collapsible) -->579<div class="section" id="prev-outputs-section" style="display:none;">580<div class="section-header">581<div class="grades-toggle" onclick="togglePrevOutputs()">582<span class="arrow" id="prev-outputs-arrow">▶</span>583Previous Output584</div>585</div>586<div class="grades-content" id="prev-outputs-content"></div>587</div>588589<!-- Grades (collapsible) -->590<div class="section" id="grades-section" style="display:none;">591<div class="section-header">592<div class="grades-toggle" onclick="toggleGrades()">593<span class="arrow" id="grades-arrow">▶</span>594Formal Grades595</div>596</div>597<div class="grades-content" id="grades-content"></div>598</div>599600<!-- Feedback -->601<div class="section">602<div class="section-header">Your Feedback</div>603<div class="section-body">604<textarea605class="feedback-textarea"606id="feedback"607placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"608></textarea>609<div class="feedback-status" id="feedback-status"></div>610<div class="prev-feedback" id="prev-feedback" style="display:none;">611<div class="prev-feedback-label">Previous feedback</div>612<div id="prev-feedback-text"></div>613</div>614</div>615</div>616</div>617618<div class="nav" id="outputs-nav">619<button class="nav-btn" id="prev-btn" onclick="navigate(-1)">← Previous</button>620<button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>621<button class="nav-btn" id="next-btn" onclick="navigate(1)">Next →</button>622</div>623</div><!-- end panel-outputs -->624625<!-- Benchmark panel (quantitative stats) -->626<div class="view-panel" id="panel-benchmark">627<div class="benchmark-view" id="benchmark-content">628<div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>629</div>630</div>631</div>632633<!-- Done overlay -->634<div class="done-overlay" id="done-overlay">635<div class="done-card">636<h2>Review Complete</h2>637<p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>638<div class="btn-row">639<button onclick="closeDoneDialog()">OK</button>640</div>641</div>642</div>643644<!-- Toast -->645<div class="toast" id="toast"></div>646647<script>648// ---- Embedded data (injected by generate_review.py) ----649/*__EMBEDDED_DATA__*/650651// ---- State ----652let feedbackMap = {}; // run_id -> feedback text653let currentIndex = 0;654let visitedRuns = new Set();655656// ---- Init ----657async function init() {658// Load saved feedback from server — but only if this isn't a fresh659// iteration (indicated by previous_feedback being present). When660// previous feedback exists, the feedback.json on disk is stale from661// the prior iteration and should not pre-fill the textareas.662const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0663|| Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;664if (!hasPrevious) {665try {666const resp = await fetch("/api/feedback");667const data = await resp.json();668if (data.reviews) {669for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;670}671} catch { /* first run, no feedback yet */ }672}673674document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;675showRun(0);676677// Wire up feedback auto-save678const textarea = document.getElementById("feedback");679let saveTimeout = null;680textarea.addEventListener("input", () => {681clearTimeout(saveTimeout);682document.getElementById("feedback-status").textContent = "";683saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);684});685}686687// ---- Navigation ----688function navigate(delta) {689const newIndex = currentIndex + delta;690if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {691saveCurrentFeedback();692showRun(newIndex);693}694}695696function updateNavButtons() {697document.getElementById("prev-btn").disabled = currentIndex === 0;698document.getElementById("next-btn").disabled =699currentIndex === EMBEDDED_DATA.runs.length - 1;700}701702// ---- Show a run ----703function showRun(index) {704currentIndex = index;705const run = EMBEDDED_DATA.runs[index];706707// Progress708document.getElementById("progress").textContent =709`${index + 1} of ${EMBEDDED_DATA.runs.length}`;710711// Prompt712document.getElementById("prompt-text").textContent = run.prompt;713714// Config badge715const badge = document.getElementById("config-badge");716const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);717if (configMatch) {718const config = configMatch[1];719const isBaseline = config === "without_skill" || config === "old_skill";720badge.textContent = config.replace(/_/g, " ");721badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");722badge.style.display = "inline-block";723} else {724badge.style.display = "none";725}726727// Outputs728renderOutputs(run);729730// Previous outputs731renderPrevOutputs(run);732733// Grades734renderGrades(run);735736// Previous feedback737const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];738const prevEl = document.getElementById("prev-feedback");739if (prevFb) {740document.getElementById("prev-feedback-text").textContent = prevFb;741prevEl.style.display = "block";742} else {743prevEl.style.display = "none";744}745746// Feedback747document.getElementById("feedback").value = feedbackMap[run.id] || "";748document.getElementById("feedback-status").textContent = "";749750updateNavButtons();751752// Track visited runs and promote done button when all visited753visitedRuns.add(index);754const doneBtn = document.getElementById("done-btn");755if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {756doneBtn.classList.add("ready");757}758759// Scroll main content to top760document.querySelector(".main").scrollTop = 0;761}762763// ---- Render outputs ----764function renderOutputs(run) {765const container = document.getElementById("outputs-body");766container.innerHTML = "";767768const outputs = run.outputs || [];769if (outputs.length === 0) {770container.innerHTML = '<div class="empty-state">No output files</div>';771return;772}773774for (const file of outputs) {775const fileDiv = document.createElement("div");776fileDiv.className = "output-file";777778// Always show file header with download link779const header = document.createElement("div");780header.className = "output-file-header";781const nameSpan = document.createElement("span");782nameSpan.textContent = file.name;783header.appendChild(nameSpan);784const dlBtn = document.createElement("a");785dlBtn.className = "dl-btn";786dlBtn.textContent = "Download";787dlBtn.download = file.name;788dlBtn.href = getDownloadUri(file);789header.appendChild(dlBtn);790fileDiv.appendChild(header);791792const content = document.createElement("div");793content.className = "output-file-content";794795if (file.type === "text") {796const pre = document.createElement("pre");797pre.textContent = file.content;798content.appendChild(pre);799} else if (file.type === "image") {800const img = document.createElement("img");801img.src = file.data_uri;802img.alt = file.name;803content.appendChild(img);804} else if (file.type === "pdf") {805const iframe = document.createElement("iframe");806iframe.src = file.data_uri;807content.appendChild(iframe);808} else if (file.type === "xlsx") {809renderXlsx(content, file.data_b64);810} else if (file.type === "binary") {811const a = document.createElement("a");812a.className = "download-link";813a.href = file.data_uri;814a.download = file.name;815a.textContent = "Download " + file.name;816content.appendChild(a);817} else if (file.type === "error") {818const pre = document.createElement("pre");819pre.textContent = file.content;820pre.style.color = "var(--red)";821content.appendChild(pre);822}823824fileDiv.appendChild(content);825container.appendChild(fileDiv);826}827}828829// ---- XLSX rendering via SheetJS ----830function renderXlsx(container, b64Data) {831try {832const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));833const wb = XLSX.read(raw, { type: "array" });834835for (let i = 0; i < wb.SheetNames.length; i++) {836const sheetName = wb.SheetNames[i];837const ws = wb.Sheets[sheetName];838839if (wb.SheetNames.length > 1) {840const sheetLabel = document.createElement("div");841sheetLabel.style.cssText =842"font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";843sheetLabel.textContent = "Sheet: " + sheetName;844container.appendChild(sheetLabel);845}846847const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });848const wrapper = document.createElement("div");849wrapper.innerHTML = htmlStr;850container.appendChild(wrapper);851}852} catch (err) {853container.textContent = "Error rendering spreadsheet: " + err.message;854}855}856857// ---- Grades ----858function renderGrades(run) {859const section = document.getElementById("grades-section");860const content = document.getElementById("grades-content");861862if (!run.grading) {863section.style.display = "none";864return;865}866867const grading = run.grading;868section.style.display = "block";869// Reset to collapsed870content.classList.remove("open");871document.getElementById("grades-arrow").classList.remove("open");872873const summary = grading.summary || {};874const expectations = grading.expectations || [];875876let html = '<div style="padding: 1rem;">';877878// Summary line879const passRate = summary.pass_rate != null880? Math.round(summary.pass_rate * 100) + "%"881: "?";882const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";883html += '<div class="grades-summary">';884html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';885html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';886html += '</div>';887888// Assertions list889html += '<ul class="assertion-list">';890for (const exp of expectations) {891const statusClass = exp.passed ? "pass" : "fail";892const statusIcon = exp.passed ? "\u2713" : "\u2717";893html += '<li class="assertion-item">';894html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';895html += '<span>' + escapeHtml(exp.text) + '</span>';896if (exp.evidence) {897html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';898}899html += '</li>';900}901html += '</ul>';902903html += '</div>';904content.innerHTML = html;905}906907function toggleGrades() {908const content = document.getElementById("grades-content");909const arrow = document.getElementById("grades-arrow");910content.classList.toggle("open");911arrow.classList.toggle("open");912}913914// ---- Previous outputs (collapsible) ----915function renderPrevOutputs(run) {916const section = document.getElementById("prev-outputs-section");917const content = document.getElementById("prev-outputs-content");918const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];919920if (!prevOutputs || prevOutputs.length === 0) {921section.style.display = "none";922return;923}924925section.style.display = "block";926// Reset to collapsed927content.classList.remove("open");928document.getElementById("prev-outputs-arrow").classList.remove("open");929930// Render the files into the content area931content.innerHTML = "";932const wrapper = document.createElement("div");933wrapper.style.padding = "1rem";934935for (const file of prevOutputs) {936const fileDiv = document.createElement("div");937fileDiv.className = "output-file";938939const header = document.createElement("div");940header.className = "output-file-header";941const nameSpan = document.createElement("span");942nameSpan.textContent = file.name;943header.appendChild(nameSpan);944const dlBtn = document.createElement("a");945dlBtn.className = "dl-btn";946dlBtn.textContent = "Download";947dlBtn.download = file.name;948dlBtn.href = getDownloadUri(file);949header.appendChild(dlBtn);950fileDiv.appendChild(header);951952const fc = document.createElement("div");953fc.className = "output-file-content";954955if (file.type === "text") {956const pre = document.createElement("pre");957pre.textContent = file.content;958fc.appendChild(pre);959} else if (file.type === "image") {960const img = document.createElement("img");961img.src = file.data_uri;962img.alt = file.name;963fc.appendChild(img);964} else if (file.type === "pdf") {965const iframe = document.createElement("iframe");966iframe.src = file.data_uri;967fc.appendChild(iframe);968} else if (file.type === "xlsx") {969renderXlsx(fc, file.data_b64);970} else if (file.type === "binary") {971const a = document.createElement("a");972a.className = "download-link";973a.href = file.data_uri;974a.download = file.name;975a.textContent = "Download " + file.name;976fc.appendChild(a);977}978979fileDiv.appendChild(fc);980wrapper.appendChild(fileDiv);981}982983content.appendChild(wrapper);984}985986function togglePrevOutputs() {987const content = document.getElementById("prev-outputs-content");988const arrow = document.getElementById("prev-outputs-arrow");989content.classList.toggle("open");990arrow.classList.toggle("open");991}992993// ---- Feedback (saved to server -> feedback.json) ----994function saveCurrentFeedback() {995const run = EMBEDDED_DATA.runs[currentIndex];996const text = document.getElementById("feedback").value;997998if (text.trim() === "") {999delete feedbackMap[run.id];1000} else {1001feedbackMap[run.id] = text;1002}10031004// Build reviews array from map1005const reviews = [];1006for (const [run_id, feedback] of Object.entries(feedbackMap)) {1007if (feedback.trim()) {1008reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });1009}1010}10111012fetch("/api/feedback", {1013method: "POST",1014headers: { "Content-Type": "application/json" },1015body: JSON.stringify({ reviews, status: "in_progress" }),1016}).then(() => {1017document.getElementById("feedback-status").textContent = "Saved";1018}).catch(() => {1019// Static mode or server unavailable — no-op on auto-save,1020// feedback will be downloaded on final submit1021document.getElementById("feedback-status").textContent = "Will download on submit";1022});1023}10241025// ---- Done ----1026function showDoneDialog() {1027// Save current textarea to feedbackMap (but don't POST yet)1028const run = EMBEDDED_DATA.runs[currentIndex];1029const text = document.getElementById("feedback").value;1030if (text.trim() === "") {1031delete feedbackMap[run.id];1032} else {1033feedbackMap[run.id] = text;1034}10351036// POST once with status: complete — include ALL runs so the model1037// can distinguish "no feedback" (looks good) from "not reviewed"1038const reviews = [];1039const ts = new Date().toISOString();1040for (const r of EMBEDDED_DATA.runs) {1041reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });1042}1043const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);1044fetch("/api/feedback", {1045method: "POST",1046headers: { "Content-Type": "application/json" },1047body: payload,1048}).then(() => {1049document.getElementById("done-overlay").classList.add("visible");1050}).catch(() => {1051// Server not available (static mode) — download as file1052const blob = new Blob([payload], { type: "application/json" });1053const url = URL.createObjectURL(blob);1054const a = document.createElement("a");1055a.href = url;1056a.download = "feedback.json";1057a.click();1058URL.revokeObjectURL(url);1059document.getElementById("done-overlay").classList.add("visible");1060});1061}10621063function closeDoneDialog() {1064// Reset status back to in_progress1065saveCurrentFeedback();1066document.getElementById("done-overlay").classList.remove("visible");1067}10681069// ---- Toast ----1070function showToast(message) {1071const toast = document.getElementById("toast");1072toast.textContent = message;1073toast.classList.add("visible");1074setTimeout(() => toast.classList.remove("visible"), 2000);1075}10761077// ---- Keyboard nav ----1078document.addEventListener("keydown", (e) => {1079// Don't capture when typing in textarea1080if (e.target.tagName === "TEXTAREA") return;10811082if (e.key === "ArrowLeft" || e.key === "ArrowUp") {1083e.preventDefault();1084navigate(-1);1085} else if (e.key === "ArrowRight" || e.key === "ArrowDown") {1086e.preventDefault();1087navigate(1);1088}1089});10901091// ---- Util ----1092function getDownloadUri(file) {1093if (file.data_uri) return file.data_uri;1094if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;1095if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);1096return "#";1097}10981099function escapeHtml(text) {1100const div = document.createElement("div");1101div.textContent = text;1102return div.innerHTML;1103}11041105// ---- View switching ----1106function switchView(view) {1107document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));1108document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));1109document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");1110document.getElementById("panel-" + view).classList.add("active");1111}11121113// ---- Benchmark rendering ----1114function renderBenchmark() {1115const data = EMBEDDED_DATA.benchmark;1116if (!data) return;11171118// Show the tabs1119document.getElementById("view-tabs").style.display = "flex";11201121const container = document.getElementById("benchmark-content");1122const summary = data.run_summary || {};1123const metadata = data.metadata || {};1124const notes = data.notes || [];11251126let html = "";11271128// Header1129html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";1130html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";1131if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> — ";1132if (metadata.timestamp) html += metadata.timestamp + " — ";1133if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " — ";1134html += (metadata.runs_per_configuration || "?") + " runs per configuration";1135html += "</p>";11361137// Summary table1138html += '<table class="benchmark-table">';11391140function fmtStat(stat, pct) {1141if (!stat) return "—";1142const suffix = pct ? "%" : "";1143const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);1144const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);1145return m + suffix + " ± " + s + suffix;1146}11471148function deltaClass(val) {1149if (!val) return "";1150const n = parseFloat(val);1151if (n > 0) return "benchmark-delta-positive";1152if (n < 0) return "benchmark-delta-negative";1153return "";1154}11551156// Discover config names dynamically (everything except "delta")1157const configs = Object.keys(summary).filter(k => k !== "delta");1158const configA = configs[0] || "config_a";1159const configB = configs[1] || "config_b";1160const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());1161const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());1162const a = summary[configA] || {};1163const b = summary[configB] || {};1164const delta = summary.delta || {};11651166html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";1167html += "<tbody>";11681169html += "<tr><td><strong>Pass Rate</strong></td>";1170html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";1171html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";1172html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";11731174// Time (only show row if data exists)1175if (a.time_seconds || b.time_seconds) {1176html += "<tr><td><strong>Time (s)</strong></td>";1177html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";1178html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";1179html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";1180}11811182// Tokens (only show row if data exists)1183if (a.tokens || b.tokens) {1184html += "<tr><td><strong>Tokens</strong></td>";1185html += "<td>" + fmtStat(a.tokens, false) + "</td>";1186html += "<td>" + fmtStat(b.tokens, false) + "</td>";1187html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";1188}11891190html += "</tbody></table>";11911192// Per-eval breakdown (if runs data available)1193const runs = data.runs || [];1194if (runs.length > 0) {1195const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);11961197html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";11981199const hasTime = runs.some(r => r.result && r.result.time_seconds != null);1200const hasErrors = runs.some(r => r.result && r.result.errors > 0);12011202for (const evalId of evalIds) {1203const evalRuns = runs.filter(r => r.eval_id === evalId);1204const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;12051206html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";1207html += '<table class="benchmark-table">';1208html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";1209if (hasTime) html += "<th>Time (s)</th>";1210if (hasErrors) html += "<th>Crashes During Execution</th>";1211html += "</tr></thead>";1212html += "<tbody>";12131214// Group by config and render with average rows1215const configGroups = [...new Set(evalRuns.map(r => r.configuration))];1216for (let ci = 0; ci < configGroups.length; ci++) {1217const config = configGroups[ci];1218const configRuns = evalRuns.filter(r => r.configuration === config);1219if (configRuns.length === 0) continue;12201221const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";1222const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());12231224for (const run of configRuns) {1225const r = run.result || {};1226const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";1227html += '<tr class="' + rowClass + '">';1228html += "<td>" + configLabel + "</td>";1229html += "<td>" + run.run_number + "</td>";1230html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";1231if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";1232if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";1233html += "</tr>";1234}12351236// Average row1237const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);1238const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;1239const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";1240html += '<tr class="benchmark-row-avg ' + rowClass + '">';1241html += "<td>" + configLabel + "</td>";1242html += "<td>Avg</td>";1243html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";1244if (hasTime) {1245const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);1246html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";1247}1248if (hasErrors) html += "<td></td>";1249html += "</tr>";1250}1251html += "</tbody></table>";12521253// Per-assertion detail for this eval1254const runsWithExpectations = {};1255for (const config of configGroups) {1256runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);1257}1258const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);1259if (hasAnyExpectations) {1260// Collect all unique assertion texts across all configs1261const allAssertions = [];1262const seen = new Set();1263for (const config of configGroups) {1264for (const run of runsWithExpectations[config]) {1265for (const exp of (run.expectations || [])) {1266if (!seen.has(exp.text)) {1267seen.add(exp.text);1268allAssertions.push(exp.text);1269}1270}1271}1272}12731274html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';1275html += "<thead><tr><th>Assertion</th>";1276for (const config of configGroups) {1277const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());1278html += "<th>" + escapeHtml(label) + "</th>";1279}1280html += "</tr></thead><tbody>";12811282for (const assertionText of allAssertions) {1283html += "<tr><td>" + escapeHtml(assertionText) + "</td>";12841285for (const config of configGroups) {1286html += "<td>";1287for (const run of runsWithExpectations[config]) {1288const exp = (run.expectations || []).find(e => e.text === assertionText);1289if (exp) {1290const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";1291const icon = exp.passed ? "\u2713" : "\u2717";1292html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";1293} else {1294html += "— ";1295}1296}1297html += "</td>";1298}1299html += "</tr>";1300}1301html += "</tbody></table>";1302}1303}1304}13051306// Notes1307if (notes.length > 0) {1308html += '<div class="benchmark-notes">';1309html += "<h3>Analysis Notes</h3>";1310html += "<ul>";1311for (const note of notes) {1312html += "<li>" + escapeHtml(note) + "</li>";1313}1314html += "</ul></div>";1315}13161317container.innerHTML = html;1318}13191320// ---- Start ----1321init();1322renderBenchmark();1323</script>1324</body>1325</html>1326