From aee21862460ac9b957dca855d9dfcf2693316069 Mon Sep 17 00:00:00 2001 From: devbox2-codex Date: Fri, 10 Apr 2026 22:52:14 +0000 Subject: [PATCH 1/6] feat(studio): per-run comparison with retroactive labelling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a per-run mode to the Studio Compare tab so users can select 2+ individual runs and see them side-by-side, independent of the existing (experiment, target) aggregation. Runs can be retroactively labelled via a sidecar label.json written next to index.jsonl; the label replaces the timestamp in compare column headers. Backend: - `apps/cli/src/commands/results/run-label.ts` — sidecar read/write/delete helpers (label.json next to manifest, 120-char cap, JSON schema). - `serve.ts` — /api/compare now returns a `runs[]` array with per-run entries (one per workspace), and enriches /api/runs with any label. - New endpoints: `PUT/DELETE /api/runs/:filename/label` and the benchmark-scoped variants. Remote runs are read-only. Frontend: - `CompareTab.tsx` completely reworked with an "Editorial Data Terminal" aesthetic — Fraunces serif display, JetBrains Mono tabular numerals, warm off-black canvas, antique gold accents. Scoped via inline styles under `[data-compare-root]` so it does not bleed into other surfaces. - Two modes: Aggregated (default, existing matrix re-skinned) and Per run (checkbox-selectable runs table + sticky Compare N bar + inline label editor). Compare view renders one column per selected run with label-or-timestamp headers and reuses the existing test breakdown. - API hooks `saveRunLabelApi` / `deleteRunLabelApi` invalidate compare and runs caches on mutation. Closes #1037 --- apps/cli/src/commands/results/run-label.ts | 79 + apps/cli/src/commands/results/serve.ts | 144 +- apps/studio/src/components/CompareTab.tsx | 2170 ++++++++++++++++++-- apps/studio/src/lib/api.ts | 40 + apps/studio/src/lib/types.ts | 31 + docs/plans/1037-per-run-compare.md | 103 + 6 files changed, 2430 insertions(+), 137 deletions(-) create mode 100644 apps/cli/src/commands/results/run-label.ts create mode 100644 docs/plans/1037-per-run-compare.md diff --git a/apps/cli/src/commands/results/run-label.ts b/apps/cli/src/commands/results/run-label.ts new file mode 100644 index 000000000..27b9116fa --- /dev/null +++ b/apps/cli/src/commands/results/run-label.ts @@ -0,0 +1,79 @@ +/** + * Per-run label sidecar file helpers. + * + * Labels are stored as a `label.json` sidecar next to the run's `index.jsonl` + * manifest. The sidecar is optional, mutable, and non-breaking — absence means + * the run has no label. + * + * Wire format (stored on disk): + * ```json + * { "label": "baseline", "updated_at": "2026-04-10T00:00:00.000Z" } + * ``` + * + * Used by the Studio compare API so users can retroactively tag runs without + * changing the eval YAML or the run manifest itself. + * + * To extend with more metadata (e.g. tags, notes): add fields to + * `RunLabelFile` and update `readRunLabel`/`writeRunLabel` accordingly. Keep + * the schema additive so older files still parse. + */ + +import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; + +export const RUN_LABEL_FILENAME = 'label.json'; + +export interface RunLabelFile { + /** Human-readable label replacing the run timestamp in compare views. */ + label: string; + /** ISO-8601 timestamp of last update. */ + updated_at: string; +} + +/** Resolve the label sidecar path given a run manifest (index.jsonl) path. */ +export function runLabelPath(manifestPath: string): string { + return path.join(path.dirname(manifestPath), RUN_LABEL_FILENAME); +} + +/** Read the label for a run. Returns `undefined` if missing or unreadable. */ +export function readRunLabel(manifestPath: string): RunLabelFile | undefined { + const fp = runLabelPath(manifestPath); + if (!existsSync(fp)) return undefined; + try { + const parsed = JSON.parse(readFileSync(fp, 'utf8')) as unknown; + if (!parsed || typeof parsed !== 'object') return undefined; + const record = parsed as Record; + if (typeof record.label !== 'string' || record.label.trim() === '') return undefined; + return { + label: record.label, + updated_at: typeof record.updated_at === 'string' ? record.updated_at : '', + }; + } catch { + return undefined; + } +} + +/** Write a label for a run. Overwrites any existing label. */ +export function writeRunLabel(manifestPath: string, label: string): RunLabelFile { + const trimmed = label.trim(); + if (trimmed === '') { + throw new Error('Label cannot be empty'); + } + if (trimmed.length > 120) { + throw new Error('Label must be at most 120 characters'); + } + const entry: RunLabelFile = { + label: trimmed, + updated_at: new Date().toISOString(), + }; + writeFileSync(runLabelPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); + return entry; +} + +/** Remove a run's label sidecar. No-op if the file does not exist. */ +export function deleteRunLabel(manifestPath: string): void { + const fp = runLabelPath(manifestPath); + if (existsSync(fp)) { + unlinkSync(fp); + } +} diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 9b4c67161..bcfa32e63 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -59,6 +59,7 @@ import { listMergedResultFiles, syncRemoteResults, } from './remote.js'; +import { deleteRunLabel, readRunLabel, writeRunLabel } from './run-label.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -273,6 +274,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { } catch { // ignore enrichment errors } + const labelEntry = readRunLabel(m.path); return { filename: m.filename, display_name: m.displayName, @@ -285,6 +287,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { source: m.source, ...(target && { target }), ...(experiment && { experiment }), + ...(labelEntry && { label: labelEntry.label }), }; }), }); @@ -551,7 +554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - // Collect per-test-case results keyed by experiment × target + // Collect per-test-case results keyed by experiment × target (aggregated view) const cellMap = new Map< string, { @@ -569,17 +572,54 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { } >(); + // Per-run entries (per-run view). Each run workspace contributes exactly + // one entry, independent of the aggregated matrix. + const runEntries: Array<{ + run_id: string; + started_at: string; + experiment: string; + target: string; + label?: string; + source: 'local' | 'remote'; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: Array<{ + test_id: string; + score: number; + passed: boolean; + execution_status?: string; + }>; + }> = []; + const experimentsSet = new Set(); const targetsSet = new Set(); + const MAX_TESTS_PER_CELL = 100; for (const m of metas) { try { const records = loadLightweightResults(m.path); + const runTestMap = new Map< + string, + { test_id: string; score: number; passed: boolean; execution_status?: string } + >(); + let runEvalCount = 0; + let runPassedCount = 0; + let runScoreSum = 0; + let runExperiment = 'default'; + let runTarget = 'default'; + let runStartedAt = m.timestamp; + for (const r of records) { const experiment = r.experiment ?? 'default'; const target = r.target ?? 'default'; experimentsSet.add(experiment); targetsSet.add(target); + runExperiment = experiment; + runTarget = target; + if (r.timestamp && r.timestamp < runStartedAt) runStartedAt = r.timestamp; + const key = JSON.stringify([experiment, target]); const entry = cellMap.get(key) ?? { experiment, @@ -600,14 +640,41 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { execution_status: r.executionStatus, }); cellMap.set(key, entry); + + // Per-run accumulation. Dedupe tests within the run by last-wins. + runTestMap.set(r.testId, { + test_id: r.testId, + score: r.score, + passed, + execution_status: r.executionStatus, + }); + runEvalCount++; + if (passed) runPassedCount++; + runScoreSum += r.score; } + + if (runEvalCount === 0) continue; + + const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL); + const labelEntry = readRunLabel(m.path); + runEntries.push({ + run_id: m.filename, + started_at: runStartedAt, + experiment: runExperiment, + target: runTarget, + ...(labelEntry && { label: labelEntry.label }), + source: m.source, + eval_count: runEvalCount, + passed_count: runPassedCount, + pass_rate: runPassedCount / runEvalCount, + avg_score: runScoreSum / runEvalCount, + tests: runTests, + }); } catch { // skip runs that fail to load } } - const MAX_TESTS_PER_CELL = 100; - const cells = [...cellMap.values()].map((entry) => { // Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order) const dedupMap = new Map(); @@ -630,10 +697,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { }; }); + // Per-run entries sorted by timestamp descending (newest first). + runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at)); + return c.json({ experiments: [...experimentsSet].sort(), targets: [...targetsSet].sort(), cells, + runs: runEntries, }); } @@ -702,6 +773,49 @@ function handleFeedbackRead(c: C, { searchDir }: DataContext) { return c.json(readFeedback(existsSync(resultsDir) ? resultsDir : searchDir)); } +async function handleRunLabelPut(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); + if (!meta) return c.json({ error: 'Run not found' }, 404); + if (meta.source === 'remote') { + return c.json({ error: 'Labels can only be set on local runs' }, 400); + } + let body: unknown; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON' }, 400); + } + if (!body || typeof body !== 'object') { + return c.json({ error: 'Invalid payload' }, 400); + } + const label = (body as Record).label; + if (typeof label !== 'string') { + return c.json({ error: 'Missing label string' }, 400); + } + try { + const entry = writeRunLabel(meta.path, label); + return c.json({ label: entry.label, updated_at: entry.updated_at }); + } catch (err) { + return c.json({ error: (err as Error).message }, 400); + } +} + +async function handleRunLabelDelete(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); + if (!meta) return c.json({ error: 'Run not found' }, 404); + if (meta.source === 'remote') { + return c.json({ error: 'Labels can only be removed on local runs' }, 400); + } + try { + deleteRunLabel(meta.path); + return c.json({ ok: true }); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } +} + // ── Hono app factory ───────────────────────────────────────────────────── /** @@ -934,6 +1048,18 @@ export function createApp( app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir))); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); + app.put('/api/runs/:filename/label', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return handleRunLabelPut(c, defaultCtx); + }); + app.delete('/api/runs/:filename/label', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return handleRunLabelDelete(c, defaultCtx); + }); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx)); @@ -1046,6 +1172,18 @@ export function createApp( withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))), ); app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns)); + app.put('/api/benchmarks/:benchmarkId/runs/:filename/label', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return withBenchmark(c, handleRunLabelPut); + }); + app.delete('/api/benchmarks/:benchmarkId/runs/:filename/label', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return withBenchmark(c, handleRunLabelDelete); + }); app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail)); app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) => withBenchmark(c, handleRunSuites), diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index a633d70ea..a2ebbd9d6 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -1,109 +1,190 @@ /** - * Cross-model comparison matrix component. + * Cross-model comparison view — "Editorial Data Terminal" aesthetic. * - * Displays a grid of experiment (columns) x target (rows) cells, - * each showing pass rate, average score, and test counts. Color-coded - * by performance: green (>80%), yellow (50-80%), red (<50%). - * Cells are expandable to show per-test-case breakdown. + * Two modes: + * 1. Aggregated (default) — `(experiment, target)` matrix, one cell per pair. + * 2. Per run — individual runs are first-class; users select + * 2+ runs to render a side-by-side comparison, + * and may attach a retroactive label to any run. * - * Used in both unscoped and project-scoped views. + * The aesthetic is intentional: warm off-black background, antique gold rule + * marks, serif display typography (Fraunces) paired with data-monospace + * (JetBrains Mono). Styling is scoped to `[data-compare-root]` via an inline + * ; +} + +const STYLES = ` +[data-compare-root] { + --cmp-bg: #0b0907; + --cmp-bg-elev: #13110c; + --cmp-bg-elev-2: #1a1712; + --cmp-ink: #f6efe0; + --cmp-ink-dim: #a89f89; + --cmp-ink-faint: #6b6350; + --cmp-rule: #2a2520; + --cmp-rule-strong: #3e3830; + --cmp-accent: #d4a84a; + --cmp-accent-ink: #f5d47a; + --cmp-ok: #a3e4b5; + --cmp-ok-bg: rgba(132, 220, 148, 0.08); + --cmp-ok-ring: rgba(132, 220, 148, 0.35); + --cmp-warn: #f0c674; + --cmp-warn-bg: rgba(240, 198, 116, 0.08); + --cmp-warn-ring: rgba(240, 198, 116, 0.35); + --cmp-bad: #f5a6a6; + --cmp-bad-bg: rgba(245, 166, 166, 0.07); + --cmp-bad-ring: rgba(245, 166, 166, 0.32); + + --cmp-font-display: "Fraunces", "Times New Roman", Georgia, serif; + --cmp-font-data: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace; + --cmp-font-body: "Instrument Sans", ui-sans-serif, system-ui, sans-serif; + + position: relative; + padding: 2.25rem 1.5rem 4rem; + margin: -1rem -1rem 0; + background: + radial-gradient(1200px 600px at 85% -10%, rgba(212, 168, 74, 0.07), transparent 60%), + radial-gradient(900px 500px at -10% 120%, rgba(120, 96, 40, 0.05), transparent 70%), + var(--cmp-bg); + color: var(--cmp-ink); + font-family: var(--cmp-font-body); + border-top: 1px solid var(--cmp-rule); + border-bottom: 1px solid var(--cmp-rule); + overflow: hidden; +} + +[data-compare-root]::before { + content: ""; + position: absolute; + inset: 0; + pointer-events: none; + background-image: radial-gradient(rgba(246, 239, 224, 0.025) 1px, transparent 1px); + background-size: 3px 3px; + opacity: 0.4; + mix-blend-mode: screen; +} + +[data-compare-root] .compare-body { + position: relative; + z-index: 1; +} + +/* ── Masthead ───────────────────────────────────────────── */ + +[data-compare-root] .compare-masthead { + position: relative; + display: grid; + grid-template-columns: 1fr auto; + gap: 2rem; + align-items: end; + padding-bottom: 1.75rem; + margin-bottom: 1.75rem; + border-bottom: 1px solid var(--cmp-rule); + z-index: 1; +} + +[data-compare-root] .compare-masthead::after { + content: ""; + position: absolute; + left: 0; + right: 0; + bottom: -4px; + height: 1px; + background: var(--cmp-rule); +} + +[data-compare-root] .compare-eyebrow { + display: inline-flex; + align-items: center; + gap: 0.6rem; + font-family: var(--cmp-font-data); + font-size: 0.72rem; + font-weight: 500; + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--cmp-ink-dim); + margin-bottom: 0.75rem; +} + +[data-compare-root] .compare-rule-mark { + display: inline-block; + width: 28px; + height: 1px; + background: var(--cmp-accent); + box-shadow: 0 0 8px rgba(212, 168, 74, 0.5); +} + +[data-compare-root] .compare-title { + font-family: var(--cmp-font-display); + font-weight: 600; + font-size: clamp(2.4rem, 4.5vw, 3.6rem); + line-height: 0.95; + letter-spacing: -0.02em; + color: var(--cmp-ink); + margin: 0; + font-variation-settings: "opsz" 144; +} + +[data-compare-root] .compare-title-word { + display: inline-block; +} + +[data-compare-root] .compare-title-ornament { + display: inline-block; + margin: 0 0.4rem; + color: var(--cmp-accent); + font-style: normal; + transform: translateY(-0.1em); +} + +[data-compare-root] .compare-title-italic { + font-style: italic; + color: var(--cmp-ink-dim); + font-weight: 400; +} + +[data-compare-root] .compare-kicker { + max-width: 54ch; + margin: 0.85rem 0 0; + color: var(--cmp-ink-dim); + font-size: 0.92rem; + line-height: 1.55; +} + +/* ── Mode toggle ────────────────────────────────────────── */ + +[data-compare-root] .compare-masthead-right { + display: flex; + align-items: end; + justify-content: flex-end; +} + +[data-compare-root] .compare-mode-toggle { + position: relative; + display: inline-flex; + gap: 0; + padding: 0.35rem; + border: 1px solid var(--cmp-rule-strong); + border-radius: 2px; + background: var(--cmp-bg-elev); +} + +[data-compare-root] .compare-mode-btn { + position: relative; + display: inline-flex; + align-items: center; + gap: 0.55rem; + padding: 0.55rem 1rem 0.5rem; + background: transparent; + border: 0; + color: var(--cmp-ink-dim); + font-family: var(--cmp-font-body); + font-size: 0.78rem; + letter-spacing: 0.14em; + text-transform: uppercase; + cursor: pointer; + transition: color 180ms ease; + z-index: 1; +} + +[data-compare-root] .compare-mode-btn:hover:not(:disabled) { + color: var(--cmp-ink); +} + +[data-compare-root] .compare-mode-btn:disabled { + opacity: 0.4; + cursor: not-allowed; +} + +[data-compare-root] .compare-mode-btn.is-active { + color: var(--cmp-ink); +} + +[data-compare-root] .compare-mode-num { + font-family: var(--cmp-font-data); + font-size: 0.68rem; + color: var(--cmp-accent); + opacity: 0.7; +} + +[data-compare-root] .compare-mode-btn.is-active .compare-mode-num { + opacity: 1; +} + +[data-compare-root] .compare-mode-label { + font-weight: 500; +} + +[data-compare-root] .compare-mode-indicator { + position: absolute; + bottom: 0; + left: 0.35rem; + height: 2px; + width: calc(50% - 0.35rem); + background: var(--cmp-accent); + box-shadow: 0 0 14px rgba(212, 168, 74, 0.6); + transition: transform 320ms cubic-bezier(0.6, 0, 0.1, 1); +} + +[data-compare-root] .compare-mode-toggle[data-mode="per-run"] .compare-mode-indicator { + transform: translateX(100%); +} + +/* ── Section + legend ───────────────────────────────────── */ + +[data-compare-root] .compare-section { + position: relative; +} + +[data-compare-root] .compare-enter { + animation: compareEnter 520ms cubic-bezier(0.2, 0.8, 0.2, 1) both; +} + +@keyframes compareEnter { + from { opacity: 0; transform: translateY(8px); } + to { opacity: 1; transform: translateY(0); } +} + +[data-compare-root] .compare-legend { + display: flex; + align-items: center; + gap: 1.4rem; + margin-bottom: 1rem; + padding: 0.55rem 0.85rem; + border-left: 2px solid var(--cmp-accent); + background: var(--cmp-bg-elev); + font-size: 0.75rem; + color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-legend-item { + display: inline-flex; + align-items: center; + gap: 0.45rem; + font-family: var(--cmp-font-data); + letter-spacing: 0.08em; +} + +[data-compare-root] .compare-legend-swatch { + width: 10px; + height: 10px; + border-radius: 1px; + border: 1px solid currentColor; + background: currentColor; + opacity: 0.85; +} + +[data-compare-root] .compare-legend-item.tone-ok { color: var(--cmp-ok); } +[data-compare-root] .compare-legend-item.tone-warn { color: var(--cmp-warn); } +[data-compare-root] .compare-legend-item.tone-bad { color: var(--cmp-bad); } +[data-compare-root] .compare-legend-item.tone-none { + color: var(--cmp-ink-faint); +} +[data-compare-root] .compare-legend-item.tone-none .compare-legend-swatch { + background: transparent; + border: 1px dashed currentColor; +} + +/* ── Runs meta strip ────────────────────────────────────── */ + +[data-compare-root] .compare-runs-meta { + display: flex; + align-items: center; + gap: 1rem; + margin-bottom: 1rem; + color: var(--cmp-ink-dim); + font-size: 0.8rem; +} + +[data-compare-root] .compare-smallcaps { + font-family: var(--cmp-font-data); + font-size: 0.72rem; + font-weight: 500; + letter-spacing: 0.18em; + text-transform: uppercase; + color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-hint { + font-size: 0.82rem; + color: var(--cmp-ink-dim); + font-style: italic; +} + +[data-compare-root] .compare-hint-faint { + font-family: var(--cmp-font-body); + font-size: 0.78rem; + color: var(--cmp-ink-faint); + font-style: italic; +} + +[data-compare-root] .compare-dot-hair { + display: inline-block; + width: 4px; + height: 4px; + border-radius: 50%; + background: var(--cmp-ink-faint); +} + +/* ── Tables ─────────────────────────────────────────────── */ + +[data-compare-root] .compare-table-wrap { + position: relative; + border: 1px solid var(--cmp-rule); + background: + linear-gradient(var(--cmp-bg-elev), var(--cmp-bg-elev)); + box-shadow: 0 1px 0 rgba(246, 239, 224, 0.04) inset; + overflow-x: auto; +} + +[data-compare-root] .compare-table { + width: 100%; + border-collapse: collapse; + font-family: var(--cmp-font-body); + font-size: 0.88rem; +} + +[data-compare-root] .compare-table thead tr { + border-bottom: 1px solid var(--cmp-rule); + background: rgba(246, 239, 224, 0.02); +} + +[data-compare-root] .compare-table thead th { + padding: 0.95rem 0.9rem 0.8rem; + text-align: left; + font-weight: 500; + color: var(--cmp-ink-dim); + vertical-align: bottom; +} + +[data-compare-root] .compare-col-head { + text-align: center; +} + +[data-compare-root] .compare-col-head-text { + font-family: var(--cmp-font-display); + font-size: 1rem; + font-style: italic; + font-weight: 500; + color: var(--cmp-ink); + letter-spacing: 0.01em; + display: inline-block; + padding-bottom: 0.35rem; + border-bottom: 1px dotted var(--cmp-accent); +} + +[data-compare-root] .compare-table tbody tr { + border-bottom: 1px solid rgba(42, 37, 32, 0.6); + transition: background 180ms ease, transform 220ms ease; +} + +[data-compare-root] .compare-table tbody tr:last-child { + border-bottom: 0; +} + +[data-compare-root] .compare-row { + animation: rowEnter 440ms cubic-bezier(0.2, 0.8, 0.2, 1) both; +} + +@keyframes rowEnter { + from { opacity: 0; transform: translateX(-6px); } + to { opacity: 1; transform: translateX(0); } +} + +[data-compare-root] .compare-table tbody tr:hover { + background: rgba(246, 239, 224, 0.035); +} + +[data-compare-root] .compare-col-gutter { + width: 18px; + padding: 0; + position: relative; +} + +[data-compare-root] .compare-row-marker { + display: block; + width: 2px; + height: 0; + background: var(--cmp-accent); + transform: translateX(6px); + transition: height 240ms cubic-bezier(0.2, 0.8, 0.2, 1); + position: absolute; + top: 50%; + left: 0; + box-shadow: 0 0 10px rgba(212, 168, 74, 0.5); +} + +[data-compare-root] .compare-table tbody tr:hover .compare-row-marker { + height: 60%; + transform: translate(6px, -50%); +} + +[data-compare-root] .compare-run-row.is-selected .compare-row-marker { + height: 70%; + transform: translate(6px, -50%); + background: var(--cmp-accent-ink); +} + +[data-compare-root] .compare-col-label { + padding: 1rem 1rem 1rem 0.25rem; + white-space: nowrap; +} + +[data-compare-root] .compare-target-name { + font-family: var(--cmp-font-display); + font-weight: 500; + font-size: 1.02rem; + color: var(--cmp-ink); + letter-spacing: 0.005em; +} + +[data-compare-root] .compare-col-cell { + padding: 0.55rem 0.55rem; + vertical-align: top; + min-width: 140px; +} + +/* ── Aggregated cells ───────────────────────────────────── */ + +[data-compare-root] .compare-cell-inner { + display: flex; + flex-direction: column; + gap: 0.35rem; +} + +[data-compare-root] .compare-cell-btn { + width: 100%; + padding: 0.85rem 0.9rem 0.75rem; + background: var(--cmp-bg-elev-2); + border: 1px solid var(--cmp-rule); + border-left-width: 3px; + color: var(--cmp-ink); + cursor: pointer; + text-align: center; + font-family: var(--cmp-font-data); + transition: background 180ms ease, border-color 180ms ease, transform 220ms ease; + border-radius: 1px; +} + +[data-compare-root] .compare-cell-btn:hover { + background: rgba(246, 239, 224, 0.045); + transform: translateY(-1px); +} + +[data-compare-root] .compare-cell-btn.tone-ok { border-left-color: var(--cmp-ok); } +[data-compare-root] .compare-cell-btn.tone-warn { border-left-color: var(--cmp-warn); } +[data-compare-root] .compare-cell-btn.tone-bad { border-left-color: var(--cmp-bad); } + +[data-compare-root] .compare-cell-num-row { + display: inline-flex; + align-items: baseline; + gap: 0.15rem; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-cell-num { + font-family: var(--cmp-font-display); + font-size: 1.75rem; + font-weight: 600; + font-variant-numeric: tabular-nums; + line-height: 1; +} + +[data-compare-root] .compare-cell-btn.tone-ok .compare-cell-num { color: var(--cmp-ok); } +[data-compare-root] .compare-cell-btn.tone-warn .compare-cell-num { color: var(--cmp-warn); } +[data-compare-root] .compare-cell-btn.tone-bad .compare-cell-num { color: var(--cmp-bad); } + +[data-compare-root] .compare-cell-num-mark { + font-family: var(--cmp-font-data); + font-size: 0.72rem; + color: var(--cmp-ink-dim); + letter-spacing: 0.05em; +} + +[data-compare-root] .compare-cell-meta { + display: flex; + justify-content: center; + align-items: center; + gap: 0.5rem; + margin-top: 0.3rem; + font-size: 0.7rem; + color: var(--cmp-ink-dim); + letter-spacing: 0.06em; +} + +[data-compare-root] .compare-dot { + width: 3px; + height: 3px; + border-radius: 50%; + background: var(--cmp-ink-faint); +} + +[data-compare-root] .compare-cell-empty { + padding: 1.45rem 0.5rem; + text-align: center; + border: 1px dashed var(--cmp-rule-strong); + color: var(--cmp-ink-faint); + font-family: var(--cmp-font-data); +} + +[data-compare-root] .compare-breakdown { + margin-top: 0.2rem; + padding: 0.55rem 0.65rem; + background: var(--cmp-bg); + border: 1px solid var(--cmp-rule); + max-height: 200px; + overflow-y: auto; +} + +[data-compare-root] .compare-breakdown-head { + font-family: var(--cmp-font-data); + font-size: 0.65rem; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--cmp-ink-faint); + padding-bottom: 0.4rem; + margin-bottom: 0.3rem; + border-bottom: 1px solid var(--cmp-rule); +} + +[data-compare-root] .compare-breakdown-list { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + gap: 0.1rem; +} + +[data-compare-root] .compare-breakdown-row { + display: flex; + align-items: center; + gap: 0.55rem; + padding: 0.18rem 0.2rem; + font-family: var(--cmp-font-data); + font-size: 0.68rem; + color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-breakdown-row.ok .compare-breakdown-glyph { color: var(--cmp-ok); } +[data-compare-root] .compare-breakdown-row.bad .compare-breakdown-glyph { color: var(--cmp-bad); } + +[data-compare-root] .compare-breakdown-glyph { + font-size: 0.75rem; + line-height: 1; +} + +[data-compare-root] .compare-breakdown-id { + flex: 1; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +[data-compare-root] .compare-breakdown-score { + font-variant-numeric: tabular-nums; + color: var(--cmp-ink-faint); +} + +/* ── Per-run table ──────────────────────────────────────── */ + +[data-compare-root] .compare-runs-table .compare-col-check { + width: 38px; + padding-left: 0.35rem; +} + +[data-compare-root] .compare-runs-table .compare-col-timestamp { + width: 220px; + padding: 0.85rem 0.9rem; +} + +[data-compare-root] .compare-timestamp-mono { + display: block; + font-family: var(--cmp-font-data); + font-size: 0.86rem; + font-weight: 500; + color: var(--cmp-ink); + font-variant-numeric: tabular-nums; +} + +[data-compare-root] .compare-runid-mono { + display: block; + margin-top: 0.22rem; + font-family: var(--cmp-font-data); + font-size: 0.68rem; + color: var(--cmp-ink-faint); + letter-spacing: 0.04em; +} + +[data-compare-root] .compare-col-label-big { + min-width: 180px; + padding: 0.85rem 0.9rem; +} + +[data-compare-root] .compare-label-cell-btn { + display: inline-block; + padding: 0.3rem 0.55rem; + background: transparent; + border: 1px dashed var(--cmp-rule-strong); + border-radius: 1px; + color: var(--cmp-ink); + cursor: pointer; + font-family: var(--cmp-font-display); + font-style: italic; + font-size: 0.98rem; + text-align: left; + transition: all 180ms ease; +} + +[data-compare-root] .compare-label-cell-btn:hover { + border-color: var(--cmp-accent); + background: rgba(212, 168, 74, 0.08); +} + +[data-compare-root] .compare-label-cell-btn.has-label { + border: 1px solid rgba(212, 168, 74, 0.4); + background: rgba(212, 168, 74, 0.07); + color: var(--cmp-accent-ink); +} + +[data-compare-root] .compare-label-text { + display: inline-block; + max-width: 240px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +[data-compare-root] .compare-label-placeholder { + color: var(--cmp-ink-faint); + font-family: var(--cmp-font-data); + font-style: normal; + font-size: 0.76rem; + letter-spacing: 0.12em; + text-transform: uppercase; +} + +[data-compare-root] .compare-label-placeholder-ro { + color: var(--cmp-ink-faint); + font-family: var(--cmp-font-data); +} + +[data-compare-root] .compare-col-field { + padding: 0.85rem 0.9rem; + white-space: nowrap; +} + +[data-compare-root] .compare-field-mono { + font-family: var(--cmp-font-data); + font-size: 0.82rem; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-col-num { + padding: 0.85rem 0.9rem; + text-align: right; + white-space: nowrap; +} + +[data-compare-root] .compare-num-tabular { + font-family: var(--cmp-font-data); + font-variant-numeric: tabular-nums; + font-size: 1.02rem; + font-weight: 500; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-num-tone-ok { color: var(--cmp-ok); } +[data-compare-root] .compare-num-tone-warn { color: var(--cmp-warn); } +[data-compare-root] .compare-num-tone-bad { color: var(--cmp-bad); } + +[data-compare-root] .compare-num-unit { + font-family: var(--cmp-font-data); + font-size: 0.7rem; + color: var(--cmp-ink-faint); + margin-left: 0.12rem; +} + +/* Checkbox */ + +[data-compare-root] .compare-checkbox { + position: relative; + display: inline-block; + width: 18px; + height: 18px; + cursor: pointer; +} + +[data-compare-root] .compare-checkbox input { + position: absolute; + inset: 0; + opacity: 0; + cursor: pointer; +} + +[data-compare-root] .compare-checkbox-box { + position: absolute; + inset: 0; + border: 1px solid var(--cmp-rule-strong); + background: var(--cmp-bg); + transition: all 180ms ease; +} + +[data-compare-root] .compare-checkbox input:checked ~ .compare-checkbox-box { + background: var(--cmp-accent); + border-color: var(--cmp-accent); + box-shadow: 0 0 12px rgba(212, 168, 74, 0.6); +} + +[data-compare-root] .compare-checkbox input:checked ~ .compare-checkbox-box::after { + content: ""; + position: absolute; + left: 4px; + top: 0px; + width: 5px; + height: 10px; + border-right: 2px solid var(--cmp-bg); + border-bottom: 2px solid var(--cmp-bg); + transform: rotate(45deg); +} + +[data-compare-root] .compare-run-row { + cursor: pointer; +} + +[data-compare-root] .compare-run-row.is-selected { + background: rgba(212, 168, 74, 0.05) !important; +} + +[data-compare-root] .compare-run-row.is-selected td { + color: var(--cmp-ink); +} + +/* Label editor row */ + +[data-compare-root] .compare-label-editor-row { + background: var(--cmp-bg) !important; +} + +[data-compare-root] .compare-label-editor-row td { + padding: 0; +} + +[data-compare-root] .compare-label-editor { + margin: 0 0 0 18px; + padding: 0.85rem 1rem 1rem; + border-left: 2px solid var(--cmp-accent); + background: var(--cmp-bg-elev-2); + animation: labelEditorIn 280ms cubic-bezier(0.2, 0.8, 0.2, 1); +} + +@keyframes labelEditorIn { + from { opacity: 0; transform: translateY(-4px); } + to { opacity: 1; transform: translateY(0); } +} + +[data-compare-root] .compare-label-editor-head { + display: flex; + align-items: baseline; + gap: 0.85rem; + margin-bottom: 0.55rem; +} + +[data-compare-root] .compare-label-editor-body { + display: flex; + gap: 0.65rem; + align-items: stretch; +} + +[data-compare-root] .compare-input { + flex: 1; + min-width: 0; + padding: 0.55rem 0.75rem; + background: var(--cmp-bg); + border: 1px solid var(--cmp-rule-strong); + color: var(--cmp-ink); + font-family: var(--cmp-font-display); + font-style: italic; + font-size: 1rem; + outline: none; + transition: border-color 180ms ease, box-shadow 180ms ease; + border-radius: 1px; +} + +[data-compare-root] .compare-input:focus { + border-color: var(--cmp-accent); + box-shadow: 0 0 0 3px rgba(212, 168, 74, 0.15); +} + +[data-compare-root] .compare-label-editor-actions { + display: inline-flex; + gap: 0.45rem; +} + +[data-compare-root] .compare-label-editor-err { + margin-top: 0.55rem; + font-family: var(--cmp-font-data); + font-size: 0.75rem; + color: var(--cmp-bad); +} + +/* Buttons */ + +[data-compare-root] .compare-btn-primary, +[data-compare-root] .compare-btn-ghost, +[data-compare-root] .compare-btn-destructive, +[data-compare-root] .compare-btn-link { + display: inline-flex; + align-items: center; + gap: 0.5rem; + padding: 0.55rem 1.05rem; + font-family: var(--cmp-font-body); + font-size: 0.8rem; + letter-spacing: 0.1em; + text-transform: uppercase; + cursor: pointer; + border-radius: 1px; + transition: all 200ms ease; + white-space: nowrap; +} + +[data-compare-root] .compare-btn-primary { + background: var(--cmp-accent); + color: var(--cmp-bg); + border: 1px solid var(--cmp-accent); + font-weight: 600; +} + +[data-compare-root] .compare-btn-primary:hover:not(:disabled) { + background: var(--cmp-accent-ink); + border-color: var(--cmp-accent-ink); + transform: translateY(-1px); + box-shadow: 0 6px 16px rgba(212, 168, 74, 0.25); +} + +[data-compare-root] .compare-btn-primary:disabled { + opacity: 0.4; + cursor: not-allowed; +} + +[data-compare-root] .compare-btn-arrow { + font-family: var(--cmp-font-data); + font-size: 0.95rem; + line-height: 1; +} + +[data-compare-root] .compare-btn-ghost { + background: transparent; + color: var(--cmp-ink-dim); + border: 1px solid var(--cmp-rule-strong); +} + +[data-compare-root] .compare-btn-ghost:hover:not(:disabled) { + color: var(--cmp-ink); + border-color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-btn-destructive { + background: transparent; + color: var(--cmp-bad); + border: 1px solid rgba(245, 166, 166, 0.35); +} + +[data-compare-root] .compare-btn-destructive:hover:not(:disabled) { + background: rgba(245, 166, 166, 0.1); + border-color: var(--cmp-bad); +} + +[data-compare-root] .compare-btn-link { + background: transparent; + color: var(--cmp-accent-ink); + border: 0; + padding: 0.4rem 0; + text-transform: none; + letter-spacing: 0.01em; + font-family: var(--cmp-font-display); + font-size: 1rem; + font-style: italic; +} + +[data-compare-root] .compare-btn-link:hover { + color: var(--cmp-accent); +} + +/* Sticky action bar */ + +[data-compare-root] .compare-stickybar { + position: sticky; + bottom: 16px; + margin-top: 1.25rem; + z-index: 5; + animation: stickyIn 280ms cubic-bezier(0.2, 0.8, 0.2, 1); +} + +@keyframes stickyIn { + from { opacity: 0; transform: translateY(12px); } + to { opacity: 1; transform: translateY(0); } +} + +[data-compare-root] .compare-stickybar-inner { + display: flex; + align-items: center; + justify-content: space-between; + gap: 1rem; + padding: 0.8rem 1.1rem; + background: + linear-gradient(var(--cmp-bg-elev-2), var(--cmp-bg-elev-2)); + border: 1px solid rgba(212, 168, 74, 0.4); + box-shadow: + 0 20px 40px -12px rgba(0, 0, 0, 0.8), + 0 0 0 1px rgba(212, 168, 74, 0.12), + inset 0 1px 0 rgba(246, 239, 224, 0.03); +} + +[data-compare-root] .compare-stickybar-count { + display: inline-flex; + align-items: baseline; + gap: 0.55rem; +} + +[data-compare-root] .compare-stickybar-num { + font-family: var(--cmp-font-display); + font-weight: 600; + font-size: 2rem; + color: var(--cmp-accent); + line-height: 1; +} + +[data-compare-root] .compare-stickybar-label { + font-family: var(--cmp-font-data); + font-size: 0.75rem; + text-transform: uppercase; + letter-spacing: 0.14em; + color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-stickybar-actions { + display: inline-flex; + gap: 0.55rem; +} + +/* ── Per-run compare view ───────────────────────────────── */ + +[data-compare-root] .compare-backstrip { + display: flex; + align-items: center; + justify-content: space-between; + margin-bottom: 1rem; + padding-bottom: 0.65rem; + border-bottom: 1px solid var(--cmp-rule); +} + +[data-compare-root] .compare-runs-compare-table .compare-col-testid { + position: sticky; + left: 0; + background: var(--cmp-bg-elev); + padding: 0.85rem 0.9rem; + border-right: 1px solid var(--cmp-rule); + min-width: 280px; + max-width: 360px; + z-index: 2; +} + +[data-compare-root] .compare-testid-mono { + font-family: var(--cmp-font-data); + font-size: 0.82rem; + color: var(--cmp-ink); + display: inline-block; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + max-width: 340px; +} + +[data-compare-root] .compare-col-run-head { + min-width: 180px; + padding: 1rem 0.9rem; + text-align: left; + border-left: 1px dotted var(--cmp-rule); +} + +[data-compare-root] .compare-run-head { + display: flex; + flex-direction: column; + gap: 0.35rem; +} + +[data-compare-root] .compare-run-head-top { + font-family: var(--cmp-font-display); + font-style: italic; + font-size: 1.1rem; + color: var(--cmp-ink); + line-height: 1.2; +} + +[data-compare-root] .compare-run-head-label { + color: var(--cmp-accent-ink); +} + +[data-compare-root] .compare-run-head-timestamp { + font-family: var(--cmp-font-data); + font-style: normal; + font-size: 0.9rem; + color: var(--cmp-ink); + font-variant-numeric: tabular-nums; +} + +[data-compare-root] .compare-run-head-meta { + display: inline-flex; + align-items: center; + gap: 0.5rem; + font-family: var(--cmp-font-data); + font-size: 0.72rem; + color: var(--cmp-ink-dim); +} + +[data-compare-root] .compare-run-head-subid { + font-family: var(--cmp-font-data); + font-size: 0.65rem; + color: var(--cmp-ink-faint); + letter-spacing: 0.04em; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + max-width: 180px; +} + +[data-compare-root] .compare-summary-row { + background: rgba(246, 239, 224, 0.015) !important; + border-bottom: 2px solid var(--cmp-rule-strong) !important; +} + +[data-compare-root] .compare-summary-row th { + padding: 0.55rem 0.9rem 0.7rem; + vertical-align: middle; +} + +[data-compare-root] .compare-summary-pill { + display: inline-flex; + align-items: baseline; + gap: 0.45rem; + padding: 0.3rem 0.6rem; + border: 1px solid var(--cmp-rule-strong); + border-left-width: 3px; + background: var(--cmp-bg); + font-family: var(--cmp-font-data); +} + +[data-compare-root] .compare-summary-pill.tone-ok { border-left-color: var(--cmp-ok); } +[data-compare-root] .compare-summary-pill.tone-warn { border-left-color: var(--cmp-warn); } +[data-compare-root] .compare-summary-pill.tone-bad { border-left-color: var(--cmp-bad); } + +[data-compare-root] .compare-summary-num { + font-weight: 600; + font-size: 0.95rem; + font-variant-numeric: tabular-nums; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-summary-frac { + font-size: 0.72rem; + color: var(--cmp-ink-faint); +} + +[data-compare-root] .compare-col-run-cell { + padding: 0.5rem 0.9rem; + border-left: 1px dotted rgba(42, 37, 32, 0.5); +} + +[data-compare-root] .compare-runcell { + display: inline-flex; + align-items: center; + gap: 0.45rem; + padding: 0.28rem 0.55rem; + border-radius: 1px; + font-family: var(--cmp-font-data); +} + +[data-compare-root] .compare-runcell.ok { + background: var(--cmp-ok-bg); + color: var(--cmp-ok); +} + +[data-compare-root] .compare-runcell.bad { + background: var(--cmp-bad-bg); + color: var(--cmp-bad); +} + +[data-compare-root] .compare-runcell-glyph { + font-size: 0.68rem; + line-height: 1; +} + +[data-compare-root] .compare-runcell-score { + font-size: 0.75rem; + font-variant-numeric: tabular-nums; +} + +[data-compare-root] .compare-runcell-missing { + color: var(--cmp-ink-faint); + font-family: var(--cmp-font-data); +} + +/* ── Notices / errors / skeleton ────────────────────────── */ + +[data-compare-root] .compare-notice { + position: relative; + padding: 2rem 1.5rem 2rem 2.5rem; + border: 1px solid var(--cmp-rule); + background: var(--cmp-bg-elev); +} + +[data-compare-root] .compare-notice-rule { + position: absolute; + left: 1rem; + top: 2rem; + width: 30px; + height: 1px; + background: var(--cmp-accent); +} + +[data-compare-root] .compare-notice-head { + font-family: var(--cmp-font-display); + font-style: italic; + font-size: 1.5rem; + font-weight: 500; + margin: 0 0 0.4rem; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-notice-body { + margin: 0; + color: var(--cmp-ink-dim); + max-width: 60ch; + line-height: 1.55; +} + +[data-compare-root] .compare-error { + padding: 1.3rem 1.5rem; + border: 1px solid rgba(245, 166, 166, 0.35); + background: rgba(245, 166, 166, 0.05); + color: var(--cmp-bad); +} + +[data-compare-root] .compare-error-eyebrow { + font-family: var(--cmp-font-data); + font-size: 0.7rem; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--cmp-bad); + margin-bottom: 0.4rem; + opacity: 0.85; +} + +[data-compare-root] .compare-error-body { + font-family: var(--cmp-font-display); + font-style: italic; + font-size: 1.05rem; + color: var(--cmp-ink); +} + +[data-compare-root] .compare-skeleton { + display: flex; + flex-direction: column; + gap: 0.85rem; + padding: 1.5rem; + border: 1px solid var(--cmp-rule); + background: var(--cmp-bg-elev); +} + +[data-compare-root] .compare-skel-row { + display: flex; + gap: 0.85rem; + animation: skelPulse 1400ms ease-in-out infinite; +} + +@keyframes skelPulse { + 0%, 100% { opacity: 0.35; } + 50% { opacity: 0.75; } +} + +[data-compare-root] .compare-skel-bar { + height: 22px; + flex: 1; + background: linear-gradient( + 90deg, + var(--cmp-rule) 0%, + var(--cmp-rule-strong) 50%, + var(--cmp-rule) 100% + ); +} + +[data-compare-root] .compare-skel-bar-sm { + flex: 0 0 120px; +} + +/* Responsive tweaks */ + +@media (max-width: 820px) { + [data-compare-root] .compare-masthead { + grid-template-columns: 1fr; + align-items: start; + } + [data-compare-root] .compare-masthead-right { + justify-content: flex-start; + } + [data-compare-root] .compare-title { + font-size: 2.2rem; + } +} +`; diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 75721d398..6e4b4e3dd 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -27,6 +27,7 @@ import type { RemoteStatusResponse, RunDetailResponse, RunEvalRequest, + RunLabelResponse, RunListResponse, StudioConfigResponse, SuitesResponse, @@ -437,6 +438,45 @@ export async function syncRemoteResultsApi(benchmarkId?: string): Promise; } +// ── Run label mutations ────────────────────────────────────────────────── + +/** + * Save (create or update) a label for a run. Labels are stored as a sidecar + * `label.json` file next to the run's manifest and replace the formatted + * timestamp in compare view column headers. + */ +export async function saveRunLabelApi( + runId: string, + label: string, + benchmarkId?: string, +): Promise { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/label` + : `/api/runs/${encodeURIComponent(runId)}/label`; + const res = await fetch(url, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ label }), + }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed to save label: ${res.status}`); + } + return res.json() as Promise; +} + +/** Remove the label sidecar for a run. */ +export async function deleteRunLabelApi(runId: string, benchmarkId?: string): Promise { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/label` + : `/api/runs/${encodeURIComponent(runId)}/label`; + const res = await fetch(url, { method: 'DELETE' }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed to delete label: ${res.status}`); + } +} + export async function saveStudioConfig( config: Partial, ): Promise { diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index a395a8072..2114f9e68 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -19,6 +19,8 @@ export interface RunMeta { source: 'local' | 'remote'; project_id?: string; project_name?: string; + /** Optional user-assigned label from the run's sidecar label.json. */ + label?: string; } export interface RunListResponse { @@ -148,10 +150,39 @@ export interface CompareCell { tests: CompareTestResult[]; } +/** + * A single evaluation run surfaced in the per-run compare view. + * + * Each run workspace contributes exactly one entry, independent of the + * aggregated `(experiment, target)` cells. Users select multiple runs to + * compare them side-by-side, regardless of whether the runs share an + * experiment or target. + */ +export interface CompareRunEntry { + run_id: string; + started_at: string; + experiment: string; + target: string; + label?: string; + source: 'local' | 'remote'; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: CompareTestResult[]; +} + export interface CompareResponse { experiments: string[]; targets: string[]; cells: CompareCell[]; + /** Per-run entries, sorted newest first. */ + runs?: CompareRunEntry[]; +} + +export interface RunLabelResponse { + label: string; + updated_at: string; } export interface TargetSummary { diff --git a/docs/plans/1037-per-run-compare.md b/docs/plans/1037-per-run-compare.md new file mode 100644 index 000000000..cf6c9d18c --- /dev/null +++ b/docs/plans/1037-per-run-compare.md @@ -0,0 +1,103 @@ +# Per-run comparison with retroactive labelling — Issue #1037 + +## Goal +Let Studio users compare individual runs (by timestamp / run id) side-by-side, +independent of the current `(experiment, target)` aggregation. Optional labels +replace timestamps in compare headers. + +## Data model + +### Sidecar label file +- Path: `/label.json` next to `index.jsonl` +- Content: `{ "label": string, "updated_at": string }` +- Mutable, non-breaking, trivially reversible. Absent file = no label. + +### Wire format extension (non-breaking) +Extend `CompareResponse`: +```ts +interface CompareRunEntry { + run_id: string; // existing run id (experiment::timestamp or timestamp) + started_at: string; // first record timestamp (fallback: manifest meta) + experiment: string; + target: string; + label?: string; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: CompareTestResult[]; +} + +interface CompareResponse { + experiments: string[]; + targets: string[]; + cells: CompareCell[]; // unchanged + runs: CompareRunEntry[]; // NEW +} +``` + +Extend `RunMeta` with optional `label?: string`. + +## Backend changes — apps/cli/src/commands/results/serve.ts + +1. `handleCompare` — after building cells, also build per-run entries. + Each run file → compute eval_count, passed_count, pass_rate, avg_score, + tests (cap 100). Read sidecar label. +2. `handleRuns` — already enriches RunMeta; add label sidecar lookup. +3. New `handleRunLabel` (PUT/POST) — writes `label.json`. Unscoped and + benchmark-scoped variants. +4. New `handleRunLabelDelete` (DELETE) — removes `label.json`. + +## Frontend changes + +### Types — apps/studio/src/lib/types.ts +Extend `CompareResponse`, add `CompareRunEntry`, add `label?` to `RunMeta`. + +### API hooks — apps/studio/src/lib/api.ts +- `saveRunLabel(runId, label, benchmarkId?)` — PUT mutation +- `deleteRunLabel(runId, benchmarkId?)` — DELETE mutation +- Invalidate `['compare']`, `['runs']`, `['benchmarks', id, 'compare']` + +### CompareTab redesign — apps/studio/src/components/CompareTab.tsx + +**Aesthetic direction: Editorial data-terminal** +- Display font: Fraunces (variable serif with optical sizing) — for headings +- Data font: JetBrains Mono Variable — tabular numbers, run ids, deltas +- Body: Inter-free. Use system-ui sparingly for secondary text, or DM Sans +- Palette: off-black (#0a0a0b) base, warm ivory (#f4ecd8) text, signal + accents (emerald #10b981, amber #f59e0b, rose #f43f5e). Hairline dividers + in warm gray (#2a2622). +- Layout: Asymmetric header (big serif title + mode toggle right-aligned). + Sharp hairline rules. Tabular number columns. Generous vertical rhythm. +- Motion: Staggered fade-in on mount (CSS `@keyframes` with animation-delay). + Hover brings subtle shadow+translate on selectable rows. Mode toggle + slides underline indicator. + +**Modes:** +1. **Aggregated** (default) — existing matrix, re-skinned with the new + aesthetic. Unchanged logic. +2. **Per run** — runs table sorted by timestamp desc with: + - Selectable checkbox (multi-select) + - Columns: `timestamp | label | experiment | target | tests | pass | avg` + - Inline "Edit label" button → popover/inline input + - Sticky footer: "Compare N selected" button (enabled when N ≥ 2) + - Opening compare view renders a side-by-side table: one column per run, + using label or formatted timestamp. Reuses `CompareMatrixCell` rendering + logic for per-test breakdown. + +## Validation plan + +1. Unit-ish: typecheck, lint, build. +2. Backend e2e: `bun apps/cli/src/cli.ts results serve --port 9100` on a + benchmark with ≥2 runs of the same (experiment, target). Hit + `/api/compare` and verify `runs[]` present. Hit `PUT /api/runs/:id/label` + and verify sidecar file is written. +3. Frontend visual: agent-browser with `--cdp 9222` on http://localhost:5173 + (or wherever studio dev runs). Screenshot aggregated mode, per-run mode, + label edit, compare view. Iterate on design until polished. + +## Out of scope +- Eval YAML schema changes +- CLI flags +- Multi-label / tag taxonomy +- Cross-project run compare From c993a2080b793b4f588f56405c7bb54b23d42fed Mon Sep 17 00:00:00 2001 From: devbox2-codex Date: Fri, 10 Apr 2026 23:04:53 +0000 Subject: [PATCH 2/6] fix(studio): address review findings from PR #1040 - CompareTab AggregatedView: hoist useMemo above the early return so adding a second experiment/target after the initial render does not violate the Rules of Hooks. - Pass `benchmarkId` and `readOnly` through to CompareTab from both routes (single-project and benchmark-scoped). Previously label mutations in the benchmark view routed to the unscoped endpoint and either 404'd or wrote the sidecar into the wrong run directory. - LabelEditor: short-circuit Save/Clear onClick handlers on `busy` to avoid a save-then-clear race where both mutations could be in flight simultaneously. - writeRunLabel: reject control characters in labels so they cannot break compare column headers or confuse test assertions. --- apps/cli/src/commands/results/run-label.ts | 8 +++++ apps/studio/src/components/CompareTab.tsx | 29 ++++++++++++------- apps/studio/src/routes/index.tsx | 14 +++++++-- .../src/routes/projects/$benchmarkId.tsx | 23 +++++++++++++-- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/apps/cli/src/commands/results/run-label.ts b/apps/cli/src/commands/results/run-label.ts index 27b9116fa..71b7390ce 100644 --- a/apps/cli/src/commands/results/run-label.ts +++ b/apps/cli/src/commands/results/run-label.ts @@ -62,6 +62,14 @@ export function writeRunLabel(manifestPath: string, label: string): RunLabelFile if (trimmed.length > 120) { throw new Error('Label must be at most 120 characters'); } + // Reject control characters (newlines, tabs, DEL, etc.) — they break + // column headers in compare views and confuse test assertions. + for (let i = 0; i < trimmed.length; i++) { + const code = trimmed.charCodeAt(i); + if (code < 0x20 || code === 0x7f) { + throw new Error('Label must not contain control characters'); + } + } const entry: RunLabelFile = { label: trimmed, updated_at: new Date().toISOString(), diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index a2ebbd9d6..20709571c 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -145,6 +145,17 @@ function Masthead({ function AggregatedView({ data }: { data: CompareResponse }) { const { experiments, targets, cells } = data; + // Hooks must run on every render regardless of the early-return below, + // so this memo is declared before any conditional return. When you add a + // new hook-using sub-path here, keep it above the guard. + const cellMap = useMemo(() => { + const map = new Map(); + for (const cell of cells) { + map.set(`${cell.experiment}::${cell.target}`, cell); + } + return map; + }, [cells]); + if (experiments.length <= 1 && targets.length <= 1) { return ( { - const map = new Map(); - for (const cell of cells) { - map.set(`${cell.experiment}::${cell.target}`, cell); - } - return map; - }, [cells]); - return (
@@ -615,7 +618,10 @@ function LabelEditor({ diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 4dd892ee9..136c1123a 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -274,7 +274,7 @@ function SingleProjectHome() { /> )} {activeTab === 'experiments' && } - {activeTab === 'compare' && } + {activeTab === 'compare' && } {activeTab === 'targets' && } {!isReadOnly && setShowRunEval(false)} />} @@ -282,9 +282,17 @@ function SingleProjectHome() { ); } -function CompareTabContent() { +function CompareTabContent({ readOnly }: { readOnly: boolean }) { const { data, isLoading, isError, error } = useCompare(); - return ; + return ( + + ); } function RunsTabContent({ diff --git a/apps/studio/src/routes/projects/$benchmarkId.tsx b/apps/studio/src/routes/projects/$benchmarkId.tsx index 1d0660cb4..b8834dbb2 100644 --- a/apps/studio/src/routes/projects/$benchmarkId.tsx +++ b/apps/studio/src/routes/projects/$benchmarkId.tsx @@ -91,7 +91,9 @@ function ProjectHomePage() { {activeTab === 'runs' && } {activeTab === 'experiments' && } - {activeTab === 'compare' && } + {activeTab === 'compare' && ( + + )} {activeTab === 'targets' && } {!isReadOnly && ( @@ -209,9 +211,24 @@ function ProjectExperimentsTab({ benchmarkId }: { benchmarkId: string }) { ); } -function ProjectCompareTab({ benchmarkId }: { benchmarkId: string }) { +function ProjectCompareTab({ + benchmarkId, + readOnly, +}: { + benchmarkId: string; + readOnly: boolean; +}) { const { data, isLoading, isError, error } = useQuery(benchmarkCompareOptions(benchmarkId)); - return ; + return ( + + ); } function ProjectTargetsTab({ benchmarkId }: { benchmarkId: string }) { From 0dbdbad9b575d076fe75b48c3c6acad951731da3 Mon Sep 17 00:00:00 2001 From: devbox2-codex Date: Sat, 11 Apr 2026 03:58:16 +0000 Subject: [PATCH 3/6] docs(studio): document per-run compare + add signature screenshots - Replace the single compare screenshot with three fresh shots at 1680x1000: the side-by-side per-run view (hero), the aggregated matrix, and the per-run list with labels. - Expand the Studio `## Compare` section to describe both modes, when to use per-run mode, how the sticky Compare N flow works, and how retroactive labels persist as sidecar `label.json` files. - While in CompareTab.tsx: honor `prefers-reduced-motion` (disables entrance animations, row stagger, hover translations), and restore focus to the row's label trigger button when the inline label editor closes so keyboard users don't lose their place. --- apps/studio/src/components/CompareTab.tsx | 29 +++++++++++++ .../screenshots/studio-compare-aggregated.png | Bin 0 -> 187542 bytes .../screenshots/studio-compare-per-run.png | Bin 0 -> 210592 bytes .../studio-compare-side-by-side.png | Bin 0 -> 193573 bytes .../src/assets/screenshots/studio-compare.png | Bin 25510 -> 0 bytes .../src/content/docs/docs/tools/studio.mdx | 38 +++++++++++++++--- 6 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 apps/web/src/assets/screenshots/studio-compare-aggregated.png create mode 100644 apps/web/src/assets/screenshots/studio-compare-per-run.png create mode 100644 apps/web/src/assets/screenshots/studio-compare-side-by-side.png delete mode 100644 apps/web/src/assets/screenshots/studio-compare.png diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index 20709571c..5257645eb 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -436,6 +436,17 @@ function PerRunRow({ const avgPct = Math.round(run.avg_score * 100); const tone = rateTone(run.pass_rate); const canEdit = !readOnly && run.source !== 'remote'; + const labelBtnRef = useRef(null); + + // Restore focus to the label trigger button once the inline editor closes, + // so keyboard users don't lose their place in the table. + const wasEditing = useRef(editing); + useEffect(() => { + if (wasEditing.current && !editing) { + labelBtnRef.current?.focus(); + } + wasEditing.current = editing; + }, [editing]); return ( <> @@ -479,6 +490,7 @@ function PerRunRow({ {canEdit ? ( - ) : run.label ? ( - - {run.label} - + ) : tags.length > 0 ? ( +
+ {tags.map((t) => ( + + {t} + + ))} +
) : ( )} @@ -517,9 +537,9 @@ function PerRunRow({ {editing && ( - @@ -530,29 +550,41 @@ function PerRunRow({ ); } -function LabelEditor({ +/** + * Inline chip-based tag editor. + * + * Local state: a `string[]` staged edit of the run's tags. Chips show the + * current staged tags; an input at the end accepts new tags (commit with + * Enter or comma, delete the last chip with Backspace on an empty input). + * Save persists the whole array; Cancel / Escape discards. + * + * The backend's `writeRunTags` handles deduplication, length limits, and + * control-character rejection, so we only lightly normalize in the UI + * (trim + skip duplicates already in the staged array). + */ +function TagsEditor({ runId, - currentLabel, + currentTags, benchmarkId, onClose, }: { runId: string; - currentLabel?: string; + currentTags: string[]; benchmarkId?: string; onClose: () => void; }) { - const [value, setValue] = useState(currentLabel ?? ''); + const [tags, setTags] = useState(currentTags); + const [input, setInput] = useState(''); const [err, setErr] = useState(null); const qc = useQueryClient(); const inputRef = useRef(null); useEffect(() => { inputRef.current?.focus(); - inputRef.current?.select(); }, []); const saveMut = useMutation({ - mutationFn: () => saveRunLabelApi(runId, value, benchmarkId), + mutationFn: () => saveRunTagsApi(runId, tags, benchmarkId), onSuccess: () => { qc.invalidateQueries({ queryKey: ['compare'] }); qc.invalidateQueries({ queryKey: ['runs'] }); @@ -566,7 +598,7 @@ function LabelEditor({ }); const clearMut = useMutation({ - mutationFn: () => deleteRunLabelApi(runId, benchmarkId), + mutationFn: () => deleteRunTagsApi(runId, benchmarkId), onSuccess: () => { qc.invalidateQueries({ queryKey: ['compare'] }); qc.invalidateQueries({ queryKey: ['runs'] }); @@ -580,35 +612,78 @@ function LabelEditor({ }); const busy = saveMut.isPending || clearMut.isPending; + const hasChanges = + tags.length !== currentTags.length || tags.some((t, i) => t !== currentTags[i]); + + const commitInput = () => { + const trimmed = input.trim(); + if (trimmed === '') return; + if (tags.includes(trimmed)) { + setInput(''); + return; + } + setTags([...tags, trimmed]); + setInput(''); + setErr(null); + }; + + const removeTag = (tag: string) => { + setTags(tags.filter((t) => t !== tag)); + }; return (
- - Rename run - + Tag run - Replaces the timestamp in compare column headers. + Multi-valued. Enter or comma adds; Backspace removes the last chip.
-
+
+ {tags.map((t) => ( + + {t} + + + ))} { setErr(null); - setValue(e.target.value); + setInput(e.target.value); }} - maxLength={120} + maxLength={60} disabled={busy} onKeyDown={(e) => { - if (e.key === 'Enter' && value.trim() && !busy) saveMut.mutate(); - if (e.key === 'Escape') onClose(); + if (e.key === 'Enter' || e.key === ',') { + e.preventDefault(); + commitInput(); + } else if (e.key === 'Backspace' && input === '' && tags.length > 0) { + e.preventDefault(); + setTags(tags.slice(0, -1)); + } else if (e.key === 'Escape') { + onClose(); + } }} + onBlur={commitInput} /> +
+
- {currentLabel && ( + {currentTags.length > 0 && ( )} +
+``` + +Used in `CompareTab` for the Aggregated / Per-run switch. Do not use a +2px underline indicator here — that pattern is reserved for the **main +page tabs** (see Navigation below). + +### Main page tabs + +The top-level tab strip (`Recent Runs`, `Experiments`, `Compare`, +`Targets`) uses a 2px underline indicator: + +```tsx +