diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts new file mode 100644 index 000000000..17f76a028 --- /dev/null +++ b/apps/cli/src/commands/results/run-tags.ts @@ -0,0 +1,132 @@ +/** + * Per-run tag sidecar file helpers. + * + * Tags are stored as a `tags.json` sidecar next to the run's `index.jsonl` + * manifest. The sidecar is optional, mutable, and non-breaking — absence + * means the run has no user-assigned tags. + * + * Wire format (stored on disk): + * ```json + * { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" } + * ``` + * + * Used by the Studio compare API so users can retroactively tag runs + * without changing the eval YAML or the run manifest itself. This mirrors + * the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued + * list of free-form labels that lives alongside the immutable run_id. + * + * Validation rules: + * - Each tag is 1–60 characters after trimming + * - No control characters (\n, \t, DEL, etc.) + * - Tags are deduplicated case-sensitively + * - A run can have at most 20 tags + * - Writing an empty array removes the sidecar file + * + * To extend (e.g. add colored labels or descriptions): add optional fields + * to `RunTagsFile` and keep the schema additive so older files still parse. + */ + +import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; + +export const RUN_TAGS_FILENAME = 'tags.json'; + +/** Maximum number of tags per run. */ +export const MAX_TAGS_PER_RUN = 20; + +/** Maximum length of a single tag after trimming. */ +export const MAX_TAG_LENGTH = 60; + +export interface RunTagsFile { + /** Ordered, deduplicated list of user-assigned tags. */ + tags: string[]; + /** ISO-8601 timestamp of last update. */ + updated_at: string; +} + +/** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */ +export function runTagsPath(manifestPath: string): string { + return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME); +} + +/** Read the tags for a run. Returns `undefined` if missing or unreadable. */ +export function readRunTags(manifestPath: string): RunTagsFile | undefined { + const fp = runTagsPath(manifestPath); + if (!existsSync(fp)) return undefined; + try { + const parsed = JSON.parse(readFileSync(fp, 'utf8')) as unknown; + if (!parsed || typeof parsed !== 'object') return undefined; + const record = parsed as Record; + if (!Array.isArray(record.tags)) return undefined; + const tags = record.tags.filter( + (t): t is string => typeof t === 'string' && t.trim().length > 0, + ); + if (tags.length === 0) return undefined; + return { + tags, + updated_at: typeof record.updated_at === 'string' ? record.updated_at : '', + }; + } catch { + return undefined; + } +} + +/** + * Write tags for a run. Replaces any existing tags. Pass an empty array + * to remove the sidecar entirely. + */ +export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null { + const cleaned = normalizeTags(tags); + if (cleaned.length === 0) { + deleteRunTags(manifestPath); + return null; + } + const entry: RunTagsFile = { + tags: cleaned, + updated_at: new Date().toISOString(), + }; + writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8'); + return entry; +} + +/** Remove a run's tags sidecar. No-op if the file does not exist. */ +export function deleteRunTags(manifestPath: string): void { + const fp = runTagsPath(manifestPath); + if (existsSync(fp)) { + unlinkSync(fp); + } +} + +/** + * Trim, validate, and deduplicate an incoming tag array. Throws on any + * invalid entry so the caller can surface a user-friendly error. + */ +function normalizeTags(tags: readonly string[]): string[] { + const seen = new Set(); + const out: string[] = []; + for (const raw of tags) { + if (typeof raw !== 'string') { + throw new Error('Tags must be strings'); + } + const trimmed = raw.trim(); + if (trimmed === '') continue; + if (trimmed.length > MAX_TAG_LENGTH) { + throw new Error(`Tag "${trimmed.slice(0, 20)}…" exceeds ${MAX_TAG_LENGTH} characters`); + } + // Reject control characters (newlines, tabs, DEL, etc.) — they break + // column headers in compare views and confuse test assertions. + for (let i = 0; i < trimmed.length; i++) { + const code = trimmed.charCodeAt(i); + if (code < 0x20 || code === 0x7f) { + throw new Error('Tag must not contain control characters'); + } + } + if (seen.has(trimmed)) continue; + seen.add(trimmed); + out.push(trimmed); + } + if (out.length > MAX_TAGS_PER_RUN) { + throw new Error(`Too many tags (max ${MAX_TAGS_PER_RUN})`); + } + return out; +} diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 9b4c67161..15ec4f4aa 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -59,6 +59,7 @@ import { listMergedResultFiles, syncRemoteResults, } from './remote.js'; +import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -273,6 +274,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { } catch { // ignore enrichment errors } + const tagsEntry = readRunTags(m.path); return { filename: m.filename, display_name: m.displayName, @@ -285,6 +287,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { source: m.source, ...(target && { target }), ...(experiment && { experiment }), + ...(tagsEntry && { tags: tagsEntry.tags }), }; }), }); @@ -551,7 +554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - // Collect per-test-case results keyed by experiment × target + // Collect per-test-case results keyed by experiment × target (aggregated view) const cellMap = new Map< string, { @@ -569,17 +572,54 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { } >(); + // Per-run entries (per-run view). Each run workspace contributes exactly + // one entry, independent of the aggregated matrix. + const runEntries: Array<{ + run_id: string; + started_at: string; + experiment: string; + target: string; + tags?: string[]; + source: 'local' | 'remote'; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: Array<{ + test_id: string; + score: number; + passed: boolean; + execution_status?: string; + }>; + }> = []; + const experimentsSet = new Set(); const targetsSet = new Set(); + const MAX_TESTS_PER_CELL = 100; for (const m of metas) { try { const records = loadLightweightResults(m.path); + const runTestMap = new Map< + string, + { test_id: string; score: number; passed: boolean; execution_status?: string } + >(); + let runEvalCount = 0; + let runPassedCount = 0; + let runScoreSum = 0; + let runExperiment = 'default'; + let runTarget = 'default'; + let runStartedAt = m.timestamp; + for (const r of records) { const experiment = r.experiment ?? 'default'; const target = r.target ?? 'default'; experimentsSet.add(experiment); targetsSet.add(target); + runExperiment = experiment; + runTarget = target; + if (r.timestamp && r.timestamp < runStartedAt) runStartedAt = r.timestamp; + const key = JSON.stringify([experiment, target]); const entry = cellMap.get(key) ?? { experiment, @@ -600,14 +640,41 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { execution_status: r.executionStatus, }); cellMap.set(key, entry); + + // Per-run accumulation. Dedupe tests within the run by last-wins. + runTestMap.set(r.testId, { + test_id: r.testId, + score: r.score, + passed, + execution_status: r.executionStatus, + }); + runEvalCount++; + if (passed) runPassedCount++; + runScoreSum += r.score; } + + if (runEvalCount === 0) continue; + + const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL); + const tagsEntry = readRunTags(m.path); + runEntries.push({ + run_id: m.filename, + started_at: runStartedAt, + experiment: runExperiment, + target: runTarget, + ...(tagsEntry && { tags: tagsEntry.tags }), + source: m.source, + eval_count: runEvalCount, + passed_count: runPassedCount, + pass_rate: runPassedCount / runEvalCount, + avg_score: runScoreSum / runEvalCount, + tests: runTests, + }); } catch { // skip runs that fail to load } } - const MAX_TESTS_PER_CELL = 100; - const cells = [...cellMap.values()].map((entry) => { // Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order) const dedupMap = new Map(); @@ -630,10 +697,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { }; }); + // Per-run entries sorted by timestamp descending (newest first). + runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at)); + return c.json({ experiments: [...experimentsSet].sort(), targets: [...targetsSet].sort(), cells, + runs: runEntries, }); } @@ -702,6 +773,52 @@ function handleFeedbackRead(c: C, { searchDir }: DataContext) { return c.json(readFeedback(existsSync(resultsDir) ? resultsDir : searchDir)); } +async function handleRunTagsPut(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); + if (!meta) return c.json({ error: 'Run not found' }, 404); + if (meta.source === 'remote') { + return c.json({ error: 'Tags can only be set on local runs' }, 400); + } + let body: unknown; + try { + body = await c.req.json(); + } catch { + return c.json({ error: 'Invalid JSON' }, 400); + } + if (!body || typeof body !== 'object') { + return c.json({ error: 'Invalid payload' }, 400); + } + const tags = (body as Record).tags; + if (!Array.isArray(tags)) { + return c.json({ error: 'Missing tags array' }, 400); + } + try { + const entry = writeRunTags(meta.path, tags as string[]); + return c.json({ + tags: entry?.tags ?? [], + updated_at: entry?.updated_at ?? new Date().toISOString(), + }); + } catch (err) { + return c.json({ error: (err as Error).message }, 400); + } +} + +async function handleRunTagsDelete(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); + if (!meta) return c.json({ error: 'Run not found' }, 404); + if (meta.source === 'remote') { + return c.json({ error: 'Tags can only be removed on local runs' }, 400); + } + try { + deleteRunTags(meta.path); + return c.json({ ok: true }); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } +} + // ── Hono app factory ───────────────────────────────────────────────────── /** @@ -934,6 +1051,18 @@ export function createApp( app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir))); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); + app.put('/api/runs/:filename/tags', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return handleRunTagsPut(c, defaultCtx); + }); + app.delete('/api/runs/:filename/tags', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return handleRunTagsDelete(c, defaultCtx); + }); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx)); @@ -1046,6 +1175,18 @@ export function createApp( withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))), ); app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns)); + app.put('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return withBenchmark(c, handleRunTagsPut); + }); + app.delete('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + return withBenchmark(c, handleRunTagsDelete); + }); app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail)); app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) => withBenchmark(c, handleRunSuites), diff --git a/apps/studio/DESIGN.md b/apps/studio/DESIGN.md new file mode 100644 index 000000000..03d22f991 --- /dev/null +++ b/apps/studio/DESIGN.md @@ -0,0 +1,413 @@ +# AgentV Studio Design System + +> Studio is a dark, utility-driven dashboard for reviewing AI agent evaluation +> results. It favors dense tabular data, muted neutrals, and a single cyan +> accent over ornamental styling. Think "terminal inspector", not "marketing +> page". When in doubt, copy the pattern from `ExperimentsTab`, `TargetsTab`, +> `RunList`, or `PassRatePill` — they are canonical examples of the style. + +## 1. Visual Theme & Atmosphere + +AgentV Studio is a local evaluation dashboard for AI agent developers. The +design language is dense, dark, and data-first — this is a tool engineers +keep open in a second monitor while they iterate on prompts, not a page +they share on social. The canvas is near-black (`bg-gray-950`), elevated +surfaces sit one step up (`bg-gray-900`), and every interactive accent +pulls the eye toward the same cyan signal color (`cyan-400`). + +Typography stays out of the way on purpose. A single system sans-serif +stack (`ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont`) +handles every piece of text. There is no brand display font, no serif, +no variable font. Numeric columns use `tabular-nums` so pass rates, +scores, and timestamps line up cleanly, and most table text sits at +`text-sm` (14px) with `font-medium` (500) reserved for row headers and +links. + +Motion is almost absent. Rows fade in via Tailwind's built-in +`transition-colors`, the main tabs slide a 2px cyan underline indicator, +and that's it. There are no staggered entrance animations, no serif +display headings, no elevated box-shadows. Honor `prefers-reduced-motion` +if you add any animation. + +**Key characteristics:** +- Dark canvas (`bg-gray-950`), elevated surfaces at `bg-gray-900/50` or `bg-gray-900` +- Single system sans-serif stack — no webfonts, no Google Fonts +- Cyan-400 is the ONE accent for interactive elements and links +- Emerald/yellow/red tones for pass/warn/fail, used sparingly and only for data +- Blue gradient reserved for `PassRatePill` (the one exception to cyan monopoly) +- Rounded corners: consistently `rounded-lg` (8px) for containers, `rounded-md` (6px) for inputs/buttons, `rounded-full` for pills +- Hairline borders (`border-gray-800`), never shadows + +## 2. Color Palette & Roles + +### Surfaces + +| Token | Hex (Tailwind) | Role | +|---|---|---| +| `bg-gray-950` | `#030712` | App canvas / body background | +| `bg-gray-900` | `#111827` | Elevated container background | +| `bg-gray-900/50` | `#111827` @ 50% | Table header row, subtle fills | +| `bg-gray-900/30` | `#111827` @ 30% | Row hover state | +| `bg-gray-800` | `#1f2937` | Secondary button fill, progress track, skeleton bars | +| `bg-gray-800/50` | `#1f2937` @ 50% | Divider, disabled fill | + +### Borders + +| Token | Role | +|---|---| +| `border-gray-800` | Default container borders (every `rounded-lg` card + table wrap) | +| `border-gray-800/50` | `divide-y` row separators inside tables | +| `border-gray-700` | Form input borders | +| `border-cyan-900/60` | Label/tag chip borders — the only cyan-tinted border | +| `border-red-900/60` | Error / destructive action borders | + +### Text + +| Token | Role | +|---|---| +| `text-gray-100` | Default body text on `bg-gray-950` | +| `text-white` | Section headings (`h2 text-xl font-semibold`) | +| `text-gray-200` | Row primary values (target name, timestamp) | +| `text-gray-300` | Secondary values inside cells | +| `text-gray-400` | Table header labels, section subtitles, muted links | +| `text-gray-500` | Metadata, timestamps, "N runs" counts | +| `text-gray-600` | Placeholders, empty-state em-dashes | + +### Accent (single source of truth: cyan) + +| Token | Role | +|---|---| +| `text-cyan-400` | Active tab, links, primary-action emphasis | +| `text-cyan-300` | Link/tag hover, label chip text | +| `text-cyan-500` | Accent on focused checkbox/select | +| `bg-cyan-500` | Primary action button fill (e.g. "Compare N", "Save") | +| `bg-cyan-400` | Primary button hover | +| `bg-cyan-950/30` | Tag chip fill, selected-row tint | +| `bg-cyan-950/20` | Selected-row tint on per-run list | +| `ring-cyan-500` | Focus ring on inputs and buttons | + +**Rule:** do not introduce a second accent. Green, amber, and red are +reserved for **data tones** (pass/warn/fail), not for interactive UI. + +### Data tones + +| Token | Role | +|---|---| +| `text-emerald-400` / `bg-emerald-400` | Pass (≥80%), success dots, "passed" count numerator | +| `text-yellow-400` / `bg-yellow-400` | Warn (50–80%) | +| `text-amber-400` | Run source badge for `remote` runs | +| `text-red-400` / `bg-red-400` | Fail (<50%), error text, destructive button | +| `bg-red-950/30` + `border-red-900/60` | Error banners and destructive button hover | + +### The blue-gradient exception + +`PassRatePill` is the one place blue (not cyan) appears — a fixed +`bg-gradient-to-r from-blue-400 to-blue-600` fill on a `bg-gray-800` +rounded-full track. This is the recognizable "Studio pill" that ties +the Runs, Experiments, Targets, and Compare tabs together. Reuse it +verbatim (``) — do not recreate it with +cyan or with a different gradient. + +## 3. Typography Rules + +### Font stack + +One stack, applied globally in `src/styles/globals.css`: + +```css +font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, + "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; +``` + +No webfonts. No display font. No second family for code or data — use +`tabular-nums` for numeric alignment, not a monospace font. + +### Hierarchy + +| Role | Class | Notes | +|---|---|---| +| Page title | `text-2xl font-semibold text-white` | Top-level section (e.g. "Evaluation Runs") | +| Section title | `text-xl font-semibold text-white` | Tab headings (e.g. "Compare runs") | +| Sub-section | `text-lg font-medium text-gray-300` | Inside-card headers | +| Table header | `font-medium text-gray-400 px-4 py-3` | Column labels; NOT uppercase by default | +| Table header (micro) | `text-xs uppercase tracking-wider text-gray-500` | Only for eyebrows / sub-headers | +| Row primary | `font-medium text-gray-200` | Main row identifier (target, timestamp) | +| Row numeric | `tabular-nums text-gray-400` | Every number in every cell | +| Body text | `text-sm text-gray-300` | Default inside cards | +| Body muted | `text-sm text-gray-400` | Subtitles, explanatory text | +| Caption | `text-xs text-gray-500` | Metadata, run ids, hint text | +| Link | `text-cyan-400 hover:text-cyan-300 hover:underline` | Internal navigation | + +### Principles + +- **`text-sm` is the default.** Most table text, most body text, most + buttons. `text-base` (16px) is reserved for empty-state headlines. +- **`font-medium` (500), not bold.** 600 for section titles, never 700. +- **`tabular-nums` on every number.** Pass rates, scores, test counts, + timestamps, avg values. The columns must line up. +- **No uppercase headers by default.** Only use `uppercase tracking-wider` + for tiny eyebrow labels (`text-xs uppercase tracking-wider text-gray-500`). +- **No custom line-heights.** Tailwind's defaults work. + +## 4. Component Stylings + +### Containers + +```tsx +
+ {/* … */} +
+``` + +Every meaningful grouping goes in a `rounded-lg` bordered container. +No drop shadows, no inner glows. The border itself IS the elevation. + +### Tables + +Canonical pattern (from `ExperimentsTab.tsx` — copy this verbatim): + +```tsx +
+ + + + + + + + + + + + + +
ColumnNumber
+
+``` + +- **Padding:** `px-4 py-3` for every cell (both header and body). +- **Rows:** `divide-y divide-gray-800/50` + `hover:bg-gray-900/30`. +- **Right-align numbers:** `text-right tabular-nums`. + +### Buttons + +| Variant | Classes | Use | +|---|---|---| +| Primary | `rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400 disabled:cursor-not-allowed disabled:bg-gray-700 disabled:text-gray-500` | "Save", "Compare N", main submit actions | +| Ghost | `rounded-md px-3 py-1.5 text-sm text-gray-400 transition-colors hover:text-gray-200` | Cancel, inline Clear, low-stakes secondary | +| Destructive | `rounded-md border border-red-900/60 px-3 py-1.5 text-sm text-red-400 transition-colors hover:border-red-800 hover:bg-red-950/30 hover:text-red-300` | Clear all, delete, destructive | +| Emerald (rare) | `rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-500` | Reserved for "Run Eval" only — NOT a general accent | + +Primary buttons use `text-gray-950` (not white) on cyan-500 because +cyan-500 is a bright background and dark foreground contrasts better. + +### Inputs + +```tsx + +``` + +- `bg-gray-950` (inset, darker than surrounding `bg-gray-900`) +- Focus uses `ring-1 ring-cyan-500` + matching `border-cyan-500` — always both together. +- Disabled is `opacity-50`, not a different fill. + +### Pill chips (tags, labels, status badges) + +```tsx + + {tag} + +``` + +- `rounded-md` (6px), not `rounded-full` +- Always three tokens together: `border-cyan-900/60 bg-cyan-950/30 text-cyan-300` +- `text-xs` (12px), `font-medium` + +For status badges (local/remote source pill, pass rate chip), swap the +cyan trio for emerald (`emerald-900/60` + `emerald-950/30` + `emerald-300`) +or amber (`amber-900/60` + `amber-950/30` + `amber-300`). + +### `PassRatePill` (always reuse the component) + +```tsx +import { PassRatePill } from './PassRatePill'; + +``` + +Never recreate this inline — import the shared component. Width is +fixed at `w-20`, height at `h-5`, fill is the only Studio element that +uses the blue gradient. + +### Mode toggle / segmented control + +```tsx +
+ +
+``` + +Used in `CompareTab` for the Aggregated / Per-run switch. Do not use a +2px underline indicator here — that pattern is reserved for the **main +page tabs** (see Navigation below). + +### Main page tabs + +The top-level tab strip (`Recent Runs`, `Experiments`, `Compare`, +`Targets`) uses a 2px underline indicator: + +```tsx + + ); +} + +// ── Aggregated (matrix) view ──────────────────────────────────────────── + +function AggregatedView({ data }: { data: CompareResponse }) { const { experiments, targets, cells } = data; - // If there is only one experiment and one target, the matrix is trivial + // Hooks must run on every render regardless of the early-return below, + // so this memo is declared before any conditional return. When you add a + // new hook-using sub-path here, keep it above the guard. + const cellMap = useMemo(() => { + const map = new Map(); + for (const cell of cells) { + map.set(`${cell.experiment}::${cell.target}`, cell); + } + return map; + }, [cells]); + if (experiments.length <= 1 && targets.length <= 1) { return ( -
-

Not enough variation to compare

-

- The comparison matrix requires at least 2 experiments or 2 targets. Currently there{' '} - {experiments.length === 1 ? 'is 1 experiment' : `are ${experiments.length} experiments`}{' '} - and {targets.length === 1 ? '1 target' : `${targets.length} targets`}. -

-
+ ); } - // Build a lookup map for cells - const cellMap = new Map(); - for (const cell of cells) { - cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell); - } - return ( -
-
- - - 80%+ - - - - 50–80% - - - - <50% - - - - No data - -
- +
+
- + {experiments.map((exp) => ( - ))} @@ -102,7 +214,7 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) {targets.map((target) => ( - {experiments.map((exp) => { - const cell = cellMap.get(JSON.stringify([exp, target])); + const cell = cellMap.get(`${exp}::${target}`); return ( - ); })} @@ -146,65 +252,663 @@ function CompareRow({ ); } -function passRateRingClass(rate: number): string { - if (rate >= 0.8) return 'ring-emerald-500/60'; - if (rate >= 0.5) return 'ring-amber-500/60'; - return 'ring-red-500/60'; -} - -function passRateTextClass(rate: number): string { - if (rate >= 0.8) return 'text-emerald-400'; - if (rate >= 0.5) return 'text-amber-400'; - return 'text-red-400'; -} - -function CompareMatrixCell({ cell }: { cell: CompareCell }) { +function MatrixCell({ cell }: { cell: CompareCell }) { const [expanded, setExpanded] = useState(false); - const pct = Math.round(cell.pass_rate * 100); const avgPct = Math.round(cell.avg_score * 100); - return ( -
+
- - {expanded && } + {expanded && }
); } -function TestCaseBreakdown({ tests }: { tests: CompareTestResult[] }) { +function EmptyCell() { + return
; +} + +function TestBreakdown({ tests }: { tests: CompareTestResult[] }) { return ( -
-
Test Cases
-
+
+
+ Test cases +
+
    {tests.map((t) => ( -
    - - {t.passed ? '\u2713' : '\u2717'} - +
  • + {t.test_id} - {Math.round(t.score * 100)}% + + {Math.round(t.score * 100)}% + +
  • + ))} +
+
+ ); +} + +// ── Per-run view ──────────────────────────────────────────────────────── + +function PerRunView({ + data, + benchmarkId, + readOnly, +}: { + data: CompareResponse; + benchmarkId?: string; + readOnly: boolean; +}) { + const runs = data.runs ?? []; + const [selected, setSelected] = useState>(new Set()); + const [showingCompare, setShowingCompare] = useState(false); + const [editingRunId, setEditingRunId] = useState(null); + + const toggleSelect = (runId: string) => { + setSelected((prev) => { + const next = new Set(prev); + if (next.has(runId)) next.delete(runId); + else next.add(runId); + return next; + }); + }; + + const clearSelection = () => setSelected(new Set()); + + const selectedRuns = useMemo(() => runs.filter((r) => selected.has(r.run_id)), [runs, selected]); + + if (runs.length === 0) { + return ; + } + + if (showingCompare && selectedRuns.length >= 2) { + return setShowingCompare(false)} />; + } + + return ( +
+
+ + {runs.length} run + {runs.length === 1 ? '' : 's'} — select two or more to compare side-by-side + +
+ +
+
Target + Target ↓ / Experiment → + + {exp}
{target} - {cell ? ( - - ) : ( -
- -- -
- )} +
+ {cell ? : }
+ + + + + + + + + + + + + {runs.map((run) => ( + toggleSelect(run.run_id)} + editing={editingRunId === run.run_id} + onStartEdit={() => setEditingRunId(run.run_id)} + onEndEdit={() => setEditingRunId(null)} + benchmarkId={benchmarkId} + readOnly={readOnly} + /> + ))} + +
+ TimestampTagsExperimentTargetTestsPass rateAvg
+
+ + {selected.size > 0 && ( +
+
+ {selected.size}{' '} + selected
+
+ + +
+
+ )} +
+ ); +} + +function PerRunRow({ + run, + checked, + onToggle, + editing, + onStartEdit, + onEndEdit, + benchmarkId, + readOnly, +}: { + run: CompareRunEntry; + checked: boolean; + onToggle: () => void; + editing: boolean; + onStartEdit: () => void; + onEndEdit: () => void; + benchmarkId?: string; + readOnly: boolean; +}) { + const avgPct = Math.round(run.avg_score * 100); + const canEdit = !readOnly && run.source !== 'remote'; + const tagsBtnRef = useRef(null); + const tags = run.tags ?? []; + const runLabel = tags[0] ?? run.run_id; + + // Restore focus to the tags trigger button once the inline editor closes, + // so keyboard users don't lose their place in the table. + const wasEditing = useRef(editing); + useEffect(() => { + if (wasEditing.current && !editing) { + tagsBtnRef.current?.focus(); + } + wasEditing.current = editing; + }, [editing]); + + return ( + <> + + + + + +
+ {formatTimestamp(run.started_at)} +
+
+ {shortenRunId(run.run_id)} +
+ + + {canEdit ? ( + + ) : tags.length > 0 ? ( +
+ {tags.map((t) => ( + + {t} + + ))} +
+ ) : ( + + )} + + {run.experiment} + {run.target} + + {run.eval_count} + + + + + {avgPct}% + + {editing && ( + + + + + + )} + + ); +} + +/** + * Inline chip-based tag editor. + * + * Local state: a `string[]` staged edit of the run's tags. Chips show the + * current staged tags; an input at the end accepts new tags (commit with + * Enter or comma, delete the last chip with Backspace on an empty input). + * Save persists the whole array; Cancel / Escape discards. + * + * The backend's `writeRunTags` handles deduplication, length limits, and + * control-character rejection, so we only lightly normalize in the UI + * (trim + skip duplicates already in the staged array). + */ +function TagsEditor({ + runId, + currentTags, + benchmarkId, + onClose, +}: { + runId: string; + currentTags: string[]; + benchmarkId?: string; + onClose: () => void; +}) { + const [tags, setTags] = useState(currentTags); + const [input, setInput] = useState(''); + const [err, setErr] = useState(null); + const qc = useQueryClient(); + const inputRef = useRef(null); + + useEffect(() => { + inputRef.current?.focus(); + }, []); + + const saveMut = useMutation({ + mutationFn: () => saveRunTagsApi(runId, tags, benchmarkId), + onSuccess: () => { + qc.invalidateQueries({ queryKey: ['compare'] }); + qc.invalidateQueries({ queryKey: ['runs'] }); + if (benchmarkId) { + qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'compare'] }); + qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'runs'] }); + } + onClose(); + }, + onError: (e: Error) => setErr(e.message), + }); + + const clearMut = useMutation({ + mutationFn: () => deleteRunTagsApi(runId, benchmarkId), + onSuccess: () => { + qc.invalidateQueries({ queryKey: ['compare'] }); + qc.invalidateQueries({ queryKey: ['runs'] }); + if (benchmarkId) { + qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'compare'] }); + qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'runs'] }); + } + onClose(); + }, + onError: (e: Error) => setErr(e.message), + }); + + const busy = saveMut.isPending || clearMut.isPending; + const hasChanges = + tags.length !== currentTags.length || tags.some((t, i) => t !== currentTags[i]); + + const commitInput = () => { + const trimmed = input.trim(); + if (trimmed === '') return; + if (tags.includes(trimmed)) { + setInput(''); + return; + } + setTags([...tags, trimmed]); + setInput(''); + setErr(null); + }; + + const removeTag = (tag: string) => { + setTags(tags.filter((t) => t !== tag)); + }; + + return ( +
+
+ Tag run + + Multi-valued. Enter or comma adds; Backspace removes the last chip. + +
+
+ {tags.map((t) => ( + + {t} + + ))} + { + setErr(null); + setInput(e.target.value); + }} + maxLength={60} + disabled={busy} + onKeyDown={(e) => { + if (e.key === 'Enter' || e.key === ',') { + e.preventDefault(); + commitInput(); + } else if (e.key === 'Backspace' && input === '' && tags.length > 0) { + e.preventDefault(); + setTags(tags.slice(0, -1)); + } else if (e.key === 'Escape') { + onClose(); + } + }} + onBlur={commitInput} + /> +
+
+ + {currentTags.length > 0 && ( + + )} +
+ {err && ( +
+ {err} +
+ )} +
+ ); +} + +function PerRunCompareView({ + runs, + onBack, +}: { + runs: CompareRunEntry[]; + onBack: () => void; +}) { + // Collect all test ids across selected runs (stable order: first run's order, then any extras) + const testIds = useMemo(() => { + const seen = new Set(); + const order: string[] = []; + for (const run of runs) { + for (const t of run.tests) { + if (!seen.has(t.test_id)) { + seen.add(t.test_id); + order.push(t.test_id); + } + } + } + return order; + }, [runs]); + + const testLookup = useMemo(() => { + return runs.map((run) => { + const m = new Map(); + for (const t of run.tests) m.set(t.test_id, t); + return m; + }); + }, [runs]); + + return ( +
+
+ + + {runs.length} runs · {testIds.length} tests + +
+
+ + + + + {runs.map((run) => ( + + ))} + + + + {runs.map((run) => ( + + ))} + + + + {testIds.map((tid) => ( + + + {testLookup.map((lookup, idx) => { + const t = lookup.get(tid); + const runId = runs[idx].run_id; + if (!t) { + return ( + + ); + } + return ( + + ); + })} + + ))} + +
+ Test case + + +
+ Pass rate + + +
+ {tid} + + — + +
+ + + {Math.round(t.score * 100)}% + +
+
+
+
+ ); +} + +function RunColumnHeader({ run }: { run: CompareRunEntry }) { + const tags = run.tags ?? []; + return ( +
+
+ {formatTimestamp(run.started_at)} +
+ {tags.length > 0 && ( +
+ {tags.map((t) => ( + + {t} + + ))} +
+ )} +
+ {run.experiment} · {run.target} +
+
+ ); +} + +// ── Shared bits ───────────────────────────────────────────────────────── + +function Legend() { + return ( +
+ + + + +
+ ); +} + +function LegendSwatch({ className, label }: { className: string; label: string }) { + return ( + + + {label} + + ); +} + +function ErrorPanel({ message }: { message: string }) { + return ( +
+ {message} +
+ ); +} + +function EmptyState() { + return ( + + ); +} + +function Notice({ headline, body }: { headline: string; body: string }) { + return ( +
+

{headline}

+

{body}

); } @@ -216,15 +920,44 @@ function LoadingSkeleton() {
- {['sk-1', 'sk-2', 'sk-3'].map((id) => ( -
+ {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => ( +
+
+
+
+
-
-
-
))}
); } + +// ── Helpers ───────────────────────────────────────────────────────────── + +/** Format an ISO timestamp for row / column display. */ +function formatTimestamp(iso: string): string { + try { + const d = new Date(iso); + if (Number.isNaN(d.getTime())) return iso; + const y = d.getFullYear(); + const mo = String(d.getMonth() + 1).padStart(2, '0'); + const da = String(d.getDate()).padStart(2, '0'); + const h = String(d.getHours()).padStart(2, '0'); + const mi = String(d.getMinutes()).padStart(2, '0'); + return `${y}-${mo}-${da} ${h}:${mi}`; + } catch { + return iso; + } +} + +/** Abbreviate the run id for display (keeps the last segment). */ +function shortenRunId(id: string): string { + const parts = id.split('::'); + if (parts.length >= 2) { + const tail = parts[parts.length - 1]; + return tail.length > 22 ? `${tail.slice(0, 10)}…${tail.slice(-8)}` : tail; + } + return id.length > 22 ? `${id.slice(0, 10)}…${id.slice(-8)}` : id; +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 75721d398..c770bb98a 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -28,6 +28,7 @@ import type { RunDetailResponse, RunEvalRequest, RunListResponse, + RunTagsResponse, StudioConfigResponse, SuitesResponse, TargetsResponse, @@ -437,6 +438,45 @@ export async function syncRemoteResultsApi(benchmarkId?: string): Promise; } +// ── Run tag mutations ──────────────────────────────────────────────────── + +/** + * Replace the tags on a run. Tags are stored as a sidecar `tags.json` file + * next to the run's manifest and surface as chips in the compare views. + * Pass an empty array to clear all tags (server deletes the sidecar). + */ +export async function saveRunTagsApi( + runId: string, + tags: string[], + benchmarkId?: string, +): Promise { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/tags` + : `/api/runs/${encodeURIComponent(runId)}/tags`; + const res = await fetch(url, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ tags }), + }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed to save tags: ${res.status}`); + } + return res.json() as Promise; +} + +/** Remove the tags sidecar for a run. */ +export async function deleteRunTagsApi(runId: string, benchmarkId?: string): Promise { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/tags` + : `/api/runs/${encodeURIComponent(runId)}/tags`; + const res = await fetch(url, { method: 'DELETE' }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed to delete tags: ${res.status}`); + } +} + export async function saveStudioConfig( config: Partial, ): Promise { diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index a395a8072..1f2d81ec6 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -19,6 +19,8 @@ export interface RunMeta { source: 'local' | 'remote'; project_id?: string; project_name?: string; + /** Optional user-assigned tags from the run's sidecar tags.json. */ + tags?: string[]; } export interface RunListResponse { @@ -148,10 +150,39 @@ export interface CompareCell { tests: CompareTestResult[]; } +/** + * A single evaluation run surfaced in the per-run compare view. + * + * Each run workspace contributes exactly one entry, independent of the + * aggregated `(experiment, target)` cells. Users select multiple runs to + * compare them side-by-side, regardless of whether the runs share an + * experiment or target. + */ +export interface CompareRunEntry { + run_id: string; + started_at: string; + experiment: string; + target: string; + tags?: string[]; + source: 'local' | 'remote'; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: CompareTestResult[]; +} + export interface CompareResponse { experiments: string[]; targets: string[]; cells: CompareCell[]; + /** Per-run entries, sorted newest first. */ + runs?: CompareRunEntry[]; +} + +export interface RunTagsResponse { + tags: string[]; + updated_at: string; } export interface TargetSummary { diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 4dd892ee9..136c1123a 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -274,7 +274,7 @@ function SingleProjectHome() { /> )} {activeTab === 'experiments' && } - {activeTab === 'compare' && } + {activeTab === 'compare' && } {activeTab === 'targets' && } {!isReadOnly && setShowRunEval(false)} />} @@ -282,9 +282,17 @@ function SingleProjectHome() { ); } -function CompareTabContent() { +function CompareTabContent({ readOnly }: { readOnly: boolean }) { const { data, isLoading, isError, error } = useCompare(); - return ; + return ( + + ); } function RunsTabContent({ diff --git a/apps/studio/src/routes/projects/$benchmarkId.tsx b/apps/studio/src/routes/projects/$benchmarkId.tsx index 1d0660cb4..b8834dbb2 100644 --- a/apps/studio/src/routes/projects/$benchmarkId.tsx +++ b/apps/studio/src/routes/projects/$benchmarkId.tsx @@ -91,7 +91,9 @@ function ProjectHomePage() { {activeTab === 'runs' && } {activeTab === 'experiments' && } - {activeTab === 'compare' && } + {activeTab === 'compare' && ( + + )} {activeTab === 'targets' && } {!isReadOnly && ( @@ -209,9 +211,24 @@ function ProjectExperimentsTab({ benchmarkId }: { benchmarkId: string }) { ); } -function ProjectCompareTab({ benchmarkId }: { benchmarkId: string }) { +function ProjectCompareTab({ + benchmarkId, + readOnly, +}: { + benchmarkId: string; + readOnly: boolean; +}) { const { data, isLoading, isError, error } = useQuery(benchmarkCompareOptions(benchmarkId)); - return ; + return ( + + ); } function ProjectTargetsTab({ benchmarkId }: { benchmarkId: string }) { diff --git a/apps/web/src/assets/screenshots/studio-compare-aggregated.png b/apps/web/src/assets/screenshots/studio-compare-aggregated.png new file mode 100644 index 000000000..a1814ac36 Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-aggregated.png differ diff --git a/apps/web/src/assets/screenshots/studio-compare-per-run.png b/apps/web/src/assets/screenshots/studio-compare-per-run.png new file mode 100644 index 000000000..1b3a452ce Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-per-run.png differ diff --git a/apps/web/src/assets/screenshots/studio-compare-side-by-side.png b/apps/web/src/assets/screenshots/studio-compare-side-by-side.png new file mode 100644 index 000000000..584514af4 Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-side-by-side.png differ diff --git a/apps/web/src/assets/screenshots/studio-compare.png b/apps/web/src/assets/screenshots/studio-compare.png deleted file mode 100644 index 78c8690de..000000000 Binary files a/apps/web/src/assets/screenshots/studio-compare.png and /dev/null differ diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 57dd61ff1..ef31b0ab3 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -11,7 +11,9 @@ import studioRunDetail from '../../../../assets/screenshots/studio-run-detail.pn import studioExperiments from '../../../../assets/screenshots/studio-experiments.png'; import studioProjects from '../../../../assets/screenshots/studio-projects.png'; import studioProjectsMulti from '../../../../assets/screenshots/studio-projects-multi.png'; -import studioCompare from '../../../../assets/screenshots/studio-compare.png'; +import studioCompareAggregated from '../../../../assets/screenshots/studio-compare-aggregated.png'; +import studioComparePerRun from '../../../../assets/screenshots/studio-compare-per-run.png'; +import studioCompareSideBySide from '../../../../assets/screenshots/studio-compare-side-by-side.png'; import studioRunsBench from '../../../../assets/screenshots/studio-runs-bench.png'; The `studio` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores. It shows both local runs and runs synced from a remote results repository. @@ -53,7 +55,7 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z - **Targets** — group runs by target (model/agent) - **Run Detail** — drill into a run to see per-test results, scores, and evaluator output - **Human Review** — add feedback annotations to individual test results -- **Comparison Matrix** — experiment × target matrix showing pass rates across dimensions +- **Compare** — two modes: an aggregated experiment × target matrix, and a per-run view for selecting individual runs to compare side-by-side with optional retroactive tags - **Remote Results** — sync and browse runs pushed from other machines or CI (see [Remote Results](#remote-results)) ## Run Detail @@ -68,11 +70,17 @@ The Experiments tab groups runs by experiment name so you can compare the impact AgentV Studio experiments tab comparing with_skills (100%) vs without_skills (60%) pass rates -## Comparison Matrix +## Compare -The **Compare** tab shows a cross-model, cross-experiment performance matrix. Cells are color-coded by pass rate: green (80%+), yellow (50–80%), red (below 50%). The best performer per row has an emerald ring; the worst has a red ring. Click any cell to expand per-test-case results. +The **Compare** tab has two modes: **Aggregated** for the classic experiment × target matrix, and **Per run** for selecting individual runs and pitting them side-by-side. Toggle between them from the mode switch on the right of the masthead. -AgentV Studio comparison matrix showing experiment vs target pass rates with color coding +AgentV Studio side-by-side comparison of two runs tagged improved-prompt and baseline, with per-test pass rates + +### Aggregated matrix + +The default view shows a cross-experiment, cross-target performance matrix. Numbers are colour-coded by pass rate — green (80%+), amber (50–80%), red (below 50%) — and each cell shows `passed/total` and the mean score. Click any cell to expand the per-test-case breakdown. + +AgentV Studio aggregated compare matrix showing experiment × target pass rates Run the same eval against multiple providers or experiment variants, then open the Compare tab: @@ -84,7 +92,25 @@ agentv eval my.EVAL.yaml --target gemini --experiment with-caching agentv studio # Compare tab shows 2x2 matrix ``` -The matrix is available per-project under the **Compare** tab. +### Per-run comparison + +Running the same `(experiment, target)` twice no longer collapses into a single cell. Switch to **Per run** mode to see every run as its own row, select two or more, and compare them head-to-head. + +AgentV Studio per-run compare mode listing individual runs with timestamps, tags, experiment, target, and pass rate + +Use per-run mode when you want to: + +- Compare back-to-back runs of the same agent + eval after a prompt or parameter tweak +- Pit a fresh run against a tagged baseline without touching the eval YAML +- Debug flakiness by inspecting two identical-configuration runs side-by-side + +Select 2+ rows with the checkboxes and click the sticky **Compare N** action to open the side-by-side view. Column headers show the run's timestamp, with any assigned tags as chips below it. The per-test breakdown reuses the same scoring and colour tones as the aggregated matrix. + +### Retroactive tags + +Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to remove every tag (deletes the sidecar). Remote runs are read-only. + +Use tags to annotate ad-hoc variants, experiment cross-cuts, or status flags you didn't plan for up front — `baseline`, `v2-prompt`, `slow`, `after-retry-fix`, `regression`, etc. Unlike `experiment` — which groups runs and is baked into the JSONL at eval-run time — tags are mutable, multi-valued, and never touch the original run data. ## Benchmarks Dashboard diff --git a/docs/plans/1037-per-run-compare.md b/docs/plans/1037-per-run-compare.md new file mode 100644 index 000000000..cf6c9d18c --- /dev/null +++ b/docs/plans/1037-per-run-compare.md @@ -0,0 +1,103 @@ +# Per-run comparison with retroactive labelling — Issue #1037 + +## Goal +Let Studio users compare individual runs (by timestamp / run id) side-by-side, +independent of the current `(experiment, target)` aggregation. Optional labels +replace timestamps in compare headers. + +## Data model + +### Sidecar label file +- Path: `/label.json` next to `index.jsonl` +- Content: `{ "label": string, "updated_at": string }` +- Mutable, non-breaking, trivially reversible. Absent file = no label. + +### Wire format extension (non-breaking) +Extend `CompareResponse`: +```ts +interface CompareRunEntry { + run_id: string; // existing run id (experiment::timestamp or timestamp) + started_at: string; // first record timestamp (fallback: manifest meta) + experiment: string; + target: string; + label?: string; + eval_count: number; + passed_count: number; + pass_rate: number; + avg_score: number; + tests: CompareTestResult[]; +} + +interface CompareResponse { + experiments: string[]; + targets: string[]; + cells: CompareCell[]; // unchanged + runs: CompareRunEntry[]; // NEW +} +``` + +Extend `RunMeta` with optional `label?: string`. + +## Backend changes — apps/cli/src/commands/results/serve.ts + +1. `handleCompare` — after building cells, also build per-run entries. + Each run file → compute eval_count, passed_count, pass_rate, avg_score, + tests (cap 100). Read sidecar label. +2. `handleRuns` — already enriches RunMeta; add label sidecar lookup. +3. New `handleRunLabel` (PUT/POST) — writes `label.json`. Unscoped and + benchmark-scoped variants. +4. New `handleRunLabelDelete` (DELETE) — removes `label.json`. + +## Frontend changes + +### Types — apps/studio/src/lib/types.ts +Extend `CompareResponse`, add `CompareRunEntry`, add `label?` to `RunMeta`. + +### API hooks — apps/studio/src/lib/api.ts +- `saveRunLabel(runId, label, benchmarkId?)` — PUT mutation +- `deleteRunLabel(runId, benchmarkId?)` — DELETE mutation +- Invalidate `['compare']`, `['runs']`, `['benchmarks', id, 'compare']` + +### CompareTab redesign — apps/studio/src/components/CompareTab.tsx + +**Aesthetic direction: Editorial data-terminal** +- Display font: Fraunces (variable serif with optical sizing) — for headings +- Data font: JetBrains Mono Variable — tabular numbers, run ids, deltas +- Body: Inter-free. Use system-ui sparingly for secondary text, or DM Sans +- Palette: off-black (#0a0a0b) base, warm ivory (#f4ecd8) text, signal + accents (emerald #10b981, amber #f59e0b, rose #f43f5e). Hairline dividers + in warm gray (#2a2622). +- Layout: Asymmetric header (big serif title + mode toggle right-aligned). + Sharp hairline rules. Tabular number columns. Generous vertical rhythm. +- Motion: Staggered fade-in on mount (CSS `@keyframes` with animation-delay). + Hover brings subtle shadow+translate on selectable rows. Mode toggle + slides underline indicator. + +**Modes:** +1. **Aggregated** (default) — existing matrix, re-skinned with the new + aesthetic. Unchanged logic. +2. **Per run** — runs table sorted by timestamp desc with: + - Selectable checkbox (multi-select) + - Columns: `timestamp | label | experiment | target | tests | pass | avg` + - Inline "Edit label" button → popover/inline input + - Sticky footer: "Compare N selected" button (enabled when N ≥ 2) + - Opening compare view renders a side-by-side table: one column per run, + using label or formatted timestamp. Reuses `CompareMatrixCell` rendering + logic for per-test breakdown. + +## Validation plan + +1. Unit-ish: typecheck, lint, build. +2. Backend e2e: `bun apps/cli/src/cli.ts results serve --port 9100` on a + benchmark with ≥2 runs of the same (experiment, target). Hit + `/api/compare` and verify `runs[]` present. Hit `PUT /api/runs/:id/label` + and verify sidecar file is written. +3. Frontend visual: agent-browser with `--cdp 9222` on http://localhost:5173 + (or wherever studio dev runs). Screenshot aggregated mode, per-run mode, + label edit, compare view. Iterate on design until polished. + +## Out of scope +- Eval YAML schema changes +- CLI flags +- Multi-label / tag taxonomy +- Cross-project run compare