diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts
new file mode 100644
index 000000000..17f76a028
--- /dev/null
+++ b/apps/cli/src/commands/results/run-tags.ts
@@ -0,0 +1,132 @@
+/**
+ * Per-run tag sidecar file helpers.
+ *
+ * Tags are stored as a `tags.json` sidecar next to the run's `index.jsonl`
+ * manifest. The sidecar is optional, mutable, and non-breaking — absence
+ * means the run has no user-assigned tags.
+ *
+ * Wire format (stored on disk):
+ * ```json
+ * { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" }
+ * ```
+ *
+ * Used by the Studio compare API so users can retroactively tag runs
+ * without changing the eval YAML or the run manifest itself. This mirrors
+ * the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued
+ * list of free-form labels that lives alongside the immutable run_id.
+ *
+ * Validation rules:
+ *   - Each tag is 1–60 characters after trimming
+ *   - No control characters (\n, \t, DEL, etc.)
+ *   - Tags are deduplicated case-sensitively
+ *   - A run can have at most 20 tags
+ *   - Writing an empty array removes the sidecar file
+ *
+ * To extend (e.g. add colored labels or descriptions): add optional fields
+ * to `RunTagsFile` and keep the schema additive so older files still parse.
+ */
+
+import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+
+export const RUN_TAGS_FILENAME = 'tags.json';
+
+/** Maximum number of tags per run. */
+export const MAX_TAGS_PER_RUN = 20;
+
+/** Maximum length of a single tag after trimming. */
+export const MAX_TAG_LENGTH = 60;
+
+export interface RunTagsFile {
+  /** Ordered, deduplicated list of user-assigned tags. */
+  tags: string[];
+  /** ISO-8601 timestamp of last update. */
+  updated_at: string;
+}
+
+/** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */
+export function runTagsPath(manifestPath: string): string {
+  return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME);
+}
+
+/** Read the tags for a run. Returns `undefined` if missing or unreadable. */
+export function readRunTags(manifestPath: string): RunTagsFile | undefined {
+  const fp = runTagsPath(manifestPath);
+  if (!existsSync(fp)) return undefined;
+  try {
+    const parsed = JSON.parse(readFileSync(fp, 'utf8')) as unknown;
+    if (!parsed || typeof parsed !== 'object') return undefined;
+    const record = parsed as Record<string, unknown>;
+    if (!Array.isArray(record.tags)) return undefined;
+    const tags = record.tags.filter(
+      (t): t is string => typeof t === 'string' && t.trim().length > 0,
+    );
+    if (tags.length === 0) return undefined;
+    return {
+      tags,
+      updated_at: typeof record.updated_at === 'string' ? record.updated_at : '',
+    };
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Write tags for a run. Replaces any existing tags. Pass an empty array
+ * to remove the sidecar entirely.
+ */
+export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null {
+  const cleaned = normalizeTags(tags);
+  if (cleaned.length === 0) {
+    deleteRunTags(manifestPath);
+    return null;
+  }
+  const entry: RunTagsFile = {
+    tags: cleaned,
+    updated_at: new Date().toISOString(),
+  };
+  writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8');
+  return entry;
+}
+
+/** Remove a run's tags sidecar. No-op if the file does not exist. */
+export function deleteRunTags(manifestPath: string): void {
+  const fp = runTagsPath(manifestPath);
+  if (existsSync(fp)) {
+    unlinkSync(fp);
+  }
+}
+
+/**
+ * Trim, validate, and deduplicate an incoming tag array. Throws on any
+ * invalid entry so the caller can surface a user-friendly error.
+ */
+function normalizeTags(tags: readonly string[]): string[] {
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const raw of tags) {
+    if (typeof raw !== 'string') {
+      throw new Error('Tags must be strings');
+    }
+    const trimmed = raw.trim();
+    if (trimmed === '') continue;
+    if (trimmed.length > MAX_TAG_LENGTH) {
+      throw new Error(`Tag "${trimmed.slice(0, 20)}…" exceeds ${MAX_TAG_LENGTH} characters`);
+    }
+    // Reject control characters (newlines, tabs, DEL, etc.) — they break
+    // column headers in compare views and confuse test assertions.
+    for (let i = 0; i < trimmed.length; i++) {
+      const code = trimmed.charCodeAt(i);
+      if (code < 0x20 || code === 0x7f) {
+        throw new Error('Tag must not contain control characters');
+      }
+    }
+    if (seen.has(trimmed)) continue;
+    seen.add(trimmed);
+    out.push(trimmed);
+  }
+  if (out.length > MAX_TAGS_PER_RUN) {
+    throw new Error(`Too many tags (max ${MAX_TAGS_PER_RUN})`);
+  }
+  return out;
+}
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 9b4c67161..15ec4f4aa 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -59,6 +59,7 @@ import {
   listMergedResultFiles,
   syncRemoteResults,
 } from './remote.js';
+import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js';
 import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js';
 
 // ── Source resolution ────────────────────────────────────────────────────
@@ -273,6 +274,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
       } catch {
         // ignore enrichment errors
       }
+      const tagsEntry = readRunTags(m.path);
       return {
         filename: m.filename,
         display_name: m.displayName,
@@ -285,6 +287,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
         source: m.source,
         ...(target && { target }),
         ...(experiment && { experiment }),
+        ...(tagsEntry && { tags: tagsEntry.tags }),
       };
     }),
   });
@@ -551,7 +554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
   const { runs: metas } = await listMergedResultFiles(searchDir);
   const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
 
-  // Collect per-test-case results keyed by experiment × target
+  // Collect per-test-case results keyed by experiment × target (aggregated view)
   const cellMap = new Map<
     string,
     {
@@ -569,17 +572,54 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
     }
   >();
 
+  // Per-run entries (per-run view). Each run workspace contributes exactly
+  // one entry, independent of the aggregated matrix.
+  const runEntries: Array<{
+    run_id: string;
+    started_at: string;
+    experiment: string;
+    target: string;
+    tags?: string[];
+    source: 'local' | 'remote';
+    eval_count: number;
+    passed_count: number;
+    pass_rate: number;
+    avg_score: number;
+    tests: Array<{
+      test_id: string;
+      score: number;
+      passed: boolean;
+      execution_status?: string;
+    }>;
+  }> = [];
+
   const experimentsSet = new Set<string>();
   const targetsSet = new Set<string>();
+  const MAX_TESTS_PER_CELL = 100;
 
   for (const m of metas) {
     try {
       const records = loadLightweightResults(m.path);
+      const runTestMap = new Map<
+        string,
+        { test_id: string; score: number; passed: boolean; execution_status?: string }
+      >();
+      let runEvalCount = 0;
+      let runPassedCount = 0;
+      let runScoreSum = 0;
+      let runExperiment = 'default';
+      let runTarget = 'default';
+      let runStartedAt = m.timestamp;
+
       for (const r of records) {
         const experiment = r.experiment ?? 'default';
         const target = r.target ?? 'default';
         experimentsSet.add(experiment);
         targetsSet.add(target);
+        runExperiment = experiment;
+        runTarget = target;
+        if (r.timestamp && r.timestamp < runStartedAt) runStartedAt = r.timestamp;
+
         const key = JSON.stringify([experiment, target]);
         const entry = cellMap.get(key) ?? {
           experiment,
@@ -600,14 +640,41 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
           execution_status: r.executionStatus,
         });
         cellMap.set(key, entry);
+
+        // Per-run accumulation. Dedupe tests within the run by last-wins.
+        runTestMap.set(r.testId, {
+          test_id: r.testId,
+          score: r.score,
+          passed,
+          execution_status: r.executionStatus,
+        });
+        runEvalCount++;
+        if (passed) runPassedCount++;
+        runScoreSum += r.score;
       }
+
+      if (runEvalCount === 0) continue;
+
+      const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL);
+      const tagsEntry = readRunTags(m.path);
+      runEntries.push({
+        run_id: m.filename,
+        started_at: runStartedAt,
+        experiment: runExperiment,
+        target: runTarget,
+        ...(tagsEntry && { tags: tagsEntry.tags }),
+        source: m.source,
+        eval_count: runEvalCount,
+        passed_count: runPassedCount,
+        pass_rate: runPassedCount / runEvalCount,
+        avg_score: runScoreSum / runEvalCount,
+        tests: runTests,
+      });
     } catch {
       // skip runs that fail to load
     }
   }
 
-  const MAX_TESTS_PER_CELL = 100;
-
   const cells = [...cellMap.values()].map((entry) => {
     // Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order)
     const dedupMap = new Map<string, (typeof entry.tests)[number]>();
@@ -630,10 +697,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
     };
   });
 
+  // Per-run entries sorted by timestamp descending (newest first).
+  runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at));
+
   return c.json({
     experiments: [...experimentsSet].sort(),
     targets: [...targetsSet].sort(),
     cells,
+    runs: runEntries,
   });
 }
 
@@ -702,6 +773,52 @@ function handleFeedbackRead(c: C, { searchDir }: DataContext) {
   return c.json(readFeedback(existsSync(resultsDir) ? resultsDir : searchDir));
 }
 
+async function handleRunTagsPut(c: C, { searchDir }: DataContext) {
+  const filename = c.req.param('filename') ?? '';
+  const meta = await findRunById(searchDir, filename);
+  if (!meta) return c.json({ error: 'Run not found' }, 404);
+  if (meta.source === 'remote') {
+    return c.json({ error: 'Tags can only be set on local runs' }, 400);
+  }
+  let body: unknown;
+  try {
+    body = await c.req.json();
+  } catch {
+    return c.json({ error: 'Invalid JSON' }, 400);
+  }
+  if (!body || typeof body !== 'object') {
+    return c.json({ error: 'Invalid payload' }, 400);
+  }
+  const tags = (body as Record<string, unknown>).tags;
+  if (!Array.isArray(tags)) {
+    return c.json({ error: 'Missing tags array' }, 400);
+  }
+  try {
+    const entry = writeRunTags(meta.path, tags as string[]);
+    return c.json({
+      tags: entry?.tags ?? [],
+      updated_at: entry?.updated_at ?? new Date().toISOString(),
+    });
+  } catch (err) {
+    return c.json({ error: (err as Error).message }, 400);
+  }
+}
+
+async function handleRunTagsDelete(c: C, { searchDir }: DataContext) {
+  const filename = c.req.param('filename') ?? '';
+  const meta = await findRunById(searchDir, filename);
+  if (!meta) return c.json({ error: 'Run not found' }, 404);
+  if (meta.source === 'remote') {
+    return c.json({ error: 'Tags can only be removed on local runs' }, 400);
+  }
+  try {
+    deleteRunTags(meta.path);
+    return c.json({ ok: true });
+  } catch (err) {
+    return c.json({ error: (err as Error).message }, 500);
+  }
+}
+
 // ── Hono app factory ─────────────────────────────────────────────────────
 
 /**
@@ -934,6 +1051,18 @@ export function createApp(
   app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir)));
   app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir)));
   app.get('/api/runs', (c) => handleRuns(c, defaultCtx));
+  app.put('/api/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return handleRunTagsPut(c, defaultCtx);
+  });
+  app.delete('/api/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return handleRunTagsDelete(c, defaultCtx);
+  });
   app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
   app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
   app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
@@ -1046,6 +1175,18 @@ export function createApp(
     withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))),
   );
   app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns));
+  app.put('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return withBenchmark(c, handleRunTagsPut);
+  });
+  app.delete('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return withBenchmark(c, handleRunTagsDelete);
+  });
   app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail));
   app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) =>
     withBenchmark(c, handleRunSuites),
diff --git a/apps/studio/DESIGN.md b/apps/studio/DESIGN.md
new file mode 100644
index 000000000..03d22f991
--- /dev/null
+++ b/apps/studio/DESIGN.md
@@ -0,0 +1,413 @@
+# AgentV Studio Design System
+
+> Studio is a dark, utility-driven dashboard for reviewing AI agent evaluation
+> results. It favors dense tabular data, muted neutrals, and a single cyan
+> accent over ornamental styling. Think "terminal inspector", not "marketing
+> page". When in doubt, copy the pattern from `ExperimentsTab`, `TargetsTab`,
+> `RunList`, or `PassRatePill` — they are canonical examples of the style.
+
+## 1. Visual Theme & Atmosphere
+
+AgentV Studio is a local evaluation dashboard for AI agent developers. The
+design language is dense, dark, and data-first — this is a tool engineers
+keep open in a second monitor while they iterate on prompts, not a page
+they share on social. The canvas is near-black (`bg-gray-950`), elevated
+surfaces sit one step up (`bg-gray-900`), and every interactive accent
+pulls the eye toward the same cyan signal color (`cyan-400`).
+
+Typography stays out of the way on purpose. A single system sans-serif
+stack (`ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont`)
+handles every piece of text. There is no brand display font, no serif,
+no variable font. Numeric columns use `tabular-nums` so pass rates,
+scores, and timestamps line up cleanly, and most table text sits at
+`text-sm` (14px) with `font-medium` (500) reserved for row headers and
+links.
+
+Motion is almost absent. Rows fade in via Tailwind's built-in
+`transition-colors`, the main tabs slide a 2px cyan underline indicator,
+and that's it. There are no staggered entrance animations, no serif
+display headings, no elevated box-shadows. Honor `prefers-reduced-motion`
+if you add any animation.
+
+**Key characteristics:**
+- Dark canvas (`bg-gray-950`), elevated surfaces at `bg-gray-900/50` or `bg-gray-900`
+- Single system sans-serif stack — no webfonts, no Google Fonts
+- Cyan-400 is the ONE accent for interactive elements and links
+- Emerald/yellow/red tones for pass/warn/fail, used sparingly and only for data
+- Blue gradient reserved for `PassRatePill` (the one exception to cyan monopoly)
+- Rounded corners: consistently `rounded-lg` (8px) for containers, `rounded-md` (6px) for inputs/buttons, `rounded-full` for pills
+- Hairline borders (`border-gray-800`), never shadows
+
+## 2. Color Palette & Roles
+
+### Surfaces
+
+| Token | Hex (Tailwind) | Role |
+|---|---|---|
+| `bg-gray-950` | `#030712` | App canvas / body background |
+| `bg-gray-900` | `#111827` | Elevated container background |
+| `bg-gray-900/50` | `#111827` @ 50% | Table header row, subtle fills |
+| `bg-gray-900/30` | `#111827` @ 30% | Row hover state |
+| `bg-gray-800` | `#1f2937` | Secondary button fill, progress track, skeleton bars |
+| `bg-gray-800/50` | `#1f2937` @ 50% | Divider, disabled fill |
+
+### Borders
+
+| Token | Role |
+|---|---|
+| `border-gray-800` | Default container borders (every `rounded-lg` card + table wrap) |
+| `border-gray-800/50` | `divide-y` row separators inside tables |
+| `border-gray-700` | Form input borders |
+| `border-cyan-900/60` | Label/tag chip borders — the only cyan-tinted border |
+| `border-red-900/60` | Error / destructive action borders |
+
+### Text
+
+| Token | Role |
+|---|---|
+| `text-gray-100` | Default body text on `bg-gray-950` |
+| `text-white` | Section headings (`h2 text-xl font-semibold`) |
+| `text-gray-200` | Row primary values (target name, timestamp) |
+| `text-gray-300` | Secondary values inside cells |
+| `text-gray-400` | Table header labels, section subtitles, muted links |
+| `text-gray-500` | Metadata, timestamps, "N runs" counts |
+| `text-gray-600` | Placeholders, empty-state em-dashes |
+
+### Accent (single source of truth: cyan)
+
+| Token | Role |
+|---|---|
+| `text-cyan-400` | Active tab, links, primary-action emphasis |
+| `text-cyan-300` | Link/tag hover, label chip text |
+| `text-cyan-500` | Accent on focused checkbox/select |
+| `bg-cyan-500` | Primary action button fill (e.g. "Compare N", "Save") |
+| `bg-cyan-400` | Primary button hover |
+| `bg-cyan-950/30` | Tag chip fill, selected-row tint |
+| `bg-cyan-950/20` | Selected-row tint on per-run list |
+| `ring-cyan-500` | Focus ring on inputs and buttons |
+
+**Rule:** do not introduce a second accent. Green, amber, and red are
+reserved for **data tones** (pass/warn/fail), not for interactive UI.
+
+### Data tones
+
+| Token | Role |
+|---|---|
+| `text-emerald-400` / `bg-emerald-400` | Pass (≥80%), success dots, "passed" count numerator |
+| `text-yellow-400` / `bg-yellow-400` | Warn (50–80%) |
+| `text-amber-400` | Run source badge for `remote` runs |
+| `text-red-400` / `bg-red-400` | Fail (<50%), error text, destructive button |
+| `bg-red-950/30` + `border-red-900/60` | Error banners and destructive button hover |
+
+### The blue-gradient exception
+
+`PassRatePill` is the one place blue (not cyan) appears — a fixed
+`bg-gradient-to-r from-blue-400 to-blue-600` fill on a `bg-gray-800`
+rounded-full track. This is the recognizable "Studio pill" that ties
+the Runs, Experiments, Targets, and Compare tabs together. Reuse it
+verbatim (`<PassRatePill rate={0.75} />`) — do not recreate it with
+cyan or with a different gradient.
+
+## 3. Typography Rules
+
+### Font stack
+
+One stack, applied globally in `src/styles/globals.css`:
+
+```css
+font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont,
+             "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+```
+
+No webfonts. No display font. No second family for code or data — use
+`tabular-nums` for numeric alignment, not a monospace font.
+
+### Hierarchy
+
+| Role | Class | Notes |
+|---|---|---|
+| Page title | `text-2xl font-semibold text-white` | Top-level section (e.g. "Evaluation Runs") |
+| Section title | `text-xl font-semibold text-white` | Tab headings (e.g. "Compare runs") |
+| Sub-section | `text-lg font-medium text-gray-300` | Inside-card headers |
+| Table header | `font-medium text-gray-400 px-4 py-3` | Column labels; NOT uppercase by default |
+| Table header (micro) | `text-xs uppercase tracking-wider text-gray-500` | Only for eyebrows / sub-headers |
+| Row primary | `font-medium text-gray-200` | Main row identifier (target, timestamp) |
+| Row numeric | `tabular-nums text-gray-400` | Every number in every cell |
+| Body text | `text-sm text-gray-300` | Default inside cards |
+| Body muted | `text-sm text-gray-400` | Subtitles, explanatory text |
+| Caption | `text-xs text-gray-500` | Metadata, run ids, hint text |
+| Link | `text-cyan-400 hover:text-cyan-300 hover:underline` | Internal navigation |
+
+### Principles
+
+- **`text-sm` is the default.** Most table text, most body text, most
+  buttons. `text-base` (16px) is reserved for empty-state headlines.
+- **`font-medium` (500), not bold.** 600 for section titles, never 700.
+- **`tabular-nums` on every number.** Pass rates, scores, test counts,
+  timestamps, avg values. The columns must line up.
+- **No uppercase headers by default.** Only use `uppercase tracking-wider`
+  for tiny eyebrow labels (`text-xs uppercase tracking-wider text-gray-500`).
+- **No custom line-heights.** Tailwind's defaults work.
+
+## 4. Component Stylings
+
+### Containers
+
+```tsx
+<div className="overflow-hidden rounded-lg border border-gray-800">
+  {/* … */}
+</div>
+```
+
+Every meaningful grouping goes in a `rounded-lg` bordered container.
+No drop shadows, no inner glows. The border itself IS the elevation.
+
+### Tables
+
+Canonical pattern (from `ExperimentsTab.tsx` — copy this verbatim):
+
+```tsx
+<div className="overflow-hidden rounded-lg border border-gray-800">
+  <table className="w-full text-left text-sm">
+    <thead className="border-b border-gray-800 bg-gray-900/50">
+      <tr>
+        <th className="px-4 py-3 font-medium text-gray-400">Column</th>
+        <th className="px-4 py-3 text-right font-medium text-gray-400">Number</th>
+      </tr>
+    </thead>
+    <tbody className="divide-y divide-gray-800/50">
+      <tr className="transition-colors hover:bg-gray-900/30">
+        <td className="px-4 py-3">…</td>
+        <td className="px-4 py-3 text-right tabular-nums text-gray-400">…</td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+```
+
+- **Padding:** `px-4 py-3` for every cell (both header and body).
+- **Rows:** `divide-y divide-gray-800/50` + `hover:bg-gray-900/30`.
+- **Right-align numbers:** `text-right tabular-nums`.
+
+### Buttons
+
+| Variant | Classes | Use |
+|---|---|---|
+| Primary | `rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400 disabled:cursor-not-allowed disabled:bg-gray-700 disabled:text-gray-500` | "Save", "Compare N", main submit actions |
+| Ghost | `rounded-md px-3 py-1.5 text-sm text-gray-400 transition-colors hover:text-gray-200` | Cancel, inline Clear, low-stakes secondary |
+| Destructive | `rounded-md border border-red-900/60 px-3 py-1.5 text-sm text-red-400 transition-colors hover:border-red-800 hover:bg-red-950/30 hover:text-red-300` | Clear all, delete, destructive |
+| Emerald (rare) | `rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-500` | Reserved for "Run Eval" only — NOT a general accent |
+
+Primary buttons use `text-gray-950` (not white) on cyan-500 because
+cyan-500 is a bright background and dark foreground contrasts better.
+
+### Inputs
+
+```tsx
+<input
+  className="rounded-md border border-gray-700 bg-gray-950 px-3 py-1.5 text-sm text-gray-100 placeholder:text-gray-600 focus:border-cyan-500 focus:outline-none focus:ring-1 focus:ring-cyan-500 disabled:opacity-50"
+/>
+```
+
+- `bg-gray-950` (inset, darker than surrounding `bg-gray-900`)
+- Focus uses `ring-1 ring-cyan-500` + matching `border-cyan-500` — always both together.
+- Disabled is `opacity-50`, not a different fill.
+
+### Pill chips (tags, labels, status badges)
+
+```tsx
+<span className="rounded-md border border-cyan-900/60 bg-cyan-950/30 px-2 py-0.5 text-xs font-medium text-cyan-300">
+  {tag}
+</span>
+```
+
+- `rounded-md` (6px), not `rounded-full`
+- Always three tokens together: `border-cyan-900/60 bg-cyan-950/30 text-cyan-300`
+- `text-xs` (12px), `font-medium`
+
+For status badges (local/remote source pill, pass rate chip), swap the
+cyan trio for emerald (`emerald-900/60` + `emerald-950/30` + `emerald-300`)
+or amber (`amber-900/60` + `amber-950/30` + `amber-300`).
+
+### `PassRatePill` (always reuse the component)
+
+```tsx
+import { PassRatePill } from './PassRatePill';
+<PassRatePill rate={0.75} />
+```
+
+Never recreate this inline — import the shared component. Width is
+fixed at `w-20`, height at `h-5`, fill is the only Studio element that
+uses the blue gradient.
+
+### Mode toggle / segmented control
+
+```tsx
+<div
+  role="tablist"
+  className="inline-flex items-center rounded-lg border border-gray-800 bg-gray-900/50 p-1"
+>
+  <button
+    className={`rounded-md px-3 py-1.5 text-sm font-medium transition-colors ${
+      active ? 'bg-gray-800 text-cyan-400 shadow-sm' : 'text-gray-400 hover:text-gray-200'
+    }`}
+  >
+    Option
+  </button>
+</div>
+```
+
+Used in `CompareTab` for the Aggregated / Per-run switch. Do not use a
+2px underline indicator here — that pattern is reserved for the **main
+page tabs** (see Navigation below).
+
+### Main page tabs
+
+The top-level tab strip (`Recent Runs`, `Experiments`, `Compare`,
+`Targets`) uses a 2px underline indicator:
+
+```tsx
+<button
+  className={`px-4 py-2 text-sm font-medium transition-colors ${
+    active
+      ? 'border-b-2 border-cyan-400 text-cyan-400'
+      : 'text-gray-400 hover:text-gray-300'
+  }`}
+>
+```
+
+Reserve this pattern for main navigation only. Use the segmented-control
+pattern above for in-view mode switches.
+
+## 5. Layout Principles
+
+### Spacing
+
+- Vertical rhythm between stacked sections: `space-y-6` (24px) for top-level, `space-y-4` (16px) mid-level, `space-y-3` (12px) inside cards, `space-y-2` (8px) inside form rows.
+- Horizontal gutters between adjacent buttons/chips: `gap-2` (8px) default, `gap-3` (12px) when elements are larger.
+- Card internal padding: `p-4` (16px) default, `p-8` (32px) for empty states, `p-3` (12px) for inline editors.
+- Cell padding: always `px-4 py-3` for table cells.
+
+### Container strategy
+
+- Main content lives in `<main className="…">` inside `Layout.tsx`. Don't add your own page-level backgrounds — inherit `bg-gray-950` from `<body>`.
+- Sidebar is fixed-width (`w-56` / `w-64`), `border-r border-gray-800`.
+- No max-width container — Studio fills the viewport. Tables scroll horizontally with `overflow-x-auto` on their wrap.
+
+### Whitespace philosophy
+
+- **Density over air.** Studio is an inspector, not a blog post. Use
+  `py-3` on table rows, not `py-6`. Use `space-y-4` between sections,
+  not `space-y-10`.
+- **Borders, not margins.** Separate groups with `border-b border-gray-800`
+  or `divide-y divide-gray-800/50` rather than large vertical gaps.
+- **Empty states get room.** The only place to use generous padding is
+  the empty-state notice (`rounded-lg border border-gray-800 bg-gray-900 p-8 text-center`).
+
+### Border radius scale
+
+- `rounded` (4px): chips' inner bits, small indicators
+- `rounded-md` (6px): buttons, inputs, chips, in-view toggles
+- `rounded-lg` (8px): every container, every card, every table wrap
+- `rounded-full`: checkboxes, the `PassRatePill` track, legend swatches
+
+**Never `rounded-xl` or `rounded-2xl`.** 8px is the ceiling for containers.
+
+## 6. Depth & Elevation
+
+Studio is flat. There are almost no shadows.
+
+| Level | Treatment | Use |
+|---|---|---|
+| Flat | No shadow | Default for everything |
+| Subtle border | `border border-gray-800` | Every container's "elevation" |
+| Focus ring | `focus:ring-1 focus:ring-cyan-500` | Focused inputs/buttons only |
+| Drop shadow | `shadow-xl` | RARE — only floating elements like the sticky compare action bar or a modal |
+
+**Shadow philosophy:** borders carry elevation. When you think "this needs
+to stand out", darken or lighten the surface (`bg-gray-900` → `bg-gray-800`)
+instead of adding a shadow. The one exception is sticky action bars and
+modals, which can use `shadow-xl backdrop-blur` over a translucent
+`bg-gray-900/95` background.
+
+## 7. Do's and Don'ts
+
+### Do
+
+- Copy patterns verbatim from `ExperimentsTab.tsx`, `TargetsTab.tsx`, `RunList.tsx`, and `PassRatePill.tsx` before inventing new ones.
+- Use `tabular-nums` on every number.
+- Use `<PassRatePill />` wherever you show a 0–1 rate.
+- Wrap every meaningful grouping in `rounded-lg border border-gray-800`.
+- Use `transition-colors` for hovers, nothing else.
+- Use `font-medium` (500) for headings — 600 is a ceiling for section titles.
+- Align numbers right, identifiers left.
+- Honor `prefers-reduced-motion` if you add any animation.
+- Pass `readOnly` through when your surface writes data — some Studio deployments run in leaderboard mode.
+
+### Don't
+
+- Don't introduce a second accent color. Cyan is the ONLY interactive accent. Use emerald/amber/red for data tones only.
+- Don't add webfonts (no Fraunces, Inter, JetBrains Mono, or Google Fonts of any kind).
+- Don't use drop shadows for elevation. Borders do that job.
+- Don't use `rounded-xl` or larger. 8px is the ceiling.
+- Don't use `font-bold` (700) for headings. 500–600 is the range.
+- Don't recreate `PassRatePill` inline. Import the component.
+- Don't nest `rounded-lg` containers inside each other more than one level deep. Studio is flat.
+- Don't use uppercase on normal table headers. `tracking-wider uppercase` is reserved for eyebrow labels only.
+- Don't put colored backgrounds on tables or main content areas. The canvas stays `bg-gray-950`.
+- Don't set your own `font-family` on a subtree. Inherit the global system stack.
+
+## 8. Responsive Behavior
+
+Studio targets desktop primarily (developers with second monitors). Mobile
+layout is best-effort.
+
+### Breakpoints (Tailwind defaults)
+
+| Name | Tailwind | Behavior |
+|---|---|---|
+| Mobile | `<640px` | Sidebar collapses; tables scroll horizontally via `overflow-x-auto` |
+| Tablet | `sm:` / `md:` | Full layout, narrower gutters |
+| Desktop | `lg:` / `xl:` | Full layout, expanded gutters |
+
+### Collapsing strategy
+
+- Tables always wrap in `overflow-x-auto` — let wide tables scroll horizontally rather than restacking columns.
+- Side-by-side compare view keeps the first column sticky (`sticky left-0 z-10 bg-gray-950/70 backdrop-blur`) so test names stay visible.
+- Flex containers with `flex-wrap` gracefully collapse multi-chip sets (tags, legend swatches).
+
+## 9. Agent Prompt Guide
+
+### Quick color reference
+
+- Canvas: `bg-gray-950`
+- Elevated surface: `bg-gray-900` or `bg-gray-900/50`
+- Border: `border-gray-800`
+- Body text: `text-gray-300` / `text-gray-400`
+- Heading: `text-white`
+- Accent (everything interactive): `cyan-400` / `cyan-500`
+- Pass / warn / fail: `emerald-400` / `yellow-400` / `red-400`
+- Destructive: `red-400` + `border-red-900/60`
+
+### Example prompts
+
+- **Table section**: "Create a table section on `bg-gray-950`. Wrap in `overflow-hidden rounded-lg border border-gray-800`. Header row `border-b border-gray-800 bg-gray-900/50` with `font-medium text-gray-400` column labels at `px-4 py-3`. Body `divide-y divide-gray-800/50`, rows `transition-colors hover:bg-gray-900/30`. Numeric columns right-aligned with `tabular-nums text-gray-400`. Reuse `<PassRatePill rate={value} />` for any 0–1 rate column."
+
+- **Primary action**: "Place a primary button aligned right: `rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400 disabled:cursor-not-allowed disabled:bg-gray-700 disabled:text-gray-500`. Text label in the button should be imperative — Save, Compare N, Apply — not 'Click here'."
+
+- **In-view toggle**: "Build a segmented control: `inline-flex items-center rounded-lg border border-gray-800 bg-gray-900/50 p-1`. Each button `rounded-md px-3 py-1.5 text-sm font-medium transition-colors`, active state `bg-gray-800 text-cyan-400 shadow-sm`, inactive `text-gray-400 hover:text-gray-200`. Give it `role=\"tablist\"` and `aria-selected` on the active button."
+
+- **Tag chip**: "Render a tag chip: `rounded-md border border-cyan-900/60 bg-cyan-950/30 px-2 py-0.5 text-xs font-medium text-cyan-300`. If editable, nest a remove button inside: `text-cyan-500 transition-colors hover:text-cyan-200 disabled:opacity-50` with aria-label describing the removal."
+
+- **Empty state**: "Show an empty state: `rounded-lg border border-gray-800 bg-gray-900 p-8 text-center`. Headline `text-lg text-gray-300`, body `mt-2 text-sm text-gray-500`. Don't add an illustration or accent color — the message and layout do all the work."
+
+- **Input row**: "Build a form row: label above at `text-xs font-medium uppercase tracking-wider text-gray-400`. Input: `rounded-md border border-gray-700 bg-gray-950 px-3 py-1.5 text-sm text-gray-100 placeholder:text-gray-600 focus:border-cyan-500 focus:outline-none focus:ring-1 focus:ring-cyan-500 disabled:opacity-50`. Right-align action buttons at the end of the row with `gap-2`."
+
+### Iteration guide
+
+1. Start from an existing Studio component and copy its classNames verbatim. Only diverge when you need to.
+2. If you want to emphasize something, darken or lighten the surface — don't add a shadow, don't add an accent colour, don't scale the type.
+3. Use cyan-400 exclusively for interactive state. If you feel the urge to add a second accent, use a data tone (emerald/yellow/red) and only for data.
+4. Every number gets `tabular-nums`. Every rate gets `<PassRatePill />`.
+5. When you think "this needs more air", stop — Studio is dense by design. Tables are `py-3` rows, not `py-6`.
+6. When you think "this needs a hero headline", stop — Studio doesn't do heroes. Section titles are `text-xl font-semibold text-white` and nothing else.
diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx
index a633d70ea..188d0c413 100644
--- a/apps/studio/src/components/CompareTab.tsx
+++ b/apps/studio/src/components/CompareTab.tsx
@@ -1,100 +1,212 @@
 /**
- * Cross-model comparison matrix component.
+ * Cross-model comparison view.
  *
- * Displays a grid of experiment (columns) x target (rows) cells,
- * each showing pass rate, average score, and test counts. Color-coded
- * by performance: green (>80%), yellow (50-80%), red (<50%).
- * Cells are expandable to show per-test-case breakdown.
+ * Two modes:
+ *   1. Aggregated (default)  — `(experiment, target)` matrix, one cell per pair.
+ *   2. Per run               — individual runs are first-class; users select
+ *                              2+ runs to render a side-by-side comparison,
+ *                              and may attach retroactive tags to any run.
  *
- * Used in both unscoped and project-scoped views.
+ * Styling matches the rest of AgentV Studio: dark gray surfaces
+ * (`bg-gray-900` / `border-gray-800`), cyan accents for interactive elements,
+ * emerald/yellow/red tones for pass rates. Reuses `PassRatePill` for pass-rate
+ * rendering and the same table patterns as `ExperimentsTab` / `TargetsTab`.
+ *
+ * Backend contract:
+ *   - `GET /api/compare`                → { cells, runs? }
+ *   - `PUT /api/runs/:runId/tags`       → replaces sidecar tags.json
+ *   - `DELETE /api/runs/:runId/tags`    → removes sidecar
+ *
+ * To extend with a new mode: add a value to `ViewMode`, a button in the mode
+ * toggle, and a new body component in the content switch. Hooks in any new
+ * sub-component must stay single-instance inside the mode switch so React's
+ * hook order does not change across renders.
  */
 
-import { useState } from 'react';
+import { useMutation, useQueryClient } from '@tanstack/react-query';
+import { useEffect, useMemo, useRef, useState } from 'react';
 
-import type { CompareCell, CompareResponse, CompareTestResult } from '~/lib/types';
+import { deleteRunTagsApi, saveRunTagsApi } from '~/lib/api';
+import type { CompareCell, CompareResponse, CompareRunEntry, CompareTestResult } from '~/lib/types';
+
+import { PassRatePill } from './PassRatePill';
 
 interface CompareTabProps {
   data: CompareResponse | undefined;
   isLoading: boolean;
   isError?: boolean;
   error?: Error | null;
+  /** Benchmark scope. Undefined for the unscoped (root) compare view. */
+  benchmarkId?: string;
+  /** Read-only mode disables tag editing. */
+  readOnly?: boolean;
 }
 
-export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) {
-  if (isLoading) {
-    return <LoadingSkeleton />;
-  }
+type ViewMode = 'aggregated' | 'per-run';
 
-  if (isError && error) {
-    return (
-      <div className="rounded-lg border border-red-900/50 bg-red-950/20 p-6 text-red-400">
-        Failed to load comparison data: {error.message}
-      </div>
-    );
-  }
+// ── Top-level container ─────────────────────────────────────────────────
 
-  if (!data || data.cells.length === 0) {
-    return (
-      <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
-        <p className="text-lg text-gray-400">No comparison data available</p>
-        <p className="mt-2 text-sm text-gray-500">
-          Run evaluations with different experiment and target combinations to see a comparison
-          matrix.
+export function CompareTab({
+  data,
+  isLoading,
+  isError,
+  error,
+  benchmarkId,
+  readOnly,
+}: CompareTabProps) {
+  const [mode, setMode] = useState<ViewMode>('aggregated');
+  const runsCount = data?.runs?.length ?? 0;
+
+  return (
+    <div className="space-y-4">
+      <Header mode={mode} onModeChange={setMode} runsCount={runsCount} />
+
+      {isLoading && <LoadingSkeleton />}
+      {!isLoading && isError && error && (
+        <ErrorPanel message={`Failed to load comparison data: ${error.message}`} />
+      )}
+      {!isLoading && !isError && (!data || data.cells.length === 0) && <EmptyState />}
+      {!isLoading && !isError && data && data.cells.length > 0 && (
+        <>
+          {mode === 'aggregated' && <AggregatedView data={data} />}
+          {mode === 'per-run' && (
+            <PerRunView data={data} benchmarkId={benchmarkId} readOnly={readOnly ?? false} />
+          )}
+        </>
+      )}
+    </div>
+  );
+}
+
+// ── Header ──────────────────────────────────────────────────────────────
+
+function Header({
+  mode,
+  onModeChange,
+  runsCount,
+}: {
+  mode: ViewMode;
+  onModeChange: (m: ViewMode) => void;
+  runsCount: number;
+}) {
+  return (
+    <div className="flex flex-wrap items-end justify-between gap-3">
+      <div>
+        <h2 className="text-xl font-semibold text-white">Compare runs</h2>
+        <p className="mt-1 text-sm text-gray-400">
+          Study one experiment against another, or pit individual runs head-to-head.
         </p>
       </div>
-    );
-  }
+      <ModeToggle mode={mode} onChange={onModeChange} runsCount={runsCount} />
+    </div>
+  );
+}
 
+function ModeToggle({
+  mode,
+  onChange,
+  runsCount,
+}: {
+  mode: ViewMode;
+  onChange: (m: ViewMode) => void;
+  runsCount: number;
+}) {
+  const perRunDisabled = runsCount === 0;
+  return (
+    <div
+      role="tablist"
+      aria-label="Comparison view mode"
+      className="inline-flex items-center rounded-lg border border-gray-800 bg-gray-900/50 p-1"
+    >
+      <ModeButton active={mode === 'aggregated'} onClick={() => onChange('aggregated')}>
+        Aggregated
+      </ModeButton>
+      <ModeButton
+        active={mode === 'per-run'}
+        onClick={() => onChange('per-run')}
+        disabled={perRunDisabled}
+        title={perRunDisabled ? 'No runs available' : undefined}
+      >
+        Per run
+        {runsCount > 0 && (
+          <span className="ml-1.5 rounded bg-gray-800 px-1.5 py-0.5 text-xs tabular-nums text-gray-400">
+            {runsCount}
+          </span>
+        )}
+      </ModeButton>
+    </div>
+  );
+}
+
+function ModeButton({
+  active,
+  onClick,
+  disabled,
+  title,
+  children,
+}: {
+  active: boolean;
+  onClick: () => void;
+  disabled?: boolean;
+  title?: string;
+  children: React.ReactNode;
+}) {
+  return (
+    <button
+      type="button"
+      role="tab"
+      aria-selected={active}
+      onClick={onClick}
+      disabled={disabled}
+      title={title}
+      className={`rounded-md px-3 py-1.5 text-sm font-medium transition-colors ${
+        active
+          ? 'bg-gray-800 text-cyan-400 shadow-sm'
+          : 'text-gray-400 hover:text-gray-200 disabled:text-gray-600 disabled:hover:text-gray-600'
+      }`}
+    >
+      {children}
+    </button>
+  );
+}
+
+// ── Aggregated (matrix) view ────────────────────────────────────────────
+
+function AggregatedView({ data }: { data: CompareResponse }) {
   const { experiments, targets, cells } = data;
 
-  // If there is only one experiment and one target, the matrix is trivial
+  // Hooks must run on every render regardless of the early-return below,
+  // so this memo is declared before any conditional return. When you add a
+  // new hook-using sub-path here, keep it above the guard.
+  const cellMap = useMemo(() => {
+    const map = new Map<string, CompareCell>();
+    for (const cell of cells) {
+      map.set(`${cell.experiment}::${cell.target}`, cell);
+    }
+    return map;
+  }, [cells]);
+
   if (experiments.length <= 1 && targets.length <= 1) {
     return (
-      <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
-        <p className="text-lg text-gray-400">Not enough variation to compare</p>
-        <p className="mt-2 text-sm text-gray-500">
-          The comparison matrix requires at least 2 experiments or 2 targets. Currently there{' '}
-          {experiments.length === 1 ? 'is 1 experiment' : `are ${experiments.length} experiments`}{' '}
-          and {targets.length === 1 ? '1 target' : `${targets.length} targets`}.
-        </p>
-      </div>
+      <Notice
+        headline="Not enough variation to compare"
+        body={`The aggregated matrix requires at least 2 experiments or 2 targets. Currently ${experiments.length} experiment(s) and ${targets.length} target(s).`}
+      />
     );
   }
 
-  // Build a lookup map for cells
-  const cellMap = new Map<string, CompareCell>();
-  for (const cell of cells) {
-    cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell);
-  }
-
   return (
-    <div className="space-y-4">
-      <div className="flex items-center gap-4 text-sm text-gray-400">
-        <span className="flex items-center gap-1.5">
-          <span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-emerald-500/60" />
-          <span className="text-emerald-400">80%+</span>
-        </span>
-        <span className="flex items-center gap-1.5">
-          <span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-amber-500/60" />
-          <span className="text-amber-400">50–80%</span>
-        </span>
-        <span className="flex items-center gap-1.5">
-          <span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-red-500/60" />
-          <span className="text-red-400">&lt;50%</span>
-        </span>
-        <span className="flex items-center gap-1.5">
-          <span className="inline-block h-3 w-3 rounded-sm border border-dashed border-gray-700" />
-          No data
-        </span>
-      </div>
-
+    <div className="space-y-3">
+      <Legend />
       <div className="overflow-x-auto rounded-lg border border-gray-800">
         <table className="w-full text-left text-sm">
           <thead className="border-b border-gray-800 bg-gray-900/50">
             <tr>
-              <th className="px-4 py-3 font-medium text-gray-400">Target</th>
+              <th className="px-4 py-3 font-medium text-gray-400">
+                <span className="text-xs uppercase tracking-wider">Target ↓ / Experiment →</span>
+              </th>
               {experiments.map((exp) => (
-                <th key={exp} className="px-4 py-3 text-center font-medium text-gray-400">
+                <th key={exp} className="px-4 py-3 font-medium text-gray-300">
                   {exp}
                 </th>
               ))}
@@ -102,7 +214,7 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps)
           </thead>
           <tbody className="divide-y divide-gray-800/50">
             {targets.map((target) => (
-              <CompareRow
+              <AggregatedRow
                 key={target}
                 target={target}
                 experiments={experiments}
@@ -116,7 +228,7 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps)
   );
 }
 
-function CompareRow({
+function AggregatedRow({
   target,
   experiments,
   cellMap,
@@ -129,16 +241,10 @@ function CompareRow({
     <tr className="transition-colors hover:bg-gray-900/30">
       <td className="px-4 py-3 font-medium text-gray-200">{target}</td>
       {experiments.map((exp) => {
-        const cell = cellMap.get(JSON.stringify([exp, target]));
+        const cell = cellMap.get(`${exp}::${target}`);
         return (
-          <td key={exp} className="px-2 py-2">
-            {cell ? (
-              <CompareMatrixCell cell={cell} />
-            ) : (
-              <div className="flex items-center justify-center rounded-lg border border-dashed border-gray-700 px-3 py-4 text-gray-600">
-                --
-              </div>
-            )}
+          <td key={exp} className="px-4 py-3">
+            {cell ? <MatrixCell cell={cell} /> : <EmptyCell />}
           </td>
         );
       })}
@@ -146,65 +252,663 @@ function CompareRow({
   );
 }
 
-function passRateRingClass(rate: number): string {
-  if (rate >= 0.8) return 'ring-emerald-500/60';
-  if (rate >= 0.5) return 'ring-amber-500/60';
-  return 'ring-red-500/60';
-}
-
-function passRateTextClass(rate: number): string {
-  if (rate >= 0.8) return 'text-emerald-400';
-  if (rate >= 0.5) return 'text-amber-400';
-  return 'text-red-400';
-}
-
-function CompareMatrixCell({ cell }: { cell: CompareCell }) {
+function MatrixCell({ cell }: { cell: CompareCell }) {
   const [expanded, setExpanded] = useState(false);
-  const pct = Math.round(cell.pass_rate * 100);
   const avgPct = Math.round(cell.avg_score * 100);
-
   return (
-    <div className="space-y-1">
+    <div className="space-y-2">
       <button
         type="button"
-        onClick={() => setExpanded(!expanded)}
+        className="group flex w-full flex-col gap-1 rounded-md border border-gray-800 bg-gray-900/40 px-3 py-2 text-left transition-colors hover:border-gray-700 hover:bg-gray-900/70"
+        onClick={() => setExpanded((v) => !v)}
         aria-expanded={expanded}
-        className={`w-full rounded-lg bg-gray-800/60 px-3 py-3 text-center ring-1 transition-colors hover:bg-gray-700/60 ${passRateRingClass(cell.pass_rate)}`}
       >
-        <div className="flex items-center justify-center">
-          <span
-            className={`text-lg font-semibold tabular-nums ${passRateTextClass(cell.pass_rate)}`}
-          >
-            {pct}%
+        <PassRatePill rate={cell.pass_rate} />
+        <div className="flex items-center gap-2 text-xs tabular-nums text-gray-500">
+          <span>
+            <span className="text-emerald-400">{cell.passed_count}</span>
+            <span className="text-gray-600"> / </span>
+            <span>{cell.eval_count}</span>
           </span>
-        </div>
-        <div className="mt-0.5 text-xs text-gray-400">
-          {cell.passed_count}/{cell.eval_count} pass | avg {avgPct}%
+          <span className="text-gray-700">·</span>
+          <span>avg {avgPct}%</span>
         </div>
       </button>
-
-      {expanded && <TestCaseBreakdown tests={cell.tests} />}
+      {expanded && <TestBreakdown tests={cell.tests} />}
     </div>
   );
 }
 
-function TestCaseBreakdown({ tests }: { tests: CompareTestResult[] }) {
+function EmptyCell() {
+  return <div className="text-center text-gray-600">—</div>;
+}
+
+function TestBreakdown({ tests }: { tests: CompareTestResult[] }) {
   return (
-    <div className="mt-1 max-h-48 overflow-y-auto rounded-md border border-gray-800 bg-gray-950/80 p-2">
-      <div className="mb-1 text-xs font-medium text-gray-500">Test Cases</div>
-      <div className="space-y-0.5">
+    <div className="rounded-md border border-gray-800 bg-gray-950/60 px-3 py-2">
+      <div className="mb-1.5 text-xs font-medium uppercase tracking-wider text-gray-500">
+        Test cases
+      </div>
+      <ul className="space-y-1">
         {tests.map((t) => (
-          <div key={t.test_id} className="flex items-center gap-2 rounded px-1.5 py-0.5 text-xs">
-            <span className={t.passed ? 'text-emerald-400' : 'text-red-400'}>
-              {t.passed ? '\u2713' : '\u2717'}
-            </span>
+          <li key={t.test_id} className="flex items-center gap-2 text-xs">
+            <span
+              aria-hidden
+              className={`h-1.5 w-1.5 rounded-full ${t.passed ? 'bg-emerald-400' : 'bg-red-400'}`}
+            />
             <span className="flex-1 truncate text-gray-300" title={t.test_id}>
               {t.test_id}
             </span>
-            <span className="tabular-nums text-gray-500">{Math.round(t.score * 100)}%</span>
+            <span className={`tabular-nums ${t.passed ? 'text-emerald-400' : 'text-red-400'}`}>
+              {Math.round(t.score * 100)}%
+            </span>
+          </li>
+        ))}
+      </ul>
+    </div>
+  );
+}
+
+// ── Per-run view ────────────────────────────────────────────────────────
+
+function PerRunView({
+  data,
+  benchmarkId,
+  readOnly,
+}: {
+  data: CompareResponse;
+  benchmarkId?: string;
+  readOnly: boolean;
+}) {
+  const runs = data.runs ?? [];
+  const [selected, setSelected] = useState<Set<string>>(new Set());
+  const [showingCompare, setShowingCompare] = useState(false);
+  const [editingRunId, setEditingRunId] = useState<string | null>(null);
+
+  const toggleSelect = (runId: string) => {
+    setSelected((prev) => {
+      const next = new Set(prev);
+      if (next.has(runId)) next.delete(runId);
+      else next.add(runId);
+      return next;
+    });
+  };
+
+  const clearSelection = () => setSelected(new Set());
+
+  const selectedRuns = useMemo(() => runs.filter((r) => selected.has(r.run_id)), [runs, selected]);
+
+  if (runs.length === 0) {
+    return <Notice headline="No runs yet" body="Run an evaluation to populate the per-run view." />;
+  }
+
+  if (showingCompare && selectedRuns.length >= 2) {
+    return <PerRunCompareView runs={selectedRuns} onBack={() => setShowingCompare(false)} />;
+  }
+
+  return (
+    <div className="space-y-3">
+      <div className="flex items-center justify-between text-sm text-gray-400">
+        <span>
+          <span className="font-medium text-gray-300">{runs.length}</span> run
+          {runs.length === 1 ? '' : 's'} — select two or more to compare side-by-side
+        </span>
+      </div>
+
+      <div className="overflow-x-auto rounded-lg border border-gray-800">
+        <table className="w-full text-left text-sm">
+          <thead className="border-b border-gray-800 bg-gray-900/50">
+            <tr>
+              <th className="w-10 px-3 py-3" aria-label="Select" />
+              <th className="px-4 py-3 font-medium text-gray-400">Timestamp</th>
+              <th className="px-4 py-3 font-medium text-gray-400">Tags</th>
+              <th className="px-4 py-3 font-medium text-gray-400">Experiment</th>
+              <th className="px-4 py-3 font-medium text-gray-400">Target</th>
+              <th className="px-4 py-3 text-right font-medium text-gray-400">Tests</th>
+              <th className="px-4 py-3 font-medium text-gray-400">Pass rate</th>
+              <th className="px-4 py-3 text-right font-medium text-gray-400">Avg</th>
+            </tr>
+          </thead>
+          <tbody className="divide-y divide-gray-800/50">
+            {runs.map((run) => (
+              <PerRunRow
+                key={run.run_id}
+                run={run}
+                checked={selected.has(run.run_id)}
+                onToggle={() => toggleSelect(run.run_id)}
+                editing={editingRunId === run.run_id}
+                onStartEdit={() => setEditingRunId(run.run_id)}
+                onEndEdit={() => setEditingRunId(null)}
+                benchmarkId={benchmarkId}
+                readOnly={readOnly}
+              />
+            ))}
+          </tbody>
+        </table>
+      </div>
+
+      {selected.size > 0 && (
+        <div
+          role="toolbar"
+          aria-label="Selection actions"
+          className="sticky bottom-4 z-10 flex items-center justify-between gap-3 rounded-lg border border-cyan-900/50 bg-gray-900/95 px-4 py-3 shadow-xl backdrop-blur"
+        >
+          <div className="text-sm text-gray-300">
+            <span className="font-semibold tabular-nums text-cyan-400">{selected.size}</span>{' '}
+            selected
           </div>
+          <div className="flex items-center gap-2">
+            <button
+              type="button"
+              onClick={clearSelection}
+              className="rounded-md px-3 py-1.5 text-sm text-gray-400 transition-colors hover:text-gray-200"
+            >
+              Clear
+            </button>
+            <button
+              type="button"
+              disabled={selected.size < 2}
+              onClick={() => setShowingCompare(true)}
+              className="inline-flex items-center gap-1.5 rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400 disabled:cursor-not-allowed disabled:bg-gray-700 disabled:text-gray-500"
+            >
+              Compare {selected.size}
+              <span aria-hidden>→</span>
+            </button>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
+
+function PerRunRow({
+  run,
+  checked,
+  onToggle,
+  editing,
+  onStartEdit,
+  onEndEdit,
+  benchmarkId,
+  readOnly,
+}: {
+  run: CompareRunEntry;
+  checked: boolean;
+  onToggle: () => void;
+  editing: boolean;
+  onStartEdit: () => void;
+  onEndEdit: () => void;
+  benchmarkId?: string;
+  readOnly: boolean;
+}) {
+  const avgPct = Math.round(run.avg_score * 100);
+  const canEdit = !readOnly && run.source !== 'remote';
+  const tagsBtnRef = useRef<HTMLButtonElement>(null);
+  const tags = run.tags ?? [];
+  const runLabel = tags[0] ?? run.run_id;
+
+  // Restore focus to the tags trigger button once the inline editor closes,
+  // so keyboard users don't lose their place in the table.
+  const wasEditing = useRef(editing);
+  useEffect(() => {
+    if (wasEditing.current && !editing) {
+      tagsBtnRef.current?.focus();
+    }
+    wasEditing.current = editing;
+  }, [editing]);
+
+  return (
+    <>
+      <tr
+        className={`transition-colors ${
+          checked ? 'bg-cyan-950/20 hover:bg-cyan-950/30' : 'hover:bg-gray-900/30'
+        }`}
+      >
+        <td className="px-3 py-3 align-middle">
+          <input
+            type="checkbox"
+            className="h-4 w-4 cursor-pointer rounded border-gray-700 bg-gray-900 text-cyan-500 accent-cyan-500 focus:ring-cyan-500"
+            checked={checked}
+            onChange={onToggle}
+            aria-label={`Select run ${runLabel}`}
+          />
+        </td>
+        <td className="px-4 py-3 align-middle">
+          <div className="font-medium text-gray-200 tabular-nums">
+            {formatTimestamp(run.started_at)}
+          </div>
+          <div className="text-xs text-gray-500 tabular-nums" title={run.run_id}>
+            {shortenRunId(run.run_id)}
+          </div>
+        </td>
+        <td className="px-4 py-3 align-middle">
+          {canEdit ? (
+            <button
+              ref={tagsBtnRef}
+              type="button"
+              onClick={(e) => {
+                e.stopPropagation();
+                onStartEdit();
+              }}
+              className={
+                tags.length > 0
+                  ? 'inline-flex flex-wrap items-center gap-1 rounded-md px-1 py-0.5 transition-colors hover:bg-gray-800/60'
+                  : 'rounded-md border border-dashed border-gray-700 px-2 py-0.5 text-xs text-gray-500 transition-colors hover:border-cyan-800 hover:text-cyan-400'
+              }
+              aria-label={tags.length > 0 ? 'Edit tags' : 'Add tags'}
+            >
+              {tags.length > 0 ? (
+                tags.map((t) => (
+                  <span
+                    key={t}
+                    className="rounded-md border border-cyan-900/60 bg-cyan-950/30 px-2 py-0.5 text-xs font-medium text-cyan-300"
+                  >
+                    {t}
+                  </span>
+                ))
+              ) : (
+                <>+ tags</>
+              )}
+            </button>
+          ) : tags.length > 0 ? (
+            <div className="inline-flex flex-wrap items-center gap-1">
+              {tags.map((t) => (
+                <span
+                  key={t}
+                  className="rounded-md border border-cyan-900/60 bg-cyan-950/30 px-2 py-0.5 text-xs font-medium text-cyan-300"
+                >
+                  {t}
+                </span>
+              ))}
+            </div>
+          ) : (
+            <span className="text-gray-600">—</span>
+          )}
+        </td>
+        <td className="px-4 py-3 align-middle text-gray-300">{run.experiment}</td>
+        <td className="px-4 py-3 align-middle text-gray-300">{run.target}</td>
+        <td className="px-4 py-3 align-middle text-right tabular-nums text-gray-400">
+          {run.eval_count}
+        </td>
+        <td className="px-4 py-3 align-middle">
+          <PassRatePill rate={run.pass_rate} />
+        </td>
+        <td className="px-4 py-3 align-middle text-right tabular-nums text-gray-400">{avgPct}%</td>
+      </tr>
+      {editing && (
+        <tr className="bg-gray-950/80">
+          <td colSpan={8} className="px-4 py-3">
+            <TagsEditor
+              runId={run.run_id}
+              currentTags={tags}
+              benchmarkId={benchmarkId}
+              onClose={onEndEdit}
+            />
+          </td>
+        </tr>
+      )}
+    </>
+  );
+}
+
+/**
+ * Inline chip-based tag editor.
+ *
+ * Local state: a `string[]` staged edit of the run's tags. Chips show the
+ * current staged tags; an input at the end accepts new tags (commit with
+ * Enter or comma, delete the last chip with Backspace on an empty input).
+ * Save persists the whole array; Cancel / Escape discards.
+ *
+ * The backend's `writeRunTags` handles deduplication, length limits, and
+ * control-character rejection, so we only lightly normalize in the UI
+ * (trim + skip duplicates already in the staged array).
+ */
+function TagsEditor({
+  runId,
+  currentTags,
+  benchmarkId,
+  onClose,
+}: {
+  runId: string;
+  currentTags: string[];
+  benchmarkId?: string;
+  onClose: () => void;
+}) {
+  const [tags, setTags] = useState<string[]>(currentTags);
+  const [input, setInput] = useState('');
+  const [err, setErr] = useState<string | null>(null);
+  const qc = useQueryClient();
+  const inputRef = useRef<HTMLInputElement>(null);
+
+  useEffect(() => {
+    inputRef.current?.focus();
+  }, []);
+
+  const saveMut = useMutation({
+    mutationFn: () => saveRunTagsApi(runId, tags, benchmarkId),
+    onSuccess: () => {
+      qc.invalidateQueries({ queryKey: ['compare'] });
+      qc.invalidateQueries({ queryKey: ['runs'] });
+      if (benchmarkId) {
+        qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'compare'] });
+        qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'runs'] });
+      }
+      onClose();
+    },
+    onError: (e: Error) => setErr(e.message),
+  });
+
+  const clearMut = useMutation({
+    mutationFn: () => deleteRunTagsApi(runId, benchmarkId),
+    onSuccess: () => {
+      qc.invalidateQueries({ queryKey: ['compare'] });
+      qc.invalidateQueries({ queryKey: ['runs'] });
+      if (benchmarkId) {
+        qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'compare'] });
+        qc.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'runs'] });
+      }
+      onClose();
+    },
+    onError: (e: Error) => setErr(e.message),
+  });
+
+  const busy = saveMut.isPending || clearMut.isPending;
+  const hasChanges =
+    tags.length !== currentTags.length || tags.some((t, i) => t !== currentTags[i]);
+
+  const commitInput = () => {
+    const trimmed = input.trim();
+    if (trimmed === '') return;
+    if (tags.includes(trimmed)) {
+      setInput('');
+      return;
+    }
+    setTags([...tags, trimmed]);
+    setInput('');
+    setErr(null);
+  };
+
+  const removeTag = (tag: string) => {
+    setTags(tags.filter((t) => t !== tag));
+  };
+
+  return (
+    <div className="space-y-2 rounded-md border border-gray-800 bg-gray-900/60 p-3">
+      <div className="flex items-center justify-between">
+        <span className="text-xs font-medium uppercase tracking-wider text-gray-400">Tag run</span>
+        <span className="text-xs text-gray-500">
+          Multi-valued. Enter or comma adds; Backspace removes the last chip.
+        </span>
+      </div>
+      <div className="flex flex-wrap items-center gap-2 rounded-md border border-gray-700 bg-gray-950 px-2 py-1.5 focus-within:border-cyan-500 focus-within:ring-1 focus-within:ring-cyan-500">
+        {tags.map((t) => (
+          <span
+            key={t}
+            className="inline-flex items-center gap-1 rounded-md border border-cyan-900/60 bg-cyan-950/30 px-2 py-0.5 text-xs font-medium text-cyan-300"
+          >
+            {t}
+            <button
+              type="button"
+              onClick={() => removeTag(t)}
+              disabled={busy}
+              className="text-cyan-500 transition-colors hover:text-cyan-200 disabled:opacity-50"
+              aria-label={`Remove tag ${t}`}
+            >
+              ×
+            </button>
+          </span>
         ))}
+        <input
+          ref={inputRef}
+          type="text"
+          className="flex-1 min-w-[140px] bg-transparent text-sm text-gray-100 placeholder:text-gray-600 focus:outline-none disabled:opacity-50"
+          placeholder={tags.length === 0 ? 'e.g. baseline, v2-prompt, slow' : 'Add tag…'}
+          value={input}
+          onChange={(e) => {
+            setErr(null);
+            setInput(e.target.value);
+          }}
+          maxLength={60}
+          disabled={busy}
+          onKeyDown={(e) => {
+            if (e.key === 'Enter' || e.key === ',') {
+              e.preventDefault();
+              commitInput();
+            } else if (e.key === 'Backspace' && input === '' && tags.length > 0) {
+              e.preventDefault();
+              setTags(tags.slice(0, -1));
+            } else if (e.key === 'Escape') {
+              onClose();
+            }
+          }}
+          onBlur={commitInput}
+        />
+      </div>
+      <div className="flex items-center justify-end gap-2">
+        <button
+          type="button"
+          onClick={onClose}
+          disabled={busy}
+          className="rounded-md px-3 py-1.5 text-sm text-gray-400 transition-colors hover:text-gray-200 disabled:opacity-50"
+        >
+          Cancel
+        </button>
+        {currentTags.length > 0 && (
+          <button
+            type="button"
+            onClick={() => {
+              if (busy) return;
+              clearMut.mutate();
+            }}
+            disabled={busy}
+            className="rounded-md border border-red-900/60 px-3 py-1.5 text-sm text-red-400 transition-colors hover:border-red-800 hover:bg-red-950/30 hover:text-red-300 disabled:opacity-50"
+          >
+            Clear all
+          </button>
+        )}
+        <button
+          type="button"
+          disabled={!hasChanges || busy}
+          onClick={() => {
+            if (busy || !hasChanges) return;
+            saveMut.mutate();
+          }}
+          className="rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400 disabled:cursor-not-allowed disabled:bg-gray-700 disabled:text-gray-500"
+        >
+          {saveMut.isPending ? 'Saving…' : 'Save'}
+        </button>
       </div>
+      {err && (
+        <div className="rounded-md border border-red-900/60 bg-red-950/30 px-3 py-1.5 text-xs text-red-400">
+          {err}
+        </div>
+      )}
+    </div>
+  );
+}
+
+function PerRunCompareView({
+  runs,
+  onBack,
+}: {
+  runs: CompareRunEntry[];
+  onBack: () => void;
+}) {
+  // Collect all test ids across selected runs (stable order: first run's order, then any extras)
+  const testIds = useMemo(() => {
+    const seen = new Set<string>();
+    const order: string[] = [];
+    for (const run of runs) {
+      for (const t of run.tests) {
+        if (!seen.has(t.test_id)) {
+          seen.add(t.test_id);
+          order.push(t.test_id);
+        }
+      }
+    }
+    return order;
+  }, [runs]);
+
+  const testLookup = useMemo(() => {
+    return runs.map((run) => {
+      const m = new Map<string, CompareTestResult>();
+      for (const t of run.tests) m.set(t.test_id, t);
+      return m;
+    });
+  }, [runs]);
+
+  return (
+    <div className="space-y-3">
+      <div className="flex items-center justify-between">
+        <button
+          type="button"
+          onClick={onBack}
+          className="inline-flex items-center gap-1 text-sm text-cyan-400 transition-colors hover:text-cyan-300"
+        >
+          <span aria-hidden>←</span> Back to runs
+        </button>
+        <span className="text-sm text-gray-500">
+          {runs.length} runs · {testIds.length} tests
+        </span>
+      </div>
+      <div className="overflow-x-auto rounded-lg border border-gray-800">
+        <table className="w-full text-left text-sm">
+          <thead className="border-b border-gray-800 bg-gray-900/50">
+            <tr>
+              <th className="sticky left-0 z-10 bg-gray-900/80 px-4 py-3 font-medium text-gray-400 backdrop-blur">
+                Test case
+              </th>
+              {runs.map((run) => (
+                <th key={run.run_id} className="px-4 py-3 align-bottom">
+                  <RunColumnHeader run={run} />
+                </th>
+              ))}
+            </tr>
+            <tr className="border-t border-gray-800/50 bg-gray-900/30">
+              <th className="sticky left-0 z-10 bg-gray-900/80 px-4 py-2 text-xs font-medium uppercase tracking-wider text-gray-500 backdrop-blur">
+                Pass rate
+              </th>
+              {runs.map((run) => (
+                <th key={run.run_id} className="px-4 py-2">
+                  <PassRatePill rate={run.pass_rate} />
+                </th>
+              ))}
+            </tr>
+          </thead>
+          <tbody className="divide-y divide-gray-800/50">
+            {testIds.map((tid) => (
+              <tr key={tid} className="transition-colors hover:bg-gray-900/30">
+                <td className="sticky left-0 z-10 bg-gray-950/70 px-4 py-3 font-medium text-gray-200 backdrop-blur">
+                  {tid}
+                </td>
+                {testLookup.map((lookup, idx) => {
+                  const t = lookup.get(tid);
+                  const runId = runs[idx].run_id;
+                  if (!t) {
+                    return (
+                      <td key={runId} className="px-4 py-3 text-center text-gray-600">
+                        —
+                      </td>
+                    );
+                  }
+                  return (
+                    <td key={runId} className="px-4 py-3">
+                      <div className="flex items-center gap-2">
+                        <span
+                          aria-hidden
+                          className={`h-1.5 w-1.5 rounded-full ${
+                            t.passed ? 'bg-emerald-400' : 'bg-red-400'
+                          }`}
+                        />
+                        <span
+                          className={`tabular-nums ${
+                            t.passed ? 'text-emerald-400' : 'text-red-400'
+                          }`}
+                        >
+                          {Math.round(t.score * 100)}%
+                        </span>
+                      </div>
+                    </td>
+                  );
+                })}
+              </tr>
+            ))}
+          </tbody>
+        </table>
+      </div>
+    </div>
+  );
+}
+
+function RunColumnHeader({ run }: { run: CompareRunEntry }) {
+  const tags = run.tags ?? [];
+  return (
+    <div className="min-w-[140px] space-y-1">
+      <div className="text-sm font-medium text-gray-200 tabular-nums" title={run.run_id}>
+        {formatTimestamp(run.started_at)}
+      </div>
+      {tags.length > 0 && (
+        <div className="flex flex-wrap gap-1">
+          {tags.map((t) => (
+            <span
+              key={t}
+              className="rounded-md border border-cyan-900/60 bg-cyan-950/30 px-1.5 py-0.5 text-[0.7rem] font-medium text-cyan-300"
+            >
+              {t}
+            </span>
+          ))}
+        </div>
+      )}
+      <div className="text-xs text-gray-500">
+        {run.experiment} · {run.target}
+      </div>
+    </div>
+  );
+}
+
+// ── Shared bits ─────────────────────────────────────────────────────────
+
+function Legend() {
+  return (
+    <div className="flex items-center gap-4 text-xs text-gray-500" role="note">
+      <LegendSwatch className="bg-emerald-400" label="80%+" />
+      <LegendSwatch className="bg-yellow-400" label="50–80%" />
+      <LegendSwatch className="bg-red-400" label="< 50%" />
+      <LegendSwatch className="bg-gray-700" label="no data" />
+    </div>
+  );
+}
+
+function LegendSwatch({ className, label }: { className: string; label: string }) {
+  return (
+    <span className="inline-flex items-center gap-1.5">
+      <span className={`h-2 w-2 rounded-full ${className}`} aria-hidden />
+      <span>{label}</span>
+    </span>
+  );
+}
+
+function ErrorPanel({ message }: { message: string }) {
+  return (
+    <div className="rounded-lg border border-red-900/60 bg-red-950/30 p-4 text-sm text-red-400">
+      {message}
+    </div>
+  );
+}
+
+function EmptyState() {
+  return (
+    <Notice
+      headline="No comparison data yet"
+      body="Run evaluations with different experiment and target combinations to populate this view."
+    />
+  );
+}
+
+function Notice({ headline, body }: { headline: string; body: string }) {
+  return (
+    <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
+      <p className="text-lg text-gray-300">{headline}</p>
+      <p className="mt-2 text-sm text-gray-500">{body}</p>
     </div>
   );
 }
@@ -216,15 +920,44 @@ function LoadingSkeleton() {
         <div className="border-b border-gray-800 bg-gray-900/50 px-4 py-3">
           <div className="h-4 w-48 rounded bg-gray-800" />
         </div>
-        {['sk-1', 'sk-2', 'sk-3'].map((id) => (
-          <div key={id} className="flex gap-4 border-b border-gray-800/50 px-4 py-6">
+        {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => (
+          <div key={id} className="flex gap-4 border-b border-gray-800/50 px-4 py-3">
+            <div className="h-4 w-32 rounded bg-gray-800" />
+            <div className="h-4 w-12 rounded bg-gray-800" />
+            <div className="h-4 w-12 rounded bg-gray-800" />
+            <div className="h-4 w-48 rounded bg-gray-800" />
             <div className="h-4 w-24 rounded bg-gray-800" />
-            <div className="h-16 w-32 rounded bg-gray-800" />
-            <div className="h-16 w-32 rounded bg-gray-800" />
-            <div className="h-16 w-32 rounded bg-gray-800" />
           </div>
         ))}
       </div>
     </div>
   );
 }
+
+// ── Helpers ─────────────────────────────────────────────────────────────
+
+/** Format an ISO timestamp for row / column display. */
+function formatTimestamp(iso: string): string {
+  try {
+    const d = new Date(iso);
+    if (Number.isNaN(d.getTime())) return iso;
+    const y = d.getFullYear();
+    const mo = String(d.getMonth() + 1).padStart(2, '0');
+    const da = String(d.getDate()).padStart(2, '0');
+    const h = String(d.getHours()).padStart(2, '0');
+    const mi = String(d.getMinutes()).padStart(2, '0');
+    return `${y}-${mo}-${da} ${h}:${mi}`;
+  } catch {
+    return iso;
+  }
+}
+
+/** Abbreviate the run id for display (keeps the last segment). */
+function shortenRunId(id: string): string {
+  const parts = id.split('::');
+  if (parts.length >= 2) {
+    const tail = parts[parts.length - 1];
+    return tail.length > 22 ? `${tail.slice(0, 10)}…${tail.slice(-8)}` : tail;
+  }
+  return id.length > 22 ? `${id.slice(0, 10)}…${id.slice(-8)}` : id;
+}
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index 75721d398..c770bb98a 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -28,6 +28,7 @@ import type {
   RunDetailResponse,
   RunEvalRequest,
   RunListResponse,
+  RunTagsResponse,
   StudioConfigResponse,
   SuitesResponse,
   TargetsResponse,
@@ -437,6 +438,45 @@ export async function syncRemoteResultsApi(benchmarkId?: string): Promise<Remote
   return res.json() as Promise<RemoteStatusResponse>;
 }
 
+// ── Run tag mutations ────────────────────────────────────────────────────
+
+/**
+ * Replace the tags on a run. Tags are stored as a sidecar `tags.json` file
+ * next to the run's manifest and surface as chips in the compare views.
+ * Pass an empty array to clear all tags (server deletes the sidecar).
+ */
+export async function saveRunTagsApi(
+  runId: string,
+  tags: string[],
+  benchmarkId?: string,
+): Promise<RunTagsResponse> {
+  const url = benchmarkId
+    ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/tags`
+    : `/api/runs/${encodeURIComponent(runId)}/tags`;
+  const res = await fetch(url, {
+    method: 'PUT',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ tags }),
+  });
+  if (!res.ok) {
+    const err = await res.json().catch(() => ({ error: res.statusText }));
+    throw new Error((err as { error?: string }).error ?? `Failed to save tags: ${res.status}`);
+  }
+  return res.json() as Promise<RunTagsResponse>;
+}
+
+/** Remove the tags sidecar for a run. */
+export async function deleteRunTagsApi(runId: string, benchmarkId?: string): Promise<void> {
+  const url = benchmarkId
+    ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(runId)}/tags`
+    : `/api/runs/${encodeURIComponent(runId)}/tags`;
+  const res = await fetch(url, { method: 'DELETE' });
+  if (!res.ok) {
+    const err = await res.json().catch(() => ({ error: res.statusText }));
+    throw new Error((err as { error?: string }).error ?? `Failed to delete tags: ${res.status}`);
+  }
+}
+
 export async function saveStudioConfig(
   config: Partial<StudioConfigResponse>,
 ): Promise<StudioConfigResponse> {
diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
index a395a8072..1f2d81ec6 100644
--- a/apps/studio/src/lib/types.ts
+++ b/apps/studio/src/lib/types.ts
@@ -19,6 +19,8 @@ export interface RunMeta {
   source: 'local' | 'remote';
   project_id?: string;
   project_name?: string;
+  /** Optional user-assigned tags from the run's sidecar tags.json. */
+  tags?: string[];
 }
 
 export interface RunListResponse {
@@ -148,10 +150,39 @@ export interface CompareCell {
   tests: CompareTestResult[];
 }
 
+/**
+ * A single evaluation run surfaced in the per-run compare view.
+ *
+ * Each run workspace contributes exactly one entry, independent of the
+ * aggregated `(experiment, target)` cells. Users select multiple runs to
+ * compare them side-by-side, regardless of whether the runs share an
+ * experiment or target.
+ */
+export interface CompareRunEntry {
+  run_id: string;
+  started_at: string;
+  experiment: string;
+  target: string;
+  tags?: string[];
+  source: 'local' | 'remote';
+  eval_count: number;
+  passed_count: number;
+  pass_rate: number;
+  avg_score: number;
+  tests: CompareTestResult[];
+}
+
 export interface CompareResponse {
   experiments: string[];
   targets: string[];
   cells: CompareCell[];
+  /** Per-run entries, sorted newest first. */
+  runs?: CompareRunEntry[];
+}
+
+export interface RunTagsResponse {
+  tags: string[];
+  updated_at: string;
 }
 
 export interface TargetSummary {
diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx
index 4dd892ee9..136c1123a 100644
--- a/apps/studio/src/routes/index.tsx
+++ b/apps/studio/src/routes/index.tsx
@@ -274,7 +274,7 @@ function SingleProjectHome() {
         />
       )}
       {activeTab === 'experiments' && <ExperimentsTab />}
-      {activeTab === 'compare' && <CompareTabContent />}
+      {activeTab === 'compare' && <CompareTabContent readOnly={isReadOnly} />}
       {activeTab === 'targets' && <TargetsTab />}
 
       {!isReadOnly && <RunEvalModal open={showRunEval} onClose={() => setShowRunEval(false)} />}
@@ -282,9 +282,17 @@ function SingleProjectHome() {
   );
 }
 
-function CompareTabContent() {
+function CompareTabContent({ readOnly }: { readOnly: boolean }) {
   const { data, isLoading, isError, error } = useCompare();
-  return <CompareTab data={data} isLoading={isLoading} isError={isError} error={error} />;
+  return (
+    <CompareTab
+      data={data}
+      isLoading={isLoading}
+      isError={isError}
+      error={error}
+      readOnly={readOnly}
+    />
+  );
 }
 
 function RunsTabContent({
diff --git a/apps/studio/src/routes/projects/$benchmarkId.tsx b/apps/studio/src/routes/projects/$benchmarkId.tsx
index 1d0660cb4..b8834dbb2 100644
--- a/apps/studio/src/routes/projects/$benchmarkId.tsx
+++ b/apps/studio/src/routes/projects/$benchmarkId.tsx
@@ -91,7 +91,9 @@ function ProjectHomePage() {
 
       {activeTab === 'runs' && <ProjectRunsTab benchmarkId={benchmarkId} />}
       {activeTab === 'experiments' && <ProjectExperimentsTab benchmarkId={benchmarkId} />}
-      {activeTab === 'compare' && <ProjectCompareTab benchmarkId={benchmarkId} />}
+      {activeTab === 'compare' && (
+        <ProjectCompareTab benchmarkId={benchmarkId} readOnly={isReadOnly} />
+      )}
       {activeTab === 'targets' && <ProjectTargetsTab benchmarkId={benchmarkId} />}
 
       {!isReadOnly && (
@@ -209,9 +211,24 @@ function ProjectExperimentsTab({ benchmarkId }: { benchmarkId: string }) {
   );
 }
 
-function ProjectCompareTab({ benchmarkId }: { benchmarkId: string }) {
+function ProjectCompareTab({
+  benchmarkId,
+  readOnly,
+}: {
+  benchmarkId: string;
+  readOnly: boolean;
+}) {
   const { data, isLoading, isError, error } = useQuery(benchmarkCompareOptions(benchmarkId));
-  return <CompareTab data={data} isLoading={isLoading} isError={isError} error={error} />;
+  return (
+    <CompareTab
+      data={data}
+      isLoading={isLoading}
+      isError={isError}
+      error={error}
+      benchmarkId={benchmarkId}
+      readOnly={readOnly}
+    />
+  );
 }
 
 function ProjectTargetsTab({ benchmarkId }: { benchmarkId: string }) {
diff --git a/apps/web/src/assets/screenshots/studio-compare-aggregated.png b/apps/web/src/assets/screenshots/studio-compare-aggregated.png
new file mode 100644
index 000000000..a1814ac36
Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-aggregated.png differ
diff --git a/apps/web/src/assets/screenshots/studio-compare-per-run.png b/apps/web/src/assets/screenshots/studio-compare-per-run.png
new file mode 100644
index 000000000..1b3a452ce
Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-per-run.png differ
diff --git a/apps/web/src/assets/screenshots/studio-compare-side-by-side.png b/apps/web/src/assets/screenshots/studio-compare-side-by-side.png
new file mode 100644
index 000000000..584514af4
Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-compare-side-by-side.png differ
diff --git a/apps/web/src/assets/screenshots/studio-compare.png b/apps/web/src/assets/screenshots/studio-compare.png
deleted file mode 100644
index 78c8690de..000000000
Binary files a/apps/web/src/assets/screenshots/studio-compare.png and /dev/null differ
diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx
index 57dd61ff1..ef31b0ab3 100644
--- a/apps/web/src/content/docs/docs/tools/studio.mdx
+++ b/apps/web/src/content/docs/docs/tools/studio.mdx
@@ -11,7 +11,9 @@ import studioRunDetail from '../../../../assets/screenshots/studio-run-detail.pn
 import studioExperiments from '../../../../assets/screenshots/studio-experiments.png';
 import studioProjects from '../../../../assets/screenshots/studio-projects.png';
 import studioProjectsMulti from '../../../../assets/screenshots/studio-projects-multi.png';
-import studioCompare from '../../../../assets/screenshots/studio-compare.png';
+import studioCompareAggregated from '../../../../assets/screenshots/studio-compare-aggregated.png';
+import studioComparePerRun from '../../../../assets/screenshots/studio-compare-per-run.png';
+import studioCompareSideBySide from '../../../../assets/screenshots/studio-compare-side-by-side.png';
 import studioRunsBench from '../../../../assets/screenshots/studio-runs-bench.png';
 
 The `studio` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores. It shows both local runs and runs synced from a remote results repository.
@@ -53,7 +55,7 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z
 - **Targets** — group runs by target (model/agent)
 - **Run Detail** — drill into a run to see per-test results, scores, and evaluator output
 - **Human Review** — add feedback annotations to individual test results
-- **Comparison Matrix** — experiment × target matrix showing pass rates across dimensions
+- **Compare** — two modes: an aggregated experiment × target matrix, and a per-run view for selecting individual runs to compare side-by-side with optional retroactive tags
 - **Remote Results** — sync and browse runs pushed from other machines or CI (see [Remote Results](#remote-results))
 
 ## Run Detail
@@ -68,11 +70,17 @@ The Experiments tab groups runs by experiment name so you can compare the impact
 
 <Image src={studioExperiments} alt="AgentV Studio experiments tab comparing with_skills (100%) vs without_skills (60%) pass rates" />
 
-## Comparison Matrix
+## Compare
 
-The **Compare** tab shows a cross-model, cross-experiment performance matrix. Cells are color-coded by pass rate: green (80%+), yellow (50–80%), red (below 50%). The best performer per row has an emerald ring; the worst has a red ring. Click any cell to expand per-test-case results.
+The **Compare** tab has two modes: **Aggregated** for the classic experiment × target matrix, and **Per run** for selecting individual runs and pitting them side-by-side. Toggle between them from the mode switch on the right of the masthead.
 
-<Image src={studioCompare} alt="AgentV Studio comparison matrix showing experiment vs target pass rates with color coding" />
+<Image src={studioCompareSideBySide} alt="AgentV Studio side-by-side comparison of two runs tagged improved-prompt and baseline, with per-test pass rates" />
+
+### Aggregated matrix
+
+The default view shows a cross-experiment, cross-target performance matrix. Numbers are colour-coded by pass rate — green (80%+), amber (50–80%), red (below 50%) — and each cell shows `passed/total` and the mean score. Click any cell to expand the per-test-case breakdown.
+
+<Image src={studioCompareAggregated} alt="AgentV Studio aggregated compare matrix showing experiment × target pass rates" />
 
 Run the same eval against multiple providers or experiment variants, then open the Compare tab:
 
@@ -84,7 +92,25 @@ agentv eval my.EVAL.yaml --target gemini --experiment with-caching
 agentv studio  # Compare tab shows 2x2 matrix
 ```
 
-The matrix is available per-project under the **Compare** tab.
+### Per-run comparison
+
+Running the same `(experiment, target)` twice no longer collapses into a single cell. Switch to **Per run** mode to see every run as its own row, select two or more, and compare them head-to-head.
+
+<Image src={studioComparePerRun} alt="AgentV Studio per-run compare mode listing individual runs with timestamps, tags, experiment, target, and pass rate" />
+
+Use per-run mode when you want to:
+
+- Compare back-to-back runs of the same agent + eval after a prompt or parameter tweak
+- Pit a fresh run against a tagged baseline without touching the eval YAML
+- Debug flakiness by inspecting two identical-configuration runs side-by-side
+
+Select 2+ rows with the checkboxes and click the sticky **Compare N** action to open the side-by-side view. Column headers show the run's timestamp, with any assigned tags as chips below it. The per-test breakdown reuses the same scoring and colour tones as the aggregated matrix.
+
+### Retroactive tags
+
+Click any row's **Tags** cell to tag a run after the fact. Each run can carry multiple free-form tags (max 20, up to 60 characters each); tags are stored in a `tags.json` sidecar next to `index.jsonl` in the run workspace, so they're mutable, non-destructive, and won't touch your eval YAML or run manifest. The chip editor supports Enter/comma to commit a new tag, Backspace to remove the last chip, and **Clear all** to remove every tag (deletes the sidecar). Remote runs are read-only.
+
+Use tags to annotate ad-hoc variants, experiment cross-cuts, or status flags you didn't plan for up front — `baseline`, `v2-prompt`, `slow`, `after-retry-fix`, `regression`, etc. Unlike `experiment` — which groups runs and is baked into the JSONL at eval-run time — tags are mutable, multi-valued, and never touch the original run data.
 
 
 ## Benchmarks Dashboard
diff --git a/docs/plans/1037-per-run-compare.md b/docs/plans/1037-per-run-compare.md
new file mode 100644
index 000000000..cf6c9d18c
--- /dev/null
+++ b/docs/plans/1037-per-run-compare.md
@@ -0,0 +1,103 @@
+# Per-run comparison with retroactive labelling — Issue #1037
+
+## Goal
+Let Studio users compare individual runs (by timestamp / run id) side-by-side,
+independent of the current `(experiment, target)` aggregation. Optional labels
+replace timestamps in compare headers.
+
+## Data model
+
+### Sidecar label file
+- Path: `<run-dir>/label.json` next to `index.jsonl`
+- Content: `{ "label": string, "updated_at": string }`
+- Mutable, non-breaking, trivially reversible. Absent file = no label.
+
+### Wire format extension (non-breaking)
+Extend `CompareResponse`:
+```ts
+interface CompareRunEntry {
+  run_id: string;          // existing run id (experiment::timestamp or timestamp)
+  started_at: string;      // first record timestamp (fallback: manifest meta)
+  experiment: string;
+  target: string;
+  label?: string;
+  eval_count: number;
+  passed_count: number;
+  pass_rate: number;
+  avg_score: number;
+  tests: CompareTestResult[];
+}
+
+interface CompareResponse {
+  experiments: string[];
+  targets: string[];
+  cells: CompareCell[];       // unchanged
+  runs: CompareRunEntry[];    // NEW
+}
+```
+
+Extend `RunMeta` with optional `label?: string`.
+
+## Backend changes — apps/cli/src/commands/results/serve.ts
+
+1. `handleCompare` — after building cells, also build per-run entries.
+   Each run file → compute eval_count, passed_count, pass_rate, avg_score,
+   tests (cap 100). Read sidecar label.
+2. `handleRuns` — already enriches RunMeta; add label sidecar lookup.
+3. New `handleRunLabel` (PUT/POST) — writes `label.json`. Unscoped and
+   benchmark-scoped variants.
+4. New `handleRunLabelDelete` (DELETE) — removes `label.json`.
+
+## Frontend changes
+
+### Types — apps/studio/src/lib/types.ts
+Extend `CompareResponse`, add `CompareRunEntry`, add `label?` to `RunMeta`.
+
+### API hooks — apps/studio/src/lib/api.ts
+- `saveRunLabel(runId, label, benchmarkId?)` — PUT mutation
+- `deleteRunLabel(runId, benchmarkId?)` — DELETE mutation
+- Invalidate `['compare']`, `['runs']`, `['benchmarks', id, 'compare']`
+
+### CompareTab redesign — apps/studio/src/components/CompareTab.tsx
+
+**Aesthetic direction: Editorial data-terminal**
+- Display font: Fraunces (variable serif with optical sizing) — for headings
+- Data font: JetBrains Mono Variable — tabular numbers, run ids, deltas
+- Body: Inter-free. Use system-ui sparingly for secondary text, or DM Sans
+- Palette: off-black (#0a0a0b) base, warm ivory (#f4ecd8) text, signal
+  accents (emerald #10b981, amber #f59e0b, rose #f43f5e). Hairline dividers
+  in warm gray (#2a2622).
+- Layout: Asymmetric header (big serif title + mode toggle right-aligned).
+  Sharp hairline rules. Tabular number columns. Generous vertical rhythm.
+- Motion: Staggered fade-in on mount (CSS `@keyframes` with animation-delay).
+  Hover brings subtle shadow+translate on selectable rows. Mode toggle
+  slides underline indicator.
+
+**Modes:**
+1. **Aggregated** (default) — existing matrix, re-skinned with the new
+   aesthetic. Unchanged logic.
+2. **Per run** — runs table sorted by timestamp desc with:
+   - Selectable checkbox (multi-select)
+   - Columns: `timestamp | label | experiment | target | tests | pass | avg`
+   - Inline "Edit label" button → popover/inline input
+   - Sticky footer: "Compare N selected" button (enabled when N ≥ 2)
+   - Opening compare view renders a side-by-side table: one column per run,
+     using label or formatted timestamp. Reuses `CompareMatrixCell` rendering
+     logic for per-test breakdown.
+
+## Validation plan
+
+1. Unit-ish: typecheck, lint, build.
+2. Backend e2e: `bun apps/cli/src/cli.ts results serve --port 9100` on a
+   benchmark with ≥2 runs of the same (experiment, target). Hit
+   `/api/compare` and verify `runs[]` present. Hit `PUT /api/runs/:id/label`
+   and verify sidecar file is written.
+3. Frontend visual: agent-browser with `--cdp 9222` on http://localhost:5173
+   (or wherever studio dev runs). Screenshot aggregated mode, per-run mode,
+   label edit, compare view. Iterate on design until polished.
+
+## Out of scope
+- Eval YAML schema changes
+- CLI flags
+- Multi-label / tag taxonomy
+- Cross-project run compare

+ Test case +	+ +
+ Pass rate +	+ +
+ {tid} +	+ — +	+ + + + {Math.round(t.score * 100)}% + + +