EntityProcess · christso · Apr 11, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 11, 2026
diff --git a/apps/cli/src/commands/results/run-tags.ts b/apps/cli/src/commands/results/run-tags.ts
@@ -0,0 +1,132 @@
+/**
+ * Per-run tag sidecar file helpers.
+ *
+ * Tags are stored as a `tags.json` sidecar next to the run's `index.jsonl`
+ * manifest. The sidecar is optional, mutable, and non-breaking — absence
+ * means the run has no user-assigned tags.
+ *
+ * Wire format (stored on disk):
+ * ```json
+ * { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" }
+ * ```
+ *
+ * Used by the Studio compare API so users can retroactively tag runs
+ * without changing the eval YAML or the run manifest itself. This mirrors
+ * the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued
+ * list of free-form labels that lives alongside the immutable run_id.
+ *
+ * Validation rules:
+ *   - Each tag is 1–60 characters after trimming
+ *   - No control characters (\n, \t, DEL, etc.)
+ *   - Tags are deduplicated case-sensitively
+ *   - A run can have at most 20 tags
+ *   - Writing an empty array removes the sidecar file
+ *
+ * To extend (e.g. add colored labels or descriptions): add optional fields
+ * to `RunTagsFile` and keep the schema additive so older files still parse.
+ */
+
+import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+
+export const RUN_TAGS_FILENAME = 'tags.json';
+
+/** Maximum number of tags per run. */
+export const MAX_TAGS_PER_RUN = 20;
+
+/** Maximum length of a single tag after trimming. */
+export const MAX_TAG_LENGTH = 60;
+
+export interface RunTagsFile {
+  /** Ordered, deduplicated list of user-assigned tags. */
+  tags: string[];
+  /** ISO-8601 timestamp of last update. */
+  updated_at: string;
+}
+
+/** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */
+export function runTagsPath(manifestPath: string): string {
+  return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME);
+}
+
+/** Read the tags for a run. Returns `undefined` if missing or unreadable. */
+export function readRunTags(manifestPath: string): RunTagsFile | undefined {
+  const fp = runTagsPath(manifestPath);
+  if (!existsSync(fp)) return undefined;
+  try {
+    const parsed = JSON.parse(readFileSync(fp, 'utf8')) as unknown;
+    if (!parsed || typeof parsed !== 'object') return undefined;
+    const record = parsed as Record<string, unknown>;
+    if (!Array.isArray(record.tags)) return undefined;
+    const tags = record.tags.filter(
+      (t): t is string => typeof t === 'string' && t.trim().length > 0,
+    );
+    if (tags.length === 0) return undefined;
+    return {
+      tags,
+      updated_at: typeof record.updated_at === 'string' ? record.updated_at : '',
+    };
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Write tags for a run. Replaces any existing tags. Pass an empty array
+ * to remove the sidecar entirely.
+ */
+export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null {
+  const cleaned = normalizeTags(tags);
+  if (cleaned.length === 0) {
+    deleteRunTags(manifestPath);
+    return null;
+  }
+  const entry: RunTagsFile = {
+    tags: cleaned,
+    updated_at: new Date().toISOString(),
+  };
+  writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8');
+  return entry;
+}
+
+/** Remove a run's tags sidecar. No-op if the file does not exist. */
+export function deleteRunTags(manifestPath: string): void {
+  const fp = runTagsPath(manifestPath);
+  if (existsSync(fp)) {
+    unlinkSync(fp);
+  }
+}
+
+/**
+ * Trim, validate, and deduplicate an incoming tag array. Throws on any
+ * invalid entry so the caller can surface a user-friendly error.
+ */
+function normalizeTags(tags: readonly string[]): string[] {
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const raw of tags) {
+    if (typeof raw !== 'string') {
+      throw new Error('Tags must be strings');
+    }
+    const trimmed = raw.trim();
+    if (trimmed === '') continue;
+    if (trimmed.length > MAX_TAG_LENGTH) {
+      throw new Error(`Tag "${trimmed.slice(0, 20)}…" exceeds ${MAX_TAG_LENGTH} characters`);
+    }
+    // Reject control characters (newlines, tabs, DEL, etc.) — they break
+    // column headers in compare views and confuse test assertions.
+    for (let i = 0; i < trimmed.length; i++) {
+      const code = trimmed.charCodeAt(i);
+      if (code < 0x20 || code === 0x7f) {
+        throw new Error('Tag must not contain control characters');
+      }
+    }
+    if (seen.has(trimmed)) continue;
+    seen.add(trimmed);
+    out.push(trimmed);
+  }
+  if (out.length > MAX_TAGS_PER_RUN) {
+    throw new Error(`Too many tags (max ${MAX_TAGS_PER_RUN})`);
+  }
+  return out;
+}
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -59,6 +59,7 @@ import {
   listMergedResultFiles,
   syncRemoteResults,
 } from './remote.js';
+import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js';
 import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js';
 
 // ── Source resolution ────────────────────────────────────────────────────
@@ -273,6 +274,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
       } catch {
         // ignore enrichment errors
       }
+      const tagsEntry = readRunTags(m.path);
       return {
         filename: m.filename,
         display_name: m.displayName,
@@ -285,6 +287,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
         source: m.source,
         ...(target && { target }),
         ...(experiment && { experiment }),
+        ...(tagsEntry && { tags: tagsEntry.tags }),
       };
     }),
   });
@@ -551,7 +554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
   const { runs: metas } = await listMergedResultFiles(searchDir);
   const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
 
-  // Collect per-test-case results keyed by experiment × target
+  // Collect per-test-case results keyed by experiment × target (aggregated view)
   const cellMap = new Map<
     string,
     {
@@ -569,17 +572,54 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
     }
   >();
 
+  // Per-run entries (per-run view). Each run workspace contributes exactly
+  // one entry, independent of the aggregated matrix.
+  const runEntries: Array<{
+    run_id: string;
+    started_at: string;
+    experiment: string;
+    target: string;
+    tags?: string[];
+    source: 'local' | 'remote';
+    eval_count: number;
+    passed_count: number;
+    pass_rate: number;
+    avg_score: number;
+    tests: Array<{
+      test_id: string;
+      score: number;
+      passed: boolean;
+      execution_status?: string;
+    }>;
+  }> = [];
+
   const experimentsSet = new Set<string>();
   const targetsSet = new Set<string>();
+  const MAX_TESTS_PER_CELL = 100;
 
   for (const m of metas) {
     try {
       const records = loadLightweightResults(m.path);
+      const runTestMap = new Map<
+        string,
+        { test_id: string; score: number; passed: boolean; execution_status?: string }
+      >();
+      let runEvalCount = 0;
+      let runPassedCount = 0;
+      let runScoreSum = 0;
+      let runExperiment = 'default';
+      let runTarget = 'default';
+      let runStartedAt = m.timestamp;
+
       for (const r of records) {
         const experiment = r.experiment ?? 'default';
         const target = r.target ?? 'default';
         experimentsSet.add(experiment);
         targetsSet.add(target);
+        runExperiment = experiment;
+        runTarget = target;
+        if (r.timestamp && r.timestamp < runStartedAt) runStartedAt = r.timestamp;
+
         const key = JSON.stringify([experiment, target]);
         const entry = cellMap.get(key) ?? {
           experiment,
@@ -600,14 +640,41 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
           execution_status: r.executionStatus,
         });
         cellMap.set(key, entry);
+
+        // Per-run accumulation. Dedupe tests within the run by last-wins.
+        runTestMap.set(r.testId, {
+          test_id: r.testId,
+          score: r.score,
+          passed,
+          execution_status: r.executionStatus,
+        });
+        runEvalCount++;
+        if (passed) runPassedCount++;
+        runScoreSum += r.score;
       }
+
+      if (runEvalCount === 0) continue;
+
+      const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL);
+      const tagsEntry = readRunTags(m.path);
+      runEntries.push({
+        run_id: m.filename,
+        started_at: runStartedAt,
+        experiment: runExperiment,
+        target: runTarget,
+        ...(tagsEntry && { tags: tagsEntry.tags }),
+        source: m.source,
+        eval_count: runEvalCount,
+        passed_count: runPassedCount,
+        pass_rate: runPassedCount / runEvalCount,
+        avg_score: runScoreSum / runEvalCount,
+        tests: runTests,
+      });
     } catch {
       // skip runs that fail to load
     }
   }
 
-  const MAX_TESTS_PER_CELL = 100;
-
   const cells = [...cellMap.values()].map((entry) => {
     // Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order)
     const dedupMap = new Map<string, (typeof entry.tests)[number]>();
@@ -630,10 +697,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
     };
   });
 
+  // Per-run entries sorted by timestamp descending (newest first).
+  runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at));
+
   return c.json({
     experiments: [...experimentsSet].sort(),
     targets: [...targetsSet].sort(),
     cells,
+    runs: runEntries,
   });
 }
 
@@ -702,6 +773,52 @@ function handleFeedbackRead(c: C, { searchDir }: DataContext) {
   return c.json(readFeedback(existsSync(resultsDir) ? resultsDir : searchDir));
 }
 
+async function handleRunTagsPut(c: C, { searchDir }: DataContext) {
+  const filename = c.req.param('filename') ?? '';
+  const meta = await findRunById(searchDir, filename);
+  if (!meta) return c.json({ error: 'Run not found' }, 404);
+  if (meta.source === 'remote') {
+    return c.json({ error: 'Tags can only be set on local runs' }, 400);
+  }
+  let body: unknown;
+  try {
+    body = await c.req.json();
+  } catch {
+    return c.json({ error: 'Invalid JSON' }, 400);
+  }
+  if (!body || typeof body !== 'object') {
+    return c.json({ error: 'Invalid payload' }, 400);
+  }
+  const tags = (body as Record<string, unknown>).tags;
+  if (!Array.isArray(tags)) {
+    return c.json({ error: 'Missing tags array' }, 400);
+  }
+  try {
+    const entry = writeRunTags(meta.path, tags as string[]);
+    return c.json({
+      tags: entry?.tags ?? [],
+      updated_at: entry?.updated_at ?? new Date().toISOString(),
+    });
+  } catch (err) {
+    return c.json({ error: (err as Error).message }, 400);
+  }
+}
+
+async function handleRunTagsDelete(c: C, { searchDir }: DataContext) {
+  const filename = c.req.param('filename') ?? '';
+  const meta = await findRunById(searchDir, filename);
+  if (!meta) return c.json({ error: 'Run not found' }, 404);
+  if (meta.source === 'remote') {
+    return c.json({ error: 'Tags can only be removed on local runs' }, 400);
+  }
+  try {
+    deleteRunTags(meta.path);
+    return c.json({ ok: true });
+  } catch (err) {
+    return c.json({ error: (err as Error).message }, 500);
+  }
+}
+
 // ── Hono app factory ─────────────────────────────────────────────────────
 
 /**
@@ -934,6 +1051,18 @@ export function createApp(
   app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir)));
   app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir)));
   app.get('/api/runs', (c) => handleRuns(c, defaultCtx));
+  app.put('/api/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return handleRunTagsPut(c, defaultCtx);
+  });
+  app.delete('/api/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return handleRunTagsDelete(c, defaultCtx);
+  });
   app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
   app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
   app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
@@ -1046,6 +1175,18 @@ export function createApp(
     withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))),
   );
   app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns));
+  app.put('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return withBenchmark(c, handleRunTagsPut);
+  });
+  app.delete('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    return withBenchmark(c, handleRunTagsDelete);
+  });
   app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail));
   app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) =>
     withBenchmark(c, handleRunSuites),