diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 15ec4f4aa..354990faa 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -554,6 +554,19 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); + // Optional tag filter: `?tags=baseline,v2-prompt` keeps only runs that + // carry at least one of the given tags (OR semantics). Empty / missing + // param is a no-op. Filtering is applied before aggregation so it + // propagates through `cells[]`, `runs[]`, `experiments[]`, and + // `targets[]` uniformly. + const tagsParam = c.req.query('tags') ?? ''; + const filterTags = new Set( + tagsParam + .split(',') + .map((t) => t.trim()) + .filter(Boolean), + ); + // Collect per-test-case results keyed by experiment × target (aggregated view) const cellMap = new Map< string, @@ -599,6 +612,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { for (const m of metas) { try { + // Read tags before any heavy work so the `?tags=` filter can skip + // non-matching runs without loading their JSONL records. + const tagsEntry = readRunTags(m.path); + if (filterTags.size > 0) { + const runTags = tagsEntry?.tags ?? []; + if (!runTags.some((t) => filterTags.has(t))) continue; + } + const records = loadLightweightResults(m.path); const runTestMap = new Map< string, @@ -656,7 +677,6 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { if (runEvalCount === 0) continue; const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL); - const tagsEntry = readRunTags(m.path); runEntries.push({ run_id: m.filename, started_at: runStartedAt, diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 14007b406..6b79b5c19 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -634,6 +634,158 @@ describe('serve app', () => { }); }); + // ── GET /api/compare (tag filter) ─────────────────────────────────── + + describe('GET /api/compare', () => { + function seedCompareFixture() { + // Four runs, each in its own run workspace, with the tags documented + // below. This setup exercises the OR filter semantics used by + // `/api/compare?tags=`. + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + + const runs: Array<{ + name: string; + experiment: string; + target: string; + score: number; + tags?: string[]; + }> = [ + { + name: '2026-04-01T10-00-00-000Z', + experiment: 'exp-a', + target: 'gpt-4o', + score: 1.0, + tags: ['baseline'], + }, + { + name: '2026-04-02T10-00-00-000Z', + experiment: 'exp-a', + target: 'claude', + score: 0.9, + tags: ['baseline'], + }, + { + name: '2026-04-03T10-00-00-000Z', + experiment: 'exp-b', + target: 'gpt-4o', + score: 0.85, + tags: ['v2-prompt'], + }, + { + // Intentionally untagged — should never match any tag filter. + name: '2026-04-04T10-00-00-000Z', + experiment: 'exp-b', + target: 'claude', + score: 0.7, + }, + ]; + + for (const run of runs) { + const runDir = path.join(runsDir, run.name); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + test_id: `test-${run.name}`, + experiment: run.experiment, + target: run.target, + score: run.score, + }), + ); + if (run.tags && run.tags.length > 0) { + writeFileSync( + path.join(runDir, 'tags.json'), + `${JSON.stringify({ tags: run.tags, updated_at: '2026-04-10T00:00:00.000Z' }, null, 2)}\n`, + ); + } + } + } + + type CompareJson = { + experiments: string[]; + targets: string[]; + cells: Array<{ experiment: string; target: string; eval_count: number }>; + runs?: Array<{ run_id: string; experiment: string; target: string; tags?: string[] }>; + }; + + it('returns all runs when no filter is provided', async () => { + seedCompareFixture(); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const res = await app.request('/api/compare'); + expect(res.status).toBe(200); + const data = (await res.json()) as CompareJson; + + expect(data.runs).toHaveLength(4); + expect(data.experiments.sort()).toEqual(['exp-a', 'exp-b']); + expect(data.targets.sort()).toEqual(['claude', 'gpt-4o']); + expect(data.cells).toHaveLength(4); + }); + + it('filters to a single tag', async () => { + seedCompareFixture(); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const res = await app.request('/api/compare?tags=baseline'); + expect(res.status).toBe(200); + const data = (await res.json()) as CompareJson; + + expect(data.runs).toHaveLength(2); + for (const run of data.runs ?? []) { + expect(run.tags ?? []).toContain('baseline'); + } + // Only exp-a is represented; targets narrow to the two used by exp-a runs. + expect(data.experiments).toEqual(['exp-a']); + expect(data.targets.sort()).toEqual(['claude', 'gpt-4o']); + expect(data.cells).toHaveLength(2); + }); + + it('applies OR semantics across multiple tags', async () => { + seedCompareFixture(); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const res = await app.request('/api/compare?tags=baseline,v2-prompt'); + expect(res.status).toBe(200); + const data = (await res.json()) as CompareJson; + + // Three tagged runs; the untagged run is excluded. + expect(data.runs).toHaveLength(3); + expect(data.experiments.sort()).toEqual(['exp-a', 'exp-b']); + // (exp-a, gpt-4o), (exp-a, claude), (exp-b, gpt-4o) — the (exp-b, claude) + // cell is missing because the only contributing run was untagged. + expect(data.cells).toHaveLength(3); + const cellKeys = (data.cells ?? []).map((c) => `${c.experiment}::${c.target}`).sort(); + expect(cellKeys).toEqual(['exp-a::claude', 'exp-a::gpt-4o', 'exp-b::gpt-4o']); + }); + + it('returns empty payload when no runs match the filter', async () => { + seedCompareFixture(); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const res = await app.request('/api/compare?tags=nonexistent'); + expect(res.status).toBe(200); + const data = (await res.json()) as CompareJson; + + expect(data.runs).toEqual([]); + expect(data.cells).toEqual([]); + expect(data.experiments).toEqual([]); + expect(data.targets).toEqual([]); + }); + + it('ignores whitespace and empty segments in the tags query', async () => { + seedCompareFixture(); + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + // ` , baseline , ` should parse to just ['baseline']. + const res = await app.request('/api/compare?tags=%20,%20baseline%20,%20'); + expect(res.status).toBe(200); + const data = (await res.json()) as CompareJson; + expect(data.runs).toHaveLength(2); + }); + }); + // ── SPA fallback ────────────────────────────────────────────────────── describe('SPA fallback', () => { diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index 188d0c413..ffbe73f1b 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -55,7 +55,99 @@ export function CompareTab({ readOnly, }: CompareTabProps) { const [mode, setMode] = useState('aggregated'); - const runsCount = data?.runs?.length ?? 0; + const [filterTags, setFilterTags] = useState([]); + + // Chip list is derived from the UNFILTERED response so chips stay visible + // even when the active filter would otherwise hide the runs that supplied + // them. Sorted alphabetically for stable UI. + const { allTags, tagCounts } = useMemo(() => { + const counts = new Map(); + for (const run of data?.runs ?? []) { + for (const tag of run.tags ?? []) { + counts.set(tag, (counts.get(tag) ?? 0) + 1); + } + } + return { allTags: [...counts.keys()].sort(), tagCounts: counts }; + }, [data?.runs]); + + // When a filter is active, re-aggregate cells/runs client-side from the + // filtered subset of runs. This avoids a network round-trip on every chip + // click and keeps the backend responsible only for the initial fetch. + // Safe because the server already exposes per-run totals (`eval_count`, + // `passed_count`, `avg_score`); we just sum them per (experiment, target) + // bucket, weighting averages by run eval_count. + const filteredData = useMemo(() => { + if (!data) return data; + if (filterTags.length === 0) return data; + const filterSet = new Set(filterTags); + const filteredRuns = (data.runs ?? []).filter((r) => + (r.tags ?? []).some((t) => filterSet.has(t)), + ); + + type CellAccum = { + experiment: string; + target: string; + eval_count: number; + passed_count: number; + score_sum: number; + tests: CompareTestResult[]; + }; + const cellMap = new Map(); + const experimentsSet = new Set(); + const targetsSet = new Set(); + + for (const run of filteredRuns) { + experimentsSet.add(run.experiment); + targetsSet.add(run.target); + const key = `${run.experiment}::${run.target}`; + const entry = cellMap.get(key) ?? { + experiment: run.experiment, + target: run.target, + eval_count: 0, + passed_count: 0, + score_sum: 0, + tests: [], + }; + entry.eval_count += run.eval_count; + entry.passed_count += run.passed_count; + entry.score_sum += run.avg_score * run.eval_count; + for (const t of run.tests) entry.tests.push(t); + cellMap.set(key, entry); + } + + const cells: CompareCell[] = [...cellMap.values()].map((e) => { + // Dedupe tests by test_id, last-wins (same pattern as the server). + const dedup = new Map(); + for (const t of e.tests) dedup.set(t.test_id, t); + return { + experiment: e.experiment, + target: e.target, + eval_count: e.eval_count, + passed_count: e.passed_count, + pass_rate: e.eval_count > 0 ? e.passed_count / e.eval_count : 0, + avg_score: e.eval_count > 0 ? e.score_sum / e.eval_count : 0, + tests: [...dedup.values()].slice(-100), + }; + }); + + return { + ...data, + experiments: [...experimentsSet].sort(), + targets: [...targetsSet].sort(), + cells, + runs: filteredRuns, + }; + }, [data, filterTags]); + + const toggleFilterTag = (tag: string) => { + setFilterTags((prev) => (prev.includes(tag) ? prev.filter((x) => x !== tag) : [...prev, tag])); + }; + const clearFilterTags = () => setFilterTags([]); + + const runsCount = filteredData?.runs?.length ?? 0; + const underlyingHasData = data && data.cells.length > 0; + const filterYieldsNoRuns = + filterTags.length > 0 && filteredData && (filteredData.runs?.length ?? 0) === 0; return (
@@ -65,12 +157,37 @@ export function CompareTab({ {!isLoading && isError && error && ( )} - {!isLoading && !isError && (!data || data.cells.length === 0) && } - {!isLoading && !isError && data && data.cells.length > 0 && ( + {!isLoading && !isError && !underlyingHasData && } + {!isLoading && !isError && underlyingHasData && ( <> - {mode === 'aggregated' && } - {mode === 'per-run' && ( - + {allTags.length > 0 && ( + + )} + {filterYieldsNoRuns ? ( + `\`${t}\``).join(' + ')}`} + body="Clear the filter or pick a different tag combination." + action={{ label: 'Clear filter', onClick: clearFilterTags }} + /> + ) : ( + filteredData && ( + <> + {mode === 'aggregated' && } + {mode === 'per-run' && ( + + )} + + ) )} )} @@ -78,6 +195,72 @@ export function CompareTab({ ); } +// ── Tag filter bar ────────────────────────────────────────────────────── + +function TagFilterBar({ + allTags, + tagCounts, + selected, + onToggle, + onClear, +}: { + allTags: string[]; + tagCounts: Map; + selected: string[]; + onToggle: (tag: string) => void; + onClear: () => void; +}) { + const selectedSet = new Set(selected); + const anySelected = selected.length > 0; + return ( +
+
+ + Filter by tag + + {allTags.map((tag) => { + const isActive = selectedSet.has(tag); + const count = tagCounts.get(tag) ?? 0; + return ( + + ); + })} + {anySelected && ( + + )} +
+

+ Showing runs with any selected tag. +

+
+ ); +} + // ── Header ────────────────────────────────────────────────────────────── function Header({ @@ -904,11 +1087,28 @@ function EmptyState() { ); } -function Notice({ headline, body }: { headline: string; body: string }) { +function Notice({ + headline, + body, + action, +}: { + headline: string; + body: string; + action?: { label: string; onClick: () => void }; +}) { return (

{headline}

{body}

+ {action && ( + + )}
); } diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index ef31b0ab3..e361722ec 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -112,6 +112,12 @@ Click any row's **Tags** cell to tag a run after the fact. Each run can carry mu Use tags to annotate ad-hoc variants, experiment cross-cuts, or status flags you didn't plan for up front — `baseline`, `v2-prompt`, `slow`, `after-retry-fix`, `regression`, etc. Unlike `experiment` — which groups runs and is baked into the JSONL at eval-run time — tags are mutable, multi-valued, and never touch the original run data. +### Filtering by tag + +Once runs are tagged, a chip row appears above the compare view listing every distinct tag with a usage count. Click a chip to narrow both the aggregated matrix and the per-run table to runs carrying at least one of the selected tags (OR semantics — clicking a second chip widens the set). A **Clear** link resets the filter, and filter selections persist as you switch between Aggregated and Per-run modes. + +The same filter is available to API consumers via `GET /api/compare?tags=baseline,v2-prompt`, which returns only the cells and runs whose tags intersect the query. + ## Benchmarks Dashboard