diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 15ec4f4aa..354990faa 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -554,6 +554,19 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
   const { runs: metas } = await listMergedResultFiles(searchDir);
   const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
 
+  // Optional tag filter: `?tags=baseline,v2-prompt` keeps only runs that
+  // carry at least one of the given tags (OR semantics). Empty / missing
+  // param is a no-op. Filtering is applied before aggregation so it
+  // propagates through `cells[]`, `runs[]`, `experiments[]`, and
+  // `targets[]` uniformly.
+  const tagsParam = c.req.query('tags') ?? '';
+  const filterTags = new Set(
+    tagsParam
+      .split(',')
+      .map((t) => t.trim())
+      .filter(Boolean),
+  );
+
   // Collect per-test-case results keyed by experiment × target (aggregated view)
   const cellMap = new Map<
     string,
@@ -599,6 +612,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
 
   for (const m of metas) {
     try {
+      // Read tags before any heavy work so the `?tags=` filter can skip
+      // non-matching runs without loading their JSONL records.
+      const tagsEntry = readRunTags(m.path);
+      if (filterTags.size > 0) {
+        const runTags = tagsEntry?.tags ?? [];
+        if (!runTags.some((t) => filterTags.has(t))) continue;
+      }
+
       const records = loadLightweightResults(m.path);
       const runTestMap = new Map<
         string,
@@ -656,7 +677,6 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
       if (runEvalCount === 0) continue;
 
       const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL);
-      const tagsEntry = readRunTags(m.path);
       runEntries.push({
         run_id: m.filename,
         started_at: runStartedAt,
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 14007b406..6b79b5c19 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -634,6 +634,158 @@ describe('serve app', () => {
     });
   });
 
+  // ── GET /api/compare (tag filter) ───────────────────────────────────
+
+  describe('GET /api/compare', () => {
+    function seedCompareFixture() {
+      // Four runs, each in its own run workspace, with the tags documented
+      // below. This setup exercises the OR filter semantics used by
+      // `/api/compare?tags=`.
+      const runsDir = path.join(tempDir, '.agentv', 'results', 'runs');
+      mkdirSync(runsDir, { recursive: true });
+
+      const runs: Array<{
+        name: string;
+        experiment: string;
+        target: string;
+        score: number;
+        tags?: string[];
+      }> = [
+        {
+          name: '2026-04-01T10-00-00-000Z',
+          experiment: 'exp-a',
+          target: 'gpt-4o',
+          score: 1.0,
+          tags: ['baseline'],
+        },
+        {
+          name: '2026-04-02T10-00-00-000Z',
+          experiment: 'exp-a',
+          target: 'claude',
+          score: 0.9,
+          tags: ['baseline'],
+        },
+        {
+          name: '2026-04-03T10-00-00-000Z',
+          experiment: 'exp-b',
+          target: 'gpt-4o',
+          score: 0.85,
+          tags: ['v2-prompt'],
+        },
+        {
+          // Intentionally untagged — should never match any tag filter.
+          name: '2026-04-04T10-00-00-000Z',
+          experiment: 'exp-b',
+          target: 'claude',
+          score: 0.7,
+        },
+      ];
+
+      for (const run of runs) {
+        const runDir = path.join(runsDir, run.name);
+        mkdirSync(runDir, { recursive: true });
+        writeFileSync(
+          path.join(runDir, 'index.jsonl'),
+          toJsonl({
+            ...RESULT_A,
+            test_id: `test-${run.name}`,
+            experiment: run.experiment,
+            target: run.target,
+            score: run.score,
+          }),
+        );
+        if (run.tags && run.tags.length > 0) {
+          writeFileSync(
+            path.join(runDir, 'tags.json'),
+            `${JSON.stringify({ tags: run.tags, updated_at: '2026-04-10T00:00:00.000Z' }, null, 2)}\n`,
+          );
+        }
+      }
+    }
+
+    type CompareJson = {
+      experiments: string[];
+      targets: string[];
+      cells: Array<{ experiment: string; target: string; eval_count: number }>;
+      runs?: Array<{ run_id: string; experiment: string; target: string; tags?: string[] }>;
+    };
+
+    it('returns all runs when no filter is provided', async () => {
+      seedCompareFixture();
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+
+      const res = await app.request('/api/compare');
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as CompareJson;
+
+      expect(data.runs).toHaveLength(4);
+      expect(data.experiments.sort()).toEqual(['exp-a', 'exp-b']);
+      expect(data.targets.sort()).toEqual(['claude', 'gpt-4o']);
+      expect(data.cells).toHaveLength(4);
+    });
+
+    it('filters to a single tag', async () => {
+      seedCompareFixture();
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+
+      const res = await app.request('/api/compare?tags=baseline');
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as CompareJson;
+
+      expect(data.runs).toHaveLength(2);
+      for (const run of data.runs ?? []) {
+        expect(run.tags ?? []).toContain('baseline');
+      }
+      // Only exp-a is represented; targets narrow to the two used by exp-a runs.
+      expect(data.experiments).toEqual(['exp-a']);
+      expect(data.targets.sort()).toEqual(['claude', 'gpt-4o']);
+      expect(data.cells).toHaveLength(2);
+    });
+
+    it('applies OR semantics across multiple tags', async () => {
+      seedCompareFixture();
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+
+      const res = await app.request('/api/compare?tags=baseline,v2-prompt');
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as CompareJson;
+
+      // Three tagged runs; the untagged run is excluded.
+      expect(data.runs).toHaveLength(3);
+      expect(data.experiments.sort()).toEqual(['exp-a', 'exp-b']);
+      // (exp-a, gpt-4o), (exp-a, claude), (exp-b, gpt-4o) — the (exp-b, claude)
+      // cell is missing because the only contributing run was untagged.
+      expect(data.cells).toHaveLength(3);
+      const cellKeys = (data.cells ?? []).map((c) => `${c.experiment}::${c.target}`).sort();
+      expect(cellKeys).toEqual(['exp-a::claude', 'exp-a::gpt-4o', 'exp-b::gpt-4o']);
+    });
+
+    it('returns empty payload when no runs match the filter', async () => {
+      seedCompareFixture();
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+
+      const res = await app.request('/api/compare?tags=nonexistent');
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as CompareJson;
+
+      expect(data.runs).toEqual([]);
+      expect(data.cells).toEqual([]);
+      expect(data.experiments).toEqual([]);
+      expect(data.targets).toEqual([]);
+    });
+
+    it('ignores whitespace and empty segments in the tags query', async () => {
+      seedCompareFixture();
+      const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+
+      // ` , baseline , ` should parse to just ['baseline'].
+      const res = await app.request('/api/compare?tags=%20,%20baseline%20,%20');
+      expect(res.status).toBe(200);
+      const data = (await res.json()) as CompareJson;
+      expect(data.runs).toHaveLength(2);
+    });
+  });
+
   // ── SPA fallback ──────────────────────────────────────────────────────
 
   describe('SPA fallback', () => {
diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx
index 188d0c413..ffbe73f1b 100644
--- a/apps/studio/src/components/CompareTab.tsx
+++ b/apps/studio/src/components/CompareTab.tsx
@@ -55,7 +55,99 @@ export function CompareTab({
   readOnly,
 }: CompareTabProps) {
   const [mode, setMode] = useState<ViewMode>('aggregated');
-  const runsCount = data?.runs?.length ?? 0;
+  const [filterTags, setFilterTags] = useState<string[]>([]);
+
+  // Chip list is derived from the UNFILTERED response so chips stay visible
+  // even when the active filter would otherwise hide the runs that supplied
+  // them. Sorted alphabetically for stable UI.
+  const { allTags, tagCounts } = useMemo(() => {
+    const counts = new Map<string, number>();
+    for (const run of data?.runs ?? []) {
+      for (const tag of run.tags ?? []) {
+        counts.set(tag, (counts.get(tag) ?? 0) + 1);
+      }
+    }
+    return { allTags: [...counts.keys()].sort(), tagCounts: counts };
+  }, [data?.runs]);
+
+  // When a filter is active, re-aggregate cells/runs client-side from the
+  // filtered subset of runs. This avoids a network round-trip on every chip
+  // click and keeps the backend responsible only for the initial fetch.
+  // Safe because the server already exposes per-run totals (`eval_count`,
+  // `passed_count`, `avg_score`); we just sum them per (experiment, target)
+  // bucket, weighting averages by run eval_count.
+  const filteredData = useMemo<CompareResponse | undefined>(() => {
+    if (!data) return data;
+    if (filterTags.length === 0) return data;
+    const filterSet = new Set(filterTags);
+    const filteredRuns = (data.runs ?? []).filter((r) =>
+      (r.tags ?? []).some((t) => filterSet.has(t)),
+    );
+
+    type CellAccum = {
+      experiment: string;
+      target: string;
+      eval_count: number;
+      passed_count: number;
+      score_sum: number;
+      tests: CompareTestResult[];
+    };
+    const cellMap = new Map<string, CellAccum>();
+    const experimentsSet = new Set<string>();
+    const targetsSet = new Set<string>();
+
+    for (const run of filteredRuns) {
+      experimentsSet.add(run.experiment);
+      targetsSet.add(run.target);
+      const key = `${run.experiment}::${run.target}`;
+      const entry = cellMap.get(key) ?? {
+        experiment: run.experiment,
+        target: run.target,
+        eval_count: 0,
+        passed_count: 0,
+        score_sum: 0,
+        tests: [],
+      };
+      entry.eval_count += run.eval_count;
+      entry.passed_count += run.passed_count;
+      entry.score_sum += run.avg_score * run.eval_count;
+      for (const t of run.tests) entry.tests.push(t);
+      cellMap.set(key, entry);
+    }
+
+    const cells: CompareCell[] = [...cellMap.values()].map((e) => {
+      // Dedupe tests by test_id, last-wins (same pattern as the server).
+      const dedup = new Map<string, CompareTestResult>();
+      for (const t of e.tests) dedup.set(t.test_id, t);
+      return {
+        experiment: e.experiment,
+        target: e.target,
+        eval_count: e.eval_count,
+        passed_count: e.passed_count,
+        pass_rate: e.eval_count > 0 ? e.passed_count / e.eval_count : 0,
+        avg_score: e.eval_count > 0 ? e.score_sum / e.eval_count : 0,
+        tests: [...dedup.values()].slice(-100),
+      };
+    });
+
+    return {
+      ...data,
+      experiments: [...experimentsSet].sort(),
+      targets: [...targetsSet].sort(),
+      cells,
+      runs: filteredRuns,
+    };
+  }, [data, filterTags]);
+
+  const toggleFilterTag = (tag: string) => {
+    setFilterTags((prev) => (prev.includes(tag) ? prev.filter((x) => x !== tag) : [...prev, tag]));
+  };
+  const clearFilterTags = () => setFilterTags([]);
+
+  const runsCount = filteredData?.runs?.length ?? 0;
+  const underlyingHasData = data && data.cells.length > 0;
+  const filterYieldsNoRuns =
+    filterTags.length > 0 && filteredData && (filteredData.runs?.length ?? 0) === 0;
 
   return (
     <div className="space-y-4">
@@ -65,12 +157,37 @@ export function CompareTab({
       {!isLoading && isError && error && (
         <ErrorPanel message={`Failed to load comparison data: ${error.message}`} />
       )}
-      {!isLoading && !isError && (!data || data.cells.length === 0) && <EmptyState />}
-      {!isLoading && !isError && data && data.cells.length > 0 && (
+      {!isLoading && !isError && !underlyingHasData && <EmptyState />}
+      {!isLoading && !isError && underlyingHasData && (
         <>
-          {mode === 'aggregated' && <AggregatedView data={data} />}
-          {mode === 'per-run' && (
-            <PerRunView data={data} benchmarkId={benchmarkId} readOnly={readOnly ?? false} />
+          {allTags.length > 0 && (
+            <TagFilterBar
+              allTags={allTags}
+              tagCounts={tagCounts}
+              selected={filterTags}
+              onToggle={toggleFilterTag}
+              onClear={clearFilterTags}
+            />
+          )}
+          {filterYieldsNoRuns ? (
+            <Notice
+              headline={`No runs match ${filterTags.map((t) => `\`${t}\``).join(' + ')}`}
+              body="Clear the filter or pick a different tag combination."
+              action={{ label: 'Clear filter', onClick: clearFilterTags }}
+            />
+          ) : (
+            filteredData && (
+              <>
+                {mode === 'aggregated' && <AggregatedView data={filteredData} />}
+                {mode === 'per-run' && (
+                  <PerRunView
+                    data={filteredData}
+                    benchmarkId={benchmarkId}
+                    readOnly={readOnly ?? false}
+                  />
+                )}
+              </>
+            )
           )}
         </>
       )}
@@ -78,6 +195,72 @@ export function CompareTab({
   );
 }
 
+// ── Tag filter bar ──────────────────────────────────────────────────────
+
+function TagFilterBar({
+  allTags,
+  tagCounts,
+  selected,
+  onToggle,
+  onClear,
+}: {
+  allTags: string[];
+  tagCounts: Map<string, number>;
+  selected: string[];
+  onToggle: (tag: string) => void;
+  onClear: () => void;
+}) {
+  const selectedSet = new Set(selected);
+  const anySelected = selected.length > 0;
+  return (
+    <div className="rounded-lg border border-gray-800 bg-gray-900/40 px-4 py-3">
+      <div className="flex flex-wrap items-center gap-2">
+        <span className="text-xs font-medium uppercase tracking-wider text-gray-500">
+          Filter by tag
+        </span>
+        {allTags.map((tag) => {
+          const isActive = selectedSet.has(tag);
+          const count = tagCounts.get(tag) ?? 0;
+          return (
+            <button
+              key={tag}
+              type="button"
+              onClick={() => onToggle(tag)}
+              aria-pressed={isActive}
+              className={`inline-flex items-center gap-1.5 rounded-md border px-2 py-0.5 text-xs font-medium transition-colors ${
+                isActive
+                  ? 'border-cyan-900/60 bg-cyan-950/30 text-cyan-300 hover:border-cyan-800/80'
+                  : 'border-gray-700 text-gray-400 hover:border-gray-600 hover:text-gray-200'
+              }`}
+            >
+              <span>{tag}</span>
+              <span
+                className={`rounded px-1 text-[0.65rem] tabular-nums ${
+                  isActive ? 'bg-cyan-900/50 text-cyan-200' : 'bg-gray-800 text-gray-500'
+                }`}
+              >
+                {count}
+              </span>
+            </button>
+          );
+        })}
+        {anySelected && (
+          <button
+            type="button"
+            onClick={onClear}
+            className="ml-1 text-xs text-gray-500 underline-offset-2 transition-colors hover:text-gray-300 hover:underline"
+          >
+            Clear
+          </button>
+        )}
+      </div>
+      <p className="mt-2 text-xs text-gray-500">
+        Showing runs with <span className="text-gray-400">any</span> selected tag.
+      </p>
+    </div>
+  );
+}
+
 // ── Header ──────────────────────────────────────────────────────────────
 
 function Header({
@@ -904,11 +1087,28 @@ function EmptyState() {
   );
 }
 
-function Notice({ headline, body }: { headline: string; body: string }) {
+function Notice({
+  headline,
+  body,
+  action,
+}: {
+  headline: string;
+  body: string;
+  action?: { label: string; onClick: () => void };
+}) {
   return (
     <div className="rounded-lg border border-gray-800 bg-gray-900 p-8 text-center">
       <p className="text-lg text-gray-300">{headline}</p>
       <p className="mt-2 text-sm text-gray-500">{body}</p>
+      {action && (
+        <button
+          type="button"
+          onClick={action.onClick}
+          className="mt-4 inline-flex items-center rounded-md bg-cyan-500 px-3 py-1.5 text-sm font-medium text-gray-950 transition-colors hover:bg-cyan-400"
+        >
+          {action.label}
+        </button>
+      )}
     </div>
   );
 }
diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx
index ef31b0ab3..e361722ec 100644
--- a/apps/web/src/content/docs/docs/tools/studio.mdx
+++ b/apps/web/src/content/docs/docs/tools/studio.mdx
@@ -112,6 +112,12 @@ Click any row's **Tags** cell to tag a run after the fact. Each run can carry mu
 
 Use tags to annotate ad-hoc variants, experiment cross-cuts, or status flags you didn't plan for up front — `baseline`, `v2-prompt`, `slow`, `after-retry-fix`, `regression`, etc. Unlike `experiment` — which groups runs and is baked into the JSONL at eval-run time — tags are mutable, multi-valued, and never touch the original run data.
 
+### Filtering by tag
+
+Once runs are tagged, a chip row appears above the compare view listing every distinct tag with a usage count. Click a chip to narrow both the aggregated matrix and the per-run table to runs carrying at least one of the selected tags (OR semantics — clicking a second chip widens the set). A **Clear** link resets the filter, and filter selections persist as you switch between Aggregated and Per-run modes.
+
+The same filter is available to API consumers via `GET /api/compare?tags=baseline,v2-prompt`, which returns only the cells and runs whose tags intersect the query.
+
 
 ## Benchmarks Dashboard