From 88be471a58d0635b4dd3c5cf193e3430302daf21 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 11:59:53 +0000 Subject: [PATCH 1/2] fix(studio): runs tab pass rate now respects configured threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /api/runs endpoint was computing pass_rate using a hardcoded DEFAULT_THRESHOLD (0.8) instead of reading the threshold configured in Studio Settings. This caused the runs list to show stale percentages that never reflected the user's threshold. Fix: load the studio threshold from config in handleRuns() and recalculate pass_rate per run using the lightweight records already loaded for target/experiment enrichment. Also fix the RunList status dot (✓/✗) to use the config threshold instead of the hardcoded 0.8. Closes #1030 Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/cli/src/commands/results/serve.ts | 7 ++- apps/cli/test/commands/results/serve.test.ts | 56 ++++++++++++++++++++ apps/studio/src/components/RunList.tsx | 6 ++- 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index de67aa536..9b4c67161 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -255,17 +255,20 @@ interface DataContext { // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route type C = Context; -async function handleRuns(c: C, { searchDir }: DataContext) { +async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { const { runs: metas } = await listMergedResultFiles(searchDir); + const { threshold: passThreshold } = loadStudioConfig(agentvDir); return c.json({ runs: metas.map((m) => { let target: string | undefined; let experiment: string | undefined; + let passRate = m.passRate; try { const records = loadLightweightResults(m.path); if (records.length > 0) { target = records[0].target; experiment = records[0].experiment; + passRate = records.filter((r) => r.score >= passThreshold).length / records.length; } } catch { // ignore enrichment errors @@ -276,7 +279,7 @@ async function handleRuns(c: C, { searchDir }: DataContext) { path: m.path, timestamp: m.timestamp, test_count: m.testCount, - pass_rate: m.passRate, + pass_rate: passRate, avg_score: m.avgScore, size_bytes: m.sizeBytes, source: m.source, diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index e1f788b8e..dd88ec7b0 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -435,6 +435,62 @@ describe('serve app', () => { }); }); + it('computes pass_rate using the configured studio threshold (strict threshold yields lower rate)', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + // Two results: score=0.8 and score=0.6 + // With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50% + // With threshold=0.9: neither passes → 0% + const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 }; + const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 }; + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); + + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + 'studio:\n threshold: 0.9\n', + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs'); + expect(res.status).toBe(200); + const data = (await res.json()) as { runs: Array<{ pass_rate: number }> }; + expect(data.runs).toHaveLength(1); + // With threshold=0.9: neither 0.8 nor 0.6 passes → 0% + expect(data.runs[0].pass_rate).toBe(0); + }); + + it('computes pass_rate using the configured studio threshold (lenient threshold yields higher rate)', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T12-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + // Two results: score=0.8 and score=0.6 + // With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50% + // With threshold=0.5: both pass → 2/2 = 100% + const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 }; + const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 }; + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); + + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + 'studio:\n threshold: 0.5\n', + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs'); + expect(res.status).toBe(200); + const data = (await res.json()) as { runs: Array<{ pass_rate: number }> }; + expect(data.runs).toHaveLength(1); + // With threshold=0.5: both 0.8 and 0.6 pass → 100% + expect(data.runs[0].pass_rate).toBe(1); + }); + it('merges cached remote runs and tags them with remote source metadata', async () => { const previousHome = process.env.AGENTV_HOME; process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 36b5d9be7..6a82b78e4 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -10,6 +10,7 @@ import type React from 'react'; import { Link } from '@tanstack/react-router'; +import { DEFAULT_PASS_THRESHOLD, useStudioConfig } from '~/lib/api'; import type { RunMeta } from '~/lib/types'; import { PassRatePill } from './PassRatePill'; @@ -49,6 +50,9 @@ function runLabel(run: RunMeta): string { } export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { + const { data: config } = useStudioConfig(); + const passThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD; + if (runs.length === 0) { return (
@@ -84,7 +88,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { {runs.map((run) => { const ts = formatDate(run.timestamp); - const passing = run.pass_rate >= 0.8; + const passing = run.pass_rate >= passThreshold; const label = runLabel(run); const passedCount = Math.round(run.pass_rate * run.test_count); const failedCount = run.test_count - passedCount; From 395c86f098126bb100aebc99d6d4abf180910da5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 12:01:40 +0000 Subject: [PATCH 2/2] style: fix biome formatting in serve.test.ts Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/cli/test/commands/results/serve.test.ts | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index dd88ec7b0..14007b406 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -449,10 +449,7 @@ describe('serve app', () => { writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); - writeFileSync( - path.join(tempDir, '.agentv', 'config.yaml'), - 'studio:\n threshold: 0.9\n', - ); + writeFileSync(path.join(tempDir, '.agentv', 'config.yaml'), 'studio:\n threshold: 0.9\n'); const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request('/api/runs'); @@ -477,10 +474,7 @@ describe('serve app', () => { writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); - writeFileSync( - path.join(tempDir, '.agentv', 'config.yaml'), - 'studio:\n threshold: 0.5\n', - ); + writeFileSync(path.join(tempDir, '.agentv', 'config.yaml'), 'studio:\n threshold: 0.5\n'); const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request('/api/runs');