diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 121d9eac1..72371a7ab 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1,5 +1,5 @@ /** - * `agentv studio` — starts the AgentV Studio server, a React SPA for + * `agentv studio` / `agentv serve` — starts the AgentV Studio server, a React SPA for * reviewing evaluation results. * * The server uses Hono for routing and @hono/node-server to listen. @@ -114,6 +114,21 @@ export function loadResults(content: string): EvaluationResult[] { return results; } +export function resolveDashboardMode( + projectCount: number, + options: { multi?: boolean; single?: boolean }, +): { isMultiProject: boolean; showMultiWarning: boolean } { + if (options.single === true) { + return { isMultiProject: false, showMultiWarning: options.multi === true }; + } + + if (options.multi === true) { + return { isMultiProject: true, showMultiWarning: true }; + } + + return { isMultiProject: projectCount > 1, showMultiWarning: false }; +} + // ── Feedback persistence ───────────────────────────────────────────────── interface FeedbackReview { @@ -669,12 +684,13 @@ async function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { function handleConfig( c: C, { agentvDir, searchDir }: DataContext, - options?: { readOnly?: boolean }, + options?: { readOnly?: boolean; multiProjectDashboard?: boolean }, ) { return c.json({ ...loadStudioConfig(agentvDir), read_only: options?.readOnly === true, project_name: path.basename(searchDir), + multi_project_dashboard: options?.multiProjectDashboard === true, }); } @@ -694,7 +710,7 @@ export function createApp( resultDir: string, cwd?: string, sourceFile?: string, - options?: { studioDir?: string; readOnly?: boolean }, + options?: { studioDir?: string; readOnly?: boolean; multiProjectDashboard?: boolean }, ): Hono { const searchDir = cwd ?? resultDir; const agentvDir = path.join(searchDir, '.agentv'); @@ -906,7 +922,12 @@ export function createApp( // ── Data routes (unscoped) ──────────────────────────────────────────── - app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly })); + app.get('/api/config', (c) => + handleConfig(c, defaultCtx, { + readOnly, + multiProjectDashboard: options?.multiProjectDashboard, + }), + ); app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir))); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); @@ -1006,7 +1027,12 @@ export function createApp( // Same handlers as above, with project-resolved DataContext via withProject. app.get('/api/projects/:projectId/config', (c) => - withProject(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly })), + withProject(c, (ctx, dataCtx) => + handleConfig(ctx, dataCtx, { + readOnly, + multiProjectDashboard: options?.multiProjectDashboard, + }), + ), ); app.get('/api/projects/:projectId/remote/status', (c) => withProject(c, async (ctx, dataCtx) => @@ -1159,7 +1185,12 @@ export const resultsServeCommand = command({ }), multi: flag({ long: 'multi', - description: 'Launch in multi-project dashboard mode', + description: + 'Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)', + }), + single: flag({ + long: 'single', + description: 'Force single-project dashboard mode', }), add: option({ type: optional(string), @@ -1181,7 +1212,7 @@ export const resultsServeCommand = command({ description: 'Disable write operations and launch Studio in read-only leaderboard mode', }), }, - handler: async ({ source, port, dir, multi, add, remove, discover, readOnly }) => { + handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1224,7 +1255,10 @@ export const resultsServeCommand = command({ // ── Determine multi-project mode ──────────────────────────────── const registry = loadProjectRegistry(); - const isMultiProject = multi || registry.projects.length > 0; + const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.projects.length, { + multi, + single, + }); try { let results: EvaluationResult[] = []; @@ -1254,7 +1288,16 @@ export const resultsServeCommand = command({ // Use the run directory for feedback storage (matches #764 behavior) const resultDir = sourceFile ? path.dirname(path.resolve(sourceFile)) : cwd; - const app = createApp(results, resultDir, cwd, sourceFile, { readOnly }); + const app = createApp(results, resultDir, cwd, sourceFile, { + readOnly, + multiProjectDashboard: isMultiProject, + }); + + if (showMultiWarning) { + console.warn( + 'Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view.', + ); + } if (isMultiProject) { console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 4a4ea4085..e1f788b8e 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -3,7 +3,12 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync import { tmpdir } from 'node:os'; import path from 'node:path'; -import { createApp, loadResults, resolveSourceFile } from '../../../src/commands/results/serve.js'; +import { + createApp, + loadResults, + resolveDashboardMode, + resolveSourceFile, +} from '../../../src/commands/results/serve.js'; // ── Sample JSONL content (snake_case, matching on-disk format) ────────── @@ -94,6 +99,52 @@ describe('loadResults', () => { }); }); +// ── resolveDashboardMode ─────────────────────────────────────────────── + +describe('resolveDashboardMode', () => { + it('defaults to single-project mode when no projects are registered', () => { + expect(resolveDashboardMode(0, {})).toEqual({ + isMultiProject: false, + showMultiWarning: false, + }); + }); + + it('defaults to single-project mode when exactly one project is registered', () => { + expect(resolveDashboardMode(1, {})).toEqual({ + isMultiProject: false, + showMultiWarning: false, + }); + }); + + it('defaults to multi-project mode when multiple projects are registered', () => { + expect(resolveDashboardMode(2, {})).toEqual({ + isMultiProject: true, + showMultiWarning: false, + }); + }); + + it('forces multi-project mode with a deprecation warning when --multi is used', () => { + expect(resolveDashboardMode(1, { multi: true })).toEqual({ + isMultiProject: true, + showMultiWarning: true, + }); + }); + + it('forces single-project mode when --single is used', () => { + expect(resolveDashboardMode(3, { single: true })).toEqual({ + isMultiProject: false, + showMultiWarning: false, + }); + }); + + it('lets --single override --multi', () => { + expect(resolveDashboardMode(3, { multi: true, single: true })).toEqual({ + isMultiProject: false, + showMultiWarning: true, + }); + }); +}); + // ── Mock studio dist ───────────────────────────────────────────────────── const MOCK_STUDIO_HTML = ` @@ -319,12 +370,17 @@ describe('serve app', () => { const app = createApp(results, tempDir, undefined, undefined, { studioDir, readOnly: true, + multiProjectDashboard: true, }); const res = await app.request('/api/config'); expect(res.status).toBe(200); - const data = (await res.json()) as { read_only?: boolean }; + const data = (await res.json()) as { + read_only?: boolean; + multi_project_dashboard?: boolean; + }; expect(data.read_only).toBe(true); + expect(data.multi_project_dashboard).toBe(true); }); }); diff --git a/apps/studio/src/components/CompareTab.tsx b/apps/studio/src/components/CompareTab.tsx index a448214d3..a633d70ea 100644 --- a/apps/studio/src/components/CompareTab.tsx +++ b/apps/studio/src/components/CompareTab.tsx @@ -67,37 +67,20 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell); } - // Find best pass rate per row (target) for highlighting - const bestByTarget = new Map(); - const worstByTarget = new Map(); - for (const target of targets) { - let best = -1; - let worst = 2; - for (const experiment of experiments) { - const cell = cellMap.get(JSON.stringify([experiment, target])); - if (cell) { - if (cell.pass_rate > best) best = cell.pass_rate; - if (cell.pass_rate < worst) worst = cell.pass_rate; - } - } - bestByTarget.set(target, best); - worstByTarget.set(target, worst); - } - return (
- - >80% + + 80%+ - - 50-80% + + 50–80% - - <50% + + <50% @@ -124,8 +107,6 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps) target={target} experiments={experiments} cellMap={cellMap} - bestRate={bestByTarget.get(target) ?? 0} - worstRate={worstByTarget.get(target) ?? 0} /> ))} @@ -139,14 +120,10 @@ function CompareRow({ target, experiments, cellMap, - bestRate, - worstRate, }: { target: string; experiments: string[]; cellMap: Map; - bestRate: number; - worstRate: number; }) { return ( @@ -156,15 +133,7 @@ function CompareRow({ return ( {cell ? ( - 1 && cell.pass_rate === bestRate && bestRate !== worstRate - } - isWorst={ - experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate - } - /> + ) : (
-- @@ -177,10 +146,10 @@ function CompareRow({ ); } -function passRateColorClass(rate: number): string { - if (rate >= 0.8) return 'bg-emerald-900/60 ring-emerald-700/40'; - if (rate >= 0.5) return 'bg-amber-900/40 ring-amber-700/40'; - return 'bg-red-900/40 ring-red-700/40'; +function passRateRingClass(rate: number): string { + if (rate >= 0.8) return 'ring-emerald-500/60'; + if (rate >= 0.5) return 'ring-amber-500/60'; + return 'ring-red-500/60'; } function passRateTextClass(rate: number): string { @@ -189,15 +158,7 @@ function passRateTextClass(rate: number): string { return 'text-red-400'; } -function CompareMatrixCell({ - cell, - isBest, - isWorst, -}: { - cell: CompareCell; - isBest: boolean; - isWorst: boolean; -}) { +function CompareMatrixCell({ cell }: { cell: CompareCell }) { const [expanded, setExpanded] = useState(false); const pct = Math.round(cell.pass_rate * 100); const avgPct = Math.round(cell.avg_score * 100); @@ -208,26 +169,14 @@ function CompareMatrixCell({ type="button" onClick={() => setExpanded(!expanded)} aria-expanded={expanded} - className={`w-full rounded-lg px-3 py-3 text-center ring-1 transition-colors ${passRateColorClass(cell.pass_rate)} hover:brightness-110 ${ - isBest ? 'ring-2 ring-emerald-500/60' : isWorst ? 'ring-2 ring-red-500/40' : '' - }`} + className={`w-full rounded-lg bg-gray-800/60 px-3 py-3 text-center ring-1 transition-colors hover:bg-gray-700/60 ${passRateRingClass(cell.pass_rate)}`} > -
+
{pct}% - {isBest && ( - - ▲ - - )} - {isWorst && ( - - ▼ - - )}
{cell.passed_count}/{cell.eval_count} pass | avg {avgPct}% diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 474fcb426..62ec615a7 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -366,7 +366,7 @@ function ProjectRunDetailSidebar({
- ← All Projects + ← All Benchmarks

{projectId}

diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 19f7e96fc..93752599d 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -202,6 +202,7 @@ export interface StudioConfigResponse { pass_threshold?: number; read_only?: boolean; project_name?: string; + multi_project_dashboard?: boolean; } export interface RemoteStatusResponse { diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index f3059d44a..082518ad6 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -1,7 +1,7 @@ /** - * Home route: shows Projects Dashboard when projects are registered, - * or the existing tabbed landing page (Runs, Experiments, Targets) - * when in single-project mode. + * Home route: shows the multi-project dashboard when the server enables it, + * or the existing tabbed landing page (Runs, Experiments, Compare, Targets) + * in single-project mode. * * Uses URL search param `?tab=` for tab persistence. */ @@ -43,13 +43,15 @@ export const Route = createFileRoute('/')({ function HomePage() { const { data: projectData, isLoading: projectsLoading } = useProjectList(); + const { data: config, isLoading: configLoading } = useStudioConfig(); const hasProjects = (projectData?.projects.length ?? 0) > 0; + const multiProjectDashboard = config?.multi_project_dashboard; - if (projectsLoading) { + if (projectsLoading || configLoading) { return ; } - if (hasProjects) { + if (multiProjectDashboard === true || (multiProjectDashboard === undefined && hasProjects)) { return ; } @@ -104,7 +106,7 @@ function ProjectsDashboard() { return (
-

Projects

+

Benchmarks

{!isReadOnly && ( <> @@ -120,7 +122,7 @@ function ProjectsDashboard() { onClick={() => setShowAddForm(!showAddForm)} className="rounded-md bg-cyan-600 px-3 py-1.5 text-sm font-medium text-white hover:bg-cyan-500" > - {showAddForm ? 'Cancel' : 'Add Project'} + {showAddForm ? 'Cancel' : 'Add Benchmark'} )} @@ -140,7 +142,7 @@ function ProjectsDashboard() { type="text" value={addPath} onChange={(e) => setAddPath(e.target.value)} - placeholder="Project path (e.g., /home/user/projects/my-app)" + placeholder="Benchmark path (e.g., /home/user/projects/my-evals)" className="flex-1 rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" />