Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 52 additions & 9 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* `agentv studio` — starts the AgentV Studio server, a React SPA for
* `agentv studio` / `agentv serve` — starts the AgentV Studio server, a React SPA for
* reviewing evaluation results.
*
* The server uses Hono for routing and @hono/node-server to listen.
Expand Down Expand Up @@ -114,6 +114,21 @@ export function loadResults(content: string): EvaluationResult[] {
return results;
}

export function resolveDashboardMode(
projectCount: number,
options: { multi?: boolean; single?: boolean },
): { isMultiProject: boolean; showMultiWarning: boolean } {
if (options.single === true) {
return { isMultiProject: false, showMultiWarning: options.multi === true };
}

if (options.multi === true) {
return { isMultiProject: true, showMultiWarning: true };
}

return { isMultiProject: projectCount > 1, showMultiWarning: false };
}

// ── Feedback persistence ─────────────────────────────────────────────────

interface FeedbackReview {
Expand Down Expand Up @@ -669,12 +684,13 @@ async function handleTargets(c: C, { searchDir, agentvDir }: DataContext) {
function handleConfig(
c: C,
{ agentvDir, searchDir }: DataContext,
options?: { readOnly?: boolean },
options?: { readOnly?: boolean; multiProjectDashboard?: boolean },
) {
return c.json({
...loadStudioConfig(agentvDir),
read_only: options?.readOnly === true,
project_name: path.basename(searchDir),
multi_project_dashboard: options?.multiProjectDashboard === true,
});
}

Expand All @@ -694,7 +710,7 @@ export function createApp(
resultDir: string,
cwd?: string,
sourceFile?: string,
options?: { studioDir?: string; readOnly?: boolean },
options?: { studioDir?: string; readOnly?: boolean; multiProjectDashboard?: boolean },
): Hono {
const searchDir = cwd ?? resultDir;
const agentvDir = path.join(searchDir, '.agentv');
Expand Down Expand Up @@ -906,7 +922,12 @@ export function createApp(

// ── Data routes (unscoped) ────────────────────────────────────────────

app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly }));
app.get('/api/config', (c) =>
handleConfig(c, defaultCtx, {
readOnly,
multiProjectDashboard: options?.multiProjectDashboard,
}),
);
app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir)));
app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir)));
app.get('/api/runs', (c) => handleRuns(c, defaultCtx));
Expand Down Expand Up @@ -1006,7 +1027,12 @@ export function createApp(
// Same handlers as above, with project-resolved DataContext via withProject.

app.get('/api/projects/:projectId/config', (c) =>
withProject(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly })),
withProject(c, (ctx, dataCtx) =>
handleConfig(ctx, dataCtx, {
readOnly,
multiProjectDashboard: options?.multiProjectDashboard,
}),
),
);
app.get('/api/projects/:projectId/remote/status', (c) =>
withProject(c, async (ctx, dataCtx) =>
Expand Down Expand Up @@ -1159,7 +1185,12 @@ export const resultsServeCommand = command({
}),
multi: flag({
long: 'multi',
description: 'Launch in multi-project dashboard mode',
description:
'Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)',
}),
single: flag({
long: 'single',
description: 'Force single-project dashboard mode',
}),
add: option({
type: optional(string),
Expand All @@ -1181,7 +1212,7 @@ export const resultsServeCommand = command({
description: 'Disable write operations and launch Studio in read-only leaderboard mode',
}),
},
handler: async ({ source, port, dir, multi, add, remove, discover, readOnly }) => {
handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => {
const cwd = dir ?? process.cwd();
const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);

Expand Down Expand Up @@ -1224,7 +1255,10 @@ export const resultsServeCommand = command({

// ── Determine multi-project mode ────────────────────────────────
const registry = loadProjectRegistry();
const isMultiProject = multi || registry.projects.length > 0;
const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.projects.length, {
multi,
single,
});

try {
let results: EvaluationResult[] = [];
Expand Down Expand Up @@ -1254,7 +1288,16 @@ export const resultsServeCommand = command({

// Use the run directory for feedback storage (matches #764 behavior)
const resultDir = sourceFile ? path.dirname(path.resolve(sourceFile)) : cwd;
const app = createApp(results, resultDir, cwd, sourceFile, { readOnly });
const app = createApp(results, resultDir, cwd, sourceFile, {
readOnly,
multiProjectDashboard: isMultiProject,
});

if (showMultiWarning) {
console.warn(
'Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view.',
);
}

if (isMultiProject) {
console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`);
Expand Down
60 changes: 58 additions & 2 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
import { tmpdir } from 'node:os';
import path from 'node:path';

import { createApp, loadResults, resolveSourceFile } from '../../../src/commands/results/serve.js';
import {
createApp,
loadResults,
resolveDashboardMode,
resolveSourceFile,
} from '../../../src/commands/results/serve.js';

// ── Sample JSONL content (snake_case, matching on-disk format) ──────────

Expand Down Expand Up @@ -94,6 +99,52 @@ describe('loadResults', () => {
});
});

// ── resolveDashboardMode ───────────────────────────────────────────────

describe('resolveDashboardMode', () => {
it('defaults to single-project mode when no projects are registered', () => {
expect(resolveDashboardMode(0, {})).toEqual({
isMultiProject: false,
showMultiWarning: false,
});
});

it('defaults to single-project mode when exactly one project is registered', () => {
expect(resolveDashboardMode(1, {})).toEqual({
isMultiProject: false,
showMultiWarning: false,
});
});

it('defaults to multi-project mode when multiple projects are registered', () => {
expect(resolveDashboardMode(2, {})).toEqual({
isMultiProject: true,
showMultiWarning: false,
});
});

it('forces multi-project mode with a deprecation warning when --multi is used', () => {
expect(resolveDashboardMode(1, { multi: true })).toEqual({
isMultiProject: true,
showMultiWarning: true,
});
});

it('forces single-project mode when --single is used', () => {
expect(resolveDashboardMode(3, { single: true })).toEqual({
isMultiProject: false,
showMultiWarning: false,
});
});

it('lets --single override --multi', () => {
expect(resolveDashboardMode(3, { multi: true, single: true })).toEqual({
isMultiProject: false,
showMultiWarning: true,
});
});
});

// ── Mock studio dist ─────────────────────────────────────────────────────

const MOCK_STUDIO_HTML = `<!doctype html>
Expand Down Expand Up @@ -319,12 +370,17 @@ describe('serve app', () => {
const app = createApp(results, tempDir, undefined, undefined, {
studioDir,
readOnly: true,
multiProjectDashboard: true,
});

const res = await app.request('/api/config');
expect(res.status).toBe(200);
const data = (await res.json()) as { read_only?: boolean };
const data = (await res.json()) as {
read_only?: boolean;
multi_project_dashboard?: boolean;
};
expect(data.read_only).toBe(true);
expect(data.multi_project_dashboard).toBe(true);
});
});

Expand Down
79 changes: 14 additions & 65 deletions apps/studio/src/components/CompareTab.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -67,37 +67,20 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps)
cellMap.set(JSON.stringify([cell.experiment, cell.target]), cell);
}

// Find best pass rate per row (target) for highlighting
const bestByTarget = new Map<string, number>();
const worstByTarget = new Map<string, number>();
for (const target of targets) {
let best = -1;
let worst = 2;
for (const experiment of experiments) {
const cell = cellMap.get(JSON.stringify([experiment, target]));
if (cell) {
if (cell.pass_rate > best) best = cell.pass_rate;
if (cell.pass_rate < worst) worst = cell.pass_rate;
}
}
bestByTarget.set(target, best);
worstByTarget.set(target, worst);
}

return (
<div className="space-y-4">
<div className="flex items-center gap-4 text-sm text-gray-400">
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-emerald-900/60 ring-1 ring-emerald-700/40" />
&gt;80%
<span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-emerald-500/60" />
<span className="text-emerald-400">80%+</span>
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-amber-900/40 ring-1 ring-amber-700/40" />
50-80%
<span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-amber-500/60" />
<span className="text-amber-400">50–80%</span>
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm bg-red-900/40 ring-1 ring-red-700/40" />
&lt;50%
<span className="inline-block h-3 w-3 rounded-sm bg-gray-800/60 ring-1 ring-red-500/60" />
<span className="text-red-400">&lt;50%</span>
</span>
<span className="flex items-center gap-1.5">
<span className="inline-block h-3 w-3 rounded-sm border border-dashed border-gray-700" />
Expand All @@ -124,8 +107,6 @@ export function CompareTab({ data, isLoading, isError, error }: CompareTabProps)
target={target}
experiments={experiments}
cellMap={cellMap}
bestRate={bestByTarget.get(target) ?? 0}
worstRate={worstByTarget.get(target) ?? 0}
/>
))}
</tbody>
Expand All @@ -139,14 +120,10 @@ function CompareRow({
target,
experiments,
cellMap,
bestRate,
worstRate,
}: {
target: string;
experiments: string[];
cellMap: Map<string, CompareCell>;
bestRate: number;
worstRate: number;
}) {
return (
<tr className="transition-colors hover:bg-gray-900/30">
Expand All @@ -156,15 +133,7 @@ function CompareRow({
return (
<td key={exp} className="px-2 py-2">
{cell ? (
<CompareMatrixCell
cell={cell}
isBest={
experiments.length > 1 && cell.pass_rate === bestRate && bestRate !== worstRate
}
isWorst={
experiments.length > 1 && cell.pass_rate === worstRate && bestRate !== worstRate
}
/>
<CompareMatrixCell cell={cell} />
) : (
<div className="flex items-center justify-center rounded-lg border border-dashed border-gray-700 px-3 py-4 text-gray-600">
--
Expand All @@ -177,10 +146,10 @@ function CompareRow({
);
}

function passRateColorClass(rate: number): string {
if (rate >= 0.8) return 'bg-emerald-900/60 ring-emerald-700/40';
if (rate >= 0.5) return 'bg-amber-900/40 ring-amber-700/40';
return 'bg-red-900/40 ring-red-700/40';
function passRateRingClass(rate: number): string {
if (rate >= 0.8) return 'ring-emerald-500/60';
if (rate >= 0.5) return 'ring-amber-500/60';
return 'ring-red-500/60';
}

function passRateTextClass(rate: number): string {
Expand All @@ -189,15 +158,7 @@ function passRateTextClass(rate: number): string {
return 'text-red-400';
}

function CompareMatrixCell({
cell,
isBest,
isWorst,
}: {
cell: CompareCell;
isBest: boolean;
isWorst: boolean;
}) {
function CompareMatrixCell({ cell }: { cell: CompareCell }) {
const [expanded, setExpanded] = useState(false);
const pct = Math.round(cell.pass_rate * 100);
const avgPct = Math.round(cell.avg_score * 100);
Expand All @@ -208,26 +169,14 @@ function CompareMatrixCell({
type="button"
onClick={() => setExpanded(!expanded)}
aria-expanded={expanded}
className={`w-full rounded-lg px-3 py-3 text-center ring-1 transition-colors ${passRateColorClass(cell.pass_rate)} hover:brightness-110 ${
isBest ? 'ring-2 ring-emerald-500/60' : isWorst ? 'ring-2 ring-red-500/40' : ''
}`}
className={`w-full rounded-lg bg-gray-800/60 px-3 py-3 text-center ring-1 transition-colors hover:bg-gray-700/60 ${passRateRingClass(cell.pass_rate)}`}
>
<div className="flex items-center justify-center gap-1">
<div className="flex items-center justify-center">
<span
className={`text-lg font-semibold tabular-nums ${passRateTextClass(cell.pass_rate)}`}
>
{pct}%
</span>
{isBest && (
<span className="text-xs text-emerald-400" title="Best performer">
&#9650;
</span>
)}
{isWorst && (
<span className="text-xs text-red-400" title="Worst performer">
&#9660;
</span>
)}
</div>
<div className="mt-0.5 text-xs text-gray-400">
{cell.passed_count}/{cell.eval_count} pass | avg {avgPct}%
Expand Down
2 changes: 1 addition & 1 deletion apps/studio/src/components/Sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ function ProjectRunDetailSidebar({

<div className="border-b border-gray-800 px-4 py-2">
<Link to="/" className="text-xs text-gray-400 hover:text-cyan-400">
&larr; All Projects
&larr; All Benchmarks
</Link>
<p className="mt-1 truncate text-sm font-medium text-gray-300">{projectId}</p>
</div>
Expand Down
1 change: 1 addition & 0 deletions apps/studio/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ export interface StudioConfigResponse {
pass_threshold?: number;
read_only?: boolean;
project_name?: string;
multi_project_dashboard?: boolean;
}

export interface RemoteStatusResponse {
Expand Down
Loading
Loading