diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index 47556f7eb..9d5053cdd 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -376,7 +376,7 @@ export function registerEvalRoutes( }); // ── Project-scoped variants ──────────────────────────────────────────── - app.get('/api/benchmarks/:projectId/eval/discover', async (c) => { + app.get('/api/benchmarks/:benchmarkId/eval/discover', async (c) => { const cwd = getCwd(c); try { const files = await discoverEvalFiles(cwd); @@ -392,7 +392,7 @@ export function registerEvalRoutes( } }); - app.get('/api/benchmarks/:projectId/eval/targets', async (c) => { + app.get('/api/benchmarks/:benchmarkId/eval/targets', async (c) => { const cwd = getCwd(c); try { const names = await discoverTargetsInProject(cwd); @@ -402,7 +402,7 @@ export function registerEvalRoutes( } }); - app.post('/api/benchmarks/:projectId/eval/run', async (c) => { + app.post('/api/benchmarks/:benchmarkId/eval/run', async (c) => { const cwd = getCwd(c); let body: RunEvalRequest; @@ -476,7 +476,7 @@ export function registerEvalRoutes( } }); - app.get('/api/benchmarks/:projectId/eval/status/:id', (c) => { + app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => { const id = c.req.param('id'); const run = activeRuns.get(id ?? ''); if (!run) return c.json({ error: 'Run not found' }, 404); @@ -492,7 +492,7 @@ export function registerEvalRoutes( }); }); - app.get('/api/benchmarks/:projectId/eval/runs', (c) => { + app.get('/api/benchmarks/:benchmarkId/eval/runs', (c) => { const runs = [...activeRuns.values()].map((r) => ({ id: r.id, status: r.status, @@ -505,7 +505,7 @@ export function registerEvalRoutes( return c.json({ runs }); }); - app.post('/api/benchmarks/:projectId/eval/preview', async (c) => { + app.post('/api/benchmarks/:benchmarkId/eval/preview', async (c) => { let body: RunEvalRequest; try { body = await c.req.json(); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index acb658c84..de67aa536 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -12,10 +12,10 @@ * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews * - GET /api/benchmarks — list registered benchmarks - * - GET /api/benchmarks/:projectId/runs — benchmark-scoped run list + * - GET /api/benchmarks/:benchmarkId/runs — benchmark-scoped run list * * All data routes (runs, suites, categories, evals, experiments, targets) - * exist in both unscoped (/api/...) and benchmark-scoped (/api/benchmarks/:projectId/...) + * exist in both unscoped (/api/...) and benchmark-scoped (/api/benchmarks/:benchmarkId/...) * variants. They share handler functions via DataContext, differing only in * how searchDir is resolved. * @@ -33,11 +33,11 @@ import { command, flag, number, option, optional, positional, string } from 'cmd import { DEFAULT_CATEGORY, type EvaluationResult, - addProject, - discoverProjects, - getProject, - loadProjectRegistry, - removeProject, + addBenchmark, + discoverBenchmarks, + getBenchmark, + loadBenchmarkRegistry, + removeBenchmark, } from '@agentv/core'; import type { Context } from 'hono'; import { Hono } from 'hono'; @@ -718,19 +718,19 @@ export function createApp( const readOnly = options?.readOnly === true; const app = new Hono(); - // ── Project resolution wrapper ──────────────────────────────────────── - // Resolves projectId → DataContext, returning 404 if not found. - function withProject( + // ── Benchmark resolution wrapper ────────────────────────────────────── + // Resolves benchmarkId → DataContext, returning 404 if not found. + function withBenchmark( c: C, handler: (c: C, ctx: DataContext) => Response | Promise, ): Response | Promise { - const project = getProject(c.req.param('projectId') ?? ''); - if (!project || !existsSync(project.path)) { + const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); + if (!benchmark || !existsSync(benchmark.path)) { return c.json({ error: 'Project not found' }, 404); } return handler(c, { - searchDir: project.path, - agentvDir: path.join(project.path, '.agentv'), + searchDir: benchmark.path, + agentvDir: path.join(benchmark.path, '.agentv'), }); } @@ -754,10 +754,10 @@ export function createApp( } }); - // ── Project management endpoints ───────────────────────────────────── + // ── Benchmark management endpoints ─────────────────────────────────── - /** Convert a ProjectEntry to snake_case wire format. */ - function projectEntryToWire(entry: { + /** Convert a BenchmarkEntry to snake_case wire format. */ + function benchmarkEntryToWire(entry: { id: string; name: string; path: string; @@ -774,9 +774,9 @@ export function createApp( } app.get('/api/benchmarks', async (c) => { - const registry = loadProjectRegistry(); - const projects = await Promise.all( - registry.projects.map(async (p) => { + const registry = loadBenchmarkRegistry(); + const benchmarks = await Promise.all( + registry.benchmarks.map(async (p) => { let runCount = 0; let passRate = 0; let lastRun: string | null = null; @@ -789,17 +789,17 @@ export function createApp( lastRun = metas[0].timestamp; } } catch { - // Project path may be missing or inaccessible + // Benchmark path may be missing or inaccessible } return { - ...projectEntryToWire(p), + ...benchmarkEntryToWire(p), run_count: runCount, pass_rate: passRate, last_run: lastRun, }; }), ); - return c.json({ projects }); + return c.json({ projects: benchmarks }); }); app.post('/api/benchmarks', async (c) => { @@ -809,34 +809,34 @@ export function createApp( try { const body = await c.req.json<{ path: string }>(); if (!body.path) return c.json({ error: 'Missing path' }, 400); - const entry = addProject(body.path); - return c.json(projectEntryToWire(entry), 201); + const entry = addBenchmark(body.path); + return c.json(benchmarkEntryToWire(entry), 201); } catch (err) { return c.json({ error: (err as Error).message }, 400); } }); - app.delete('/api/benchmarks/:projectId', (c) => { + app.delete('/api/benchmarks/:benchmarkId', (c) => { if (readOnly) { return c.json({ error: 'Studio is running in read-only mode' }, 403); } - const removed = removeProject(c.req.param('projectId') ?? ''); + const removed = removeBenchmark(c.req.param('benchmarkId') ?? ''); if (!removed) return c.json({ error: 'Project not found' }, 404); return c.json({ ok: true }); }); - app.get('/api/benchmarks/:projectId/summary', async (c) => { - const project = getProject(c.req.param('projectId') ?? ''); - if (!project) return c.json({ error: 'Project not found' }, 404); + app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { + const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); + if (!benchmark) return c.json({ error: 'Project not found' }, 404); try { - const { runs: metas } = await listMergedResultFiles(project.path); + const { runs: metas } = await listMergedResultFiles(benchmark.path); const runCount = metas.length; const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0; const lastRun = metas.length > 0 ? metas[0].timestamp : null; return c.json({ - id: project.id, - name: project.name, - path: project.path, + id: benchmark.id, + name: benchmark.name, + path: benchmark.path, run_count: runCount, pass_rate: passRate, last_run: lastRun, @@ -853,17 +853,17 @@ export function createApp( try { const body = await c.req.json<{ path: string }>(); if (!body.path) return c.json({ error: 'Missing path' }, 400); - const discovered = discoverProjects(body.path); - const registered = discovered.map((p) => projectEntryToWire(addProject(p))); + const discovered = discoverBenchmarks(body.path); + const registered = discovered.map((p) => benchmarkEntryToWire(addBenchmark(p))); return c.json({ discovered: registered }); } catch (err) { return c.json({ error: (err as Error).message }, 400); } }); - /** Aggregate runs from all registered projects, sorted by timestamp descending. */ + /** Aggregate runs from all registered benchmarks, sorted by timestamp descending. */ app.get('/api/benchmarks/all-runs', async (c) => { - const registry = loadProjectRegistry(); + const registry = loadBenchmarkRegistry(); const allRuns: Array<{ filename: string; display_name: string; @@ -880,7 +880,7 @@ export function createApp( project_name: string; }> = []; - for (const p of registry.projects) { + for (const p of registry.benchmarks) { try { const { runs: metas } = await listMergedResultFiles(p.path); for (const m of metas) { @@ -1023,60 +1023,60 @@ export function createApp( return c.json({ entries }); }); - // ── Data routes (project-scoped) ────────────────────────────────────── - // Same handlers as above, with project-resolved DataContext via withProject. + // ── Data routes (benchmark-scoped) ─────────────────────────────────── + // Same handlers as above, with benchmark-resolved DataContext via withBenchmark. - app.get('/api/benchmarks/:projectId/config', (c) => - withProject(c, (ctx, dataCtx) => + app.get('/api/benchmarks/:benchmarkId/config', (c) => + withBenchmark(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly, multiProjectDashboard: options?.multiProjectDashboard, }), ), ); - app.get('/api/benchmarks/:projectId/remote/status', (c) => - withProject(c, async (ctx, dataCtx) => + app.get('/api/benchmarks/:benchmarkId/remote/status', (c) => + withBenchmark(c, async (ctx, dataCtx) => ctx.json(await getRemoteResultsStatus(dataCtx.searchDir)), ), ); - app.post('/api/benchmarks/:projectId/remote/sync', (c) => - withProject(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))), + app.post('/api/benchmarks/:benchmarkId/remote/sync', (c) => + withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))), ); - app.get('/api/benchmarks/:projectId/runs', (c) => withProject(c, handleRuns)); - app.get('/api/benchmarks/:projectId/runs/:filename', (c) => withProject(c, handleRunDetail)); - app.get('/api/benchmarks/:projectId/runs/:filename/suites', (c) => - withProject(c, handleRunSuites), + app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns)); + app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail)); + app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) => + withBenchmark(c, handleRunSuites), ); - app.get('/api/benchmarks/:projectId/runs/:filename/categories', (c) => - withProject(c, handleRunCategories), + app.get('/api/benchmarks/:benchmarkId/runs/:filename/categories', (c) => + withBenchmark(c, handleRunCategories), ); - app.get('/api/benchmarks/:projectId/runs/:filename/categories/:category/suites', (c) => - withProject(c, handleCategorySuites), + app.get('/api/benchmarks/:benchmarkId/runs/:filename/categories/:category/suites', (c) => + withBenchmark(c, handleCategorySuites), ); - app.get('/api/benchmarks/:projectId/runs/:filename/evals/:evalId', (c) => - withProject(c, handleEvalDetail), + app.get('/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId', (c) => + withBenchmark(c, handleEvalDetail), ); - app.get('/api/benchmarks/:projectId/runs/:filename/evals/:evalId/files', (c) => - withProject(c, handleEvalFiles), + app.get('/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId/files', (c) => + withBenchmark(c, handleEvalFiles), ); - app.get('/api/benchmarks/:projectId/runs/:filename/evals/:evalId/files/*', (c) => - withProject(c, handleEvalFileContent), + app.get('/api/benchmarks/:benchmarkId/runs/:filename/evals/:evalId/files/*', (c) => + withBenchmark(c, handleEvalFileContent), ); - app.get('/api/benchmarks/:projectId/experiments', (c) => withProject(c, handleExperiments)); - app.get('/api/benchmarks/:projectId/compare', (c) => withProject(c, handleCompare)); - app.get('/api/benchmarks/:projectId/targets', (c) => withProject(c, handleTargets)); - app.get('/api/benchmarks/:projectId/feedback', (c) => withProject(c, handleFeedbackRead)); + app.get('/api/benchmarks/:benchmarkId/experiments', (c) => withBenchmark(c, handleExperiments)); + app.get('/api/benchmarks/:benchmarkId/compare', (c) => withBenchmark(c, handleCompare)); + app.get('/api/benchmarks/:benchmarkId/targets', (c) => withBenchmark(c, handleTargets)); + app.get('/api/benchmarks/:benchmarkId/feedback', (c) => withBenchmark(c, handleFeedbackRead)); // ── Eval runner routes (discovery, launch, status) ──────────────────── registerEvalRoutes( app, (c) => { - // For project-scoped routes, resolve to project path; otherwise use searchDir - const projectId = c.req.param('projectId'); - if (projectId) { - const project = getProject(projectId); - if (project) return project.path; + // For benchmark-scoped routes, resolve to benchmark path; otherwise use searchDir + const benchmarkId = c.req.param('benchmarkId'); + if (benchmarkId) { + const benchmark = getBenchmark(benchmarkId); + if (benchmark) return benchmark.path; } return searchDir; }, @@ -1218,10 +1218,10 @@ export const resultsServeCommand = command({ const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); - // ── Project management commands (non-server) ───────────────────── + // ── Benchmark management commands (non-server) ─────────────────── if (add) { try { - const entry = addProject(add); + const entry = addBenchmark(add); console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`); } catch (err) { console.error(`Error: ${(err as Error).message}`); @@ -1231,7 +1231,7 @@ export const resultsServeCommand = command({ } if (remove) { - const removed = removeProject(remove); + const removed = removeBenchmark(remove); if (removed) { console.log(`Unregistered project: ${remove}`); } else { @@ -1242,13 +1242,13 @@ export const resultsServeCommand = command({ } if (discover) { - const discovered = discoverProjects(discover); + const discovered = discoverBenchmarks(discover); if (discovered.length === 0) { console.log(`No projects with .agentv/ found under ${discover}`); return; } for (const p of discovered) { - const entry = addProject(p); + const entry = addBenchmark(p); console.log(`Registered: ${entry.name} (${entry.id}) at ${entry.path}`); } console.log(`\nDiscovered ${discovered.length} project(s).`); @@ -1256,8 +1256,8 @@ export const resultsServeCommand = command({ } // ── Determine multi-project mode ──────────────────────────────── - const registry = loadProjectRegistry(); - const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.projects.length, { + const registry = loadBenchmarkRegistry(); + const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.benchmarks.length, { multi, single, }); @@ -1302,7 +1302,7 @@ export const resultsServeCommand = command({ } if (isMultiProject) { - console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`); + console.log(`Multi-project mode: ${registry.benchmarks.length} project(s) registered`); } else if (results.length > 0 && sourceFile) { console.log(`Serving ${results.length} result(s) from ${sourceFile}`); } else { diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 4b3414751..a0a2ddfa9 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -10,8 +10,14 @@ import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; -import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api'; -import { projectEvalFileContentOptions, projectEvalFilesOptions } from '~/lib/api'; +import { + benchmarkEvalFileContentOptions, + benchmarkEvalFilesOptions, + isPassing, + useEvalFileContent, + useEvalFiles, + useStudioConfig, +} from '~/lib/api'; import type { AssertionEntry, EvalResult, ScoreEntry } from '~/lib/types'; import { FeedbackPanel } from './FeedbackPanel'; @@ -23,7 +29,7 @@ import { ScoreBar } from './ScoreBar'; interface EvalDetailProps { eval: EvalResult; runId: string; - projectId?: string; + benchmarkId?: string; } type Tab = 'checks' | 'files' | 'feedback'; @@ -40,7 +46,7 @@ function findFirstFile(nodes: FileNode[]): string | null { return null; } -export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) { +export function EvalDetail({ eval: result, runId, benchmarkId }: EvalDetailProps) { const [activeTab, setActiveTab] = useState('checks'); const { data: config } = useStudioConfig(); const isReadOnly = config?.read_only === true; @@ -96,7 +102,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) )} {activeTab === 'files' && (
- +
)} {!isReadOnly && activeTab === 'feedback' && ( @@ -280,13 +286,13 @@ function ChecksTab({ result }: { result: EvalResult }) { function FilesTab({ result, runId, - projectId, -}: { result: EvalResult; runId: string; projectId?: string }) { + benchmarkId, +}: { result: EvalResult; runId: string; benchmarkId?: string }) { const evalId = result.testId; - // Use project-scoped API hooks when projectId is present - const { data: filesData } = projectId - ? useQuery(projectEvalFilesOptions(projectId, runId, evalId)) + // Use benchmark-scoped API hooks when benchmarkId is present + const { data: filesData } = benchmarkId + ? useQuery(benchmarkEvalFilesOptions(benchmarkId, runId, evalId)) : useEvalFiles(runId, evalId); const files = filesData?.files ?? []; @@ -294,8 +300,8 @@ function FilesTab({ const effectivePath = selectedPath ?? (files.length > 0 ? findFirstFile(files) : null); - const { data: fileContentData, isLoading: isLoadingContent } = projectId - ? useQuery(projectEvalFileContentOptions(projectId, runId, evalId, effectivePath ?? '')) + const { data: fileContentData, isLoading: isLoadingContent } = benchmarkId + ? useQuery(benchmarkEvalFileContentOptions(benchmarkId, runId, evalId, effectivePath ?? '')) : useEvalFileContent(runId, evalId, effectivePath ?? ''); if (files.length === 0) { diff --git a/apps/studio/src/components/ProjectCard.tsx b/apps/studio/src/components/ProjectCard.tsx index ff0642dc8..a7af8cebe 100644 --- a/apps/studio/src/components/ProjectCard.tsx +++ b/apps/studio/src/components/ProjectCard.tsx @@ -7,7 +7,7 @@ import { Link } from '@tanstack/react-router'; -import type { ProjectSummary } from '~/lib/types'; +import type { BenchmarkSummary } from '~/lib/types'; function formatTimeAgo(timestamp: string | null): string { if (!timestamp) return 'No runs'; @@ -23,13 +23,13 @@ function formatTimeAgo(timestamp: string | null): string { return `${days}d ago`; } -export function ProjectCard({ project }: { project: ProjectSummary }) { +export function ProjectCard({ project }: { project: BenchmarkSummary }) { const passPercent = Math.round(project.pass_rate * 100); return (
diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index a380b6e88..7e78431a7 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -18,7 +18,7 @@ import { StatsCards } from './StatsCards'; interface RunDetailProps { results: EvalResult[]; runId: string; - projectId?: string; + benchmarkId?: string; } interface SuiteStats { @@ -85,7 +85,7 @@ function buildCategoryGroups(results: EvalResult[], passThreshold: number): Cate .sort((a, b) => a.name.localeCompare(b.name)); } -export function RunDetail({ results, runId, projectId }: RunDetailProps) { +export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) { const { data: config } = useStudioConfig(); const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; @@ -191,10 +191,10 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { )} - {projectId ? ( + {benchmarkId ? ( {result.testId} diff --git a/apps/studio/src/components/RunEvalModal.tsx b/apps/studio/src/components/RunEvalModal.tsx index 886f53eb2..5a17fc85e 100644 --- a/apps/studio/src/components/RunEvalModal.tsx +++ b/apps/studio/src/components/RunEvalModal.tsx @@ -28,7 +28,7 @@ import type { RunEvalRequest } from '~/lib/types'; export interface RunEvalModalProps { open: boolean; onClose: () => void; - projectId?: string; + benchmarkId?: string; prefill?: { suiteFilter?: string; testIds?: string[]; @@ -38,7 +38,7 @@ export interface RunEvalModalProps { // ── Component ──────────────────────────────────────────────────────────── -export function RunEvalModal({ open, onClose, projectId, prefill }: RunEvalModalProps) { +export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalModalProps) { const queryClient = useQueryClient(); // Form state @@ -58,8 +58,8 @@ export function RunEvalModal({ open, onClose, projectId, prefill }: RunEvalModal const [cliPreview, setCliPreview] = useState(null); // Data - const { data: discoverData } = useEvalDiscover(projectId); - const { data: targetsData } = useEvalTargets(projectId); + const { data: discoverData } = useEvalDiscover(benchmarkId); + const { data: targetsData } = useEvalTargets(benchmarkId); const { data: runStatus } = useEvalRunStatus(activeRunId); const evalFiles = useMemo(() => discoverData?.eval_files ?? [], [discoverData]); @@ -110,10 +110,10 @@ export function RunEvalModal({ open, onClose, projectId, prefill }: RunEvalModal setCliPreview(null); return; } - previewEvalCommand(req, projectId) + previewEvalCommand(req, benchmarkId) .then((r) => setCliPreview(r.command)) .catch(() => setCliPreview(null)); - }, [buildRequest, projectId]); + }, [buildRequest, benchmarkId]); // Add a test ID pill function addTestId() { @@ -134,7 +134,7 @@ export function RunEvalModal({ open, onClose, projectId, prefill }: RunEvalModal setLaunching(true); try { const req = buildRequest(); - const result = await launchEvalRun(req, projectId); + const result = await launchEvalRun(req, benchmarkId); setActiveRunId(result.id); } catch (err) { setError((err as Error).message); diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 29ee3a8fe..36b5d9be7 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -16,7 +16,7 @@ import { PassRatePill } from './PassRatePill'; interface RunListProps { runs: RunMeta[]; - projectId?: string; + benchmarkId?: string; emptyMessage?: React.ReactNode; } @@ -48,7 +48,7 @@ function runLabel(run: RunMeta): string { return run.display_name ?? run.filename; } -export function RunList({ runs, projectId, emptyMessage }: RunListProps) { +export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { if (runs.length === 0) { return (
@@ -101,10 +101,10 @@ export function RunList({ runs, projectId, emptyMessage }: RunListProps) { {/* Run name */} - {projectId ? ( + {benchmarkId ? ( {label} diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 64067076f..a0886eba7 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -18,12 +18,12 @@ import { Link, useLocation, useMatchRoute } from '@tanstack/react-router'; import { isPassing, - useAllProjectRuns, + useAllBenchmarkRuns, + useBenchmarkList, + useBenchmarkRunDetail, + useBenchmarkRunList, useCategorySuites, useExperiments, - useProjectList, - useProjectRunDetail, - useProjectRunList, useRunDetail, useRunList, useStudioConfig, @@ -71,38 +71,42 @@ export function Sidebar() { // ── Project-scoped route matching ──────────────────────────────────── const projectEvalMatch = matchRoute({ - to: '/projects/$projectId/evals/$runId/$evalId', + to: '/projects/$benchmarkId/evals/$runId/$evalId', fuzzy: true, }); const projectRunMatch = matchRoute({ - to: '/projects/$projectId/runs/$runId', + to: '/projects/$benchmarkId/runs/$runId', fuzzy: true, }); const projectMatch = matchRoute({ - to: '/projects/$projectId', + to: '/projects/$benchmarkId', fuzzy: true, }); // Project-scoped eval detail - if (projectEvalMatch && typeof projectEvalMatch === 'object' && 'projectId' in projectEvalMatch) { - const { projectId, runId, evalId } = projectEvalMatch as { - projectId: string; + if ( + projectEvalMatch && + typeof projectEvalMatch === 'object' && + 'benchmarkId' in projectEvalMatch + ) { + const { benchmarkId, runId, evalId } = projectEvalMatch as { + benchmarkId: string; runId: string; evalId: string; }; - return ; + return ; } // Project-scoped run detail - if (projectRunMatch && typeof projectRunMatch === 'object' && 'projectId' in projectRunMatch) { - const { projectId, runId } = projectRunMatch as { projectId: string; runId: string }; - return ; + if (projectRunMatch && typeof projectRunMatch === 'object' && 'benchmarkId' in projectRunMatch) { + const { benchmarkId, runId } = projectRunMatch as { benchmarkId: string; runId: string }; + return ; } // Project home (runs/experiments/targets) - if (projectMatch && typeof projectMatch === 'object' && 'projectId' in projectMatch) { - const { projectId } = projectMatch as { projectId: string }; - return ; + if (projectMatch && typeof projectMatch === 'object' && 'benchmarkId' in projectMatch) { + const { benchmarkId } = projectMatch as { benchmarkId: string }; + return ; } // ── Unscoped route matching ────────────────────────────────────────── @@ -149,7 +153,7 @@ export function Sidebar() { function RunSidebar() { const matchRoute = useMatchRoute(); - const { data: projectData } = useProjectList(); + const { data: projectData } = useBenchmarkList(); const hasProjects = (projectData?.projects.length ?? 0) > 0; const isHome = matchRoute({ to: '/' }); @@ -159,7 +163,7 @@ function RunSidebar() { const useAggregated = hasProjects && isHome !== false; const { data: localData } = useRunList(); - const { data: aggregatedData } = useAllProjectRuns(); + const { data: aggregatedData } = useAllBenchmarkRuns(); const data = useAggregated ? aggregatedData : localData; return ( @@ -188,8 +192,8 @@ function RunSidebar() { return ( @@ -391,13 +395,13 @@ function CategorySidebar({ runId, category }: { runId: string; category: string // ── Project-scoped sidebars ────────────────────────────────────────────── function ProjectRunDetailSidebar({ - projectId, + benchmarkId, currentRunId, }: { - projectId: string; + benchmarkId: string; currentRunId?: string; }) { - const { data } = useProjectRunList(projectId); + const { data } = useBenchmarkRunList(benchmarkId); return ( @@ -411,7 +415,7 @@ function ProjectRunDetailSidebar({ ← All Benchmarks -

{projectId}

+

{benchmarkId}

); } -function ProjectRunsTab({ projectId }: { projectId: string }) { +function ProjectRunsTab({ benchmarkId }: { benchmarkId: string }) { const queryClient = useQueryClient(); - const { data, isLoading, error } = useProjectRunList(projectId); - const { data: remoteStatus } = useRemoteStatus(projectId); + const { data, isLoading, error } = useBenchmarkRunList(benchmarkId); + const { data: remoteStatus } = useRemoteStatus(benchmarkId); const [sourceFilter, setSourceFilter] = useState('all'); const [syncInFlight, setSyncInFlight] = useState(false); @@ -120,13 +120,13 @@ function ProjectRunsTab({ projectId }: { projectId: string }) { async function handleSyncRemote() { setSyncInFlight(true); try { - await syncRemoteResultsApi(projectId); + await syncRemoteResultsApi(benchmarkId); await Promise.all([ - queryClient.invalidateQueries({ queryKey: ['projects', projectId, 'runs'] }), - queryClient.invalidateQueries({ queryKey: ['projects', projectId, 'experiments'] }), - queryClient.invalidateQueries({ queryKey: ['projects', projectId, 'compare'] }), - queryClient.invalidateQueries({ queryKey: ['projects', projectId, 'targets'] }), - queryClient.invalidateQueries({ queryKey: ['remote-status', projectId] }), + queryClient.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'runs'] }), + queryClient.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'experiments'] }), + queryClient.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'compare'] }), + queryClient.invalidateQueries({ queryKey: ['benchmarks', benchmarkId, 'targets'] }), + queryClient.invalidateQueries({ queryKey: ['remote-status', benchmarkId] }), ]); } finally { setSyncInFlight(false); @@ -160,13 +160,13 @@ function ProjectRunsTab({ projectId }: { projectId: string }) { syncInFlight={syncInFlight} onSync={handleSyncRemote} /> - + ); } -function ProjectExperimentsTab({ projectId }: { projectId: string }) { - const { data, isLoading } = useQuery(projectExperimentsOptions(projectId)); +function ProjectExperimentsTab({ benchmarkId }: { benchmarkId: string }) { + const { data, isLoading } = useQuery(benchmarkExperimentsOptions(benchmarkId)); const experiments = (data as ExperimentsResponse | undefined)?.experiments ?? []; if (isLoading) { @@ -209,13 +209,13 @@ function ProjectExperimentsTab({ projectId }: { projectId: string }) { ); } -function ProjectCompareTab({ projectId }: { projectId: string }) { - const { data, isLoading, isError, error } = useQuery(projectCompareOptions(projectId)); +function ProjectCompareTab({ benchmarkId }: { benchmarkId: string }) { + const { data, isLoading, isError, error } = useQuery(benchmarkCompareOptions(benchmarkId)); return ; } -function ProjectTargetsTab({ projectId }: { projectId: string }) { - const { data, isLoading } = useQuery(projectTargetsOptions(projectId)); +function ProjectTargetsTab({ benchmarkId }: { benchmarkId: string }) { + const { data, isLoading } = useQuery(benchmarkTargetsOptions(benchmarkId)); const targets = (data as TargetsResponse | undefined)?.targets ?? []; if (isLoading) { diff --git a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/projects/$benchmarkId_/evals/$runId.$evalId.tsx similarity index 84% rename from apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx rename to apps/studio/src/routes/projects/$benchmarkId_/evals/$runId.$evalId.tsx index 62242c174..0ad6c533d 100644 --- a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/projects/$benchmarkId_/evals/$runId.$evalId.tsx @@ -7,15 +7,15 @@ import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useProjectRunDetail, useStudioConfig } from '~/lib/api'; +import { useBenchmarkRunDetail, useStudioConfig } from '~/lib/api'; -export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId')({ +export const Route = createFileRoute('/projects/$benchmarkId_/evals/$runId/$evalId')({ component: ProjectEvalDetailPage, }); function ProjectEvalDetailPage() { - const { projectId, runId, evalId } = Route.useParams(); - const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const { benchmarkId, runId, evalId } = Route.useParams(); + const { data, isLoading, error } = useBenchmarkRunDetail(benchmarkId, runId); const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); const isReadOnly = config?.read_only === true; @@ -69,12 +69,12 @@ function ProjectEvalDetailPage() { )} - + {!isReadOnly && ( setShowRunEval(false)} - projectId={projectId} + benchmarkId={benchmarkId} prefill={{ testIds: [evalId], target: result.target, diff --git a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx b/apps/studio/src/routes/projects/$benchmarkId_/runs/$runId.tsx similarity index 85% rename from apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx rename to apps/studio/src/routes/projects/$benchmarkId_/runs/$runId.tsx index 2d44cb894..d9776d23a 100644 --- a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx +++ b/apps/studio/src/routes/projects/$benchmarkId_/runs/$runId.tsx @@ -7,15 +7,15 @@ import { useState } from 'react'; import { RunDetail } from '~/components/RunDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useProjectRunDetail, useStudioConfig } from '~/lib/api'; +import { useBenchmarkRunDetail, useStudioConfig } from '~/lib/api'; -export const Route = createFileRoute('/projects/$projectId_/runs/$runId')({ +export const Route = createFileRoute('/projects/$benchmarkId_/runs/$runId')({ component: ProjectRunDetailPage, }); function ProjectRunDetailPage() { - const { projectId, runId } = Route.useParams(); - const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const { benchmarkId, runId } = Route.useParams(); + const { data, isLoading, error } = useBenchmarkRunDetail(benchmarkId, runId); const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); const isReadOnly = config?.read_only === true; @@ -78,12 +78,12 @@ function ProjectRunDetailPage() { )} - + {!isReadOnly && ( setShowRunEval(false)} - projectId={projectId} + benchmarkId={benchmarkId} prefill={prefill} /> )} diff --git a/packages/core/src/projects.ts b/packages/core/src/benchmarks.ts similarity index 58% rename from packages/core/src/projects.ts rename to packages/core/src/benchmarks.ts index 5fc58a541..2fb603b12 100644 --- a/packages/core/src/projects.ts +++ b/packages/core/src/benchmarks.ts @@ -1,19 +1,19 @@ /** - * Project registry for AgentV Studio multi-project support. + * Benchmark registry for AgentV Studio multi-benchmark support. * - * A Project = any directory containing a `.agentv/` folder. - * The registry lives at `~/.agentv/projects.yaml` and tracks registered projects. + * A Benchmark = any directory containing a `.agentv/` folder. + * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks. * * YAML format: - * projects: + * benchmarks: * - id: my-app * name: My App * path: /home/user/projects/my-app * addedAt: "2026-03-20T10:00:00Z" * lastOpenedAt: "2026-03-30T14:00:00Z" * - * To extend: use loadProjectRegistry() / saveProjectRegistry() for CRUD, - * discoverProjects() to scan a directory tree for `.agentv/` directories. + * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD, + * discoverBenchmarks() to scan a directory tree for `.agentv/` directories. */ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; @@ -25,7 +25,7 @@ import { getAgentvHome } from './paths.js'; // ── Types ─────────────────────────────────────────────────────────────── -export interface ProjectEntry { +export interface BenchmarkEntry { id: string; name: string; path: string; @@ -33,59 +33,59 @@ export interface ProjectEntry { lastOpenedAt: string; } -export interface ProjectRegistry { - projects: ProjectEntry[]; +export interface BenchmarkRegistry { + benchmarks: BenchmarkEntry[]; } // ── Registry path ─────────────────────────────────────────────────────── -export function getProjectsRegistryPath(): string { +export function getBenchmarksRegistryPath(): string { return path.join(getAgentvHome(), 'projects.yaml'); } // ── Load / Save ───────────────────────────────────────────────────────── -export function loadProjectRegistry(): ProjectRegistry { - const registryPath = getProjectsRegistryPath(); +export function loadBenchmarkRegistry(): BenchmarkRegistry { + const registryPath = getBenchmarksRegistryPath(); if (!existsSync(registryPath)) { - return { projects: [] }; + return { benchmarks: [] }; } try { const raw = readFileSync(registryPath, 'utf-8'); const parsed = parseYaml(raw); - if (!parsed || !Array.isArray(parsed.projects)) { - return { projects: [] }; + if (!parsed || !Array.isArray(parsed.benchmarks)) { + return { benchmarks: [] }; } - return { projects: parsed.projects as ProjectEntry[] }; + return { benchmarks: parsed.benchmarks as BenchmarkEntry[] }; } catch { - return { projects: [] }; + return { benchmarks: [] }; } } -export function saveProjectRegistry(registry: ProjectRegistry): void { - const registryPath = getProjectsRegistryPath(); +export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { + const registryPath = getBenchmarksRegistryPath(); const dir = path.dirname(registryPath); if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - writeFileSync(registryPath, stringifyYaml(registry), 'utf-8'); + writeFileSync(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), 'utf-8'); } // ── CRUD operations ───────────────────────────────────────────────────── /** - * Derive a URL-safe project ID from a directory path. + * Derive a URL-safe benchmark ID from a directory path. * Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens. * Appends a numeric suffix if the ID already exists in the registry. */ -export function deriveProjectId(dirPath: string, existingIds: string[]): string { +export function deriveBenchmarkId(dirPath: string, existingIds: string[]): string { const base = path .basename(dirPath) .toLowerCase() .replace(/[^a-z0-9-]/g, '-') .replace(/-+/g, '-') .replace(/^-|-$/g, ''); - let candidate = base || 'project'; + let candidate = base || 'benchmark'; let suffix = 2; while (existingIds.includes(candidate)) { candidate = `${base}-${suffix}`; @@ -95,11 +95,11 @@ export function deriveProjectId(dirPath: string, existingIds: string[]): string } /** - * Register a project by path. Returns the new entry, or the existing one if already registered. + * Register a benchmark by path. Returns the new entry, or the existing one if already registered. * Validates that the path exists and contains a `.agentv/` directory. */ -export function addProject(projectPath: string): ProjectEntry { - const absPath = path.resolve(projectPath); +export function addBenchmark(benchmarkPath: string): BenchmarkEntry { + const absPath = path.resolve(benchmarkPath); if (!existsSync(absPath)) { throw new Error(`Directory not found: ${absPath}`); } @@ -107,56 +107,56 @@ export function addProject(projectPath: string): ProjectEntry { throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`); } - const registry = loadProjectRegistry(); - const existing = registry.projects.find((p) => p.path === absPath); + const registry = loadBenchmarkRegistry(); + const existing = registry.benchmarks.find((p) => p.path === absPath); if (existing) { return existing; } const now = new Date().toISOString(); - const entry: ProjectEntry = { - id: deriveProjectId( + const entry: BenchmarkEntry = { + id: deriveBenchmarkId( absPath, - registry.projects.map((p) => p.id), + registry.benchmarks.map((p) => p.id), ), name: path.basename(absPath), path: absPath, addedAt: now, lastOpenedAt: now, }; - registry.projects.push(entry); - saveProjectRegistry(registry); + registry.benchmarks.push(entry); + saveBenchmarkRegistry(registry); return entry; } /** - * Remove a project by ID. Returns true if removed, false if not found. + * Remove a benchmark by ID. Returns true if removed, false if not found. */ -export function removeProject(projectId: string): boolean { - const registry = loadProjectRegistry(); - const idx = registry.projects.findIndex((p) => p.id === projectId); +export function removeBenchmark(benchmarkId: string): boolean { + const registry = loadBenchmarkRegistry(); + const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId); if (idx < 0) return false; - registry.projects.splice(idx, 1); - saveProjectRegistry(registry); + registry.benchmarks.splice(idx, 1); + saveBenchmarkRegistry(registry); return true; } /** - * Look up a project by ID. Returns undefined if not found. + * Look up a benchmark by ID. Returns undefined if not found. */ -export function getProject(projectId: string): ProjectEntry | undefined { - return loadProjectRegistry().projects.find((p) => p.id === projectId); +export function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined { + return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId); } /** - * Update lastOpenedAt for a project. + * Update lastOpenedAt for a benchmark. */ -export function touchProject(projectId: string): void { - const registry = loadProjectRegistry(); - const entry = registry.projects.find((p) => p.id === projectId); +export function touchBenchmark(benchmarkId: string): void { + const registry = loadBenchmarkRegistry(); + const entry = registry.benchmarks.find((p) => p.id === benchmarkId); if (entry) { entry.lastOpenedAt = new Date().toISOString(); - saveProjectRegistry(registry); + saveBenchmarkRegistry(registry); } } @@ -164,9 +164,9 @@ export function touchProject(projectId: string): void { /** * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`. - * Returns absolute paths of discovered project directories. + * Returns absolute paths of discovered benchmark directories. */ -export function discoverProjects(rootDir: string, maxDepth = 2): string[] { +export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { const absRoot = path.resolve(rootDir); if (!existsSync(absRoot) || !statSync(absRoot).isDirectory()) { return []; @@ -177,10 +177,10 @@ export function discoverProjects(rootDir: string, maxDepth = 2): string[] { function scan(dir: string, depth: number) { if (depth > maxDepth) return; - // Check if this directory itself is a project + // Check if this directory itself is a benchmark if (existsSync(path.join(dir, '.agentv'))) { results.push(dir); - return; // Don't scan subdirectories of a project + return; // Don't scan subdirectories of a benchmark } if (depth === maxDepth) return; diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 64a68de23..a44e7d6e7 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -80,18 +80,18 @@ export { getWorkspacePoolRoot, } from './paths.js'; export { - type ProjectEntry, - type ProjectRegistry, - loadProjectRegistry, - saveProjectRegistry, - addProject, - removeProject, - getProject, - touchProject, - discoverProjects, - deriveProjectId, - getProjectsRegistryPath, -} from './projects.js'; + type BenchmarkEntry, + type BenchmarkRegistry, + loadBenchmarkRegistry, + saveBenchmarkRegistry, + addBenchmark, + removeBenchmark, + getBenchmark, + touchBenchmark, + discoverBenchmarks, + deriveBenchmarkId, + getBenchmarksRegistryPath, +} from './benchmarks.js'; export { trimBaselineResult } from './evaluation/baseline.js'; export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; export * from './observability/index.js';