diff --git a/AGENTS.md b/AGENTS.md index 7d78e1cc2..13c3cd0cc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -49,6 +49,16 @@ Before adding features, research how peer frameworks solve the problem. Prefer t ### 5. YAGNI — You Aren't Gonna Need It Don't build features until there's a concrete need. Before adding a new capability, ask: "Is there real demand for this today, or am I anticipating future needs?" Numeric thresholds, extra tracking fields, and configurable knobs should be omitted until users actually request them. Start with the simplest version (e.g., boolean over numeric range) and extend later if needed. +**YAGNI applies to *how* you meet a real request, not just *whether* to meet it.** The common failure mode is not "I built X and nobody wanted it." It's "someone asked for X and I built a bigger X than they asked for." Guard against that with these habits: + +1. **Audit existing primitives before adding new ones.** When an issue asks for capability Y, the first question is not "how do I build Y?" — it's **"what does the codebase already do that addresses Y?"** Grep for existing functions, endpoints, and config shapes. Many requests are satisfied by a behavior that already exists and just needs to be surfaced, configured, or exercised differently. +2. **Treat issue language as a hint, not a spec.** Issues describe problems *and* implementations. "We need a discovery root" is one implementation of "we need the registry to update live." When an issue lists multiple acceptable approaches (or its acceptance criteria don't actually require the implementation it names), pick the one with the least code surface. Summarize the acceptance criteria in your own words, strip out implementation nouns ("discovery root," "watcher," "registry reload"), then match them against existing primitives before designing anything new. +3. **Prefer data/config changes over new mechanisms.** If the observable effect is "this list should be editable at runtime," prefer "re-read the file per request" over "add a watcher + a new field + a precedence rule + a new endpoint." Config-driven beats code-driven when both are sufficient. +4. **Stop when scope doubles.** If an implementation's surface area grows more than ~2× the starting estimate (extra types, extra endpoints, extra invariants), that's a red flag to re-plan, not a sign to push through. Pause and ask: "What would the smallest possible version look like? Does the issue actually require more than that?" +5. **If you are about to add a second mode, two-layer precedence, or an invariant between two optional fields, stop.** `source: manual | discovered`, "pinned wins over discovered," `excluded_paths` filtering the discovered set — every one of these is a sign that you're in complexity territory that a simpler data model would have avoided. + +**Call out existing overengineering.** If, while working on a task, you notice a *current* feature in the repo that looks overengineered relative to what it's used for (multiple modes, optional precedence rules, dead-looking extensibility scaffolding), flag it — don't silently fix it. Open a tracking issue titled "cleanup: simplify X" that lists: the observable behavior today, the simpler model that would cover it, and the migration notes. Link to the code. Do not widen your current PR to absorb the cleanup unless the user asks. + ### 6. Non-Breaking Extensions New fields should be optional. Existing configurations must continue working unchanged. @@ -146,36 +156,64 @@ cd ../agentv.worktrees/- ## Wire Format Convention -**All external-facing JSON and JSONL output uses `snake_case` keys.** This applies to: -- JSONL result files on disk (`test_id`, `token_usage`, `duration_ms`) -- Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`) -- CLI command JSON output (`results summary`, `results failures`, `results show`) -- YAML eval config fields +**Everything that crosses a process boundary uses `snake_case` keys. Internal TypeScript uses `camelCase`. Translate at the boundary — never in the middle.** + +The rule is blanket: if the key is going to disk, to a user's editor, into a JSON response, or onto a CLI, it's snake_case. There is no "well this file is internal-ish" carve-out. If in doubt, snake_case. + +### snake_case surfaces +- All YAML files on disk: `*.eval.yaml`, `agentv.config.yaml`, `benchmarks.yaml`, `studio/config.yaml`, any future YAML we add. +- JSONL result files (`test_id`, `token_usage`, `duration_ms`). +- Artifact-writer output (`pass_rate`, `tests_run`, `total_tool_calls`). +- HTTP response bodies from `agentv serve` / Studio (`added_at`, `pass_rate`, `benchmark_id`). +- CLI JSON output (`agentv results summary`, `results failures`, `results show`). +- Anything consumed by non-TS tooling (Python, jq pipelines, external dashboards). + +### camelCase surfaces +- TypeScript source: all variables, parameters, fields, type members. +- Internal in-memory shapes passed between TS modules. -**Internal TypeScript uses `camelCase`** as standard. Convert at the serialization boundary only: +### Translate only at the boundary +Define a second interface for the wire shape and convert in one place — don't smear snake_case through TS internals. ```typescript -// Interfaces for JSON output use snake_case (they define the wire format) -interface SummaryJson { - total: number; - pass_rate: number; - failed_test_ids: string[]; +// Wire shape — snake_case, matches what hits disk / the network +interface BenchmarkEntryYaml { + id: string; + name: string; + path: string; + added_at: string; + last_opened_at: string; } -// Function internals use camelCase (idiomatic TypeScript) -function formatSummary(results: EvaluationResult[]): SummaryJson { - const passRate = computePassRate(results); - const failedTestIds = findFailed(results); +// Internal shape — camelCase, what every TS call site sees +interface BenchmarkEntry { + id: string; + name: string; + path: string; + addedAt: string; + lastOpenedAt: string; +} + +function fromYaml(e: BenchmarkEntryYaml): BenchmarkEntry { + return { id: e.id, name: e.name, path: e.path, addedAt: e.added_at, lastOpenedAt: e.last_opened_at }; +} - return { - total: results.length, - pass_rate: passRate, - failed_test_ids: failedTestIds, - }; +function toYaml(e: BenchmarkEntry): BenchmarkEntryYaml { + return { id: e.id, name: e.name, path: e.path, added_at: e.addedAt, last_opened_at: e.lastOpenedAt }; } ``` -**Reading back:** `parseJsonlResults()` in `artifact-writer.ts` converts snake_case → camelCase when reading JSONL into TypeScript. +Yes, this is two interfaces and two functions per entity. That's the price of keeping TS idiomatic while staying faithful to the wire contract. Don't skip it — dumping TS objects directly to YAML leaks `addedAt`-style camelCase onto disk and breaks jq/Python consumers. + +### Anti-patterns +- `writeFileSync(path, stringifyYaml(tsObject))` — dumps TS field names verbatim. Wrong. +- `interface Foo { testId: string; ... }` for a JSON response body — `test_id`, always. +- Accepting both `testId` and `test_id` on input "for back-compat" when nothing is shipped yet. Just snake_case. + +### Existing divergences +If you spot a camelCase key already on disk or in a response (e.g. a legacy endpoint), treat it as a bug: migrate it to snake_case in the same PR where you touch that code path. Don't grandfather it in. + +**Reading back:** `parseJsonlResults()` in `artifact-writer.ts` converts snake_case → camelCase when reading JSONL into TypeScript. `fromYaml` / `toYaml` in `packages/core/src/benchmarks.ts` is the model for YAML boundaries. **Why:** Aligns with skill-creator (claude-plugins-official) and broader Python/JSON ecosystem conventions where snake_case is the standard wire format. diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index eeb3a4d62..580d4e1fd 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -12,6 +12,8 @@ * - GET /api/feedback — read feedback reviews * - POST /api/feedback — write feedback reviews * - GET /api/benchmarks — list registered benchmarks + * - POST /api/benchmarks — register a benchmark by path + * - DELETE /api/benchmarks/:benchmarkId — unregister a benchmark * - GET /api/benchmarks/:benchmarkId/runs — benchmark-scoped run list * * All data routes (runs, suites, categories, evals, experiments, targets) @@ -20,7 +22,7 @@ * how searchDir is resolved. * * Before starting the server, the command enforces `required_version` from - * the cwd's `.agentv/config.yaml` (single-project scope) via + * the cwd's `.agentv/config.yaml` (single-benchmark scope) via * `enforceRequiredVersion()`, matching the behavior of `agentv eval`. * * Exported functions (for testing): @@ -38,7 +40,6 @@ import { DEFAULT_CATEGORY, type EvaluationResult, addBenchmark, - discoverBenchmarks, getBenchmark, loadBenchmarkRegistry, loadConfig, @@ -123,18 +124,18 @@ export function loadResults(content: string): EvaluationResult[] { } export function resolveDashboardMode( - projectCount: number, + benchmarkCount: number, options: { multi?: boolean; single?: boolean }, -): { isMultiProject: boolean; showMultiWarning: boolean } { +): { isMultiBenchmark: boolean; showMultiWarning: boolean } { if (options.single === true) { - return { isMultiProject: false, showMultiWarning: options.multi === true }; + return { isMultiBenchmark: false, showMultiWarning: options.multi === true }; } if (options.multi === true) { - return { isMultiProject: true, showMultiWarning: true }; + return { isMultiBenchmark: true, showMultiWarning: true }; } - return { isMultiProject: projectCount > 1, showMultiWarning: false }; + return { isMultiBenchmark: benchmarkCount > 1, showMultiWarning: false }; } // ── Feedback persistence ───────────────────────────────────────────────── @@ -252,7 +253,7 @@ function stripHeavyFields(results: readonly EvaluationResult[]) { // ── Shared data-route handlers ─────────────────────────────────────────── // // Each handler takes a Hono Context and a DataContext (resolved directories). -// Both unscoped and project-scoped routes call the same handler, differing +// Both unscoped and benchmark-scoped routes call the same handler, differing // only in how the DataContext is constructed. interface DataContext { @@ -815,13 +816,13 @@ async function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { function handleConfig( c: C, { agentvDir, searchDir }: DataContext, - options?: { readOnly?: boolean; multiProjectDashboard?: boolean }, + options?: { readOnly?: boolean; multiBenchmarkDashboard?: boolean }, ) { return c.json({ ...loadStudioConfig(agentvDir), read_only: options?.readOnly === true, - project_name: path.basename(searchDir), - multi_project_dashboard: options?.multiProjectDashboard === true, + benchmark_name: path.basename(searchDir), + multi_benchmark_dashboard: options?.multiBenchmarkDashboard === true, }); } @@ -887,7 +888,7 @@ export function createApp( resultDir: string, cwd?: string, sourceFile?: string, - options?: { studioDir?: string; readOnly?: boolean; multiProjectDashboard?: boolean }, + options?: { studioDir?: string; readOnly?: boolean; multiBenchmarkDashboard?: boolean }, ): Hono { const searchDir = cwd ?? resultDir; const agentvDir = path.join(searchDir, '.agentv'); @@ -896,14 +897,16 @@ export function createApp( const app = new Hono(); // ── Benchmark resolution wrapper ────────────────────────────────────── - // Resolves benchmarkId → DataContext, returning 404 if not found. + // Resolves benchmarkId → DataContext, returning 404 if not found. The + // registry is re-read on every request, so edits to benchmarks.yaml (or + // POST /api/benchmarks) take effect without restarting the server. function withBenchmark( c: C, handler: (c: C, ctx: DataContext) => Response | Promise, ): Response | Promise { const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); if (!benchmark || !existsSync(benchmark.path)) { - return c.json({ error: 'Project not found' }, 404); + return c.json({ error: 'Benchmark not found' }, 404); } return handler(c, { searchDir: benchmark.path, @@ -976,7 +979,7 @@ export function createApp( }; }), ); - return c.json({ projects: benchmarks }); + return c.json({ benchmarks }); }); app.post('/api/benchmarks', async (c) => { @@ -993,18 +996,9 @@ export function createApp( } }); - app.delete('/api/benchmarks/:benchmarkId', (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - const removed = removeBenchmark(c.req.param('benchmarkId') ?? ''); - if (!removed) return c.json({ error: 'Project not found' }, 404); - return c.json({ ok: true }); - }); - app.get('/api/benchmarks/:benchmarkId/summary', async (c) => { const benchmark = getBenchmark(c.req.param('benchmarkId') ?? ''); - if (!benchmark) return c.json({ error: 'Project not found' }, 404); + if (!benchmark) return c.json({ error: 'Benchmark not found' }, 404); try { const { runs: metas } = await listMergedResultFiles(benchmark.path); const runCount = metas.length; @@ -1019,22 +1013,7 @@ export function createApp( last_run: lastRun, }); } catch { - return c.json({ error: 'Failed to read project' }, 500); - } - }); - - app.post('/api/benchmarks/discover', async (c) => { - if (readOnly) { - return c.json({ error: 'Studio is running in read-only mode' }, 403); - } - try { - const body = await c.req.json<{ path: string }>(); - if (!body.path) return c.json({ error: 'Missing path' }, 400); - const discovered = discoverBenchmarks(body.path); - const registered = discovered.map((p) => benchmarkEntryToWire(addBenchmark(p))); - return c.json({ discovered: registered }); - } catch (err) { - return c.json({ error: (err as Error).message }, 400); + return c.json({ error: 'Failed to read benchmark' }, 500); } }); @@ -1053,8 +1032,8 @@ export function createApp( target?: string; experiment?: string; source: 'local' | 'remote'; - project_id: string; - project_name: string; + benchmark_id: string; + benchmark_name: string; }> = []; for (const p of registry.benchmarks) { @@ -1084,12 +1063,12 @@ export function createApp( source: m.source, ...(target && { target }), ...(experiment && { experiment }), - project_id: p.id, - project_name: p.name, + benchmark_id: p.id, + benchmark_name: p.name, }); } } catch { - // skip inaccessible projects + // skip inaccessible benchmarks } } @@ -1097,12 +1076,21 @@ export function createApp( return c.json({ runs: allRuns }); }); + app.delete('/api/benchmarks/:benchmarkId', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const removed = removeBenchmark(c.req.param('benchmarkId') ?? ''); + if (!removed) return c.json({ error: 'Benchmark not found' }, 404); + return c.json({ ok: true }); + }); + // ── Data routes (unscoped) ──────────────────────────────────────────── app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly, - multiProjectDashboard: options?.multiProjectDashboard, + multiBenchmarkDashboard: options?.multiBenchmarkDashboard, }), ); app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); @@ -1219,7 +1207,7 @@ export function createApp( withBenchmark(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly, - multiProjectDashboard: options?.multiProjectDashboard, + multiBenchmarkDashboard: options?.multiBenchmarkDashboard, }), ), ); @@ -1389,33 +1377,28 @@ export const resultsServeCommand = command({ multi: flag({ long: 'multi', description: - 'Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)', + 'Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or --single)', }), single: flag({ long: 'single', - description: 'Force single-project dashboard mode', + description: 'Force single-benchmark dashboard mode', }), add: option({ type: optional(string), long: 'add', - description: 'Register a project by path', + description: 'Register a benchmark by path', }), remove: option({ type: optional(string), long: 'remove', - description: 'Unregister a project by ID', - }), - discover: option({ - type: optional(string), - long: 'discover', - description: 'Scan a directory tree for repos with .agentv/', + description: 'Unregister a benchmark by ID', }), readOnly: flag({ long: 'read-only', description: 'Disable write operations and launch Studio in read-only leaderboard mode', }), }, - handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => { + handler: async ({ source, port, dir, multi, single, add, remove, readOnly }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1423,7 +1406,7 @@ export const resultsServeCommand = command({ if (add) { try { const entry = addBenchmark(add); - console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`); + console.log(`Registered benchmark: ${entry.name} (${entry.id}) at ${entry.path}`); } catch (err) { console.error(`Error: ${(err as Error).message}`); process.exit(1); @@ -1434,46 +1417,32 @@ export const resultsServeCommand = command({ if (remove) { const removed = removeBenchmark(remove); if (removed) { - console.log(`Unregistered project: ${remove}`); + console.log(`Unregistered benchmark: ${remove}`); } else { - console.error(`Project not found: ${remove}`); + console.error(`Benchmark not found: ${remove}`); process.exit(1); } return; } - if (discover) { - const discovered = discoverBenchmarks(discover); - if (discovered.length === 0) { - console.log(`No projects with .agentv/ found under ${discover}`); - return; - } - for (const p of discovered) { - const entry = addBenchmark(p); - console.log(`Registered: ${entry.name} (${entry.id}) at ${entry.path}`); - } - console.log(`\nDiscovered ${discovered.length} project(s).`); - return; - } - // ── Version check ──────────────────────────────────────────────── // Enforce `required_version` from .agentv/config.yaml so Studio/serve // match `agentv eval` behavior. Same prompt in TTY, warn+continue - // otherwise. Single-project scope only — when one agentv instance + // otherwise. Single-benchmark scope only — when one agentv instance // serves multiple repos with differing version requirements, a - // per-project local install is required instead. + // per-benchmark local install is required instead. const repoRoot = await findRepoRoot(cwd); const yamlConfig = await loadConfig(path.join(cwd, '_'), repoRoot); if (yamlConfig?.required_version) { await enforceRequiredVersion(yamlConfig.required_version); } - // ── Determine multi-project mode ──────────────────────────────── + // ── Determine multi-benchmark mode ─────────────────────────────── const registry = loadBenchmarkRegistry(); - const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.benchmarks.length, { - multi, - single, - }); + const { isMultiBenchmark, showMultiWarning } = resolveDashboardMode( + registry.benchmarks.length, + { multi, single }, + ); try { let results: EvaluationResult[] = []; @@ -1505,17 +1474,17 @@ export const resultsServeCommand = command({ const resultDir = sourceFile ? path.dirname(path.resolve(sourceFile)) : cwd; const app = createApp(results, resultDir, cwd, sourceFile, { readOnly, - multiProjectDashboard: isMultiProject, + multiBenchmarkDashboard: isMultiBenchmark, }); if (showMultiWarning) { console.warn( - 'Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view.', + 'Warning: --multi is deprecated. Studio now auto-detects multi-benchmark mode when multiple benchmarks are registered. Use --single to force the single-benchmark view.', ); } - if (isMultiProject) { - console.log(`Multi-project mode: ${registry.benchmarks.length} project(s) registered`); + if (isMultiBenchmark) { + console.log(`Multi-benchmark mode: ${registry.benchmarks.length} benchmark(s) registered`); } else if (results.length > 0 && sourceFile) { console.log(`Serving ${results.length} result(s) from ${sourceFile}`); } else { diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 6b79b5c19..702cd9ece 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -102,44 +102,44 @@ describe('loadResults', () => { // ── resolveDashboardMode ─────────────────────────────────────────────── describe('resolveDashboardMode', () => { - it('defaults to single-project mode when no projects are registered', () => { + it('defaults to single-benchmark mode when no benchmarks are registered', () => { expect(resolveDashboardMode(0, {})).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); - it('defaults to single-project mode when exactly one project is registered', () => { + it('defaults to single-benchmark mode when exactly one benchmark is registered', () => { expect(resolveDashboardMode(1, {})).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); - it('defaults to multi-project mode when multiple projects are registered', () => { + it('defaults to multi-benchmark mode when multiple benchmarks are registered', () => { expect(resolveDashboardMode(2, {})).toEqual({ - isMultiProject: true, + isMultiBenchmark: true, showMultiWarning: false, }); }); - it('forces multi-project mode with a deprecation warning when --multi is used', () => { + it('forces multi-benchmark mode with a deprecation warning when --multi is used', () => { expect(resolveDashboardMode(1, { multi: true })).toEqual({ - isMultiProject: true, + isMultiBenchmark: true, showMultiWarning: true, }); }); - it('forces single-project mode when --single is used', () => { + it('forces single-benchmark mode when --single is used', () => { expect(resolveDashboardMode(3, { single: true })).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: false, }); }); it('lets --single override --multi', () => { expect(resolveDashboardMode(3, { multi: true, single: true })).toEqual({ - isMultiProject: false, + isMultiBenchmark: false, showMultiWarning: true, }); }); @@ -370,17 +370,17 @@ describe('serve app', () => { const app = createApp(results, tempDir, undefined, undefined, { studioDir, readOnly: true, - multiProjectDashboard: true, + multiBenchmarkDashboard: true, }); const res = await app.request('/api/config'); expect(res.status).toBe(200); const data = (await res.json()) as { read_only?: boolean; - multi_project_dashboard?: boolean; + multi_benchmark_dashboard?: boolean; }; expect(data.read_only).toBe(true); - expect(data.multi_project_dashboard).toBe(true); + expect(data.multi_benchmark_dashboard).toBe(true); }); }); diff --git a/apps/studio/src/components/ProjectCard.tsx b/apps/studio/src/components/BenchmarkCard.tsx similarity index 68% rename from apps/studio/src/components/ProjectCard.tsx rename to apps/studio/src/components/BenchmarkCard.tsx index a7af8cebe..a498c1a1a 100644 --- a/apps/studio/src/components/ProjectCard.tsx +++ b/apps/studio/src/components/BenchmarkCard.tsx @@ -1,8 +1,8 @@ /** - * Project card for the multi-project dashboard. + * Benchmark card for the multi-benchmark dashboard. * - * Shows project name, path, run count, pass rate, and last run time. - * Click navigates to the project's run list. + * Shows benchmark name, path, run count, pass rate, and last run time. + * Click navigates to the benchmark's run list. */ import { Link } from '@tanstack/react-router'; @@ -23,34 +23,34 @@ function formatTimeAgo(timestamp: string | null): string { return `${days}d ago`; } -export function ProjectCard({ project }: { project: BenchmarkSummary }) { - const passPercent = Math.round(project.pass_rate * 100); +export function BenchmarkCard({ benchmark }: { benchmark: BenchmarkSummary }) { + const passPercent = Math.round(benchmark.pass_rate * 100); return (

- {project.name} + {benchmark.name}

-

{project.path}

+

{benchmark.path}

Runs

-

{project.run_count}

+

{benchmark.run_count}

Pass Rate

= 80 ? 'text-emerald-400' @@ -59,12 +59,12 @@ export function ProjectCard({ project }: { project: BenchmarkSummary }) { : 'text-red-400' }`} > - {project.run_count > 0 ? `${passPercent}%` : '--'} + {benchmark.run_count > 0 ? `${passPercent}%` : '--'}

Last Run

-

{formatTimeAgo(project.last_run)}

+

{formatTimeAgo(benchmark.last_run)}

diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index 7e78431a7..019885eec 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -193,7 +193,7 @@ export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) { {benchmarkId ? ( diff --git a/apps/studio/src/components/RunEvalModal.tsx b/apps/studio/src/components/RunEvalModal.tsx index db7bd0d13..9c281a9dc 100644 --- a/apps/studio/src/components/RunEvalModal.tsx +++ b/apps/studio/src/components/RunEvalModal.tsx @@ -89,7 +89,7 @@ export function RunEvalModal({ open, onClose, benchmarkId, prefill }: RunEvalMod useEffect(() => { if (runStatus?.status === 'finished' || runStatus?.status === 'failed') { queryClient.invalidateQueries({ queryKey: ['runs'] }); - queryClient.invalidateQueries({ queryKey: ['projects'] }); + queryClient.invalidateQueries({ queryKey: ['benchmarks'] }); } }, [runStatus?.status, queryClient]); diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 6a82b78e4..1d3f7be71 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -107,7 +107,7 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) { {benchmarkId ? ( diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 3bc35f0fd..5933d791d 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -70,44 +70,48 @@ function SidebarShell({ children }: { children: ReactNode }) { export function Sidebar() { const matchRoute = useMatchRoute(); - // ── Project-scoped route matching ──────────────────────────────────── - const projectEvalMatch = matchRoute({ - to: '/projects/$benchmarkId/evals/$runId/$evalId', + // ── Benchmark-scoped route matching ────────────────────────────────── + const benchmarkEvalMatch = matchRoute({ + to: '/benchmarks/$benchmarkId/evals/$runId/$evalId', fuzzy: true, }); - const projectRunMatch = matchRoute({ - to: '/projects/$benchmarkId/runs/$runId', + const benchmarkRunMatch = matchRoute({ + to: '/benchmarks/$benchmarkId/runs/$runId', fuzzy: true, }); - const projectMatch = matchRoute({ - to: '/projects/$benchmarkId', + const benchmarkMatch = matchRoute({ + to: '/benchmarks/$benchmarkId', fuzzy: true, }); - // Project-scoped eval detail + // Benchmark-scoped eval detail if ( - projectEvalMatch && - typeof projectEvalMatch === 'object' && - 'benchmarkId' in projectEvalMatch + benchmarkEvalMatch && + typeof benchmarkEvalMatch === 'object' && + 'benchmarkId' in benchmarkEvalMatch ) { - const { benchmarkId, runId, evalId } = projectEvalMatch as { + const { benchmarkId, runId, evalId } = benchmarkEvalMatch as { benchmarkId: string; runId: string; evalId: string; }; - return ; + return ; } - // Project-scoped run detail - if (projectRunMatch && typeof projectRunMatch === 'object' && 'benchmarkId' in projectRunMatch) { - const { benchmarkId, runId } = projectRunMatch as { benchmarkId: string; runId: string }; - return ; + // Benchmark-scoped run detail + if ( + benchmarkRunMatch && + typeof benchmarkRunMatch === 'object' && + 'benchmarkId' in benchmarkRunMatch + ) { + const { benchmarkId, runId } = benchmarkRunMatch as { benchmarkId: string; runId: string }; + return ; } - // Project home (runs/experiments/targets) - if (projectMatch && typeof projectMatch === 'object' && 'benchmarkId' in projectMatch) { - const { benchmarkId } = projectMatch as { benchmarkId: string }; - return ; + // Benchmark home (runs/experiments/targets) + if (benchmarkMatch && typeof benchmarkMatch === 'object' && 'benchmarkId' in benchmarkMatch) { + const { benchmarkId } = benchmarkMatch as { benchmarkId: string }; + return ; } // ── Unscoped route matching ────────────────────────────────────────── @@ -154,14 +158,14 @@ export function Sidebar() { function RunSidebar() { const matchRoute = useMatchRoute(); - const { data: projectData } = useBenchmarkList(); - const hasProjects = (projectData?.projects.length ?? 0) > 0; + const { data: benchmarkData } = useBenchmarkList(); + const hasBenchmarks = (benchmarkData?.benchmarks.length ?? 0) > 0; const isHome = matchRoute({ to: '/' }); const runMatch = matchRoute({ to: '/runs/$runId', fuzzy: true }); - // On the projects landing page, show aggregated runs from all projects - const useAggregated = hasProjects && isHome !== false; + // On the benchmarks landing page, show aggregated runs from all benchmarks + const useAggregated = hasBenchmarks && isHome !== false; const { data: localData } = useRunList(); const { data: aggregatedData } = useAllBenchmarkRuns(); @@ -199,15 +203,15 @@ function RunSidebar() { 'runId' in runMatch && (runMatch as { runId: string }).runId === run.filename; - // Aggregated runs link to their project's run detail - if (run.project_id) { + // Aggregated runs link to their benchmark's run detail + if (run.benchmark_id) { return ( {run.display_name ?? run.filename} @@ -404,9 +408,9 @@ function CategorySidebar({ runId, category }: { runId: string; category: string ); } -// ── Project-scoped sidebars ────────────────────────────────────────────── +// ── Benchmark-scoped sidebars ──────────────────────────────────────────── -function ProjectRunDetailSidebar({ +function BenchmarkRunDetailSidebar({ benchmarkId, currentRunId, }: { @@ -439,7 +443,7 @@ function ProjectRunDetailSidebar({ return ( @@ -498,7 +502,7 @@ function ProjectEvalSidebar({ return ( ; } @@ -277,24 +277,10 @@ export async function removeBenchmarkApi(benchmarkId: string): Promise { method: 'DELETE', }); if (!res.ok) { - throw new Error(`Failed to remove project: ${res.status}`); + throw new Error(`Failed to remove benchmark: ${res.status}`); } } -export async function discoverBenchmarksApi(dirPath: string): Promise { - const res = await fetch('/api/benchmarks/discover', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ path: dirPath }), - }); - if (!res.ok) { - const err = (await res.json()) as { error: string }; - throw new Error(err.error || `Failed to discover: ${res.status}`); - } - const data = (await res.json()) as { discovered: BenchmarkEntry[] }; - return data.discovered; -} - /** Build the API base URL for a benchmark-scoped request. */ function benchmarkApiBase(benchmarkId: string): string { return `/api/benchmarks/${encodeURIComponent(benchmarkId)}`; diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 98dfd03b0..6ca189340 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -17,8 +17,8 @@ export interface RunMeta { target?: string; experiment?: string; source: 'local' | 'remote'; - project_id?: string; - project_name?: string; + benchmark_id?: string; + benchmark_name?: string; /** Optional user-assigned tags from the run's sidecar tags.json. */ tags?: string[]; } @@ -236,8 +236,8 @@ export interface StudioConfigResponse { /** @deprecated Use threshold */ pass_threshold?: number; read_only?: boolean; - project_name?: string; - multi_project_dashboard?: boolean; + benchmark_name?: string; + multi_benchmark_dashboard?: boolean; } export interface RemoteStatusResponse { @@ -267,7 +267,7 @@ export interface BenchmarkSummary { } export interface BenchmarkListResponse { - projects: BenchmarkSummary[]; + benchmarks: BenchmarkSummary[]; } export interface BenchmarkEntry { diff --git a/apps/studio/src/routeTree.gen.ts b/apps/studio/src/routeTree.gen.ts index f1552a6c6..dda530e93 100644 --- a/apps/studio/src/routeTree.gen.ts +++ b/apps/studio/src/routeTree.gen.ts @@ -12,14 +12,14 @@ import { Route as rootRouteImport } from './routes/__root' import { Route as SettingsRouteImport } from './routes/settings' import { Route as IndexRouteImport } from './routes/index' import { Route as RunsRunIdRouteImport } from './routes/runs/$runId' -import { Route as ProjectsBenchmarkIdRouteImport } from './routes/projects/$benchmarkId' import { Route as JobsRunIdRouteImport } from './routes/jobs/$runId' import { Route as ExperimentsExperimentNameRouteImport } from './routes/experiments/$experimentName' +import { Route as BenchmarksBenchmarkIdRouteImport } from './routes/benchmarks/$benchmarkId' import { Route as EvalsRunIdEvalIdRouteImport } from './routes/evals/$runId.$evalId' import { Route as RunsRunIdSuiteSuiteRouteImport } from './routes/runs/$runId_.suite.$suite' import { Route as RunsRunIdCategoryCategoryRouteImport } from './routes/runs/$runId_.category.$category' -import { Route as ProjectsBenchmarkIdRunsRunIdRouteImport } from './routes/projects/$benchmarkId_/runs/$runId' -import { Route as ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport } from './routes/projects/$benchmarkId_/evals/$runId.$evalId' +import { Route as BenchmarksBenchmarkIdRunsRunIdRouteImport } from './routes/benchmarks/$benchmarkId_/runs/$runId' +import { Route as BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport } from './routes/benchmarks/$benchmarkId_/evals/$runId.$evalId' const SettingsRoute = SettingsRouteImport.update({ id: '/settings', @@ -36,11 +36,6 @@ const RunsRunIdRoute = RunsRunIdRouteImport.update({ path: '/runs/$runId', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdRoute = ProjectsBenchmarkIdRouteImport.update({ - id: '/projects/$benchmarkId', - path: '/projects/$benchmarkId', - getParentRoute: () => rootRouteImport, -} as any) const JobsRunIdRoute = JobsRunIdRouteImport.update({ id: '/jobs/$runId', path: '/jobs/$runId', @@ -52,6 +47,11 @@ const ExperimentsExperimentNameRoute = path: '/experiments/$experimentName', getParentRoute: () => rootRouteImport, } as any) +const BenchmarksBenchmarkIdRoute = BenchmarksBenchmarkIdRouteImport.update({ + id: '/benchmarks/$benchmarkId', + path: '/benchmarks/$benchmarkId', + getParentRoute: () => rootRouteImport, +} as any) const EvalsRunIdEvalIdRoute = EvalsRunIdEvalIdRouteImport.update({ id: '/evals/$runId/$evalId', path: '/evals/$runId/$evalId', @@ -68,113 +68,113 @@ const RunsRunIdCategoryCategoryRoute = path: '/runs/$runId/category/$category', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdRunsRunIdRoute = - ProjectsBenchmarkIdRunsRunIdRouteImport.update({ - id: '/projects/$benchmarkId_/runs/$runId', - path: '/projects/$benchmarkId/runs/$runId', +const BenchmarksBenchmarkIdRunsRunIdRoute = + BenchmarksBenchmarkIdRunsRunIdRouteImport.update({ + id: '/benchmarks/$benchmarkId_/runs/$runId', + path: '/benchmarks/$benchmarkId/runs/$runId', getParentRoute: () => rootRouteImport, } as any) -const ProjectsBenchmarkIdEvalsRunIdEvalIdRoute = - ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport.update({ - id: '/projects/$benchmarkId_/evals/$runId/$evalId', - path: '/projects/$benchmarkId/evals/$runId/$evalId', +const BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute = + BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport.update({ + id: '/benchmarks/$benchmarkId_/evals/$runId/$evalId', + path: '/benchmarks/$benchmarkId/evals/$runId/$evalId', getParentRoute: () => rootRouteImport, } as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRoutesByTo { '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute '/settings': typeof SettingsRoute + '/benchmarks/$benchmarkId': typeof BenchmarksBenchmarkIdRoute '/experiments/$experimentName': typeof ExperimentsExperimentNameRoute '/jobs/$runId': typeof JobsRunIdRoute - '/projects/$benchmarkId': typeof ProjectsBenchmarkIdRoute '/runs/$runId': typeof RunsRunIdRoute '/evals/$runId/$evalId': typeof EvalsRunIdEvalIdRoute - '/projects/$benchmarkId_/runs/$runId': typeof ProjectsBenchmarkIdRunsRunIdRoute + '/benchmarks/$benchmarkId_/runs/$runId': typeof BenchmarksBenchmarkIdRunsRunIdRoute '/runs/$runId_/category/$category': typeof RunsRunIdCategoryCategoryRoute '/runs/$runId_/suite/$suite': typeof RunsRunIdSuiteSuiteRoute - '/projects/$benchmarkId_/evals/$runId/$evalId': typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + '/benchmarks/$benchmarkId_/evals/$runId/$evalId': typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId/runs/$runId' + | '/benchmarks/$benchmarkId/runs/$runId' | '/runs/$runId/category/$category' | '/runs/$runId/suite/$suite' - | '/projects/$benchmarkId/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId/evals/$runId/$evalId' fileRoutesByTo: FileRoutesByTo to: | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId/runs/$runId' + | '/benchmarks/$benchmarkId/runs/$runId' | '/runs/$runId/category/$category' | '/runs/$runId/suite/$suite' - | '/projects/$benchmarkId/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId/evals/$runId/$evalId' id: | '__root__' | '/' | '/settings' + | '/benchmarks/$benchmarkId' | '/experiments/$experimentName' | '/jobs/$runId' - | '/projects/$benchmarkId' | '/runs/$runId' | '/evals/$runId/$evalId' - | '/projects/$benchmarkId_/runs/$runId' + | '/benchmarks/$benchmarkId_/runs/$runId' | '/runs/$runId_/category/$category' | '/runs/$runId_/suite/$suite' - | '/projects/$benchmarkId_/evals/$runId/$evalId' + | '/benchmarks/$benchmarkId_/evals/$runId/$evalId' fileRoutesById: FileRoutesById } export interface RootRouteChildren { IndexRoute: typeof IndexRoute SettingsRoute: typeof SettingsRoute + BenchmarksBenchmarkIdRoute: typeof BenchmarksBenchmarkIdRoute ExperimentsExperimentNameRoute: typeof ExperimentsExperimentNameRoute JobsRunIdRoute: typeof JobsRunIdRoute - ProjectsBenchmarkIdRoute: typeof ProjectsBenchmarkIdRoute RunsRunIdRoute: typeof RunsRunIdRoute EvalsRunIdEvalIdRoute: typeof EvalsRunIdEvalIdRoute - ProjectsBenchmarkIdRunsRunIdRoute: typeof ProjectsBenchmarkIdRunsRunIdRoute + BenchmarksBenchmarkIdRunsRunIdRoute: typeof BenchmarksBenchmarkIdRunsRunIdRoute RunsRunIdCategoryCategoryRoute: typeof RunsRunIdCategoryCategoryRoute RunsRunIdSuiteSuiteRoute: typeof RunsRunIdSuiteSuiteRoute - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute: typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRoute + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute: typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute } declare module '@tanstack/react-router' { @@ -200,13 +200,6 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId': { - id: '/projects/$benchmarkId' - path: '/projects/$benchmarkId' - fullPath: '/projects/$benchmarkId' - preLoaderRoute: typeof ProjectsBenchmarkIdRouteImport - parentRoute: typeof rootRouteImport - } '/jobs/$runId': { id: '/jobs/$runId' path: '/jobs/$runId' @@ -221,6 +214,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ExperimentsExperimentNameRouteImport parentRoute: typeof rootRouteImport } + '/benchmarks/$benchmarkId': { + id: '/benchmarks/$benchmarkId' + path: '/benchmarks/$benchmarkId' + fullPath: '/benchmarks/$benchmarkId' + preLoaderRoute: typeof BenchmarksBenchmarkIdRouteImport + parentRoute: typeof rootRouteImport + } '/evals/$runId/$evalId': { id: '/evals/$runId/$evalId' path: '/evals/$runId/$evalId' @@ -242,18 +242,18 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof RunsRunIdCategoryCategoryRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId_/runs/$runId': { - id: '/projects/$benchmarkId_/runs/$runId' - path: '/projects/$benchmarkId/runs/$runId' - fullPath: '/projects/$benchmarkId/runs/$runId' - preLoaderRoute: typeof ProjectsBenchmarkIdRunsRunIdRouteImport + '/benchmarks/$benchmarkId_/runs/$runId': { + id: '/benchmarks/$benchmarkId_/runs/$runId' + path: '/benchmarks/$benchmarkId/runs/$runId' + fullPath: '/benchmarks/$benchmarkId/runs/$runId' + preLoaderRoute: typeof BenchmarksBenchmarkIdRunsRunIdRouteImport parentRoute: typeof rootRouteImport } - '/projects/$benchmarkId_/evals/$runId/$evalId': { - id: '/projects/$benchmarkId_/evals/$runId/$evalId' - path: '/projects/$benchmarkId/evals/$runId/$evalId' - fullPath: '/projects/$benchmarkId/evals/$runId/$evalId' - preLoaderRoute: typeof ProjectsBenchmarkIdEvalsRunIdEvalIdRouteImport + '/benchmarks/$benchmarkId_/evals/$runId/$evalId': { + id: '/benchmarks/$benchmarkId_/evals/$runId/$evalId' + path: '/benchmarks/$benchmarkId/evals/$runId/$evalId' + fullPath: '/benchmarks/$benchmarkId/evals/$runId/$evalId' + preLoaderRoute: typeof BenchmarksBenchmarkIdEvalsRunIdEvalIdRouteImport parentRoute: typeof rootRouteImport } } @@ -262,16 +262,16 @@ declare module '@tanstack/react-router' { const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, SettingsRoute: SettingsRoute, + BenchmarksBenchmarkIdRoute: BenchmarksBenchmarkIdRoute, ExperimentsExperimentNameRoute: ExperimentsExperimentNameRoute, JobsRunIdRoute: JobsRunIdRoute, - ProjectsBenchmarkIdRoute: ProjectsBenchmarkIdRoute, RunsRunIdRoute: RunsRunIdRoute, EvalsRunIdEvalIdRoute: EvalsRunIdEvalIdRoute, - ProjectsBenchmarkIdRunsRunIdRoute: ProjectsBenchmarkIdRunsRunIdRoute, + BenchmarksBenchmarkIdRunsRunIdRoute: BenchmarksBenchmarkIdRunsRunIdRoute, RunsRunIdCategoryCategoryRoute: RunsRunIdCategoryCategoryRoute, RunsRunIdSuiteSuiteRoute: RunsRunIdSuiteSuiteRoute, - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute: - ProjectsBenchmarkIdEvalsRunIdEvalIdRoute, + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute: + BenchmarksBenchmarkIdEvalsRunIdEvalIdRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/apps/studio/src/routes/projects/$benchmarkId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx similarity index 89% rename from apps/studio/src/routes/projects/$benchmarkId.tsx rename to apps/studio/src/routes/benchmarks/$benchmarkId.tsx index 3bcdaf281..f8d698190 100644 --- a/apps/studio/src/routes/projects/$benchmarkId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx @@ -1,7 +1,7 @@ /** - * Project home route: tabbed view (Runs, Experiments, Analytics, Targets) scoped to a project. + * Benchmark home route: tabbed view (Runs, Experiments, Analytics, Targets) scoped to a benchmark. * - * Mirrors the single-project home page but fetches from project-scoped API endpoints. + * Mirrors the single-benchmark home page but fetches from benchmark-scoped API endpoints. */ import { createFileRoute, useNavigate, useRouterState } from '@tanstack/react-router'; @@ -32,11 +32,11 @@ const tabs: { id: TabId; label: string }[] = [ { id: 'targets', label: 'Targets' }, ]; -export const Route = createFileRoute('/projects/$benchmarkId')({ - component: ProjectHomePage, +export const Route = createFileRoute('/benchmarks/$benchmarkId')({ + component: BenchmarkHomePage, }); -function ProjectHomePage() { +function BenchmarkHomePage() { const { benchmarkId } = Route.useParams(); const routerState = useRouterState(); const searchParams = routerState.location.search as Record; @@ -72,7 +72,7 @@ function ProjectHomePage() { key={t.id} onClick={() => navigate({ - to: '/projects/$benchmarkId', + to: '/benchmarks/$benchmarkId', params: { benchmarkId }, search: { tab: t.id } as Record, }) @@ -89,12 +89,12 @@ function ProjectHomePage() { - {activeTab === 'runs' && } - {activeTab === 'experiments' && } + {activeTab === 'runs' && } + {activeTab === 'experiments' && } {activeTab === 'analytics' && ( - + )} - {activeTab === 'targets' && } + {activeTab === 'targets' && } {!isReadOnly && ( 0; - const multiProjectDashboard = config?.multi_project_dashboard; + const hasBenchmarks = (benchmarkData?.benchmarks.length ?? 0) > 0; + const multiBenchmarkDashboard = config?.multi_benchmark_dashboard; - if (projectsLoading || configLoading) { + if (benchmarksLoading || configLoading) { return ; } - if (multiProjectDashboard === true || (multiProjectDashboard === undefined && hasProjects)) { - return ; + if ( + multiBenchmarkDashboard === true || + (multiBenchmarkDashboard === undefined && hasBenchmarks) + ) { + return ; } - return ; + return ; } -// ── Projects Dashboard ────────────────────────────────────────────────── +// ── Benchmarks Dashboard ──────────────────────────────────────────────── -function ProjectsDashboard() { +function BenchmarksDashboard() { const { data } = useBenchmarkList(); const { data: config } = useStudioConfig(); const queryClient = useQueryClient(); const [addPath, setAddPath] = useState(''); - const [discoverPath, setDiscoverPath] = useState(''); const [error, setError] = useState(null); const [showAddForm, setShowAddForm] = useState(false); const [showRunEval, setShowRunEval] = useState(false); - const projects = data?.projects ?? []; + const benchmarks = data?.benchmarks ?? []; const isReadOnly = config?.read_only === true; - async function handleAddProject(e: React.FormEvent) { + async function handleAddBenchmark(e: React.FormEvent) { e.preventDefault(); if (!addPath.trim()) return; setError(null); @@ -88,22 +89,6 @@ function ProjectsDashboard() { } } - async function handleDiscover(e: React.FormEvent) { - e.preventDefault(); - if (!discoverPath.trim()) return; - setError(null); - try { - const discovered = await discoverBenchmarksApi(discoverPath.trim()); - setDiscoverPath(''); - if (discovered.length === 0) { - setError('No projects with .agentv/ found in that directory.'); - } - queryClient.invalidateQueries({ queryKey: ['benchmarks'] }); - } catch (err) { - setError((err as Error).message); - } - } - return (
@@ -138,7 +123,7 @@ function ProjectsDashboard() { {!isReadOnly && showAddForm && (
-
+
-
- setDiscoverPath(e.target.value)} - placeholder="Discover benchmarks in directory..." - className="flex-1 rounded-md border border-gray-700 bg-gray-800 px-3 py-1.5 text-sm text-white placeholder-gray-500 focus:border-cyan-600 focus:outline-none" - /> - -
)}
- {projects.map((project) => ( - + {benchmarks.map((benchmark) => ( + ))}
@@ -182,9 +152,9 @@ function ProjectsDashboard() { ); } -// ── Single-project home (existing behavior) ───────────────────────────── +// ── Single-benchmark home (existing behavior) ─────────────────────────── -function SingleProjectHome() { +function SingleBenchmarkHome() { const routerState = useRouterState(); const searchParams = routerState.location.search as Record; const tab = searchParams.tab as TabId | undefined; @@ -225,8 +195,8 @@ function SingleProjectHome() {

Evaluation Runs

- {config?.project_name && ( -

{config.project_name}

+ {config?.benchmark_name && ( +

{config.benchmark_name}

)}
{!isReadOnly && ( diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index b4c333625..49a8947cb 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -409,7 +409,7 @@ The `{timestamp}` placeholder is replaced with an ISO-like timestamp (e.g., `202 ### AGENTV_HOME -Override the data directory for heavy runtime artifacts — workspaces, workspace pool, subagents, trace state, git cache, and downloaded dependencies. Lightweight config and cache files (`version-check.json`, `last-config.json`, `projects.yaml`) always stay in `~/.agentv` regardless of this setting. +Override the data directory for heavy runtime artifacts — workspaces, workspace pool, subagents, trace state, git cache, and downloaded dependencies. Lightweight config and cache files (`version-check.json`, `last-config.json`, `benchmarks.yaml`) always stay in `~/.agentv` regardless of this setting. ```bash # Linux/macOS diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 1df0b42a2..33b96455e 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -45,11 +45,10 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z |--------|-------------| | `--port`, `-p` | Port to listen on (flag > `PORT` env var > 3117) | | `--dir`, `-d` | Working directory (default: current directory) | -| `--multi` | Launch in multi-project dashboard mode (deprecated; use auto-detect or `--single`) | -| `--single` | Force single-project dashboard mode | -| `--add ` | Register a project by path | -| `--remove ` | Unregister a project by ID | -| `--discover ` | Scan a directory tree for repos with `.agentv/` | +| `--multi` | Launch in multi-benchmark dashboard mode (deprecated; use auto-detect or `--single`) | +| `--single` | Force single-benchmark dashboard mode | +| `--add ` | Register a benchmark by path | +| `--remove ` | Unregister a benchmark by ID | ## Features @@ -152,28 +151,29 @@ agentv studio --add /path/to/my-evals agentv studio --add /path/to/other-evals ``` -Each path must contain a `.agentv/` directory. Registered benchmarks are stored in `~/.agentv/projects.yaml`. +Each path must contain a `.agentv/` directory. Registered benchmarks are stored in `~/.agentv/benchmarks.yaml`. -### Auto-Discovery +### Runtime behavior: no restart needed -Scan a parent directory to find and register all benchmark repos: +`benchmarks.yaml` is the single source of truth. Studio re-reads it on every `/api/benchmarks` request (which the UI polls every ~10 s), so any of these changes appear live without restarting `agentv serve`: -```bash -agentv studio --discover /path/to/repos -``` +- Adding via the UI's **Add Benchmark** form or `POST /api/benchmarks`. +- Removing via the UI's **Remove** button or `DELETE /api/benchmarks/:id`. +- Editing `~/.agentv/benchmarks.yaml` directly. +- Mounting the file via a Kubernetes ConfigMap — GitOps the ConfigMap and Studio reflects it within the next poll. -This recursively searches (up to 2 levels deep) for directories containing `.agentv/` and registers them. +This satisfies the 24/7-Studio use case: the server stays up; benchmarks come and go through config edits or API calls. ### Launching the Dashboard Studio auto-detects the mode based on how many benchmarks are registered: -- `0` or `1` registered: single-project view +- `0` or `1` registered: single-benchmark view - `2+` registered: Benchmarks dashboard ```bash agentv studio # auto-detects -agentv studio --single # force single-project view +agentv studio --single # force single-benchmark view ``` The landing page shows a card for each benchmark with run count, pass rate, and last run time. diff --git a/packages/core/src/benchmarks.ts b/packages/core/src/benchmarks.ts index 53a8a2ed2..39fa6b3c6 100644 --- a/packages/core/src/benchmarks.ts +++ b/packages/core/src/benchmarks.ts @@ -2,34 +2,39 @@ * Benchmark registry for AgentV Studio multi-benchmark support. * * A Benchmark = any directory containing a `.agentv/` folder. - * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks. + * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of + * truth for which benchmarks Studio shows. Studio re-reads the file on every + * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via + * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected + * without restarting `agentv serve`. * - * YAML format: + * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"): * benchmarks: * - id: my-app * name: My App * path: /home/user/projects/my-app - * addedAt: "2026-03-20T10:00:00Z" - * lastOpenedAt: "2026-03-30T14:00:00Z" + * added_at: "2026-03-20T10:00:00Z" + * last_opened_at: "2026-03-30T14:00:00Z" * - * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD, - * discoverBenchmarks() to scan a directory tree for `.agentv/` directories. + * Concurrency: the registry assumes a single writer. All mutating calls + * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml + * without a lock. Studio's HTTP handlers are serialized by Node's + * single-threaded event loop, which satisfies the 24/7 deployment case. + * Run only one `agentv` process against a given home at a time. + * + * To extend: + * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the + * add/remove/touch helpers. + * - discoverBenchmarks() is a one-shot filesystem utility for bulk + * registration; it does not run in the request path. */ -import { - copyFileSync, - existsSync, - mkdirSync, - readFileSync, - readdirSync, - statSync, - writeFileSync, -} from 'node:fs'; +import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import { parse as parseYaml, stringify as stringifyYaml } from 'yaml'; -import { getAgentvConfigDir, getAgentvHome } from './paths.js'; +import { getAgentvConfigDir } from './paths.js'; // ── Types ─────────────────────────────────────────────────────────────── @@ -48,41 +53,64 @@ export interface BenchmarkRegistry { // ── Registry path ─────────────────────────────────────────────────────── export function getBenchmarksRegistryPath(): string { - return path.join(getAgentvConfigDir(), 'projects.yaml'); + return path.join(getAgentvConfigDir(), 'benchmarks.yaml'); } -/** - * One-time migration: if projects.yaml exists at the old AGENTV_HOME location - * but not in ~/.agentv, copy it over. This handles the case where users had - * AGENTV_HOME set and projects.yaml was created there before the config/data split. - */ -function migrateProjectsYaml(targetPath: string): void { - const dataHome = getAgentvHome(); - const configDir = getAgentvConfigDir(); - if (dataHome === configDir) return; - const legacyPath = path.join(dataHome, 'projects.yaml'); - if (!existsSync(legacyPath)) return; - mkdirSync(path.dirname(targetPath), { recursive: true }); - copyFileSync(legacyPath, targetPath); +// ── Load / Save ───────────────────────────────────────────────────────── +// YAML uses snake_case per AGENTS.md §"Wire Format Convention"; TypeScript +// internals stay camelCase. fromYaml / toYaml handle the translation; every +// other function in this module works in camelCase only. + +interface BenchmarkEntryYaml { + id: string; + name: string; + path: string; + added_at: string; + last_opened_at: string; } -// ── Load / Save ───────────────────────────────────────────────────────── +function fromYaml(raw: unknown): BenchmarkEntry | null { + if (!raw || typeof raw !== 'object') return null; + const e = raw as Partial; + if (typeof e.id !== 'string' || typeof e.name !== 'string' || typeof e.path !== 'string') { + return null; + } + return { + id: e.id, + name: e.name, + path: e.path, + addedAt: typeof e.added_at === 'string' ? e.added_at : '', + lastOpenedAt: typeof e.last_opened_at === 'string' ? e.last_opened_at : '', + }; +} + +function toYaml(entry: BenchmarkEntry): BenchmarkEntryYaml { + return { + id: entry.id, + name: entry.name, + path: entry.path, + added_at: entry.addedAt, + last_opened_at: entry.lastOpenedAt, + }; +} export function loadBenchmarkRegistry(): BenchmarkRegistry { const registryPath = getBenchmarksRegistryPath(); - if (!existsSync(registryPath)) { - migrateProjectsYaml(registryPath); - } if (!existsSync(registryPath)) { return { benchmarks: [] }; } try { const raw = readFileSync(registryPath, 'utf-8'); const parsed = parseYaml(raw); - if (!parsed || !Array.isArray(parsed.benchmarks)) { + if (!parsed || typeof parsed !== 'object') { return { benchmarks: [] }; } - return { benchmarks: parsed.benchmarks as BenchmarkEntry[] }; + const benchmarks = Array.isArray(parsed.benchmarks) + ? (parsed.benchmarks as unknown[]) + .map(fromYaml) + .filter((e): e is BenchmarkEntry => e !== null) + : []; + return { benchmarks }; } catch { return { benchmarks: [] }; } @@ -94,7 +122,8 @@ export function saveBenchmarkRegistry(registry: BenchmarkRegistry): void { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } - writeFileSync(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), 'utf-8'); + const payload = { benchmarks: registry.benchmarks.map(toYaml) }; + writeFileSync(registryPath, stringifyYaml(payload), 'utf-8'); } // ── CRUD operations ───────────────────────────────────────────────────── @@ -186,11 +215,13 @@ export function touchBenchmark(benchmarkId: string): void { } } -// ── Discovery ─────────────────────────────────────────────────────────── +// ── Discovery utility ─────────────────────────────────────────────────── /** * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`. - * Returns absolute paths of discovered benchmark directories. + * Returns absolute paths of discovered benchmark directories, sorted for + * deterministic iteration. This is a one-shot helper for bulk registration; + * Studio does not scan at request time. */ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { const absRoot = path.resolve(rootDir); @@ -224,5 +255,5 @@ export function discoverBenchmarks(rootDir: string, maxDepth = 2): string[] { } scan(absRoot, 0); - return results; + return results.sort(); } diff --git a/packages/core/src/paths.ts b/packages/core/src/paths.ts index e864319ca..f11cd4c8a 100644 --- a/packages/core/src/paths.ts +++ b/packages/core/src/paths.ts @@ -6,7 +6,7 @@ let logged = false; /** * The default config directory (~/.agentv). Always resolves to the user's home * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files - * like version-check.json, last-config.json, and projects.yaml. + * like version-check.json, last-config.json, and benchmarks.yaml. */ export function getAgentvConfigDir(): string { return path.join(os.homedir(), '.agentv'); diff --git a/packages/core/test/benchmarks.test.ts b/packages/core/test/benchmarks.test.ts new file mode 100644 index 000000000..5c9953890 --- /dev/null +++ b/packages/core/test/benchmarks.test.ts @@ -0,0 +1,102 @@ +import { afterEach, beforeEach, describe, expect, it, spyOn } from 'bun:test'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { + addBenchmark, + getBenchmark, + getBenchmarksRegistryPath, + loadBenchmarkRegistry, + removeBenchmark, + touchBenchmark, +} from '../src/benchmarks.js'; + +describe('benchmarks registry', () => { + let fakeHome: string; + let reposRoot: string; + // biome-ignore lint/suspicious/noExplicitAny: spy typing from bun:test is intentionally loose. + let homedirSpy: any; + + beforeEach(() => { + fakeHome = mkdtempSync(path.join(os.tmpdir(), 'agentv-benchmarks-')); + reposRoot = mkdtempSync(path.join(os.tmpdir(), 'agentv-repos-')); + homedirSpy = spyOn(os, 'homedir').mockReturnValue(fakeHome); + }); + + afterEach(() => { + homedirSpy?.mockRestore?.(); + rmSync(fakeHome, { recursive: true, force: true }); + rmSync(reposRoot, { recursive: true, force: true }); + }); + + function makeRepo(name: string): string { + const dir = path.join(reposRoot, name); + mkdirSync(path.join(dir, '.agentv'), { recursive: true }); + return dir; + } + + it('starts empty and surfaces new entries after addBenchmark', () => { + expect(loadBenchmarkRegistry().benchmarks).toEqual([]); + + const repoPath = makeRepo('alpha'); + const entry = addBenchmark(repoPath); + expect(entry.name).toBe('alpha'); + expect(entry.path).toBe(path.resolve(repoPath)); + + // Subsequent load reflects the write (per-request reload model). + expect(loadBenchmarkRegistry().benchmarks).toHaveLength(1); + expect(getBenchmark(entry.id)?.path).toBe(entry.path); + }); + + it('addBenchmark refuses a path with no .agentv/ directory', () => { + const bare = mkdtempSync(path.join(os.tmpdir(), 'agentv-bare-')); + expect(() => addBenchmark(bare)).toThrow(/No \.agentv\/ directory found/); + rmSync(bare, { recursive: true, force: true }); + }); + + it('addBenchmark is idempotent on the same path', () => { + const repoPath = makeRepo('idempotent'); + const first = addBenchmark(repoPath); + const second = addBenchmark(repoPath); + expect(first.id).toBe(second.id); + expect(loadBenchmarkRegistry().benchmarks).toHaveLength(1); + }); + + it('removeBenchmark drops the entry by id', () => { + const entry = addBenchmark(makeRepo('to-remove')); + expect(removeBenchmark(entry.id)).toBe(true); + expect(loadBenchmarkRegistry().benchmarks).toEqual([]); + expect(removeBenchmark(entry.id)).toBe(false); + }); + + it('touchBenchmark updates lastOpenedAt without affecting other entries', () => { + const a = addBenchmark(makeRepo('a')); + const b = addBenchmark(makeRepo('b')); + const originalB = loadBenchmarkRegistry().benchmarks.find((e) => e.id === b.id); + + touchBenchmark(a.id); + const reloadedA = loadBenchmarkRegistry().benchmarks.find((e) => e.id === a.id); + const reloadedB = loadBenchmarkRegistry().benchmarks.find((e) => e.id === b.id); + expect(reloadedA?.lastOpenedAt).not.toBe(a.lastOpenedAt); + expect(reloadedB?.lastOpenedAt).toBe(originalB?.lastOpenedAt); + }); + + it('serializes benchmark entries with snake_case keys on disk', () => { + const entry = addBenchmark(makeRepo('snake')); + + const yamlOnDisk = readFileSync(getBenchmarksRegistryPath(), 'utf-8'); + expect(yamlOnDisk).toContain('added_at:'); + expect(yamlOnDisk).toContain('last_opened_at:'); + expect(yamlOnDisk).not.toContain('addedAt:'); + expect(yamlOnDisk).not.toContain('lastOpenedAt:'); + + // Round-trips cleanly back into the camelCase TS shape. + const reloaded = loadBenchmarkRegistry().benchmarks.find((b) => b.id === entry.id); + expect(reloaded).toMatchObject({ + id: entry.id, + addedAt: entry.addedAt, + lastOpenedAt: entry.lastOpenedAt, + }); + }); +});