From f8f1d6acfb5620162e7b3d9ac379ee707f06e32f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 13:23:41 +0000 Subject: [PATCH 1/3] feat(studio): add experiment-scoped runs and read-only mode --- apps/cli/src/commands/eval/artifact-writer.ts | 15 ++-- apps/cli/src/commands/eval/commands/run.ts | 6 ++ apps/cli/src/commands/eval/result-layout.ts | 29 ++++++-- apps/cli/src/commands/eval/run-eval.ts | 16 +++-- apps/cli/src/commands/inspect/utils.ts | 55 ++++++++++++--- apps/cli/src/commands/pipeline/input.ts | 4 +- apps/cli/src/commands/pipeline/run.ts | 5 +- apps/cli/src/commands/results/eval-runner.ts | 10 ++- apps/cli/src/commands/results/serve.ts | 65 +++++++++++++----- apps/cli/test/commands/results/serve.test.ts | 35 ++++++++++ apps/cli/test/commands/trace/trace.test.ts | 14 ++++ apps/studio/src/components/Breadcrumbs.tsx | 14 +++- apps/studio/src/components/EvalDetail.tsx | 6 +- apps/studio/src/components/RunList.tsx | 4 +- apps/studio/src/components/Sidebar.tsx | 6 +- apps/studio/src/lib/types.ts | 3 + .../src/routes/evals/$runId.$evalId.tsx | 38 ++++++----- apps/studio/src/routes/index.tsx | 68 ++++++++++++------- .../studio/src/routes/projects/$projectId.tsx | 34 ++++++---- .../$projectId_/evals/$runId.$evalId.tsx | 40 ++++++----- .../projects/$projectId_/runs/$runId.tsx | 34 ++++++---- apps/studio/src/routes/runs/$runId.tsx | 24 ++++--- apps/studio/src/routes/settings.tsx | 21 +++--- 23 files changed, 388 insertions(+), 158 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 63be38f3f..7bd1e359c 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -61,6 +61,7 @@ export interface BenchmarkArtifact { readonly timestamp: string; readonly targets: readonly string[]; readonly tests_run: readonly string[]; + readonly experiment?: string; }; readonly run_summary: Record< string, @@ -97,6 +98,7 @@ export interface IndexArtifactEntry { readonly suite?: string; readonly category?: string; readonly conversation_id?: string; + readonly experiment?: string; readonly score: number; readonly target: string; readonly scores?: readonly Record[]; @@ -313,6 +315,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin export function buildBenchmarkArtifact( results: readonly EvaluationResult[], evalFile = '', + experiment?: string, ): BenchmarkArtifact { const targetSet = new Set(); const testIdSet = new Set(); @@ -405,6 +408,7 @@ export function buildBenchmarkArtifact( timestamp, targets, tests_run: testIds, + experiment, }, run_summary: runSummary, per_grader_summary: perEvaluatorSummary, @@ -689,7 +693,7 @@ export function parseJsonlResults(content: string): EvaluationResult[] { export async function writeArtifacts( jsonlPath: string, outputDir: string, - options?: { evalFile?: string }, + options?: { evalFile?: string; experiment?: string }, ): Promise<{ testArtifactDir: string; timingPath: string; @@ -705,7 +709,7 @@ export async function writeArtifacts( export async function writeArtifactsFromResults( results: readonly EvaluationResult[], outputDir: string, - options?: { evalFile?: string }, + options?: { evalFile?: string; experiment?: string }, ): Promise<{ testArtifactDir: string; timingPath: string; @@ -746,7 +750,10 @@ export async function writeArtifactsFromResults( ); } - indexRecords.push(buildResultIndexArtifact(result)); + indexRecords.push({ + ...buildResultIndexArtifact(result), + experiment: options?.experiment, + }); } // Write aggregate timing @@ -754,7 +761,7 @@ export async function writeArtifactsFromResults( await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); // Write benchmark - const benchmark = buildBenchmarkArtifact(results, options?.evalFile); + const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); await writeJsonlFile(indexPath, indexRecords); diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 098cffa5c..282d8d655 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -60,6 +60,11 @@ export const evalRunCommand = command({ long: 'output-format', description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)", }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Experiment label for canonical run output (default: default)', + }), export: multioption({ type: array(string), long: 'export', @@ -223,6 +228,7 @@ export const evalRunCommand = command({ out: args.out, output: args.output, outputFormat: args.outputFormat, + experiment: args.experiment, export: args.export, dryRun: args.dryRun, dryRunDelay: args.dryRunDelay, diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts index b6e6c57b7..f9acd3e69 100644 --- a/apps/cli/src/commands/eval/result-layout.ts +++ b/apps/cli/src/commands/eval/result-layout.ts @@ -3,17 +3,38 @@ import path from 'node:path'; export const RESULT_INDEX_FILENAME = 'index.jsonl'; export const RESULT_RUNS_DIRNAME = 'runs'; +export const DEFAULT_EXPERIMENT_NAME = 'default'; + +export function normalizeExperimentName(experiment?: string): string { + const trimmed = experiment?.trim(); + if (!trimmed) { + return DEFAULT_EXPERIMENT_NAME; + } + if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) { + throw new Error( + `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`, + ); + } + return trimmed; +} export function createRunDirName(timestamp = new Date()): string { return timestamp.toISOString().replace(/[:.]/g, '-'); } -export function buildDefaultRunDir(cwd: string): string { - return path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME, createRunDirName()); +export function buildDefaultRunDir(cwd: string, experiment?: string, timestamp = new Date()): string { + return path.join( + cwd, + '.agentv', + 'results', + RESULT_RUNS_DIRNAME, + normalizeExperimentName(experiment), + createRunDirName(timestamp), + ); } -export function buildDefaultIndexPath(cwd: string): string { - return path.join(buildDefaultRunDir(cwd), RESULT_INDEX_FILENAME); +export function buildDefaultIndexPath(cwd: string, experiment?: string): string { + return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME); } export function resolveRunIndexPath(runDir: string): string { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 471293a52..c97d3d816 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -33,7 +33,7 @@ import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; -import { buildDefaultRunDir } from './result-layout.js'; +import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js'; import { buildExclusionFilter, loadErrorTestIds, @@ -96,6 +96,7 @@ interface NormalizedOptions { readonly tags: readonly string[]; readonly excludeTags: readonly string[]; readonly transcript?: string; + readonly experiment?: string; } function normalizeBoolean(value: unknown): boolean { @@ -363,6 +364,7 @@ function normalizeOptions( tags: normalizeStringArray(rawOptions.tag), excludeTags: normalizeStringArray(rawOptions.excludeTag), transcript: normalizeString(rawOptions.transcript), + experiment: normalizeString(rawOptions.experiment), } satisfies NormalizedOptions; } @@ -374,8 +376,8 @@ async function ensureFileExists(filePath: string, description: string): Promise< } } -function buildDefaultOutputPath(cwd: string): string { - const runDir = buildDefaultRunDir(cwd); +function buildDefaultOutputPathForExperiment(cwd: string, experiment?: string): string { + const runDir = buildDefaultRunDir(cwd, experiment); mkdirSync(runDir, { recursive: true }); return path.join(runDir, 'index.jsonl'); } @@ -868,6 +870,9 @@ export async function runEvalCommand( .replace(/:/g, '-') .replace(/\./g, '-'); } + if (!process.env.AGENTV_EXPERIMENT) { + process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment); + } // Load agentv.config.ts (if present) for default values let config: Awaited> = null; @@ -987,8 +992,8 @@ export async function runEvalCommand( mkdirSync(runDir, { recursive: true }); usesDefaultArtifactWorkspace = false; } else { - // Default: .agentv/results/runs// - outputPath = buildDefaultOutputPath(cwd); + // Default: .agentv/results/runs/// + outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment); runDir = path.dirname(outputPath); usesDefaultArtifactWorkspace = true; } @@ -1426,6 +1431,7 @@ export async function runEvalCommand( indexPath, } = await writeArtifactsFromResults(allResults, runDir, { evalFile, + experiment: normalizeExperimentName(options.experiment), }); console.log(`Artifact workspace written to: ${runDir}`); console.log(` Index: ${indexPath}`); diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 01c6f6fc0..0d9689153 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -523,6 +523,7 @@ export function toTraceSummary(result: RawResult): TraceSummary | undefined { export interface ResultFileMeta { path: string; filename: string; + displayName: string; timestamp: string; testCount: number; passRate: number; @@ -530,24 +531,57 @@ export interface ResultFileMeta { sizeBytes: number; } +function buildRunId(relativeRunPath: string): string { + const normalized = relativeRunPath.split(path.sep).join('/'); + const segments = normalized.split('/').filter(Boolean); + if (segments.length >= 2) { + const experiment = segments.slice(0, -1).join('/'); + const timestamp = segments.at(-1); + if (experiment === 'default') { + return timestamp ?? normalized; + } + return `${experiment}::${timestamp}`; + } + return segments[0]; +} + +function collectRunManifestPaths( + runsDir: string, + currentDir: string, + files: { filePath: string; displayName: string; runId: string }[], +): void { + const primaryPath = resolveExistingRunPrimaryPath(currentDir); + if (primaryPath) { + const relativeRunPath = path.relative(runsDir, currentDir); + files.push({ + filePath: primaryPath, + displayName: path.basename(currentDir), + runId: buildRunId(relativeRunPath), + }); + return; + } + + const entries = readdirSync(currentDir, { withFileTypes: true }); + for (const entry of entries) { + if (entry.isDirectory()) { + collectRunManifestPaths(runsDir, path.join(currentDir, entry.name), files); + } + } +} + /** * Enumerate canonical run manifests in `.agentv/results/runs/`. */ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME); - const files: { filePath: string; displayName: string }[] = []; + const files: { filePath: string; displayName: string; runId: string }[] = []; try { const entries = readdirSync(runsDir, { withFileTypes: true }); for (const entry of entries) { - if (!entry.isDirectory()) { - continue; - } - - const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name)); - if (primaryPath) { - files.push({ filePath: primaryPath, displayName: entry.name }); + if (entry.isDirectory()) { + collectRunManifestPaths(runsDir, path.join(runsDir, entry.name), files); } } } catch { @@ -561,7 +595,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { const metas: ResultFileMeta[] = []; - for (const { filePath, displayName } of limited) { + for (const { filePath, displayName, runId } of limited) { try { const fileStat = statSync(filePath); const results = loadResultFile(filePath); @@ -576,7 +610,8 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { metas.push({ path: filePath, - filename: displayName, + filename: runId, + displayName, timestamp, testCount, passRate, diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 3eb7ad0a4..c3a54e20d 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -43,7 +43,7 @@ export const evalInputCommand = command({ type: optional(string), long: 'out', description: - 'Output directory for extracted inputs (default: .agentv/results/runs/)', + 'Output directory for extracted inputs (default: .agentv/results/runs//)', }), experiment: option({ type: optional(string), @@ -53,7 +53,7 @@ export const evalInputCommand = command({ }, handler: async ({ evalPath, out, experiment }) => { const resolvedEvalPath = resolve(evalPath); - const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); + const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment)); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index f91db3dad..86c1b2289 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -73,7 +73,8 @@ export const evalRunCommand = command({ out: option({ type: optional(string), long: 'out', - description: 'Output directory for results (default: .agentv/results/runs/)', + description: + 'Output directory for results (default: .agentv/results/runs//)', }), workers: option({ type: optional(number), @@ -94,7 +95,7 @@ export const evalRunCommand = command({ }, handler: async ({ evalPath, out, workers, experiment, graderType }) => { const resolvedEvalPath = resolve(evalPath); - const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); + const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment)); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); const evalDir = dirname(resolvedEvalPath); diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index 1ab92bf9e..0e19e10e1 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -185,7 +185,12 @@ function resolveCliPath(cwd: string): { bunPath: string; cliPath: string } | und // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route type C = Context; -export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) { +export function registerEvalRoutes( + app: Hono, + getCwd: (c: C) => string, + options?: { readOnly?: boolean }, +) { + const readOnly = options?.readOnly === true; // ── Discovery: eval files ────────────────────────────────────────────── app.get('/api/eval/discover', async (c) => { const cwd = getCwd(c); @@ -216,6 +221,9 @@ export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) { // ── Launch eval run ──────────────────────────────────────────────────── app.post('/api/eval/run', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } const cwd = getCwd(c); let body: RunEvalRequest; diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 6bd2cc503..925a22e6c 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -251,6 +251,7 @@ function handleRuns(c: C, { searchDir }: DataContext) { } return { filename: m.filename, + display_name: m.displayName, path: m.path, timestamp: m.timestamp, test_count: m.testCount, @@ -270,7 +271,7 @@ function handleRunDetail(c: C, { searchDir }: DataContext) { if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - return c.json({ results: stripHeavyFields(loaded), source: meta.filename }); + return c.json({ results: stripHeavyFields(loaded), source: meta.displayName }); } catch { return c.json({ error: 'Failed to load run' }, 500); } @@ -565,8 +566,11 @@ function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { return c.json({ targets }); } -function handleConfig(c: C, { agentvDir }: DataContext) { - return c.json(loadStudioConfig(agentvDir)); +function handleConfig(c: C, { agentvDir }: DataContext, options?: { readOnly?: boolean }) { + return c.json({ + ...loadStudioConfig(agentvDir), + read_only: options?.readOnly === true, + }); } function handleFeedbackRead(c: C, { searchDir }: DataContext) { @@ -585,11 +589,12 @@ export function createApp( resultDir: string, cwd?: string, sourceFile?: string, - options?: { studioDir?: string }, + options?: { studioDir?: string; readOnly?: boolean }, ): Hono { const searchDir = cwd ?? resultDir; const agentvDir = path.join(searchDir, '.agentv'); const defaultCtx: DataContext = { searchDir, agentvDir }; + const readOnly = options?.readOnly === true; const app = new Hono(); // ── Project resolution wrapper ──────────────────────────────────────── @@ -611,6 +616,9 @@ export function createApp( // ── Studio configuration ────────────────────────────────────────────── app.post('/api/config', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } try { const body = await c.req.json>(); const current = loadStudioConfig(agentvDir); @@ -672,6 +680,9 @@ export function createApp( }); app.post('/api/projects', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } try { const body = await c.req.json<{ path: string }>(); if (!body.path) return c.json({ error: 'Missing path' }, 400); @@ -683,6 +694,9 @@ export function createApp( }); app.delete('/api/projects/:projectId', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } const removed = removeProject(c.req.param('projectId') ?? ''); if (!removed) return c.json({ error: 'Project not found' }, 404); return c.json({ ok: true }); @@ -710,6 +724,9 @@ export function createApp( }); app.post('/api/projects/discover', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } try { const body = await c.req.json<{ path: string }>(); if (!body.path) return c.json({ error: 'Missing path' }, 400); @@ -726,6 +743,7 @@ export function createApp( const registry = loadProjectRegistry(); const allRuns: Array<{ filename: string; + display_name: string; path: string; timestamp: string; test_count: number; @@ -755,6 +773,7 @@ export function createApp( } allRuns.push({ filename: m.filename, + display_name: m.displayName, path: m.path, timestamp: m.timestamp, test_count: m.testCount, @@ -778,7 +797,7 @@ export function createApp( // ── Data routes (unscoped) ──────────────────────────────────────────── - app.get('/api/config', (c) => handleConfig(c, defaultCtx)); + app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly })); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); @@ -799,6 +818,9 @@ export function createApp( }); app.post('/api/feedback', async (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } let body: unknown; try { body = await c.req.json(); @@ -857,6 +879,7 @@ export function createApp( } return { run_filename: m.filename, + display_name: m.displayName, test_count: m.testCount, pass_rate: m.passRate, avg_score: m.avgScore, @@ -870,7 +893,9 @@ export function createApp( // ── Data routes (project-scoped) ────────────────────────────────────── // Same handlers as above, with project-resolved DataContext via withProject. - app.get('/api/projects/:projectId/config', (c) => withProject(c, handleConfig)); + app.get('/api/projects/:projectId/config', (c) => + withProject(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly })), + ); app.get('/api/projects/:projectId/runs', (c) => withProject(c, handleRuns)); app.get('/api/projects/:projectId/runs/:filename', (c) => withProject(c, handleRunDetail)); app.get('/api/projects/:projectId/runs/:filename/suites', (c) => withProject(c, handleRunSuites)); @@ -895,15 +920,19 @@ export function createApp( // ── Eval runner routes (discovery, launch, status) ──────────────────── - registerEvalRoutes(app, (c) => { + registerEvalRoutes( + app, + (c) => { // For project-scoped routes, resolve to project path; otherwise use searchDir - const projectId = c.req.param('projectId'); - if (projectId) { - const project = getProject(projectId); - if (project) return project.path; - } - return searchDir; - }); + const projectId = c.req.param('projectId'); + if (projectId) { + const project = getProject(projectId); + if (project) return project.path; + } + return searchDir; + }, + { readOnly }, + ); // ── Static file serving for Studio SPA ──────────────────────────────── @@ -1026,8 +1055,12 @@ export const resultsServeCommand = command({ long: 'discover', description: 'Scan a directory tree for repos with .agentv/', }), + readOnly: flag({ + long: 'read-only', + description: 'Disable write operations and launch Studio in read-only leaderboard mode', + }), }, - handler: async ({ source, port, dir, multi, add, remove, discover }) => { + handler: async ({ source, port, dir, multi, add, remove, discover, readOnly }) => { const cwd = dir ?? process.cwd(); const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117); @@ -1100,7 +1133,7 @@ export const resultsServeCommand = command({ // Use the run directory for feedback storage (matches #764 behavior) const resultDir = sourceFile ? path.dirname(path.resolve(sourceFile)) : cwd; - const app = createApp(results, resultDir, cwd, sourceFile); + const app = createApp(results, resultDir, cwd, sourceFile, { readOnly }); if (isMultiProject) { console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 343625fea..efb5ee370 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -291,6 +291,41 @@ describe('serve app', () => { }); expect(res3.status).toBe(400); }); + + it('returns 403 in read-only mode', async () => { + const content = toJsonl(RESULT_A, RESULT_B); + const results = loadResults(content); + const app = createApp(results, tempDir, undefined, undefined, { + studioDir, + readOnly: true, + }); + + const res = await app.request('/api/feedback', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + reviews: [{ test_id: 'test-greeting', comment: 'blocked' }], + }), + }); + + expect(res.status).toBe(403); + }); + }); + + describe('GET /api/config', () => { + it('includes read_only mode in the config payload', async () => { + const content = toJsonl(RESULT_A, RESULT_B); + const results = loadResults(content); + const app = createApp(results, tempDir, undefined, undefined, { + studioDir, + readOnly: true, + }); + + const res = await app.request('/api/config'); + expect(res.status).toBe(200); + const data = (await res.json()) as { read_only?: boolean }; + expect(data.read_only).toBe(true); + }); }); // ── Empty state (no results) ──────────────────────────────────────── diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index f49941f6b..808586cbc 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -345,6 +345,20 @@ describe('trace utils', () => { expect(metas[0].filename).toBe('2026-02-20T21-38-05-833Z'); }); + it('should discover nested experiment run directories and emit safe run ids', () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + const runDir = path.join(runsDir, 'with-skills', '2026-02-20T21-38-05-833Z'); + mkdirSync(runDir, { recursive: true }); + + writeFileSync(path.join(runDir, 'index.jsonl'), `${RESULT_WITH_TRACE}\n`); + + const metas = listResultFiles(tempDir); + + expect(metas).toHaveLength(1); + expect(metas[0].filename).toBe('with-skills::2026-02-20T21-38-05-833Z'); + expect(metas[0].displayName).toBe('2026-02-20T21-38-05-833Z'); + }); + it('should skip directories without index.jsonl', () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); const emptyDir = path.join(runsDir, '2026-02-20T21-38-05-833Z'); diff --git a/apps/studio/src/components/Breadcrumbs.tsx b/apps/studio/src/components/Breadcrumbs.tsx index 9dedf70a5..680f7cb5c 100644 --- a/apps/studio/src/components/Breadcrumbs.tsx +++ b/apps/studio/src/components/Breadcrumbs.tsx @@ -12,6 +12,14 @@ interface BreadcrumbSegment { to?: string; } +function formatRunLabel(runId: string | undefined): string { + if (!runId) { + return 'Run'; + } + const [, timestamp] = runId.split('::'); + return timestamp || runId; +} + function deriveSegments(matches: ReturnType): BreadcrumbSegment[] { const segments: BreadcrumbSegment[] = []; @@ -26,7 +34,7 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme if (routeId.includes('/runs/$runId/category/$category')) { if (!segments.some((s) => s.label === params.runId)) { segments.push({ - label: params.runId ?? 'Run', + label: formatRunLabel(params.runId), to: `/runs/${encodeURIComponent(params.runId)}`, }); } @@ -41,14 +49,14 @@ function deriveSegments(matches: ReturnType): BreadcrumbSegme }); } else if (routeId.includes('/runs/$runId')) { segments.push({ - label: params.runId ?? 'Run', + label: formatRunLabel(params.runId), to: match.pathname, }); } else if (routeId.includes('/evals/$runId/$evalId')) { // For eval pages, show the run as a parent segment too if (!segments.some((s) => s.label === params.runId)) { segments.push({ - label: params.runId ?? 'Run', + label: formatRunLabel(params.runId), to: `/runs/${encodeURIComponent(params.runId)}`, }); } diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index e28279cec..ee4f9f485 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -41,11 +41,13 @@ function findFirstFile(nodes: FileNode[]): string | null { export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) { const [activeTab, setActiveTab] = useState('checks'); + const { data: config } = useStudioConfig(); + const isReadOnly = config?.read_only === true; const tabs: { id: Tab; label: string }[] = [ { id: 'checks', label: 'Checks' }, { id: 'files', label: 'Files' }, - { id: 'feedback', label: 'Feedback' }, + ...(isReadOnly ? [] : [{ id: 'feedback' as const, label: 'Feedback' }]), ]; return ( @@ -112,7 +114,7 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
{activeTab === 'checks' && } {activeTab === 'files' && } - {activeTab === 'feedback' && } + {!isReadOnly && activeTab === 'feedback' && }
); diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index 75a11a63f..d102c8225 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -77,7 +77,7 @@ export function RunList({ runs, projectId }: RunListProps) { params={{ projectId, runId: run.filename }} className="font-medium text-cyan-400 hover:text-cyan-300 hover:underline" > - {run.filename} + {run.display_name ?? run.filename} ) : ( - {run.filename} + {run.display_name ?? run.filename} )} diff --git a/apps/studio/src/components/Sidebar.tsx b/apps/studio/src/components/Sidebar.tsx index 461eb74ed..474fcb426 100644 --- a/apps/studio/src/components/Sidebar.tsx +++ b/apps/studio/src/components/Sidebar.tsx @@ -150,7 +150,7 @@ function RunSidebar() { className="mb-0.5 block truncate rounded-md px-2 py-1.5 text-sm text-gray-400 transition-colors hover:bg-gray-800/50 hover:text-gray-200" title={run.project_name} > - {run.filename} + {run.display_name ?? run.filename} ); } @@ -166,7 +166,7 @@ function RunSidebar() { : 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200' }`} > - {run.filename} + {run.display_name ?? run.filename} ); })} @@ -388,7 +388,7 @@ function ProjectRunDetailSidebar({ : 'text-gray-400 hover:bg-gray-800/50 hover:text-gray-200' }`} > - {run.filename} + {run.display_name ?? run.filename} ); })} diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 1b735a00e..bc6be5908 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -7,6 +7,7 @@ export interface RunMeta { filename: string; + display_name?: string; path: string; timestamp: string; test_count: number; @@ -91,6 +92,7 @@ export interface EvalDetailResponse { export interface IndexEntry { run_filename: string; + display_name?: string; target?: string; test_count: number; pass_rate: number; @@ -171,6 +173,7 @@ export interface StudioConfigResponse { threshold: number; /** @deprecated Use threshold */ pass_threshold?: number; + read_only?: boolean; } // ── Project types ──────────────────────────────────────────────────────── diff --git a/apps/studio/src/routes/evals/$runId.$evalId.tsx b/apps/studio/src/routes/evals/$runId.$evalId.tsx index 27fb29e60..79ba93fff 100644 --- a/apps/studio/src/routes/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/evals/$runId.$evalId.tsx @@ -11,7 +11,7 @@ import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useRunDetail } from '~/lib/api'; +import { useRunDetail, useStudioConfig } from '~/lib/api'; export const Route = createFileRoute('/evals/$runId/$evalId')({ component: EvalDetailPage, @@ -20,7 +20,9 @@ export const Route = createFileRoute('/evals/$runId/$evalId')({ function EvalDetailPage() { const { runId, evalId } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); + const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const isReadOnly = config?.read_only === true; if (isLoading) { return ( @@ -61,23 +63,27 @@ function EvalDetailPage() {

{evalId}

- + {!isReadOnly && ( + + )} - setShowRunEval(false)} - prefill={{ - testIds: [evalId], - target: result.target, - }} - /> + {!isReadOnly && ( + setShowRunEval(false)} + prefill={{ + testIds: [evalId], + target: result.target, + }} + /> + )} ); } diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index a700e4825..cf95c22bf 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -15,7 +15,13 @@ import { ProjectCard } from '~/components/ProjectCard'; import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; import { TargetsTab } from '~/components/TargetsTab'; -import { addProjectApi, discoverProjectsApi, useProjectList, useRunList } from '~/lib/api'; +import { + addProjectApi, + discoverProjectsApi, + useProjectList, + useRunList, + useStudioConfig, +} from '~/lib/api'; type TabId = 'runs' | 'experiments' | 'targets'; @@ -48,6 +54,7 @@ function HomePage() { function ProjectsDashboard() { const { data } = useProjectList(); + const { data: config } = useStudioConfig(); const queryClient = useQueryClient(); const [addPath, setAddPath] = useState(''); const [discoverPath, setDiscoverPath] = useState(''); @@ -56,6 +63,7 @@ function ProjectsDashboard() { const [showRunEval, setShowRunEval] = useState(false); const projects = data?.projects ?? []; + const isReadOnly = config?.read_only === true; async function handleAddProject(e: React.FormEvent) { e.preventDefault(); @@ -92,20 +100,24 @@ function ProjectsDashboard() {

Projects

- - + {!isReadOnly && ( + <> + + + + )}
@@ -115,7 +127,7 @@ function ProjectsDashboard() { )} - {showAddForm && ( + {!isReadOnly && showAddForm && (
- setShowRunEval(false)} /> + {!isReadOnly && setShowRunEval(false)} />}
); } @@ -169,21 +181,25 @@ function SingleProjectHome() { const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); const { data, isLoading, error } = useRunList(); + const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const isReadOnly = config?.read_only === true; - const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'runs'; + const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments'; return (

Evaluation Runs

- + {!isReadOnly && ( + + )}
{/* Tab navigation */} @@ -211,7 +227,7 @@ function SingleProjectHome() { {activeTab === 'experiments' && } {activeTab === 'targets' && } - setShowRunEval(false)} /> + {!isReadOnly && setShowRunEval(false)} />}
); } diff --git a/apps/studio/src/routes/projects/$projectId.tsx b/apps/studio/src/routes/projects/$projectId.tsx index 493f38064..b38d112e4 100644 --- a/apps/studio/src/routes/projects/$projectId.tsx +++ b/apps/studio/src/routes/projects/$projectId.tsx @@ -10,7 +10,7 @@ import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; -import { useProjectRunList } from '~/lib/api'; +import { useProjectRunList, useStudioConfig } from '~/lib/api'; import { projectExperimentsOptions, projectTargetsOptions } from '~/lib/api'; import type { ExperimentsResponse, TargetsResponse } from '~/lib/types'; @@ -33,20 +33,24 @@ function ProjectHomePage() { const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); const [showRunEval, setShowRunEval] = useState(false); + const { data: config } = useStudioConfig(); + const isReadOnly = config?.read_only === true; - const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'runs'; + const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments'; return (

{projectId}

- + {!isReadOnly && ( + + )}
{/* Tab navigation */} @@ -79,11 +83,13 @@ function ProjectHomePage() { {activeTab === 'experiments' && } {activeTab === 'targets' && } - setShowRunEval(false)} - projectId={projectId} - /> + {!isReadOnly && ( + setShowRunEval(false)} + projectId={projectId} + /> + )}
); } diff --git a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx index 94499866c..62242c174 100644 --- a/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/projects/$projectId_/evals/$runId.$evalId.tsx @@ -7,7 +7,7 @@ import { useState } from 'react'; import { EvalDetail } from '~/components/EvalDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useProjectRunDetail } from '~/lib/api'; +import { useProjectRunDetail, useStudioConfig } from '~/lib/api'; export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId')({ component: ProjectEvalDetailPage, @@ -16,7 +16,9 @@ export const Route = createFileRoute('/projects/$projectId_/evals/$runId/$evalId function ProjectEvalDetailPage() { const { projectId, runId, evalId } = Route.useParams(); const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const isReadOnly = config?.read_only === true; if (isLoading) { return ( @@ -57,24 +59,28 @@ function ProjectEvalDetailPage() {

{evalId}

- + {!isReadOnly && ( + + )} - setShowRunEval(false)} - projectId={projectId} - prefill={{ - testIds: [evalId], - target: result.target, - }} - /> + {!isReadOnly && ( + setShowRunEval(false)} + projectId={projectId} + prefill={{ + testIds: [evalId], + target: result.target, + }} + /> + )} ); } diff --git a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx b/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx index f23ba6095..f21fed6ae 100644 --- a/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx +++ b/apps/studio/src/routes/projects/$projectId_/runs/$runId.tsx @@ -7,7 +7,7 @@ import { useState } from 'react'; import { RunDetail } from '~/components/RunDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useProjectRunDetail } from '~/lib/api'; +import { useProjectRunDetail, useStudioConfig } from '~/lib/api'; export const Route = createFileRoute('/projects/$projectId_/runs/$runId')({ component: ProjectRunDetailPage, @@ -16,7 +16,9 @@ export const Route = createFileRoute('/projects/$projectId_/runs/$runId')({ function ProjectRunDetailPage() { const { projectId, runId } = Route.useParams(); const { data, isLoading, error } = useProjectRunDetail(projectId, runId); + const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const isReadOnly = config?.read_only === true; if (isLoading) { return ( @@ -49,21 +51,25 @@ function ProjectRunDetailPage() {

Run: {runId}

Source: {data?.source}

- + {!isReadOnly && ( + + )} - setShowRunEval(false)} - projectId={projectId} - prefill={prefill} - /> + {!isReadOnly && ( + setShowRunEval(false)} + projectId={projectId} + prefill={prefill} + /> + )} ); } diff --git a/apps/studio/src/routes/runs/$runId.tsx b/apps/studio/src/routes/runs/$runId.tsx index 1ae307cf8..87f5231ee 100644 --- a/apps/studio/src/routes/runs/$runId.tsx +++ b/apps/studio/src/routes/runs/$runId.tsx @@ -7,7 +7,7 @@ import { useState } from 'react'; import { RunDetail } from '~/components/RunDetail'; import { RunEvalModal } from '~/components/RunEvalModal'; -import { useRunDetail } from '~/lib/api'; +import { useRunDetail, useStudioConfig } from '~/lib/api'; export const Route = createFileRoute('/runs/$runId')({ component: RunDetailPage, @@ -16,7 +16,9 @@ export const Route = createFileRoute('/runs/$runId')({ function RunDetailPage() { const { runId } = Route.useParams(); const { data, isLoading, error } = useRunDetail(runId); + const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const isReadOnly = config?.read_only === true; if (isLoading) { return ( @@ -50,16 +52,20 @@ function RunDetailPage() {

Run: {runId}

Source: {data?.source}

- + {!isReadOnly && ( + + )} - setShowRunEval(false)} prefill={prefill} /> + {!isReadOnly && ( + setShowRunEval(false)} prefill={prefill} /> + )} ); } diff --git a/apps/studio/src/routes/settings.tsx b/apps/studio/src/routes/settings.tsx index 417657cdc..d00c330ac 100644 --- a/apps/studio/src/routes/settings.tsx +++ b/apps/studio/src/routes/settings.tsx @@ -24,6 +24,7 @@ function SettingsPage() { const currentThreshold = config?.threshold ?? DEFAULT_PASS_THRESHOLD; const displayThreshold = threshold || String(currentThreshold); + const isReadOnly = config?.read_only === true; const handleSave = async () => { const value = Number.parseFloat(threshold || String(currentThreshold)); @@ -87,6 +88,7 @@ function SettingsPage() { step="0.05" value={displayThreshold} onChange={(e) => setThreshold(e.target.value)} + disabled={isReadOnly} className="w-32 rounded-md border border-gray-700 bg-gray-800 px-3 py-2 text-sm text-white placeholder-gray-500 focus:border-cyan-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" /> @@ -97,14 +99,17 @@ function SettingsPage() {
- + {!isReadOnly && ( + + )} + {isReadOnly && Read-only mode is enabled.} {message && ( Date: Wed, 8 Apr 2026 13:24:56 +0000 Subject: [PATCH 2/3] fix(cli): resolve pre-push validation issues --- apps/cli/src/commands/eval/result-layout.ts | 6 +++++- apps/cli/src/commands/eval/run-eval.ts | 6 +++--- apps/cli/src/commands/results/serve.ts | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts index f9acd3e69..1373e7089 100644 --- a/apps/cli/src/commands/eval/result-layout.ts +++ b/apps/cli/src/commands/eval/result-layout.ts @@ -22,7 +22,11 @@ export function createRunDirName(timestamp = new Date()): string { return timestamp.toISOString().replace(/[:.]/g, '-'); } -export function buildDefaultRunDir(cwd: string, experiment?: string, timestamp = new Date()): string { +export function buildDefaultRunDir( + cwd: string, + experiment?: string, + timestamp = new Date(), +): string { return path.join( cwd, '.agentv', diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index c97d3d816..7dc02d77d 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -870,9 +870,6 @@ export async function runEvalCommand( .replace(/:/g, '-') .replace(/\./g, '-'); } - if (!process.env.AGENTV_EXPERIMENT) { - process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment); - } // Load agentv.config.ts (if present) for default values let config: Awaited> = null; @@ -899,6 +896,9 @@ export async function runEvalCommand( } let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); + if (!process.env.AGENTV_EXPERIMENT) { + process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment); + } // Validate --grader-target / --model combinations if (options.graderTarget === 'agentv' && !options.model) { diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 925a22e6c..06a71fa31 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -923,7 +923,7 @@ export function createApp( registerEvalRoutes( app, (c) => { - // For project-scoped routes, resolve to project path; otherwise use searchDir + // For project-scoped routes, resolve to project path; otherwise use searchDir const projectId = c.req.param('projectId'); if (projectId) { const project = getProject(projectId); From 7b84ac2a6c17589b23a95be260c3307d22dbcca2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 14:41:50 +0000 Subject: [PATCH 3/3] fix(results): support experiment-scoped run workspaces --- apps/cli/src/commands/results/export.ts | 9 +++- apps/cli/src/commands/results/serve.ts | 5 +- apps/cli/src/commands/results/validate.ts | 40 ++++++++++------ apps/cli/test/commands/results/export.test.ts | 18 ++++++++ apps/cli/test/commands/results/serve.test.ts | 35 ++++++++++++++ .../test/commands/results/validate.test.ts | 46 +++++++++++++++++++ 6 files changed, 134 insertions(+), 19 deletions(-) create mode 100644 apps/cli/test/commands/results/validate.test.ts diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index c31622cc0..bfe82a89d 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -59,7 +59,14 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string { throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`); } - const parentDir = path.basename(path.dirname(sourceFile)); + const runDir = path.dirname(sourceFile); + const segments = path.normalize(runDir).split(path.sep).filter(Boolean); + const runsIndex = segments.lastIndexOf('runs'); + if (runsIndex >= 0 && runsIndex < segments.length - 1) { + return path.join(cwd, '.agentv', 'results', 'export', ...segments.slice(runsIndex + 1)); + } + + const parentDir = path.basename(runDir); if (parentDir.startsWith('eval_')) { return path.join(cwd, '.agentv', 'results', 'export', parentDir.slice(5)); } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 06a71fa31..4e54155b8 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -431,12 +431,11 @@ function handleEvalFiles(c: C, { searchDir }: DataContext) { function handleEvalFileContent(c: C, { searchDir }: DataContext) { const filename = c.req.param('filename'); - const evalId = c.req.param('evalId'); const meta = listResultFiles(searchDir).find((m) => m.filename === filename); if (!meta) return c.json({ error: 'Run not found' }, 404); - // Extract file path from wildcard using a mount-agnostic marker - const marker = `/runs/${filename}/evals/${evalId}/files/`; + // Extract the wildcard suffix without depending on decoded route params. + const marker = '/files/'; const markerIdx = c.req.path.indexOf(marker); const filePath = markerIdx >= 0 ? c.req.path.slice(markerIdx + marker.length) : ''; diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index 0a53fbf94..991ffe7df 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -3,7 +3,7 @@ * artifacts compatible with the AgentV dashboard and results commands. * * Checks: - * 1. Directory follows the `runs/` naming convention + * 1. Directory follows the `runs//` naming convention * 2. index.jsonl exists and each line has required fields * 3. Per-test grading.json exists for every entry in the index * 4. Per-test timing.json exists (warning if missing) @@ -43,13 +43,15 @@ interface IndexEntry { function checkDirectoryNaming(runDir: string): Diagnostic[] { const dirName = path.basename(runDir); - const parentName = path.basename(path.dirname(runDir)); + const pathSegments = path.normalize(runDir).split(path.sep).filter(Boolean); + const runsIndex = pathSegments.lastIndexOf('runs'); const diagnostics: Diagnostic[] = []; - if (parentName !== 'runs') { + if (runsIndex < 0 || runsIndex >= pathSegments.length - 1) { diagnostics.push({ severity: 'warning', - message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/`, + message: + "Directory is not under a 'runs/' tree. Expected: .agentv/results/runs//", }); } @@ -65,6 +67,24 @@ function checkDirectoryNaming(runDir: string): Diagnostic[] { return diagnostics; } +export function validateRunDirectory(runDir: string): { + diagnostics: Diagnostic[]; + entries: IndexEntry[]; +} { + const diagnostics: Diagnostic[] = []; + + diagnostics.push(...checkDirectoryNaming(runDir)); + + const { diagnostics: indexDiags, entries } = checkIndexJsonl(runDir); + diagnostics.push(...indexDiags); + + if (entries.length > 0) { + diagnostics.push(...checkArtifactFiles(runDir, entries)); + } + + return { diagnostics, entries }; +} + function checkIndexJsonl(runDir: string): { diagnostics: Diagnostic[]; entries: IndexEntry[] } { const indexPath = path.join(runDir, 'index.jsonl'); const diagnostics: Diagnostic[] = []; @@ -251,17 +271,7 @@ export const resultsValidateCommand = command({ process.exit(1); } - const allDiagnostics: Diagnostic[] = []; - - // Run all checks - allDiagnostics.push(...checkDirectoryNaming(resolvedDir)); - - const { diagnostics: indexDiags, entries } = checkIndexJsonl(resolvedDir); - allDiagnostics.push(...indexDiags); - - if (entries.length > 0) { - allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries)); - } + const { diagnostics: allDiagnostics, entries } = validateRunDirectory(resolvedDir); // Report const errors = allDiagnostics.filter((d) => d.severity === 'error'); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index 60d54661a..e37d5fbd4 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -138,6 +138,24 @@ describe('results export', () => { ); }); + it('deriveOutputDir preserves experiment directories for canonical nested runs', () => { + const outputDir = deriveOutputDir( + tempDir, + path.join( + tempDir, + '.agentv', + 'results', + 'runs', + 'with-skills', + '2026-03-18T10-00-00-000Z', + 'index.jsonl', + ), + ); + expect(outputDir).toBe( + path.join(tempDir, '.agentv', 'results', 'export', 'with-skills', '2026-03-18T10-00-00-000Z'), + ); + }); + it('deriveOutputDir rejects non-manifest paths', () => { expect(() => deriveOutputDir(tempDir, path.join(tempDir, 'results.jsonl'))).toThrow( 'Expected a run manifest named index.jsonl', diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index efb5ee370..94841e32f 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -389,6 +389,41 @@ describe('serve app', () => { }); }); + describe('GET /api/runs/:filename/evals/:evalId/files/*', () => { + it('loads file content for experiment-scoped run ids', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'with-skills'); + const runId = 'with-skills::2026-03-25T10-00-00-000Z'; + const timestampDir = path.join(runsDir, '2026-03-25T10-00-00-000Z'); + const responsePath = path.join( + timestampDir, + 'demo', + 'test-greeting', + 'outputs', + 'response.md', + ); + + mkdirSync(path.dirname(responsePath), { recursive: true }); + writeFileSync(responsePath, '@[assistant]:\nHello, Alice!'); + writeFileSync( + path.join(timestampDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'with-skills', + output_path: 'demo/test-greeting/outputs/response.md', + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request( + `/api/runs/${encodeURIComponent(runId)}/evals/test-greeting/files/demo/test-greeting/outputs/response.md`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { content: string }; + expect(data.content).toContain('Hello, Alice!'); + }); + }); + // ── SPA fallback ────────────────────────────────────────────────────── describe('SPA fallback', () => { diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts new file mode 100644 index 000000000..c418ab8e6 --- /dev/null +++ b/apps/cli/test/commands/results/validate.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { validateRunDirectory } from '../../../src/commands/results/validate.js'; + +describe('results validate', () => { + it('accepts experiment-scoped canonical run directories without layout warnings', () => { + const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-validate-test-')); + + try { + const runDir = path.join( + tempDir, + '.agentv', + 'results', + 'runs', + 'with-skills', + '2026-03-27T12-42-24-429Z', + ); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + `${JSON.stringify({ + timestamp: '2026-03-27T12:42:24.429Z', + test_id: 'test-greeting', + score: 1, + target: 'gpt-4o', + execution_status: 'ok', + })}\n`, + ); + + const { diagnostics } = validateRunDirectory(runDir); + + expect(diagnostics.filter((d) => d.severity === 'error')).toEqual([]); + expect(diagnostics.map((d) => d.message)).not.toContain( + "Directory is not under a 'runs/' tree. Expected: .agentv/results/runs//", + ); + expect( + diagnostics.some((d) => d.message.includes('does not match the expected pattern')), + ).toBe(false); + } finally { + rmSync(tempDir, { recursive: true, force: true }); + } + }); +});