EntityProcess · christso · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -61,6 +61,7 @@ export interface BenchmarkArtifact {
     readonly timestamp: string;
     readonly targets: readonly string[];
     readonly tests_run: readonly string[];
+    readonly experiment?: string;
   };
   readonly run_summary: Record<
     string,
@@ -97,6 +98,7 @@ export interface IndexArtifactEntry {
   readonly suite?: string;
   readonly category?: string;
   readonly conversation_id?: string;
+  readonly experiment?: string;
   readonly score: number;
   readonly target: string;
   readonly scores?: readonly Record<string, unknown>[];
@@ -313,6 +315,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
 export function buildBenchmarkArtifact(
   results: readonly EvaluationResult[],
   evalFile = '',
+  experiment?: string,
 ): BenchmarkArtifact {
   const targetSet = new Set<string>();
   const testIdSet = new Set<string>();
@@ -405,6 +408,7 @@ export function buildBenchmarkArtifact(
       timestamp,
       targets,
       tests_run: testIds,
+      experiment,
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
@@ -689,7 +693,7 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
 export async function writeArtifacts(
   jsonlPath: string,
   outputDir: string,
-  options?: { evalFile?: string },
+  options?: { evalFile?: string; experiment?: string },
 ): Promise<{
   testArtifactDir: string;
   timingPath: string;
@@ -705,7 +709,7 @@ export async function writeArtifacts(
 export async function writeArtifactsFromResults(
   results: readonly EvaluationResult[],
   outputDir: string,
-  options?: { evalFile?: string },
+  options?: { evalFile?: string; experiment?: string },
 ): Promise<{
   testArtifactDir: string;
   timingPath: string;
@@ -746,15 +750,18 @@ export async function writeArtifactsFromResults(
       );
     }
 
-    indexRecords.push(buildResultIndexArtifact(result));
+    indexRecords.push({
+      ...buildResultIndexArtifact(result),
+      experiment: options?.experiment,
+    });
   }
 
   // Write aggregate timing
   const timing = buildTimingArtifact(results);
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
   // Write benchmark
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
+  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
 
   await writeJsonlFile(indexPath, indexRecords);

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -60,6 +60,11 @@ export const evalRunCommand = command({
       long: 'output-format',
       description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
     }),
+    experiment: option({
+      type: optional(string),
+      long: 'experiment',
+      description: 'Experiment label for canonical run output (default: default)',
+    }),
     export: multioption({
       type: array(string),
       long: 'export',
@@ -223,6 +228,7 @@ export const evalRunCommand = command({
       out: args.out,
       output: args.output,
       outputFormat: args.outputFormat,
+      experiment: args.experiment,
       export: args.export,
       dryRun: args.dryRun,
       dryRunDelay: args.dryRunDelay,

diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts
@@ -3,17 +3,42 @@ import path from 'node:path';
 
 export const RESULT_INDEX_FILENAME = 'index.jsonl';
 export const RESULT_RUNS_DIRNAME = 'runs';
+export const DEFAULT_EXPERIMENT_NAME = 'default';
+
+export function normalizeExperimentName(experiment?: string): string {
+  const trimmed = experiment?.trim();
+  if (!trimmed) {
+    return DEFAULT_EXPERIMENT_NAME;
+  }
+  if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
+    throw new Error(
+      `Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`,
+    );
+  }
+  return trimmed;
+}
 
 export function createRunDirName(timestamp = new Date()): string {
   return timestamp.toISOString().replace(/[:.]/g, '-');
 }
 
-export function buildDefaultRunDir(cwd: string): string {
-  return path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME, createRunDirName());
+export function buildDefaultRunDir(
+  cwd: string,
+  experiment?: string,
+  timestamp = new Date(),
+): string {
+  return path.join(
+    cwd,
+    '.agentv',
+    'results',
+    RESULT_RUNS_DIRNAME,
+    normalizeExperimentName(experiment),
+    createRunDirName(timestamp),
+  );
 }
 
-export function buildDefaultIndexPath(cwd: string): string {
-  return path.join(buildDefaultRunDir(cwd), RESULT_INDEX_FILENAME);
+export function buildDefaultIndexPath(cwd: string, experiment?: string): string {
+  return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME);
 }
 
 export function resolveRunIndexPath(runDir: string): string {

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -33,7 +33,7 @@ import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
-import { buildDefaultRunDir } from './result-layout.js';
+import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js';
 import {
   buildExclusionFilter,
   loadErrorTestIds,
@@ -96,6 +96,7 @@ interface NormalizedOptions {
   readonly tags: readonly string[];
   readonly excludeTags: readonly string[];
   readonly transcript?: string;
+  readonly experiment?: string;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -363,6 +364,7 @@ function normalizeOptions(
     tags: normalizeStringArray(rawOptions.tag),
     excludeTags: normalizeStringArray(rawOptions.excludeTag),
     transcript: normalizeString(rawOptions.transcript),
+    experiment: normalizeString(rawOptions.experiment),
   } satisfies NormalizedOptions;
 }
 
@@ -374,8 +376,8 @@ async function ensureFileExists(filePath: string, description: string): Promise<
   }
 }
 
-function buildDefaultOutputPath(cwd: string): string {
-  const runDir = buildDefaultRunDir(cwd);
+function buildDefaultOutputPathForExperiment(cwd: string, experiment?: string): string {
+  const runDir = buildDefaultRunDir(cwd, experiment);
   mkdirSync(runDir, { recursive: true });
   return path.join(runDir, 'index.jsonl');
 }
@@ -894,6 +896,9 @@ export async function runEvalCommand(
   }
 
   let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  if (!process.env.AGENTV_EXPERIMENT) {
+    process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
+  }
 
   // Validate --grader-target / --model combinations
   if (options.graderTarget === 'agentv' && !options.model) {
@@ -987,8 +992,8 @@ export async function runEvalCommand(
     mkdirSync(runDir, { recursive: true });
     usesDefaultArtifactWorkspace = false;
   } else {
-    // Default: .agentv/results/runs/<timestamp>/
-    outputPath = buildDefaultOutputPath(cwd);
+    // Default: .agentv/results/runs/<experiment>/<timestamp>/
+    outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
     runDir = path.dirname(outputPath);
     usesDefaultArtifactWorkspace = true;
   }
@@ -1426,6 +1431,7 @@ export async function runEvalCommand(
         indexPath,
       } = await writeArtifactsFromResults(allResults, runDir, {
         evalFile,
+        experiment: normalizeExperimentName(options.experiment),
       });
       console.log(`Artifact workspace written to: ${runDir}`);
       console.log(`  Index: ${indexPath}`);

diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts
@@ -523,31 +523,65 @@ export function toTraceSummary(result: RawResult): TraceSummary | undefined {
 export interface ResultFileMeta {
   path: string;
   filename: string;
+  displayName: string;
   timestamp: string;
   testCount: number;
   passRate: number;
   avgScore: number;
   sizeBytes: number;
 }
 
+function buildRunId(relativeRunPath: string): string {
+  const normalized = relativeRunPath.split(path.sep).join('/');
+  const segments = normalized.split('/').filter(Boolean);
+  if (segments.length >= 2) {
+    const experiment = segments.slice(0, -1).join('/');
+    const timestamp = segments.at(-1);
+    if (experiment === 'default') {
+      return timestamp ?? normalized;
+    }
+    return `${experiment}::${timestamp}`;
+  }
+  return segments[0];
+}
+
+function collectRunManifestPaths(
+  runsDir: string,
+  currentDir: string,
+  files: { filePath: string; displayName: string; runId: string }[],
+): void {
+  const primaryPath = resolveExistingRunPrimaryPath(currentDir);
+  if (primaryPath) {
+    const relativeRunPath = path.relative(runsDir, currentDir);
+    files.push({
+      filePath: primaryPath,
+      displayName: path.basename(currentDir),
+      runId: buildRunId(relativeRunPath),
+    });
+    return;
+  }
+
+  const entries = readdirSync(currentDir, { withFileTypes: true });
+  for (const entry of entries) {
+    if (entry.isDirectory()) {
+      collectRunManifestPaths(runsDir, path.join(currentDir, entry.name), files);
+    }
+  }
+}
+
 /**
  * Enumerate canonical run manifests in `.agentv/results/runs/`.
  */
 export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
   const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME);
 
-  const files: { filePath: string; displayName: string }[] = [];
+  const files: { filePath: string; displayName: string; runId: string }[] = [];
 
   try {
     const entries = readdirSync(runsDir, { withFileTypes: true });
     for (const entry of entries) {
-      if (!entry.isDirectory()) {
-        continue;
-      }
-
-      const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name));
-      if (primaryPath) {
-        files.push({ filePath: primaryPath, displayName: entry.name });
+      if (entry.isDirectory()) {
+        collectRunManifestPaths(runsDir, path.join(runsDir, entry.name), files);
       }
     }
   } catch {
@@ -561,7 +595,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
 
   const metas: ResultFileMeta[] = [];
 
-  for (const { filePath, displayName } of limited) {
+  for (const { filePath, displayName, runId } of limited) {
     try {
       const fileStat = statSync(filePath);
       const results = loadResultFile(filePath);
@@ -576,7 +610,8 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
 
       metas.push({
         path: filePath,
-        filename: displayName,
+        filename: runId,
+        displayName,
         timestamp,
         testCount,
         passRate,

diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -43,7 +43,7 @@ export const evalInputCommand = command({
       type: optional(string),
       long: 'out',
       description:
-        'Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)',
+        'Output directory for extracted inputs (default: .agentv/results/runs/<experiment>/<timestamp>)',
     }),
     experiment: option({
       type: optional(string),
@@ -53,7 +53,7 @@ export const evalInputCommand = command({
   },
   handler: async ({ evalPath, out, experiment }) => {
     const resolvedEvalPath = resolve(evalPath);
-    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
+    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
     const evalDir = dirname(resolvedEvalPath);
 

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -73,7 +73,8 @@ export const evalRunCommand = command({
     out: option({
       type: optional(string),
       long: 'out',
-      description: 'Output directory for results (default: .agentv/results/runs/<timestamp>)',
+      description:
+        'Output directory for results (default: .agentv/results/runs/<experiment>/<timestamp>)',
     }),
     workers: option({
       type: optional(number),
@@ -94,7 +95,7 @@ export const evalRunCommand = command({
   },
   handler: async ({ evalPath, out, workers, experiment, graderType }) => {
     const resolvedEvalPath = resolve(evalPath);
-    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
+    const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
     const evalDir = dirname(resolvedEvalPath);
 

diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
@@ -185,7 +185,12 @@ function resolveCliPath(cwd: string): { bunPath: string; cliPath: string } | und
 // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
 type C = Context<any, any, any>;
 
-export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) {
+export function registerEvalRoutes(
+  app: Hono,
+  getCwd: (c: C) => string,
+  options?: { readOnly?: boolean },
+) {
+  const readOnly = options?.readOnly === true;
   // ── Discovery: eval files ──────────────────────────────────────────────
   app.get('/api/eval/discover', async (c) => {
     const cwd = getCwd(c);
@@ -216,6 +221,9 @@ export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) {
 
   // ── Launch eval run ────────────────────────────────────────────────────
   app.post('/api/eval/run', async (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
     const cwd = getCwd(c);
 
     let body: RunEvalRequest;

diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
@@ -59,7 +59,14 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
     throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
   }
 
-  const parentDir = path.basename(path.dirname(sourceFile));
+  const runDir = path.dirname(sourceFile);
+  const segments = path.normalize(runDir).split(path.sep).filter(Boolean);
+  const runsIndex = segments.lastIndexOf('runs');
+  if (runsIndex >= 0 && runsIndex < segments.length - 1) {
+    return path.join(cwd, '.agentv', 'results', 'export', ...segments.slice(runsIndex + 1));
+  }
+
+  const parentDir = path.basename(runDir);
   if (parentDir.startsWith('eval_')) {
     return path.join(cwd, '.agentv', 'results', 'export', parentDir.slice(5));
   }