Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export interface BenchmarkArtifact {
readonly timestamp: string;
readonly targets: readonly string[];
readonly tests_run: readonly string[];
readonly experiment?: string;
};
readonly run_summary: Record<
string,
Expand Down Expand Up @@ -97,6 +98,7 @@ export interface IndexArtifactEntry {
readonly suite?: string;
readonly category?: string;
readonly conversation_id?: string;
readonly experiment?: string;
readonly score: number;
readonly target: string;
readonly scores?: readonly Record<string, unknown>[];
Expand Down Expand Up @@ -313,6 +315,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
export function buildBenchmarkArtifact(
results: readonly EvaluationResult[],
evalFile = '',
experiment?: string,
): BenchmarkArtifact {
const targetSet = new Set<string>();
const testIdSet = new Set<string>();
Expand Down Expand Up @@ -405,6 +408,7 @@ export function buildBenchmarkArtifact(
timestamp,
targets,
tests_run: testIds,
experiment,
},
run_summary: runSummary,
per_grader_summary: perEvaluatorSummary,
Expand Down Expand Up @@ -689,7 +693,7 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
export async function writeArtifacts(
jsonlPath: string,
outputDir: string,
options?: { evalFile?: string },
options?: { evalFile?: string; experiment?: string },
): Promise<{
testArtifactDir: string;
timingPath: string;
Expand All @@ -705,7 +709,7 @@ export async function writeArtifacts(
export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
options?: { evalFile?: string },
options?: { evalFile?: string; experiment?: string },
): Promise<{
testArtifactDir: string;
timingPath: string;
Expand Down Expand Up @@ -746,15 +750,18 @@ export async function writeArtifactsFromResults(
);
}

indexRecords.push(buildResultIndexArtifact(result));
indexRecords.push({
...buildResultIndexArtifact(result),
experiment: options?.experiment,
});
}

// Write aggregate timing
const timing = buildTimingArtifact(results);
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');

// Write benchmark
const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');

await writeJsonlFile(indexPath, indexRecords);
Expand Down
6 changes: 6 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ export const evalRunCommand = command({
long: 'output-format',
description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
}),
experiment: option({
type: optional(string),
long: 'experiment',
description: 'Experiment label for canonical run output (default: default)',
}),
export: multioption({
type: array(string),
long: 'export',
Expand Down Expand Up @@ -223,6 +228,7 @@ export const evalRunCommand = command({
out: args.out,
output: args.output,
outputFormat: args.outputFormat,
experiment: args.experiment,
export: args.export,
dryRun: args.dryRun,
dryRunDelay: args.dryRunDelay,
Expand Down
33 changes: 29 additions & 4 deletions apps/cli/src/commands/eval/result-layout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,42 @@ import path from 'node:path';

export const RESULT_INDEX_FILENAME = 'index.jsonl';
export const RESULT_RUNS_DIRNAME = 'runs';
export const DEFAULT_EXPERIMENT_NAME = 'default';

export function normalizeExperimentName(experiment?: string): string {
const trimmed = experiment?.trim();
if (!trimmed) {
return DEFAULT_EXPERIMENT_NAME;
}
if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
throw new Error(
`Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`,
);
}
return trimmed;
}

export function createRunDirName(timestamp = new Date()): string {
return timestamp.toISOString().replace(/[:.]/g, '-');
}

export function buildDefaultRunDir(cwd: string): string {
return path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME, createRunDirName());
export function buildDefaultRunDir(
cwd: string,
experiment?: string,
timestamp = new Date(),
): string {
return path.join(
cwd,
'.agentv',
'results',
RESULT_RUNS_DIRNAME,
normalizeExperimentName(experiment),
createRunDirName(timestamp),
);
}

export function buildDefaultIndexPath(cwd: string): string {
return path.join(buildDefaultRunDir(cwd), RESULT_INDEX_FILENAME);
export function buildDefaultIndexPath(cwd: string, experiment?: string): string {
return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME);
}

export function resolveRunIndexPath(runDir: string): string {
Expand Down
16 changes: 11 additions & 5 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
import { buildDefaultRunDir } from './result-layout.js';
import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js';
import {
buildExclusionFilter,
loadErrorTestIds,
Expand Down Expand Up @@ -96,6 +96,7 @@ interface NormalizedOptions {
readonly tags: readonly string[];
readonly excludeTags: readonly string[];
readonly transcript?: string;
readonly experiment?: string;
}

function normalizeBoolean(value: unknown): boolean {
Expand Down Expand Up @@ -363,6 +364,7 @@ function normalizeOptions(
tags: normalizeStringArray(rawOptions.tag),
excludeTags: normalizeStringArray(rawOptions.excludeTag),
transcript: normalizeString(rawOptions.transcript),
experiment: normalizeString(rawOptions.experiment),
} satisfies NormalizedOptions;
}

Expand All @@ -374,8 +376,8 @@ async function ensureFileExists(filePath: string, description: string): Promise<
}
}

function buildDefaultOutputPath(cwd: string): string {
const runDir = buildDefaultRunDir(cwd);
function buildDefaultOutputPathForExperiment(cwd: string, experiment?: string): string {
const runDir = buildDefaultRunDir(cwd, experiment);
mkdirSync(runDir, { recursive: true });
return path.join(runDir, 'index.jsonl');
}
Expand Down Expand Up @@ -894,6 +896,9 @@ export async function runEvalCommand(
}

let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
if (!process.env.AGENTV_EXPERIMENT) {
process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
}

// Validate --grader-target / --model combinations
if (options.graderTarget === 'agentv' && !options.model) {
Expand Down Expand Up @@ -987,8 +992,8 @@ export async function runEvalCommand(
mkdirSync(runDir, { recursive: true });
usesDefaultArtifactWorkspace = false;
} else {
// Default: .agentv/results/runs/<timestamp>/
outputPath = buildDefaultOutputPath(cwd);
// Default: .agentv/results/runs/<experiment>/<timestamp>/
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
runDir = path.dirname(outputPath);
usesDefaultArtifactWorkspace = true;
}
Expand Down Expand Up @@ -1426,6 +1431,7 @@ export async function runEvalCommand(
indexPath,
} = await writeArtifactsFromResults(allResults, runDir, {
evalFile,
experiment: normalizeExperimentName(options.experiment),
});
console.log(`Artifact workspace written to: ${runDir}`);
console.log(` Index: ${indexPath}`);
Expand Down
55 changes: 45 additions & 10 deletions apps/cli/src/commands/inspect/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -523,31 +523,65 @@ export function toTraceSummary(result: RawResult): TraceSummary | undefined {
export interface ResultFileMeta {
path: string;
filename: string;
displayName: string;
timestamp: string;
testCount: number;
passRate: number;
avgScore: number;
sizeBytes: number;
}

function buildRunId(relativeRunPath: string): string {
const normalized = relativeRunPath.split(path.sep).join('/');
const segments = normalized.split('/').filter(Boolean);
if (segments.length >= 2) {
const experiment = segments.slice(0, -1).join('/');
const timestamp = segments.at(-1);
if (experiment === 'default') {
return timestamp ?? normalized;
}
return `${experiment}::${timestamp}`;
}
return segments[0];
}

function collectRunManifestPaths(
runsDir: string,
currentDir: string,
files: { filePath: string; displayName: string; runId: string }[],
): void {
const primaryPath = resolveExistingRunPrimaryPath(currentDir);
if (primaryPath) {
const relativeRunPath = path.relative(runsDir, currentDir);
files.push({
filePath: primaryPath,
displayName: path.basename(currentDir),
runId: buildRunId(relativeRunPath),
});
return;
}

const entries = readdirSync(currentDir, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
collectRunManifestPaths(runsDir, path.join(currentDir, entry.name), files);
}
}
}

/**
* Enumerate canonical run manifests in `.agentv/results/runs/`.
*/
export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME);

const files: { filePath: string; displayName: string }[] = [];
const files: { filePath: string; displayName: string; runId: string }[] = [];

try {
const entries = readdirSync(runsDir, { withFileTypes: true });
for (const entry of entries) {
if (!entry.isDirectory()) {
continue;
}

const primaryPath = resolveExistingRunPrimaryPath(path.join(runsDir, entry.name));
if (primaryPath) {
files.push({ filePath: primaryPath, displayName: entry.name });
if (entry.isDirectory()) {
collectRunManifestPaths(runsDir, path.join(runsDir, entry.name), files);
}
}
} catch {
Expand All @@ -561,7 +595,7 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {

const metas: ResultFileMeta[] = [];

for (const { filePath, displayName } of limited) {
for (const { filePath, displayName, runId } of limited) {
try {
const fileStat = statSync(filePath);
const results = loadResultFile(filePath);
Expand All @@ -576,7 +610,8 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {

metas.push({
path: filePath,
filename: displayName,
filename: runId,
displayName,
timestamp,
testCount,
passRate,
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export const evalInputCommand = command({
type: optional(string),
long: 'out',
description:
'Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)',
'Output directory for extracted inputs (default: .agentv/results/runs/<experiment>/<timestamp>)',
}),
experiment: option({
type: optional(string),
Expand All @@ -53,7 +53,7 @@ export const evalInputCommand = command({
},
handler: async ({ evalPath, out, experiment }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
const evalDir = dirname(resolvedEvalPath);

Expand Down
5 changes: 3 additions & 2 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ export const evalRunCommand = command({
out: option({
type: optional(string),
long: 'out',
description: 'Output directory for results (default: .agentv/results/runs/<timestamp>)',
description:
'Output directory for results (default: .agentv/results/runs/<experiment>/<timestamp>)',
}),
workers: option({
type: optional(number),
Expand All @@ -94,7 +95,7 @@ export const evalRunCommand = command({
},
handler: async ({ evalPath, out, workers, experiment, graderType }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
const evalDir = dirname(resolvedEvalPath);

Expand Down
10 changes: 9 additions & 1 deletion apps/cli/src/commands/results/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,12 @@ function resolveCliPath(cwd: string): { bunPath: string; cliPath: string } | und
// biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
type C = Context<any, any, any>;

export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) {
export function registerEvalRoutes(
app: Hono,
getCwd: (c: C) => string,
options?: { readOnly?: boolean },
) {
const readOnly = options?.readOnly === true;
// ── Discovery: eval files ──────────────────────────────────────────────
app.get('/api/eval/discover', async (c) => {
const cwd = getCwd(c);
Expand Down Expand Up @@ -216,6 +221,9 @@ export function registerEvalRoutes(app: Hono, getCwd: (c: C) => string) {

// ── Launch eval run ────────────────────────────────────────────────────
app.post('/api/eval/run', async (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
const cwd = getCwd(c);

let body: RunEvalRequest;
Expand Down
9 changes: 8 additions & 1 deletion apps/cli/src/commands/results/export.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,14 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
}

const parentDir = path.basename(path.dirname(sourceFile));
const runDir = path.dirname(sourceFile);
const segments = path.normalize(runDir).split(path.sep).filter(Boolean);
const runsIndex = segments.lastIndexOf('runs');
if (runsIndex >= 0 && runsIndex < segments.length - 1) {
return path.join(cwd, '.agentv', 'results', 'export', ...segments.slice(runsIndex + 1));
}

const parentDir = path.basename(runDir);
if (parentDir.startsWith('eval_')) {
return path.join(cwd, '.agentv', 'results', 'export', parentDir.slice(5));
}
Expand Down
Loading
Loading