From 56c8e5c8f30c2277605b4a3147ecf1f25e2a702c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 6 Apr 2026 22:27:42 +0000 Subject: [PATCH] refactor(cli): consolidate eval run output flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make --output mean "artifact directory" (the single canonical output). Add --export for secondary file outputs (HTML, XML, YAML, JSON). Deprecate --out, --output-format, --benchmark-json, --artifacts with warnings — all continue working for backward compatibility. - --output : artifact directory (index.jsonl, benchmark.json, per-test grading/timing) - --export : repeatable flag for additional output formats, format inferred from extension - Primary writer always JSONL; removed dead OutputFormat plumbing - Updated CI workflow, docs, and skills to use new flags Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/evals.yml | 5 +- apps/cli/src/commands/eval/commands/run.ts | 22 ++- apps/cli/src/commands/eval/run-eval.ts | 161 ++++++++++-------- apps/cli/test/eval.integration.test.ts | 7 +- .../docs/docs/evaluation/running-evals.mdx | 18 +- .../docs/docs/guides/agent-skills-evals.mdx | 5 +- .../agentv-dev/skills/agentv-bench/SKILL.md | 2 +- 7 files changed, 129 insertions(+), 91 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 5fa81e046..c129ab86e 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -88,9 +88,8 @@ jobs: "${TARGET_FLAG[@]}" \ --workers 3 \ --threshold ${{ steps.filter.outputs.threshold }} \ - --output .agentv/ci-results/junit.xml \ - --benchmark-json .agentv/ci-results/benchmark.json \ - --artifacts .agentv/ci-results/artifacts + --output .agentv/ci-results/artifacts \ + --export .agentv/ci-results/junit.xml EXIT_CODE=$? echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT" diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index d46665abe..098cffa5c 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -46,19 +46,25 @@ export const evalRunCommand = command({ out: option({ type: optional(string), long: 'out', - description: 'Write results to the specified path', + description: '[Deprecated: use --output] Write results to the specified path', }), - output: multioption({ - type: array(string), + output: option({ + type: optional(string), long: 'output', short: 'o', description: - 'Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html', + 'Artifact directory for run output (index.jsonl, benchmark.json, per-test grading/timing)', }), outputFormat: option({ type: optional(string), long: 'output-format', - description: "Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)", + description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)", + }), + export: multioption({ + type: array(string), + long: 'export', + description: + 'Write additional output file(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html (repeatable)', }), dryRun: flag({ long: 'dry-run', @@ -151,13 +157,14 @@ export const evalRunCommand = command({ benchmarkJson: option({ type: optional(string), long: 'benchmark-json', - description: 'Write Agent Skills benchmark.json to the specified path', + description: + '[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path', }), artifacts: option({ type: optional(string), long: 'artifacts', description: - 'Write companion artifacts (index.jsonl, /grading.json, /timing.json, timing.json, benchmark.json) to the specified directory', + '[Deprecated: use --output] Write companion artifacts to the specified directory', }), graderTarget: option({ type: optional(string), @@ -216,6 +223,7 @@ export const evalRunCommand = command({ out: args.out, output: args.output, outputFormat: args.outputFormat, + export: args.export, dryRun: args.dryRun, dryRunDelay: args.dryRunDelay, dryRunDelayMin: args.dryRunDelayMin, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 5b316c6b2..4b052d32c 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -31,12 +31,7 @@ import { enforceRequiredVersion } from '../../version-check.js'; import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; -import { - type OutputFormat, - type OutputWriter, - createMultiWriter, - createOutputWriter, -} from './output-writer.js'; +import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir } from './result-layout.js'; import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; @@ -62,9 +57,12 @@ interface NormalizedOptions { readonly targetsPath?: string; readonly filter?: string | readonly string[]; readonly workers?: number; + /** --output : artifact directory (new canonical meaning) */ + readonly outputDir?: string; + /** Legacy --out : deprecated, treated as artifact dir */ readonly outPath?: string; - readonly outputPaths: readonly string[]; - readonly format: OutputFormat; + /** --export : additional output files */ + readonly exportPaths: readonly string[]; readonly dryRun: boolean; readonly dryRunDelay: number; readonly dryRunDelayMin: number; @@ -82,7 +80,9 @@ interface NormalizedOptions { readonly retryErrors?: string; readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; + /** Deprecated: benchmark.json is always written to artifact dir */ readonly benchmarkJson?: string; + /** Deprecated: use --output instead */ readonly artifacts?: string; readonly graderTarget?: string; readonly model?: string; @@ -247,18 +247,17 @@ function normalizeOptions( config?: Awaited>, yamlExecution?: ExecutionDefaults, ): NormalizedOptions { - const cliFormat = normalizeString(rawOptions.outputFormat); - const configFormat = config?.output?.format; - const formatStr = cliFormat ?? configFormat ?? 'jsonl'; - const format: OutputFormat = formatStr === 'yaml' ? 'yaml' : 'jsonl'; - const cliWorkers = normalizeOptionalNumber(rawOptions.workers); const configWorkers = config?.execution?.workers; const workers = cliWorkers ?? configWorkers ?? 0; - const rawOutputPaths = rawOptions.output; - const outputPaths: string[] = Array.isArray(rawOutputPaths) - ? rawOutputPaths.filter((v): v is string => typeof v === 'string' && v.trim().length > 0) + // --output is now a single optional string (artifact directory) + const cliOutputDir = normalizeString(rawOptions.output); + + // --export is the new repeatable flag for additional output files + const rawExportPaths = rawOptions.export; + const exportPaths: string[] = Array.isArray(rawExportPaths) + ? rawExportPaths.filter((v): v is string => typeof v === 'string' && v.trim().length > 0) : []; // Normalize --target: can be a string (legacy) or string[] (multioption) @@ -313,9 +312,9 @@ function normalizeOptions( targetsPath: normalizeString(rawOptions.targets), filter: normalizeFilter(rawOptions.filter), workers: workers > 0 ? workers : undefined, + outputDir: cliOutputDir, outPath: cliOut ?? configOut, - outputPaths, - format, + exportPaths, dryRun: normalizeBoolean(rawOptions.dryRun), dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0), dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0), @@ -937,8 +936,51 @@ export async function runEvalCommand( console.log(`Repository root: ${repoRoot}`); } - const usesDefaultArtifactWorkspace = !options.outPath; - const outputPath = options.outPath ? path.resolve(options.outPath) : buildDefaultOutputPath(cwd); + // Emit deprecation warnings for legacy flags + if (options.outPath) { + console.warn('Warning: --out is deprecated. Use --output to set the artifact directory.'); + } + if (options.artifacts) { + console.warn( + 'Warning: --artifacts is deprecated. Use --output to set the artifact directory.', + ); + } + if (options.benchmarkJson) { + console.warn( + 'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.', + ); + } + if (normalizeString(input.rawOptions.outputFormat)) { + console.warn( + 'Warning: --output-format is deprecated. The artifact directory always uses JSONL.', + ); + } + + // Resolve artifact directory (runDir) and primary output path. + // Precedence: --output > --artifacts (deprecated) > --out (deprecated) > default + const explicitDir = options.outputDir ?? options.artifacts; + let runDir: string; + let outputPath: string; + let usesDefaultArtifactWorkspace: boolean; + + if (explicitDir) { + // --output or --artifacts : use as artifact directory + runDir = path.resolve(explicitDir); + mkdirSync(runDir, { recursive: true }); + outputPath = path.join(runDir, 'index.jsonl'); + usesDefaultArtifactWorkspace = true; + } else if (options.outPath) { + // --out (deprecated): use dirname as artifact dir + outputPath = path.resolve(options.outPath); + runDir = path.dirname(outputPath); + mkdirSync(runDir, { recursive: true }); + usesDefaultArtifactWorkspace = false; + } else { + // Default: .agentv/results/runs// + outputPath = buildDefaultOutputPath(cwd); + runDir = path.dirname(outputPath); + usesDefaultArtifactWorkspace = true; + } // Initialize OTel exporter if --export-otel flag is set or file export flags are used let otelExporter: OtelTraceExporterType | null = null; @@ -998,23 +1040,13 @@ export async function runEvalCommand( const primaryWritePath = outputPath; - // Resolve -o / --output paths (new multi-format support) - const extraOutputPaths = options.outputPaths.map((p) => path.resolve(p)); - - // Build the primary output writer (from --out / default) - // When extra --output paths are provided, combine all into a multi-writer - const allOutputPaths = - extraOutputPaths.length > 0 ? [primaryWritePath, ...extraOutputPaths] : [primaryWritePath]; - const uniqueOutputPaths = [...new Set(allOutputPaths)]; - const reportedOutputPaths = - extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath]; - const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)]; + // Resolve --export paths (additional output files) + const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p)); - if (uniqueOutputPaths.length === 1) { - console.log(`Output path: ${outputPath}`); - } else { - console.log('Output paths:'); - for (const p of uniqueReportedOutputPaths) { + console.log(`Artifact directory: ${runDir}`); + if (resolvedExportPaths.length > 0) { + console.log('Export files:'); + for (const p of resolvedExportPaths) { console.log(` ${p}`); } } @@ -1141,16 +1173,11 @@ export async function runEvalCommand( throw new Error('--threshold must be between 0 and 1'); } - // Build the output writer (deferred until after threshold is resolved so JUnit - // writer can use the resolved threshold for per-test pass/fail decisions) + // Build the output writer. Primary output is always JSONL to the artifact directory. + // Additional --export paths get their own writers that receive all results after the run. const writerOptions = resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; - let outputWriter: OutputWriter; - if (uniqueOutputPaths.length === 1) { - outputWriter = await createOutputWriter(primaryWritePath, options.format); - } else { - outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions); - } + const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl'); // Detect matrix mode: multiple targets for any file const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1); @@ -1366,25 +1393,25 @@ export async function runEvalCommand( console.log(formatMatrixSummary(allResults)); } - // Write Agent Skills benchmark.json if requested + // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat) if (options.benchmarkJson && allResults.length > 0) { const benchmarkPath = path.resolve(options.benchmarkJson); await writeBenchmarkJson(benchmarkPath, allResults); console.log(`Benchmark written to: ${benchmarkPath}`); } - if (usesDefaultArtifactWorkspace) { + // Write artifacts to the run directory (always, not conditional on flags) + if (usesDefaultArtifactWorkspace && allResults.length > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; - const workspaceDir = path.dirname(outputPath); const { testArtifactDir, timingPath, benchmarkPath: workspaceBenchmarkPath, indexPath, - } = await writeArtifactsFromResults(allResults, workspaceDir, { + } = await writeArtifactsFromResults(allResults, runDir, { evalFile, }); - console.log(`Artifact workspace written to: ${workspaceDir}`); + console.log(`Artifact workspace written to: ${runDir}`); console.log(` Index: ${indexPath}`); console.log( ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, @@ -1393,25 +1420,18 @@ export async function runEvalCommand( console.log(` Benchmark: ${workspaceBenchmarkPath}`); } - // Write companion artifacts (grading, timing, benchmark) if requested - if (options.artifacts) { - const artifactsDir = path.resolve(options.artifacts); - const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; - const { - testArtifactDir, - indexPath, - timingPath, - benchmarkPath: abp, - } = await writeArtifactsFromResults(allResults, artifactsDir, { - evalFile, - }); - console.log(`Artifacts written to: ${artifactsDir}`); - console.log(` Index: ${indexPath}`); + // Write --export output files (additional formats) + if (resolvedExportPaths.length > 0 && allResults.length > 0) { + for (const exportPath of resolvedExportPaths) { + const writer = await createWriterFromPath(exportPath, writerOptions); + for (const result of allResults) { + await writer.append(result); + } + await writer.close(); + } console.log( - ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, + `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, ); - console.log(` Timing: ${timingPath}`); - console.log(` Benchmark: ${abp}`); } // Print workspace paths for failed cases (when preserved for debugging) @@ -1426,14 +1446,7 @@ export async function runEvalCommand( } if (allResults.length > 0) { - if (uniqueReportedOutputPaths.length === 1) { - console.log(`\nResults written to: ${outputPath}`); - } else { - console.log('\nResults written to:'); - for (const p of uniqueReportedOutputPaths) { - console.log(` ${p}`); - } - } + console.log(`\nResults written to: ${outputPath}`); // Persist last run path for `agentv results` commands await saveRunCache(cwd, outputPath).catch(() => undefined); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 5b3a8b8a3..1d8199494 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -165,11 +165,14 @@ async function runCli( function extractOutputPath(stdout: string): string { const lines = stdout.split(/\r?\n/); - const outputLine = lines.find((line) => line.startsWith('Output path:')); + // Try new format first, then legacy + const outputLine = + lines.find((line) => line.startsWith('Results written to:')) ?? + lines.find((line) => line.startsWith('Output path:')); if (!outputLine) { throw new Error(`Unable to parse output path from CLI output:\n${stdout}`); } - return outputLine.replace('Output path:', '').trim(); + return outputLine.replace(/^(Results written to:|Output path:)/, '').trim(); } async function readJsonLines(filePath: string): Promise { diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 58035377d..b1a2ced9a 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -78,10 +78,24 @@ agentv eval --dry-run evals/my-eval.yaml Dry-run returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic. ::: -### Output to Specific File +### Custom Output Directory + +Write all artifacts (index.jsonl, benchmark.json, per-test grading/timing) to a specific directory: ```bash -agentv eval evals/my-eval.yaml --out results/baseline.jsonl +agentv eval evals/my-eval.yaml --output ./my-results +``` + +### Export Additional Formats + +Write additional output files alongside the artifact directory. Format is inferred from the file extension (`.jsonl`, `.json`, `.xml`, `.yaml`, `.html`): + +```bash +# Export JUnit XML for CI test reporters +agentv eval evals/my-eval.yaml --export results.xml + +# Export multiple formats +agentv eval evals/my-eval.yaml --output ./my-results --export results.xml --export results.html ``` ### Trace Persistence diff --git a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx index 04bebd443..d93639150 100644 --- a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx +++ b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx @@ -107,10 +107,11 @@ The rest of the bundle follows the same pattern: ## Benchmark output -Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL: +Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory: ```bash -agentv eval evals.json --target claude --benchmark-json benchmark.json +agentv eval evals.json --target claude --output ./results +# benchmark.json is written to ./results/benchmark.json ``` The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects: diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 416017812..162b2dbd4 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -146,7 +146,7 @@ Set `SUBAGENT_EVAL_MODE` in `.env` at the project root as the default when no mo **AgentV CLI mode** (end-to-end, EVAL.yaml): ```bash -agentv eval --artifacts .agentv/artifacts/ +agentv eval --output .agentv/artifacts/ ``` **Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below.