From 56c8e5c8f30c2277605b4a3147ecf1f25e2a702c Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Mon, 6 Apr 2026 22:27:42 +0000
Subject: [PATCH] refactor(cli): consolidate eval run output flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make --output mean "artifact directory" (the single canonical output).
Add --export for secondary file outputs (HTML, XML, YAML, JSON).
Deprecate --out, --output-format, --benchmark-json, --artifacts with
warnings — all continue working for backward compatibility.

- --output <dir>: artifact directory (index.jsonl, benchmark.json,
  per-test grading/timing)
- --export <file>: repeatable flag for additional output formats,
  format inferred from extension
- Primary writer always JSONL; removed dead OutputFormat plumbing
- Updated CI workflow, docs, and skills to use new flags

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/evals.yml                   |   5 +-
 apps/cli/src/commands/eval/commands/run.ts    |  22 ++-
 apps/cli/src/commands/eval/run-eval.ts        | 161 ++++++++++--------
 apps/cli/test/eval.integration.test.ts        |   7 +-
 .../docs/docs/evaluation/running-evals.mdx    |  18 +-
 .../docs/docs/guides/agent-skills-evals.mdx   |   5 +-
 .../agentv-dev/skills/agentv-bench/SKILL.md   |   2 +-
 7 files changed, 129 insertions(+), 91 deletions(-)
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 5fa81e046..c129ab86e 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -88,9 +88,8 @@ jobs:
             "${TARGET_FLAG[@]}" \
             --workers 3 \
             --threshold ${{ steps.filter.outputs.threshold }} \
-            --output .agentv/ci-results/junit.xml \
-            --benchmark-json .agentv/ci-results/benchmark.json \
-            --artifacts .agentv/ci-results/artifacts
+            --output .agentv/ci-results/artifacts \
+            --export .agentv/ci-results/junit.xml
           EXIT_CODE=$?
 
           echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index d46665abe..098cffa5c 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -46,19 +46,25 @@ export const evalRunCommand = command({
     out: option({
       type: optional(string),
       long: 'out',
-      description: 'Write results to the specified path',
+      description: '[Deprecated: use --output] Write results to the specified path',
     }),
-    output: multioption({
-      type: array(string),
+    output: option({
+      type: optional(string),
       long: 'output',
       short: 'o',
       description:
-        'Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html',
+        'Artifact directory for run output (index.jsonl, benchmark.json, per-test grading/timing)',
     }),
     outputFormat: option({
       type: optional(string),
       long: 'output-format',
-      description: "Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
+      description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)",
+    }),
+    export: multioption({
+      type: array(string),
+      long: 'export',
+      description:
+        'Write additional output file(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html (repeatable)',
     }),
     dryRun: flag({
       long: 'dry-run',
@@ -151,13 +157,14 @@ export const evalRunCommand = command({
     benchmarkJson: option({
       type: optional(string),
       long: 'benchmark-json',
-      description: 'Write Agent Skills benchmark.json to the specified path',
+      description:
+        '[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path',
     }),
     artifacts: option({
       type: optional(string),
       long: 'artifacts',
       description:
-        'Write companion artifacts (index.jsonl, <test>/grading.json, <test>/timing.json, timing.json, benchmark.json) to the specified directory',
+        '[Deprecated: use --output] Write companion artifacts to the specified directory',
     }),
     graderTarget: option({
       type: optional(string),
@@ -216,6 +223,7 @@ export const evalRunCommand = command({
       out: args.out,
       output: args.output,
       outputFormat: args.outputFormat,
+      export: args.export,
       dryRun: args.dryRun,
       dryRunDelay: args.dryRunDelay,
       dryRunDelayMin: args.dryRunDelayMin,
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 5b316c6b2..4b052d32c 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -31,12 +31,7 @@ import { enforceRequiredVersion } from '../../version-check.js';
 import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
-import {
-  type OutputFormat,
-  type OutputWriter,
-  createMultiWriter,
-  createOutputWriter,
-} from './output-writer.js';
+import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir } from './result-layout.js';
 import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
@@ -62,9 +57,12 @@ interface NormalizedOptions {
   readonly targetsPath?: string;
   readonly filter?: string | readonly string[];
   readonly workers?: number;
+  /** --output <dir>: artifact directory (new canonical meaning) */
+  readonly outputDir?: string;
+  /** Legacy --out <path>: deprecated, treated as artifact dir */
   readonly outPath?: string;
-  readonly outputPaths: readonly string[];
-  readonly format: OutputFormat;
+  /** --export <paths...>: additional output files */
+  readonly exportPaths: readonly string[];
   readonly dryRun: boolean;
   readonly dryRunDelay: number;
   readonly dryRunDelayMin: number;
@@ -82,7 +80,9 @@ interface NormalizedOptions {
   readonly retryErrors?: string;
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
+  /** Deprecated: benchmark.json is always written to artifact dir */
   readonly benchmarkJson?: string;
+  /** Deprecated: use --output instead */
   readonly artifacts?: string;
   readonly graderTarget?: string;
   readonly model?: string;
@@ -247,18 +247,17 @@ function normalizeOptions(
   config?: Awaited<ReturnType<typeof loadTsConfig>>,
   yamlExecution?: ExecutionDefaults,
 ): NormalizedOptions {
-  const cliFormat = normalizeString(rawOptions.outputFormat);
-  const configFormat = config?.output?.format;
-  const formatStr = cliFormat ?? configFormat ?? 'jsonl';
-  const format: OutputFormat = formatStr === 'yaml' ? 'yaml' : 'jsonl';
-
   const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
   const configWorkers = config?.execution?.workers;
   const workers = cliWorkers ?? configWorkers ?? 0;
 
-  const rawOutputPaths = rawOptions.output;
-  const outputPaths: string[] = Array.isArray(rawOutputPaths)
-    ? rawOutputPaths.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
+  // --output is now a single optional string (artifact directory)
+  const cliOutputDir = normalizeString(rawOptions.output);
+
+  // --export is the new repeatable flag for additional output files
+  const rawExportPaths = rawOptions.export;
+  const exportPaths: string[] = Array.isArray(rawExportPaths)
+    ? rawExportPaths.filter((v): v is string => typeof v === 'string' && v.trim().length > 0)
     : [];
 
   // Normalize --target: can be a string (legacy) or string[] (multioption)
@@ -313,9 +312,9 @@ function normalizeOptions(
     targetsPath: normalizeString(rawOptions.targets),
     filter: normalizeFilter(rawOptions.filter),
     workers: workers > 0 ? workers : undefined,
+    outputDir: cliOutputDir,
     outPath: cliOut ?? configOut,
-    outputPaths,
-    format,
+    exportPaths,
     dryRun: normalizeBoolean(rawOptions.dryRun),
     dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
     dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
@@ -937,8 +936,51 @@ export async function runEvalCommand(
     console.log(`Repository root: ${repoRoot}`);
   }
 
-  const usesDefaultArtifactWorkspace = !options.outPath;
-  const outputPath = options.outPath ? path.resolve(options.outPath) : buildDefaultOutputPath(cwd);
+  // Emit deprecation warnings for legacy flags
+  if (options.outPath) {
+    console.warn('Warning: --out is deprecated. Use --output <dir> to set the artifact directory.');
+  }
+  if (options.artifacts) {
+    console.warn(
+      'Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory.',
+    );
+  }
+  if (options.benchmarkJson) {
+    console.warn(
+      'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
+    );
+  }
+  if (normalizeString(input.rawOptions.outputFormat)) {
+    console.warn(
+      'Warning: --output-format is deprecated. The artifact directory always uses JSONL.',
+    );
+  }
+
+  // Resolve artifact directory (runDir) and primary output path.
+  // Precedence: --output > --artifacts (deprecated) > --out (deprecated) > default
+  const explicitDir = options.outputDir ?? options.artifacts;
+  let runDir: string;
+  let outputPath: string;
+  let usesDefaultArtifactWorkspace: boolean;
+
+  if (explicitDir) {
+    // --output <dir> or --artifacts <dir>: use as artifact directory
+    runDir = path.resolve(explicitDir);
+    mkdirSync(runDir, { recursive: true });
+    outputPath = path.join(runDir, 'index.jsonl');
+    usesDefaultArtifactWorkspace = true;
+  } else if (options.outPath) {
+    // --out <path> (deprecated): use dirname as artifact dir
+    outputPath = path.resolve(options.outPath);
+    runDir = path.dirname(outputPath);
+    mkdirSync(runDir, { recursive: true });
+    usesDefaultArtifactWorkspace = false;
+  } else {
+    // Default: .agentv/results/runs/<timestamp>/
+    outputPath = buildDefaultOutputPath(cwd);
+    runDir = path.dirname(outputPath);
+    usesDefaultArtifactWorkspace = true;
+  }
 
   // Initialize OTel exporter if --export-otel flag is set or file export flags are used
   let otelExporter: OtelTraceExporterType | null = null;
@@ -998,23 +1040,13 @@ export async function runEvalCommand(
 
   const primaryWritePath = outputPath;
 
-  // Resolve -o / --output paths (new multi-format support)
-  const extraOutputPaths = options.outputPaths.map((p) => path.resolve(p));
-
-  // Build the primary output writer (from --out / default)
-  // When extra --output paths are provided, combine all into a multi-writer
-  const allOutputPaths =
-    extraOutputPaths.length > 0 ? [primaryWritePath, ...extraOutputPaths] : [primaryWritePath];
-  const uniqueOutputPaths = [...new Set(allOutputPaths)];
-  const reportedOutputPaths =
-    extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
-  const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
+  // Resolve --export paths (additional output files)
+  const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p));
 
-  if (uniqueOutputPaths.length === 1) {
-    console.log(`Output path: ${outputPath}`);
-  } else {
-    console.log('Output paths:');
-    for (const p of uniqueReportedOutputPaths) {
+  console.log(`Artifact directory: ${runDir}`);
+  if (resolvedExportPaths.length > 0) {
+    console.log('Export files:');
+    for (const p of resolvedExportPaths) {
       console.log(`  ${p}`);
     }
   }
@@ -1141,16 +1173,11 @@ export async function runEvalCommand(
     throw new Error('--threshold must be between 0 and 1');
   }
 
-  // Build the output writer (deferred until after threshold is resolved so JUnit
-  // writer can use the resolved threshold for per-test pass/fail decisions)
+  // Build the output writer. Primary output is always JSONL to the artifact directory.
+  // Additional --export paths get their own writers that receive all results after the run.
   const writerOptions =
     resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined;
-  let outputWriter: OutputWriter;
-  if (uniqueOutputPaths.length === 1) {
-    outputWriter = await createOutputWriter(primaryWritePath, options.format);
-  } else {
-    outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
-  }
+  const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl');
 
   // Detect matrix mode: multiple targets for any file
   const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
@@ -1366,25 +1393,25 @@ export async function runEvalCommand(
       console.log(formatMatrixSummary(allResults));
     }
 
-    // Write Agent Skills benchmark.json if requested
+    // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat)
     if (options.benchmarkJson && allResults.length > 0) {
       const benchmarkPath = path.resolve(options.benchmarkJson);
       await writeBenchmarkJson(benchmarkPath, allResults);
       console.log(`Benchmark written to: ${benchmarkPath}`);
     }
 
-    if (usesDefaultArtifactWorkspace) {
+    // Write artifacts to the run directory (always, not conditional on flags)
+    if (usesDefaultArtifactWorkspace && allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-      const workspaceDir = path.dirname(outputPath);
       const {
         testArtifactDir,
         timingPath,
         benchmarkPath: workspaceBenchmarkPath,
         indexPath,
-      } = await writeArtifactsFromResults(allResults, workspaceDir, {
+      } = await writeArtifactsFromResults(allResults, runDir, {
         evalFile,
       });
-      console.log(`Artifact workspace written to: ${workspaceDir}`);
+      console.log(`Artifact workspace written to: ${runDir}`);
       console.log(`  Index: ${indexPath}`);
       console.log(
         `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
@@ -1393,25 +1420,18 @@ export async function runEvalCommand(
       console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
     }
 
-    // Write companion artifacts (grading, timing, benchmark) if requested
-    if (options.artifacts) {
-      const artifactsDir = path.resolve(options.artifacts);
-      const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-      const {
-        testArtifactDir,
-        indexPath,
-        timingPath,
-        benchmarkPath: abp,
-      } = await writeArtifactsFromResults(allResults, artifactsDir, {
-        evalFile,
-      });
-      console.log(`Artifacts written to: ${artifactsDir}`);
-      console.log(`  Index: ${indexPath}`);
+    // Write --export output files (additional formats)
+    if (resolvedExportPaths.length > 0 && allResults.length > 0) {
+      for (const exportPath of resolvedExportPaths) {
+        const writer = await createWriterFromPath(exportPath, writerOptions);
+        for (const result of allResults) {
+          await writer.append(result);
+        }
+        await writer.close();
+      }
       console.log(
-        `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
+        `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`,
       );
-      console.log(`  Timing:  ${timingPath}`);
-      console.log(`  Benchmark: ${abp}`);
     }
 
     // Print workspace paths for failed cases (when preserved for debugging)
@@ -1426,14 +1446,7 @@ export async function runEvalCommand(
     }
 
     if (allResults.length > 0) {
-      if (uniqueReportedOutputPaths.length === 1) {
-        console.log(`\nResults written to: ${outputPath}`);
-      } else {
-        console.log('\nResults written to:');
-        for (const p of uniqueReportedOutputPaths) {
-          console.log(`  ${p}`);
-        }
-      }
+      console.log(`\nResults written to: ${outputPath}`);
 
       // Persist last run path for `agentv results` commands
       await saveRunCache(cwd, outputPath).catch(() => undefined);
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 5b3a8b8a3..1d8199494 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -165,11 +165,14 @@ async function runCli(
 
 function extractOutputPath(stdout: string): string {
   const lines = stdout.split(/\r?\n/);
-  const outputLine = lines.find((line) => line.startsWith('Output path:'));
+  // Try new format first, then legacy
+  const outputLine =
+    lines.find((line) => line.startsWith('Results written to:')) ??
+    lines.find((line) => line.startsWith('Output path:'));
   if (!outputLine) {
     throw new Error(`Unable to parse output path from CLI output:\n${stdout}`);
   }
-  return outputLine.replace('Output path:', '').trim();
+  return outputLine.replace(/^(Results written to:|Output path:)/, '').trim();
 }
 
 async function readJsonLines(filePath: string): Promise<readonly unknown[]> {
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 58035377d..b1a2ced9a 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -78,10 +78,24 @@ agentv eval --dry-run evals/my-eval.yaml
 Dry-run returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic.
 :::
 
-### Output to Specific File
+### Custom Output Directory
+
+Write all artifacts (index.jsonl, benchmark.json, per-test grading/timing) to a specific directory:
 
 ```bash
-agentv eval evals/my-eval.yaml --out results/baseline.jsonl
+agentv eval evals/my-eval.yaml --output ./my-results
+```
+
+### Export Additional Formats
+
+Write additional output files alongside the artifact directory. Format is inferred from the file extension (`.jsonl`, `.json`, `.xml`, `.yaml`, `.html`):
+
+```bash
+# Export JUnit XML for CI test reporters
+agentv eval evals/my-eval.yaml --export results.xml
+
+# Export multiple formats
+agentv eval evals/my-eval.yaml --output ./my-results --export results.xml --export results.html
 ```
 
 ### Trace Persistence
diff --git a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
index 04bebd443..d93639150 100644
--- a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
+++ b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
@@ -107,10 +107,11 @@ The rest of the bundle follows the same pattern:
 
 ## Benchmark output
 
-Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL:
+Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
 
 ```bash
-agentv eval evals.json --target claude --benchmark-json benchmark.json
+agentv eval evals.json --target claude --output ./results
+# benchmark.json is written to ./results/benchmark.json
 ```
 
 The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects:
diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
index 416017812..162b2dbd4 100644
--- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -146,7 +146,7 @@ Set `SUBAGENT_EVAL_MODE` in `.env` at the project root as the default when no mo
 
 **AgentV CLI mode** (end-to-end, EVAL.yaml):
 ```bash
-agentv eval <eval-path> --artifacts .agentv/artifacts/
+agentv eval <eval-path> --output .agentv/artifacts/
 ```
 
 **Subagent mode** — read `references/subagent-pipeline.md` for the detailed procedure. In brief: use `pipeline input` to extract inputs, dispatch one `executor` subagent per test case (all in parallel), then proceed to grading below.