EntityProcess · christso · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -41,7 +41,7 @@ export const evalRunCommand = command({
       type: optional(number),
       long: 'workers',
       description:
-        'Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml',
+        'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files always run sequentially. Can also be set per-target in targets.yaml',
     }),
     out: option({
       type: optional(string),

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -615,25 +615,6 @@ async function prepareFileMetadata(params: {
   };
 }
 
-async function runWithLimit<T>(
-  items: readonly T[],
-  limit: number,
-  task: (item: T) => Promise<void>,
-): Promise<void> {
-  const safeLimit = Math.max(1, limit);
-  let index = 0;
-
-  const workers = Array.from({ length: safeLimit }, async () => {
-    while (index < items.length) {
-      const current = items[index];
-      index += 1;
-      await task(current);
-    }
-  });
-
-  await Promise.all(workers);
-}
-
 async function runSingleEvalFile(params: {
   readonly testFilePath: string;
   readonly cwd: string;
@@ -1088,15 +1069,8 @@ export async function runEvalCommand(
   const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
-  // Derive file-level concurrency from worker count (global) when provided
-  const totalWorkers = options.workers ?? DEFAULT_WORKERS;
-  const fileConcurrency = Math.min(
-    Math.max(1, totalWorkers),
-    Math.max(1, resolvedTestFiles.length),
-  );
-  const perFileWorkers = options.workers
-    ? Math.max(1, Math.floor(totalWorkers / fileConcurrency))
-    : undefined;
+  // Each file gets the full worker budget — no splitting across files
+  const perFileWorkers = options.workers;
   const fileMetadata = new Map<
     string,
     {
@@ -1228,7 +1202,9 @@ export async function runEvalCommand(
     }
     throw new Error('No tests matched the provided filters.');
   }
-  const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
+  const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
+    verbose: options.verbose,
+  });
   progressReporter.start();
   progressReporter.setTotal(totalEvalCount);
   const seenCodexLogPaths = new Set<string>();
@@ -1309,8 +1285,11 @@ export async function runEvalCommand(
     );
   }
 
+  // Eval files run sequentially; within each file, --workers N test cases run in parallel.
+  // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
+  // workspace races without any grouping complexity.
   try {
-    await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
+    for (const testFilePath of activeTestFiles) {
       const targetPrep = fileMetadata.get(testFilePath);
       if (!targetPrep) {
         throw new Error(`Missing metadata for ${testFilePath}`);
@@ -1404,7 +1383,7 @@ export async function runEvalCommand(
       for (const results of targetResults) {
         allResults.push(...results);
       }
-    });
+    }
 
     progressReporter.finish();
 

diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -190,6 +190,20 @@ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token"
 agentv eval evals/my-eval.yaml --export-otel
 ```
 
+### Parallelism
+
+The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts.
+
+```bash
+agentv eval evals/my-eval.yaml --workers 4
+# Up to 4 test cases from the file run concurrently
+
+agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
+# Files run one at a time; within each file, up to 3 test cases run in parallel
+```
+
+This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration.
+
 ### Workspace Modes and Finish Policy
 
 Use workspace mode and finish policies instead of multiple conflicting booleans:

diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -135,6 +135,8 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre
 
 The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.
 
+**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above.
+
 ## Drift detection
 
 If you change the workspace config (e.g., update a repo URL or checkout ref), the computed fingerprint changes. AgentV detects this drift by comparing the stored `metadata.json` fingerprint against the newly computed one:

diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
@@ -194,6 +194,8 @@ export type EvalSuiteResult = {
   readonly failOnError?: import('./types.js').FailOnError;
   /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
   readonly threshold?: number;
+  /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
+  readonly workspacePath?: string;
 };
 
 /**
@@ -212,7 +214,11 @@ export async function loadTestSuite(
   if (format === 'agent-skills-json') {
     return { tests: await loadTestsFromAgentSkills(evalFilePath) };
   }
-  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
+  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
+    evalFilePath,
+    repoRoot,
+    options,
+  );
   const metadata = parseMetadata(parsed);
   const failOnError = extractFailOnError(parsed);
   const threshold = extractThreshold(parsed);
@@ -226,6 +232,7 @@ export async function loadTestSuite(
     ...(metadata !== undefined && { metadata }),
     ...(failOnError !== undefined && { failOnError }),
     ...(threshold !== undefined && { threshold }),
+    ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }),
   };
 }
 
@@ -256,7 +263,7 @@ async function loadTestsFromYaml(
   evalFilePath: string,
   repoRoot: URL | string,
   options?: LoadOptions,
-): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> {
+): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> {
   // YAML parsing (existing implementation)
   const verbose = options?.verbose ?? false;
   const filterPattern = options?.filter;
@@ -524,7 +531,7 @@ async function loadTestsFromYaml(
     results.push(testCase);
   }
 
-  return { tests: results, parsed: suite };
+  return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
 }
 
 /**