From 3f0aff322cb2362de569af4fe8b3c2f65b156892 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 06:51:18 +0000
Subject: [PATCH 1/8] docs: clarify --workers parallelism scope and
 workspace-mode semantics (#1039)

Answers the questions from #1039:
- --workers N is a global concurrent test-case budget: with a single
  eval file, N test cases run in parallel; with M eval files,
  min(N, M) files run concurrently each with floor(N/min(N,M)) workers.
- Documents the static workspace race condition when multiple eval files
  share the same path and run concurrently, with --workers 1 as the
  serialization escape hatch.
- Updates workspace-pool.mdx concurrency section to explain multi-file
  slot allocation and the static-workspace cross-file hazard.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/commands/run.ts    |  2 +-
 .../docs/docs/evaluation/running-evals.mdx    | 31 +++++++++++++++++++
 .../docs/docs/guides/workspace-pool.mdx       |  4 +++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 282d8d655..33ce025d4 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -41,7 +41,7 @@ export const evalRunCommand = command({
       type: optional(number),
       long: 'workers',
       description:
-        'Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml',
+        'Maximum concurrent test cases across the run (default: 3, max: 50). With a single eval file, N test cases run in parallel. With multiple eval files, the budget is split: up to min(N, files) eval files run concurrently, each receiving floor(N / concurrent_files) workers. Use --workers 1 to serialize everything. Can also be set per-target in targets.yaml',
     }),
     out: option({
       type: optional(string),
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index 341b71d30..df0a6b6db 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -190,6 +190,37 @@ export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token"
 agentv eval evals/my-eval.yaml --export-otel
 ```
 
+### Parallelism
+
+The `--workers N` flag is a **global concurrent test-case budget** for the entire run.
+
+**Single eval file** — up to N test cases run in parallel within that file:
+
+```bash
+agentv eval evals/my-eval.yaml --workers 4
+# Up to 4 test cases run concurrently
+```
+
+**Multiple eval files** — workers are split across files: `min(N, files)` eval files run concurrently, each getting `floor(N / concurrent_files)` workers for intra-file parallelism:
+
+```bash
+agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 6
+# 3 eval files run concurrently, each with 2 workers → up to 6 concurrent test cases total
+```
+
+:::caution[Static workspace race conditions]
+When multiple eval files reference the **same static workspace path** (e.g., via an env var like `EVAL_WORKSPACE_PATH`), concurrent execution can corrupt the workspace — for example, one file resetting a repo while another file's tests are running against it.
+
+To serialize all eval files (one file at a time), use `--workers 1`:
+
+```bash
+agentv eval evals/**/*.yaml --workers 1
+# One file at a time; one test case at a time within each file
+```
+
+Alternatively, use `--workspace-mode pooled` (the default for evals with `repos`). Pooled mode allocates separate slots per worker, so concurrent test cases and concurrent eval files never share a workspace directory. See [Workspace Pool](/docs/guides/workspace-pool/) for details.
+:::
+
 ### Workspace Modes and Finish Policy
 
 Use workspace mode and finish policies instead of multiple conflicting booleans:
diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
index 0969f710b..6a5a99c89 100644
--- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
+++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -135,6 +135,10 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre
 
 The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.
 
+**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they may run concurrently (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces handle this safely — each active worker (across all eval files) acquires its own slot. Eval files that share the same workspace config (same fingerprint) draw from the same pool; eval files with different configs maintain separate pools. No cross-file workspace contention occurs in pooled mode.
+
+**Static workspaces and multiple eval files:** If you use `--workspace-mode static --workspace-path /path` across multiple eval files, all files will share that single directory concurrently. This can cause race conditions — for example, one file running `git checkout <sha>` while another file's tests read from the same repository. To prevent this, either use `--workers 1` to serialize execution or use pooled mode instead.
+
 ## Drift detection
 
 If you change the workspace config (e.g., update a repo URL or checkout ref), the computed fingerprint changes. AgentV detects this drift by comparing the stored `metadata.json` fingerprint against the newly computed one:

From ece25fce66cb398fe483a9eab59a11701f054f1f Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 07:10:43 +0000
Subject: [PATCH 2/8] feat: warn when multiple eval files share a static
 workspace path and run concurrently

Surfaces workspace.path from EvalSuiteResult so the CLI can detect
cross-file workspace collisions before starting concurrent execution.
Emits a console.warn pointing users to --workers 1 as the fix.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts      | 32 +++++++++++++++++++++
 packages/core/src/evaluation/yaml-parser.ts |  9 ++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index b7ad4ffa1..e9f4463ac 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -498,6 +498,7 @@ async function prepareFileMetadata(params: {
   readonly failOnError?: FailOnError;
   readonly threshold?: number;
   readonly tags?: readonly string[];
+  readonly workspacePath?: string;
 }> {
   const { testFilePath, repoRoot, cwd, options } = params;
 
@@ -612,6 +613,7 @@ async function prepareFileMetadata(params: {
     failOnError: suite.failOnError,
     threshold: suite.threshold,
     tags: suite.metadata?.tags,
+    workspacePath: suite.workspacePath,
   };
 }
 
@@ -1115,6 +1117,7 @@ export async function runEvalCommand(
       readonly failOnError?: FailOnError;
       readonly threshold?: number;
       readonly tags?: readonly string[];
+      readonly workspacePath?: string;
     }
   >();
   // Separate TypeScript/JS eval files from YAML files.
@@ -1284,6 +1287,35 @@ export async function runEvalCommand(
   // Use only files that survived tag filtering (fileMetadata keys)
   const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
 
+  // Warn when multiple eval files share a static workspace path and will run concurrently,
+  // since concurrent writes to the same directory can corrupt each other's runs.
+  if (fileConcurrency > 1 && activeTestFiles.length > 1) {
+    const cliPath = options.workspacePath;
+    if (cliPath) {
+      console.warn(
+        `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. ` +
+          `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
+      );
+    } else {
+      const pathToFiles = new Map<string, string[]>();
+      for (const [filePath, meta] of fileMetadata.entries()) {
+        if (meta.workspacePath) {
+          const group = pathToFiles.get(meta.workspacePath) ?? [];
+          group.push(path.relative(cwd, filePath));
+          pathToFiles.set(meta.workspacePath, group);
+        }
+      }
+      for (const [wsPath, files] of pathToFiles.entries()) {
+        if (files.length > 1) {
+          console.warn(
+            `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). ` +
+              `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
+          );
+        }
+      }
+    }
+  }
+
   // --transcript: create a shared TranscriptProvider and validate line count
   let transcriptProviderFactory:
     | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider)
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 9e6f7de1e..6c6b95ea8 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -194,6 +194,8 @@ export type EvalSuiteResult = {
   readonly failOnError?: import('./types.js').FailOnError;
   /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
   readonly threshold?: number;
+  /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
+  readonly workspacePath?: string;
 };
 
 /**
@@ -212,7 +214,7 @@ export async function loadTestSuite(
   if (format === 'agent-skills-json') {
     return { tests: await loadTestsFromAgentSkills(evalFilePath) };
   }
-  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
+  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
   const metadata = parseMetadata(parsed);
   const failOnError = extractFailOnError(parsed);
   const threshold = extractThreshold(parsed);
@@ -226,6 +228,7 @@ export async function loadTestSuite(
     ...(metadata !== undefined && { metadata }),
     ...(failOnError !== undefined && { failOnError }),
     ...(threshold !== undefined && { threshold }),
+    ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }),
   };
 }
 
@@ -256,7 +259,7 @@ async function loadTestsFromYaml(
   evalFilePath: string,
   repoRoot: URL | string,
   options?: LoadOptions,
-): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> {
+): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> {
   // YAML parsing (existing implementation)
   const verbose = options?.verbose ?? false;
   const filterPattern = options?.filter;
@@ -524,7 +527,7 @@ async function loadTestsFromYaml(
     results.push(testCase);
   }
 
-  return { tests: results, parsed: suite };
+  return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
 }
 
 /**

From 477d4eed9240f5a07b7bc6fc029179d3c4cbe37a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 07:16:50 +0000
Subject: [PATCH 3/8] =?UTF-8?q?fix:=20lint=20=E2=80=94=20format=20long=20l?=
 =?UTF-8?q?ine=20and=20consolidate=20warn=20strings=20into=20template=20li?=
 =?UTF-8?q?terals?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts      | 6 ++----
 packages/core/src/evaluation/yaml-parser.ts | 6 +++++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index e9f4463ac..d7f518be0 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1293,8 +1293,7 @@ export async function runEvalCommand(
     const cliPath = options.workspacePath;
     if (cliPath) {
       console.warn(
-        `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. ` +
-          `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
+        `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
       );
     } else {
       const pathToFiles = new Map<string, string[]>();
@@ -1308,8 +1307,7 @@ export async function runEvalCommand(
       for (const [wsPath, files] of pathToFiles.entries()) {
         if (files.length > 1) {
           console.warn(
-            `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). ` +
-              `Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
+            `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
           );
         }
       }
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 6c6b95ea8..66111a9b6 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -214,7 +214,11 @@ export async function loadTestSuite(
   if (format === 'agent-skills-json') {
     return { tests: await loadTestsFromAgentSkills(evalFilePath) };
   }
-  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
+  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
+    evalFilePath,
+    repoRoot,
+    options,
+  );
   const metadata = parseMetadata(parsed);
   const failOnError = extractFailOnError(parsed);
   const threshold = extractThreshold(parsed);

From 80f7d8e653e82d86f9c01f044f0ebb7f50e5fe99 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 07:38:03 +0000
Subject: [PATCH 4/8] feat: auto-serialize eval files that share a static
 workspace path

Replace the global worker-budget-split scheduler with workspace-aware
grouping: eval files sharing the same static workspace.path run
sequentially within their group; groups with distinct paths (or no
static workspace) run in parallel. Each file gets the full --workers N
budget with no splitting.

Also removes the now-unnecessary concurrent-workspace warning added
in the previous commit and updates docs + CLI help to reflect the new
semantics.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/commands/run.ts    |   2 +-
 apps/cli/src/commands/eval/run-eval.ts        | 245 +++++++++---------
 .../docs/docs/evaluation/running-evals.mdx    |  27 +-
 .../docs/docs/guides/workspace-pool.mdx       |   4 +-
 4 files changed, 127 insertions(+), 151 deletions(-)

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 33ce025d4..442edb65a 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -41,7 +41,7 @@ export const evalRunCommand = command({
       type: optional(number),
       long: 'workers',
       description:
-        'Maximum concurrent test cases across the run (default: 3, max: 50). With a single eval file, N test cases run in parallel. With multiple eval files, the budget is split: up to min(N, files) eval files run concurrently, each receiving floor(N / concurrent_files) workers. Use --workers 1 to serialize everything. Can also be set per-target in targets.yaml',
+        'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files run sequentially unless they use distinct workspace paths, in which case they run in parallel. Can also be set per-target in targets.yaml',
     }),
     out: option({
       type: optional(string),
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index d7f518be0..ace651c4c 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1090,15 +1090,8 @@ export async function runEvalCommand(
   const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
-  // Derive file-level concurrency from worker count (global) when provided
-  const totalWorkers = options.workers ?? DEFAULT_WORKERS;
-  const fileConcurrency = Math.min(
-    Math.max(1, totalWorkers),
-    Math.max(1, resolvedTestFiles.length),
-  );
-  const perFileWorkers = options.workers
-    ? Math.max(1, Math.floor(totalWorkers / fileConcurrency))
-    : undefined;
+  // Each file gets the full worker budget — no splitting across files
+  const perFileWorkers = options.workers;
   const fileMetadata = new Map<
     string,
     {
@@ -1231,7 +1224,9 @@ export async function runEvalCommand(
     }
     throw new Error('No tests matched the provided filters.');
   }
-  const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
+  const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
+    verbose: options.verbose,
+  });
   progressReporter.start();
   progressReporter.setTotal(totalEvalCount);
   const seenCodexLogPaths = new Set<string>();
@@ -1287,33 +1282,6 @@ export async function runEvalCommand(
   // Use only files that survived tag filtering (fileMetadata keys)
   const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
 
-  // Warn when multiple eval files share a static workspace path and will run concurrently,
-  // since concurrent writes to the same directory can corrupt each other's runs.
-  if (fileConcurrency > 1 && activeTestFiles.length > 1) {
-    const cliPath = options.workspacePath;
-    if (cliPath) {
-      console.warn(
-        `Warning: ${activeTestFiles.length} eval files share --workspace-path "${cliPath}" and will run concurrently. Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
-      );
-    } else {
-      const pathToFiles = new Map<string, string[]>();
-      for (const [filePath, meta] of fileMetadata.entries()) {
-        if (meta.workspacePath) {
-          const group = pathToFiles.get(meta.workspacePath) ?? [];
-          group.push(path.relative(cwd, filePath));
-          pathToFiles.set(meta.workspacePath, group);
-        }
-      }
-      for (const [wsPath, files] of pathToFiles.entries()) {
-        if (files.length > 1) {
-          console.warn(
-            `Warning: ${files.length} eval files share workspace path "${wsPath}" and will run concurrently (${files.join(', ')}). Concurrent writes to the same workspace directory may corrupt results. Use --workers 1 to serialize.`,
-          );
-        }
-      }
-    }
-  }
-
   // --transcript: create a shared TranscriptProvider and validate line count
   let transcriptProviderFactory:
     | ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider)
@@ -1339,102 +1307,121 @@ export async function runEvalCommand(
     );
   }
 
-  try {
-    await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
-      const targetPrep = fileMetadata.get(testFilePath);
-      if (!targetPrep) {
-        throw new Error(`Missing metadata for ${testFilePath}`);
-      }
+  // Group files by static workspace path. Files sharing the same static workspace run
+  // sequentially to prevent concurrent writes from racing. Files using pooled mode or
+  // no workspace each get their own group and run in parallel with other groups.
+  const workspaceGroups = new Map<string, string[]>();
+  for (const filePath of activeTestFiles) {
+    const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath;
+    // Files with no static workspace get a unique key so they run in parallel
+    const groupKey = staticPath ?? filePath;
+    const group = workspaceGroups.get(groupKey) ?? [];
+    group.push(filePath);
+    workspaceGroups.set(groupKey, group);
+  }
 
-      // Run all targets concurrently (each target has its own worker limit)
-      const targetResults = await Promise.all(
-        targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
-          // Filter test cases to those applicable to this target.
-          const targetName = selection.targetName;
-          const applicableTestCases =
-            targetPrep.selections.length > 1
-              ? targetPrep.testCases.filter((test) => {
-                  if (test.targets && test.targets.length > 0) {
-                    return test.targets.includes(targetName);
-                  }
-                  return true;
-                })
-              : targetPrep.testCases;
-
-          if (applicableTestCases.length === 0) {
-            return [];
+  try {
+    await Promise.all(
+      [...workspaceGroups.values()].map(async (group) => {
+        for (const testFilePath of group) {
+          const targetPrep = fileMetadata.get(testFilePath);
+          if (!targetPrep) {
+            throw new Error(`Missing metadata for ${testFilePath}`);
           }
 
-          try {
-            const result = await runSingleEvalFile({
-              testFilePath,
-              cwd,
-              repoRoot,
-              options,
-              outputWriter,
-              otelExporter,
-              cache,
-              evaluationRunner,
-              workersOverride: perFileWorkers,
-              yamlWorkers: targetPrep.yamlWorkers,
-              progressReporter,
-              seenTestCases,
-              displayIdTracker,
-              selection,
-              inlineTargetLabel,
-              testCases: applicableTestCases,
-              trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
-              matrixMode: targetPrep.selections.length > 1,
-              totalBudgetUsd: targetPrep.totalBudgetUsd,
-              failOnError: targetPrep.failOnError,
-              threshold: resolvedThreshold,
-              providerFactory: transcriptProviderFactory,
-            });
-            const evalFile = path.relative(cwd, testFilePath);
-            const existingSummary = remoteEvalSummaries.find(
-              (summary) => summary.evalFile === evalFile,
-            );
-            if (existingSummary) {
-              existingSummary.results.push(...result.results);
-            } else {
-              remoteEvalSummaries.push({
-                evalFile,
-                results: [...result.results],
-              });
-            }
-
-            return result.results;
-          } catch (fileError) {
-            // before_all or other setup failures should not abort the entire run.
-            // Mark all tests in this file as errors and continue with other files.
-            const message = fileError instanceof Error ? fileError.message : String(fileError);
-            console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
-            const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
-              timestamp: new Date().toISOString(),
-              testId: testCase.id,
-              score: 0,
-              assertions: [],
-              output: [],
-              scores: [],
-              error: message,
-              executionStatus: 'execution_error' as const,
-              failureStage: 'setup' as const,
-              failureReasonCode: 'setup_error' as const,
-              durationMs: 0,
-              tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
-              target: selection.targetName,
-            }));
-            for (const errResult of errorResults) {
-              await outputWriter.append(errResult);
-            }
-            return errorResults;
+          // Run all targets concurrently (each target has its own worker limit)
+          const targetResults = await Promise.all(
+            targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
+              // Filter test cases to those applicable to this target.
+              const targetName = selection.targetName;
+              const applicableTestCases =
+                targetPrep.selections.length > 1
+                  ? targetPrep.testCases.filter((test) => {
+                      if (test.targets && test.targets.length > 0) {
+                        return test.targets.includes(targetName);
+                      }
+                      return true;
+                    })
+                  : targetPrep.testCases;
+
+              if (applicableTestCases.length === 0) {
+                return [];
+              }
+
+              try {
+                const result = await runSingleEvalFile({
+                  testFilePath,
+                  cwd,
+                  repoRoot,
+                  options,
+                  outputWriter,
+                  otelExporter,
+                  cache,
+                  evaluationRunner,
+                  workersOverride: perFileWorkers,
+                  yamlWorkers: targetPrep.yamlWorkers,
+                  progressReporter,
+                  seenTestCases,
+                  displayIdTracker,
+                  selection,
+                  inlineTargetLabel,
+                  testCases: applicableTestCases,
+                  trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
+                  matrixMode: targetPrep.selections.length > 1,
+                  totalBudgetUsd: targetPrep.totalBudgetUsd,
+                  failOnError: targetPrep.failOnError,
+                  threshold: resolvedThreshold,
+                  providerFactory: transcriptProviderFactory,
+                });
+                const evalFile = path.relative(cwd, testFilePath);
+                const existingSummary = remoteEvalSummaries.find(
+                  (summary) => summary.evalFile === evalFile,
+                );
+                if (existingSummary) {
+                  existingSummary.results.push(...result.results);
+                } else {
+                  remoteEvalSummaries.push({
+                    evalFile,
+                    results: [...result.results],
+                  });
+                }
+
+                return result.results;
+              } catch (fileError) {
+                // before_all or other setup failures should not abort the entire run.
+                // Mark all tests in this file as errors and continue with other files.
+                const message = fileError instanceof Error ? fileError.message : String(fileError);
+                console.error(
+                  `\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
+                );
+                const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
+                  timestamp: new Date().toISOString(),
+                  testId: testCase.id,
+                  score: 0,
+                  assertions: [],
+                  output: [],
+                  scores: [],
+                  error: message,
+                  executionStatus: 'execution_error' as const,
+                  failureStage: 'setup' as const,
+                  failureReasonCode: 'setup_error' as const,
+                  durationMs: 0,
+                  tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
+                  target: selection.targetName,
+                }));
+                for (const errResult of errorResults) {
+                  await outputWriter.append(errResult);
+                }
+                return errorResults;
+              }
+            }),
+          );
+          for (const results of targetResults) {
+            allResults.push(...results);
           }
-        }),
-      );
-      for (const results of targetResults) {
-        allResults.push(...results);
-      }
-    });
+        }
+      }),
+    );
 
     progressReporter.finish();
 
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index df0a6b6db..b6ca5c233 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -192,34 +192,25 @@ agentv eval evals/my-eval.yaml --export-otel
 
 ### Parallelism
 
-The `--workers N` flag is a **global concurrent test-case budget** for the entire run.
-
-**Single eval file** — up to N test cases run in parallel within that file:
+The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3).
 
 ```bash
 agentv eval evals/my-eval.yaml --workers 4
-# Up to 4 test cases run concurrently
-```
-
-**Multiple eval files** — workers are split across files: `min(N, files)` eval files run concurrently, each getting `floor(N / concurrent_files)` workers for intra-file parallelism:
-
-```bash
-agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 6
-# 3 eval files run concurrently, each with 2 workers → up to 6 concurrent test cases total
+# Up to 4 test cases from the file run concurrently
 ```
 
-:::caution[Static workspace race conditions]
-When multiple eval files reference the **same static workspace path** (e.g., via an env var like `EVAL_WORKSPACE_PATH`), concurrent execution can corrupt the workspace — for example, one file resetting a repo while another file's tests are running against it.
+**Multiple eval files** are scheduled by workspace path:
 
-To serialize all eval files (one file at a time), use `--workers 1`:
+- Files that share the **same static workspace path** run **sequentially** — one file completes before the next starts, so they never write to the same directory concurrently.
+- Files with **distinct workspace paths** (or no workspace) run **in parallel**, each with the full `--workers N` budget.
 
 ```bash
-agentv eval evals/**/*.yaml --workers 1
-# One file at a time; one test case at a time within each file
+# file1 and file2 share EVAL_WORKSPACE_PATH → run sequentially, 3 workers each
+# file3 has its own path → runs in parallel with the file1/file2 group
+agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
 ```
 
-Alternatively, use `--workspace-mode pooled` (the default for evals with `repos`). Pooled mode allocates separate slots per worker, so concurrent test cases and concurrent eval files never share a workspace directory. See [Workspace Pool](/docs/guides/workspace-pool/) for details.
-:::
+Pooled workspaces (the default for evals with `repos`) are always safe to run in parallel — each worker gets its own pool slot. See [Workspace Pool](/docs/guides/workspace-pool/) for details.
 
 ### Workspace Modes and Finish Policy
 
diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
index 6a5a99c89..72e2b8d94 100644
--- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
+++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -135,9 +135,7 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre
 
 The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.
 
-**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they may run concurrently (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces handle this safely — each active worker (across all eval files) acquires its own slot. Eval files that share the same workspace config (same fingerprint) draw from the same pool; eval files with different configs maintain separate pools. No cross-file workspace contention occurs in pooled mode.
-
-**Static workspaces and multiple eval files:** If you use `--workspace-mode static --workspace-path /path` across multiple eval files, all files will share that single directory concurrently. This can cause race conditions — for example, one file running `git checkout <sha>` while another file's tests read from the same repository. To prevent this, either use `--workers 1` to serialize execution or use pooled mode instead.
+**Multiple eval files:** When you pass multiple eval files to `agentv eval`, files with distinct workspace paths run in parallel while files that share a static workspace path are automatically serialized (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces are always safe to run in parallel — each active worker acquires its own slot regardless of which eval file it belongs to. No cross-file workspace contention occurs in pooled mode.
 
 ## Drift detection
 

From 87f381ee8bc0398c1aed6542f1be3615a65bf60d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 07:44:25 +0000
Subject: [PATCH 5/8] perf: cap parallel workspace groups at --workers to bound
 total concurrency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runWithLimit limits how many groups run in parallel to --workers N.
Each file within a group still gets the full --workers budget for
within-file test-case parallelism. Max concurrent test cases is
bounded by workers² rather than unbounded across all groups.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts | 192 ++++++++++++-------------
 1 file changed, 95 insertions(+), 97 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index ace651c4c..166f8dc65 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1309,7 +1309,9 @@ export async function runEvalCommand(
 
   // Group files by static workspace path. Files sharing the same static workspace run
   // sequentially to prevent concurrent writes from racing. Files using pooled mode or
-  // no workspace each get their own group and run in parallel with other groups.
+  // no workspace each get their own group. Groups run in parallel up to --workers limit,
+  // each file within a group gets the full --workers budget for within-file concurrency.
+  const workers = options.workers ?? DEFAULT_WORKERS;
   const workspaceGroups = new Map<string, string[]>();
   for (const filePath of activeTestFiles) {
     const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath;
@@ -1321,107 +1323,103 @@ export async function runEvalCommand(
   }
 
   try {
-    await Promise.all(
-      [...workspaceGroups.values()].map(async (group) => {
-        for (const testFilePath of group) {
-          const targetPrep = fileMetadata.get(testFilePath);
-          if (!targetPrep) {
-            throw new Error(`Missing metadata for ${testFilePath}`);
-          }
+    await runWithLimit([...workspaceGroups.values()], workers, async (group) => {
+      for (const testFilePath of group) {
+        const targetPrep = fileMetadata.get(testFilePath);
+        if (!targetPrep) {
+          throw new Error(`Missing metadata for ${testFilePath}`);
+        }
 
-          // Run all targets concurrently (each target has its own worker limit)
-          const targetResults = await Promise.all(
-            targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
-              // Filter test cases to those applicable to this target.
-              const targetName = selection.targetName;
-              const applicableTestCases =
-                targetPrep.selections.length > 1
-                  ? targetPrep.testCases.filter((test) => {
-                      if (test.targets && test.targets.length > 0) {
-                        return test.targets.includes(targetName);
-                      }
-                      return true;
-                    })
-                  : targetPrep.testCases;
-
-              if (applicableTestCases.length === 0) {
-                return [];
+        // Run all targets concurrently (each target has its own worker limit)
+        const targetResults = await Promise.all(
+          targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
+            // Filter test cases to those applicable to this target.
+            const targetName = selection.targetName;
+            const applicableTestCases =
+              targetPrep.selections.length > 1
+                ? targetPrep.testCases.filter((test) => {
+                    if (test.targets && test.targets.length > 0) {
+                      return test.targets.includes(targetName);
+                    }
+                    return true;
+                  })
+                : targetPrep.testCases;
+
+            if (applicableTestCases.length === 0) {
+              return [];
+            }
+
+            try {
+              const result = await runSingleEvalFile({
+                testFilePath,
+                cwd,
+                repoRoot,
+                options,
+                outputWriter,
+                otelExporter,
+                cache,
+                evaluationRunner,
+                workersOverride: perFileWorkers,
+                yamlWorkers: targetPrep.yamlWorkers,
+                progressReporter,
+                seenTestCases,
+                displayIdTracker,
+                selection,
+                inlineTargetLabel,
+                testCases: applicableTestCases,
+                trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
+                matrixMode: targetPrep.selections.length > 1,
+                totalBudgetUsd: targetPrep.totalBudgetUsd,
+                failOnError: targetPrep.failOnError,
+                threshold: resolvedThreshold,
+                providerFactory: transcriptProviderFactory,
+              });
+              const evalFile = path.relative(cwd, testFilePath);
+              const existingSummary = remoteEvalSummaries.find(
+                (summary) => summary.evalFile === evalFile,
+              );
+              if (existingSummary) {
+                existingSummary.results.push(...result.results);
+              } else {
+                remoteEvalSummaries.push({
+                  evalFile,
+                  results: [...result.results],
+                });
               }
 
-              try {
-                const result = await runSingleEvalFile({
-                  testFilePath,
-                  cwd,
-                  repoRoot,
-                  options,
-                  outputWriter,
-                  otelExporter,
-                  cache,
-                  evaluationRunner,
-                  workersOverride: perFileWorkers,
-                  yamlWorkers: targetPrep.yamlWorkers,
-                  progressReporter,
-                  seenTestCases,
-                  displayIdTracker,
-                  selection,
-                  inlineTargetLabel,
-                  testCases: applicableTestCases,
-                  trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
-                  matrixMode: targetPrep.selections.length > 1,
-                  totalBudgetUsd: targetPrep.totalBudgetUsd,
-                  failOnError: targetPrep.failOnError,
-                  threshold: resolvedThreshold,
-                  providerFactory: transcriptProviderFactory,
-                });
-                const evalFile = path.relative(cwd, testFilePath);
-                const existingSummary = remoteEvalSummaries.find(
-                  (summary) => summary.evalFile === evalFile,
-                );
-                if (existingSummary) {
-                  existingSummary.results.push(...result.results);
-                } else {
-                  remoteEvalSummaries.push({
-                    evalFile,
-                    results: [...result.results],
-                  });
-                }
-
-                return result.results;
-              } catch (fileError) {
-                // before_all or other setup failures should not abort the entire run.
-                // Mark all tests in this file as errors and continue with other files.
-                const message = fileError instanceof Error ? fileError.message : String(fileError);
-                console.error(
-                  `\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
-                );
-                const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
-                  timestamp: new Date().toISOString(),
-                  testId: testCase.id,
-                  score: 0,
-                  assertions: [],
-                  output: [],
-                  scores: [],
-                  error: message,
-                  executionStatus: 'execution_error' as const,
-                  failureStage: 'setup' as const,
-                  failureReasonCode: 'setup_error' as const,
-                  durationMs: 0,
-                  tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
-                  target: selection.targetName,
-                }));
-                for (const errResult of errorResults) {
-                  await outputWriter.append(errResult);
-                }
-                return errorResults;
+              return result.results;
+            } catch (fileError) {
+              // before_all or other setup failures should not abort the entire run.
+              // Mark all tests in this file as errors and continue with other files.
+              const message = fileError instanceof Error ? fileError.message : String(fileError);
+              console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
+              const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
+                timestamp: new Date().toISOString(),
+                testId: testCase.id,
+                score: 0,
+                assertions: [],
+                output: [],
+                scores: [],
+                error: message,
+                executionStatus: 'execution_error' as const,
+                failureStage: 'setup' as const,
+                failureReasonCode: 'setup_error' as const,
+                durationMs: 0,
+                tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
+                target: selection.targetName,
+              }));
+              for (const errResult of errorResults) {
+                await outputWriter.append(errResult);
               }
-            }),
-          );
-          for (const results of targetResults) {
-            allResults.push(...results);
-          }
+              return errorResults;
+            }
+          }),
+        );
+        for (const results of targetResults) {
+          allResults.push(...results);
         }
-      }),
-    );
+      }
+    });
 
     progressReporter.finish();
 

From 1d168e12a0df0c68e4c9e4aba8946d66c78b3f44 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 08:04:41 +0000
Subject: [PATCH 6/8] refactor: simplify to sequential eval files with parallel
 test cases

Replace workspace-group scheduler with a plain for-of loop.
Eval files always run sequentially; --workers N controls within-file
test-case parallelism. This matches the standard model used by
promptfoo and convex-evals and eliminates all cross-file workspace
race conditions without any grouping complexity.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/commands/run.ts    |   2 +-
 apps/cli/src/commands/eval/run-eval.ts        | 200 ++++++++----------
 .../docs/docs/evaluation/running-evals.mdx    |  14 +-
 .../docs/docs/guides/workspace-pool.mdx       |   2 +-
 4 files changed, 98 insertions(+), 120 deletions(-)

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 442edb65a..8d3f5f18b 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -41,7 +41,7 @@ export const evalRunCommand = command({
       type: optional(number),
       long: 'workers',
       description:
-        'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files run sequentially unless they use distinct workspace paths, in which case they run in parallel. Can also be set per-target in targets.yaml',
+        'Number of parallel test cases within each eval file (default: 3, max: 50). Eval files always run sequentially. Can also be set per-target in targets.yaml',
     }),
     out: option({
       type: optional(string),
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 166f8dc65..a09356920 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1307,119 +1307,105 @@ export async function runEvalCommand(
     );
   }
 
-  // Group files by static workspace path. Files sharing the same static workspace run
-  // sequentially to prevent concurrent writes from racing. Files using pooled mode or
-  // no workspace each get their own group. Groups run in parallel up to --workers limit,
-  // each file within a group gets the full --workers budget for within-file concurrency.
-  const workers = options.workers ?? DEFAULT_WORKERS;
-  const workspaceGroups = new Map<string, string[]>();
-  for (const filePath of activeTestFiles) {
-    const staticPath = options.workspacePath ?? fileMetadata.get(filePath)?.workspacePath;
-    // Files with no static workspace get a unique key so they run in parallel
-    const groupKey = staticPath ?? filePath;
-    const group = workspaceGroups.get(groupKey) ?? [];
-    group.push(filePath);
-    workspaceGroups.set(groupKey, group);
-  }
-
+  // Eval files run sequentially; within each file, --workers N test cases run in parallel.
+  // This matches industry practice (promptfoo, convex-evals) and avoids cross-file workspace
+  // races without any grouping complexity.
   try {
-    await runWithLimit([...workspaceGroups.values()], workers, async (group) => {
-      for (const testFilePath of group) {
-        const targetPrep = fileMetadata.get(testFilePath);
-        if (!targetPrep) {
-          throw new Error(`Missing metadata for ${testFilePath}`);
-        }
+    for (const testFilePath of activeTestFiles) {
+      const targetPrep = fileMetadata.get(testFilePath);
+      if (!targetPrep) {
+        throw new Error(`Missing metadata for ${testFilePath}`);
+      }
 
-        // Run all targets concurrently (each target has its own worker limit)
-        const targetResults = await Promise.all(
-          targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
-            // Filter test cases to those applicable to this target.
-            const targetName = selection.targetName;
-            const applicableTestCases =
-              targetPrep.selections.length > 1
-                ? targetPrep.testCases.filter((test) => {
-                    if (test.targets && test.targets.length > 0) {
-                      return test.targets.includes(targetName);
-                    }
-                    return true;
-                  })
-                : targetPrep.testCases;
-
-            if (applicableTestCases.length === 0) {
-              return [];
-            }
+      // Run all targets concurrently (each target has its own worker limit)
+      const targetResults = await Promise.all(
+        targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
+          // Filter test cases to those applicable to this target.
+          const targetName = selection.targetName;
+          const applicableTestCases =
+            targetPrep.selections.length > 1
+              ? targetPrep.testCases.filter((test) => {
+                  if (test.targets && test.targets.length > 0) {
+                    return test.targets.includes(targetName);
+                  }
+                  return true;
+                })
+              : targetPrep.testCases;
+
+          if (applicableTestCases.length === 0) {
+            return [];
+          }
 
-            try {
-              const result = await runSingleEvalFile({
-                testFilePath,
-                cwd,
-                repoRoot,
-                options,
-                outputWriter,
-                otelExporter,
-                cache,
-                evaluationRunner,
-                workersOverride: perFileWorkers,
-                yamlWorkers: targetPrep.yamlWorkers,
-                progressReporter,
-                seenTestCases,
-                displayIdTracker,
-                selection,
-                inlineTargetLabel,
-                testCases: applicableTestCases,
-                trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
-                matrixMode: targetPrep.selections.length > 1,
-                totalBudgetUsd: targetPrep.totalBudgetUsd,
-                failOnError: targetPrep.failOnError,
-                threshold: resolvedThreshold,
-                providerFactory: transcriptProviderFactory,
+          try {
+            const result = await runSingleEvalFile({
+              testFilePath,
+              cwd,
+              repoRoot,
+              options,
+              outputWriter,
+              otelExporter,
+              cache,
+              evaluationRunner,
+              workersOverride: perFileWorkers,
+              yamlWorkers: targetPrep.yamlWorkers,
+              progressReporter,
+              seenTestCases,
+              displayIdTracker,
+              selection,
+              inlineTargetLabel,
+              testCases: applicableTestCases,
+              trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
+              matrixMode: targetPrep.selections.length > 1,
+              totalBudgetUsd: targetPrep.totalBudgetUsd,
+              failOnError: targetPrep.failOnError,
+              threshold: resolvedThreshold,
+              providerFactory: transcriptProviderFactory,
+            });
+            const evalFile = path.relative(cwd, testFilePath);
+            const existingSummary = remoteEvalSummaries.find(
+              (summary) => summary.evalFile === evalFile,
+            );
+            if (existingSummary) {
+              existingSummary.results.push(...result.results);
+            } else {
+              remoteEvalSummaries.push({
+                evalFile,
+                results: [...result.results],
               });
-              const evalFile = path.relative(cwd, testFilePath);
-              const existingSummary = remoteEvalSummaries.find(
-                (summary) => summary.evalFile === evalFile,
-              );
-              if (existingSummary) {
-                existingSummary.results.push(...result.results);
-              } else {
-                remoteEvalSummaries.push({
-                  evalFile,
-                  results: [...result.results],
-                });
-              }
-
-              return result.results;
-            } catch (fileError) {
-              // before_all or other setup failures should not abort the entire run.
-              // Mark all tests in this file as errors and continue with other files.
-              const message = fileError instanceof Error ? fileError.message : String(fileError);
-              console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
-              const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
-                timestamp: new Date().toISOString(),
-                testId: testCase.id,
-                score: 0,
-                assertions: [],
-                output: [],
-                scores: [],
-                error: message,
-                executionStatus: 'execution_error' as const,
-                failureStage: 'setup' as const,
-                failureReasonCode: 'setup_error' as const,
-                durationMs: 0,
-                tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
-                target: selection.targetName,
-              }));
-              for (const errResult of errorResults) {
-                await outputWriter.append(errResult);
-              }
-              return errorResults;
             }
-          }),
-        );
-        for (const results of targetResults) {
-          allResults.push(...results);
-        }
+
+            return result.results;
+          } catch (fileError) {
+            // before_all or other setup failures should not abort the entire run.
+            // Mark all tests in this file as errors and continue with other files.
+            const message = fileError instanceof Error ? fileError.message : String(fileError);
+            console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
+            const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
+              timestamp: new Date().toISOString(),
+              testId: testCase.id,
+              score: 0,
+              assertions: [],
+              output: [],
+              scores: [],
+              error: message,
+              executionStatus: 'execution_error' as const,
+              failureStage: 'setup' as const,
+              failureReasonCode: 'setup_error' as const,
+              durationMs: 0,
+              tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
+              target: selection.targetName,
+            }));
+            for (const errResult of errorResults) {
+              await outputWriter.append(errResult);
+            }
+            return errorResults;
+          }
+        }),
+      );
+      for (const results of targetResults) {
+        allResults.push(...results);
       }
-    });
+    }
 
     progressReporter.finish();
 
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index b6ca5c233..ea5eb35f2 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -192,25 +192,17 @@ agentv eval evals/my-eval.yaml --export-otel
 
 ### Parallelism
 
-The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3).
+The `--workers N` flag controls how many **test cases run in parallel within each eval file** (default: 3). Eval files always run sequentially — one file completes before the next starts.
 
 ```bash
 agentv eval evals/my-eval.yaml --workers 4
 # Up to 4 test cases from the file run concurrently
-```
-
-**Multiple eval files** are scheduled by workspace path:
 
-- Files that share the **same static workspace path** run **sequentially** — one file completes before the next starts, so they never write to the same directory concurrently.
-- Files with **distinct workspace paths** (or no workspace) run **in parallel**, each with the full `--workers N` budget.
-
-```bash
-# file1 and file2 share EVAL_WORKSPACE_PATH → run sequentially, 3 workers each
-# file3 has its own path → runs in parallel with the file1/file2 group
 agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
+# Files run one at a time; within each file, up to 3 test cases run in parallel
 ```
 
-Pooled workspaces (the default for evals with `repos`) are always safe to run in parallel — each worker gets its own pool slot. See [Workspace Pool](/docs/guides/workspace-pool/) for details.
+This matches the standard model used by eval frameworks (promptfoo, convex-evals) and avoids cross-file workspace races without any special configuration.
 
 ### Workspace Modes and Finish Policy
 
diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
index 72e2b8d94..58e024e2b 100644
--- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
+++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -135,7 +135,7 @@ This creates up to 4 slots (`slot-0` through `slot-3`). PID-based lock files pre
 
 The maximum number of pool slots defaults to 10 (capped at 50). Slots are created on demand — a run with 2 workers only creates 2 slots, even if the pool allows 10.
 
-**Multiple eval files:** When you pass multiple eval files to `agentv eval`, files with distinct workspace paths run in parallel while files that share a static workspace path are automatically serialized (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Pooled workspaces are always safe to run in parallel — each active worker acquires its own slot regardless of which eval file it belongs to. No cross-file workspace contention occurs in pooled mode.
+**Multiple eval files:** When you pass multiple eval files to `agentv eval`, they run sequentially — one file completes before the next starts (see [Parallelism](/docs/evaluation/running-evals/#parallelism)). Within each file, pool slots support concurrent workers as described above.
 
 ## Drift detection
 

From efebcc9f73247b734094d514212a2d471cf4deb2 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 08:10:18 +0000
Subject: [PATCH 7/8] docs: reference deepeval and OpenAI Evals instead of
 convex-evals

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts                      | 4 ++--
 apps/web/src/content/docs/docs/evaluation/running-evals.mdx | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index a09356920..5f0176d9f 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1308,8 +1308,8 @@ export async function runEvalCommand(
   }
 
   // Eval files run sequentially; within each file, --workers N test cases run in parallel.
-  // This matches industry practice (promptfoo, convex-evals) and avoids cross-file workspace
-  // races without any grouping complexity.
+  // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
+  // workspace races without any grouping complexity.
   try {
     for (const testFilePath of activeTestFiles) {
       const targetPrep = fileMetadata.get(testFilePath);
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index ea5eb35f2..efa3ddc8f 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -202,7 +202,7 @@ agentv eval evals/file1.yaml evals/file2.yaml evals/file3.yaml --workers 3
 # Files run one at a time; within each file, up to 3 test cases run in parallel
 ```
 
-This matches the standard model used by eval frameworks (promptfoo, convex-evals) and avoids cross-file workspace races without any special configuration.
+This matches the standard model used by eval frameworks (promptfoo, deepeval, OpenAI Evals) and avoids cross-file workspace races without any special configuration.
 
 ### Workspace Modes and Finish Policy
 

From f09ffa753622cbba576e8da1e6b54c51b9369c8a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 12 Apr 2026 08:15:21 +0000
Subject: [PATCH 8/8] chore: remove dead code from earlier iterations

Remove runWithLimit (unused after switching to plain for-of loop) and
workspacePath from fileMetadata Map type (set but never read).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 5f0176d9f..c5aa81a2b 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -498,7 +498,6 @@ async function prepareFileMetadata(params: {
   readonly failOnError?: FailOnError;
   readonly threshold?: number;
   readonly tags?: readonly string[];
-  readonly workspacePath?: string;
 }> {
   const { testFilePath, repoRoot, cwd, options } = params;
 
@@ -613,29 +612,9 @@ async function prepareFileMetadata(params: {
     failOnError: suite.failOnError,
     threshold: suite.threshold,
     tags: suite.metadata?.tags,
-    workspacePath: suite.workspacePath,
   };
 }
 
-async function runWithLimit<T>(
-  items: readonly T[],
-  limit: number,
-  task: (item: T) => Promise<void>,
-): Promise<void> {
-  const safeLimit = Math.max(1, limit);
-  let index = 0;
-
-  const workers = Array.from({ length: safeLimit }, async () => {
-    while (index < items.length) {
-      const current = items[index];
-      index += 1;
-      await task(current);
-    }
-  });
-
-  await Promise.all(workers);
-}
-
 async function runSingleEvalFile(params: {
   readonly testFilePath: string;
   readonly cwd: string;
@@ -1110,7 +1089,6 @@ export async function runEvalCommand(
       readonly failOnError?: FailOnError;
       readonly threshold?: number;
       readonly tags?: readonly string[];
-      readonly workspacePath?: string;
     }
   >();
   // Separate TypeScript/JS eval files from YAML files.