From 5db5935fc2621c3d11d84d3065cb8e9205664196 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Wed, 8 Apr 2026 13:16:20 +0000
Subject: [PATCH 1/3] feat(cli): extend --retry-errors to resume missing cases
 from crashed runs (#975)

Previously --retry-errors only re-ran execution_error test cases. If the
process crashed mid-run, cases that never started were lost. Now uses an
exclusion filter (negating completed test IDs) so both error cases AND
missing cases are re-run, enabling crash recovery without a new checkpoint
system.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/retry-errors.ts | 12 +++++++
 apps/cli/src/commands/eval/run-eval.ts     | 32 ++++++++++++------
 apps/cli/test/unit/retry-errors.test.ts    | 38 +++++++++++++++++++++-
 3 files changed, 71 insertions(+), 11 deletions(-)
diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
index 65f76c2ea..7ccdaaf8f 100644
--- a/apps/cli/src/commands/eval/retry-errors.ts
+++ b/apps/cli/src/commands/eval/retry-errors.ts
@@ -17,6 +17,18 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
   return [...new Set(ids)];
 }
 
+/**
+ * Load test IDs from an index/results source that completed successfully (non-error).
+ * Used for resume: any test NOT in this set needs to be re-run.
+ */
+export async function loadCompletedTestIds(jsonlPath: string): Promise<readonly string[]> {
+  const ids = (await loadRetrySourceResults(jsonlPath))
+    .filter((result) => result.testId && result.executionStatus !== 'execution_error')
+    .map((result) => result.testId);
+
+  return [...new Set(ids)];
+}
+
 /**
  * Load results from an index/results source that do NOT have executionStatus === 'execution_error'.
  * These are the "good" results that should be preserved when merging retry output.
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 4b052d32c..0015503c6 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,7 +34,7 @@ import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir } from './result-layout.js';
-import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
+import { loadCompletedTestIds, loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
 import { saveRunCache } from './run-cache.js';
 import { findRepoRoot } from './shared.js';
 import {
@@ -895,23 +895,30 @@ export async function runEvalCommand(
     throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
   }
 
-  // --retry-errors: override filter to only re-run execution_error test cases.
+  // --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
+  // Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
+  // both error cases and cases that never ran (e.g., due to a crash or interrupt).
   // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
   // since the retry source and output destination may refer to the same file.
   let retryNonErrorResults: readonly EvaluationResult[] | undefined;
   if (options.retryErrors) {
     const retryPath = path.resolve(options.retryErrors);
     await ensureFileExists(retryPath, 'Retry-errors JSONL file');
+    const completedIds = await loadCompletedTestIds(retryPath);
     const errorIds = await loadErrorTestIds(retryPath);
-    if (errorIds.length === 0) {
-      console.log('No execution errors found in the previous output. Nothing to retry.');
-      return;
-    }
-    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
-    // Override the filter to match only error test IDs using micromatch brace expansion
-    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`;
-    options = { ...options, filter: filterPattern };
     retryNonErrorResults = await loadNonErrorResults(retryPath);
+
+    if (errorIds.length > 0) {
+      console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
+    }
+    // Use a negation filter to exclude completed (non-error) cases.
+    // This re-runs both error cases and any cases missing from the previous output (crash recovery).
+    if (completedIds.length > 0) {
+      const excludePattern =
+        completedIds.length === 1 ? `!${completedIds[0]}` : `!{${completedIds.join(',')}}`;
+      options = { ...options, filter: excludePattern };
+      console.log(`Skipping ${completedIds.length} already-completed test(s).`);
+    }
   }
 
   // Validate static workspace path exists and is a directory
@@ -1197,6 +1204,11 @@ export async function runEvalCommand(
   }
 
   if (totalEvalCount === 0) {
+    // When using --retry-errors, all tests being filtered means everything completed successfully
+    if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
+      console.log('All tests completed successfully in the previous run. Nothing to retry.');
+      return;
+    }
     throw new Error('No tests matched the provided filters.');
   }
   const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
index 9aca5b16b..1bd644c96 100644
--- a/apps/cli/test/unit/retry-errors.test.ts
+++ b/apps/cli/test/unit/retry-errors.test.ts
@@ -3,7 +3,11 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
-import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js';
+import {
+  loadCompletedTestIds,
+  loadErrorTestIds,
+  loadNonErrorResults,
+} from '../../src/commands/eval/retry-errors.js';
 
 describe('retry-errors', () => {
   let tmpDir: string;
@@ -131,6 +135,38 @@ describe('retry-errors', () => {
     expect(ids).toEqual(['case-2']);
   });
 
+  it('loadCompletedTestIds returns only non-error test IDs', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
+      { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 },
+      { test_id: 'case-4', execution_status: 'execution_error', score: 0, error: 'provider failed' },
+    ]);
+
+    const ids = await loadCompletedTestIds(filePath);
+    expect(ids).toEqual(['case-1', 'case-3']);
+  });
+
+  it('loadCompletedTestIds returns empty array when all are errors', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
+    ]);
+
+    const ids = await loadCompletedTestIds(filePath);
+    expect(ids).toEqual([]);
+  });
+
+  it('loadCompletedTestIds deduplicates IDs', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-1', execution_status: 'ok', score: 0.8 },
+    ]);
+
+    const ids = await loadCompletedTestIds(filePath);
+    expect(ids).toEqual(['case-1']);
+  });
+
   it('throws on malformed index.jsonl lines', async () => {
     tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
     const filePath = path.join(tmpDir, 'index.jsonl');

From 741b2cd7419889b85b30cc68e90d0687a9fc8efa Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Wed, 8 Apr 2026 13:17:40 +0000
Subject: [PATCH 2/3] style: fix lint formatting in retry-errors test

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/test/unit/retry-errors.test.ts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
index 1bd644c96..e76f6c4ab 100644
--- a/apps/cli/test/unit/retry-errors.test.ts
+++ b/apps/cli/test/unit/retry-errors.test.ts
@@ -140,7 +140,12 @@ describe('retry-errors', () => {
       { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
       { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
       { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 },
-      { test_id: 'case-4', execution_status: 'execution_error', score: 0, error: 'provider failed' },
+      {
+        test_id: 'case-4',
+        execution_status: 'execution_error',
+        score: 0,
+        error: 'provider failed',
+      },
     ]);
 
     const ids = await loadCompletedTestIds(filePath);

From 9b392e8d4fce86e0e61e4f8bdb713a1ef93b4c8c Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Wed, 8 Apr 2026 13:39:19 +0000
Subject: [PATCH 3/3] fix(cli): address code review findings for --retry-errors
 resume

1. Matrix safety: loadFullyCompletedTestIds only excludes test IDs where
   ALL results are non-error. If case-1 succeeded on target A but errored
   on target B, case-1 re-runs on both targets (conservative).

2. Glob escaping: buildExclusionFilter escapes micromatch metacharacters
   in test IDs so IDs with *, ?, [], {}, ! match literally.

3. Early-return message: changed from "All tests completed successfully"
   to "No execution errors or missing cases" since non-error results
   include quality_failure.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/retry-errors.ts | 42 ++++++++++++++++++----
 apps/cli/src/commands/eval/run-eval.ts     | 22 +++++++-----
 apps/cli/test/unit/retry-errors.test.ts    | 35 ++++++++++++------
 3 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
index 7ccdaaf8f..37e647533 100644
--- a/apps/cli/src/commands/eval/retry-errors.ts
+++ b/apps/cli/src/commands/eval/retry-errors.ts
@@ -6,6 +6,14 @@ async function loadRetrySourceResults(jsonlPath: string): Promise<readonly Evalu
   return loadManifestResults(resolveResultSourcePath(jsonlPath));
 }
 
+/**
+ * Escape micromatch glob metacharacters in a string so it matches literally.
+ * Characters escaped: * ? [ ] { } ( ) ! @ # + |
+ */
+function escapeGlob(id: string): string {
+  return id.replace(/[*?[\]{}()!@#+|\\]/g, '\\$&');
+}
+
 /**
  * Load test IDs from an index/results source that have executionStatus === 'execution_error'.
  */
@@ -18,15 +26,35 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
 }
 
 /**
- * Load test IDs from an index/results source that completed successfully (non-error).
- * Used for resume: any test NOT in this set needs to be re-run.
+ * Load test IDs that are fully completed (non-error) across ALL targets.
+ * A test ID is only considered "completed" if every result for that ID is non-error.
+ * This is conservative for matrix runs: if case-1 succeeded on target A but errored
+ * on target B, case-1 is NOT excluded (it will re-run on both targets).
  */
-export async function loadCompletedTestIds(jsonlPath: string): Promise<readonly string[]> {
-  const ids = (await loadRetrySourceResults(jsonlPath))
-    .filter((result) => result.testId && result.executionStatus !== 'execution_error')
-    .map((result) => result.testId);
+export async function loadFullyCompletedTestIds(jsonlPath: string): Promise<readonly string[]> {
+  const results = await loadRetrySourceResults(jsonlPath);
+  const allIds = new Set<string>();
+  const errorIds = new Set<string>();
 
-  return [...new Set(ids)];
+  for (const result of results) {
+    if (!result.testId) continue;
+    allIds.add(result.testId);
+    if (result.executionStatus === 'execution_error') {
+      errorIds.add(result.testId);
+    }
+  }
+
+  // Only IDs where every result is non-error
+  return [...allIds].filter((id) => !errorIds.has(id));
+}
+
+/**
+ * Build a micromatch negation pattern that excludes the given test IDs.
+ * Escapes glob metacharacters in IDs to ensure literal matching.
+ */
+export function buildExclusionFilter(completedIds: readonly string[]): string {
+  const escaped = completedIds.map(escapeGlob);
+  return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(',')}}`;
 }
 
 /**
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 0015503c6..471293a52 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,7 +34,12 @@ import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir } from './result-layout.js';
-import { loadCompletedTestIds, loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
+import {
+  buildExclusionFilter,
+  loadErrorTestIds,
+  loadFullyCompletedTestIds,
+  loadNonErrorResults,
+} from './retry-errors.js';
 import { saveRunCache } from './run-cache.js';
 import { findRepoRoot } from './shared.js';
 import {
@@ -904,19 +909,18 @@ export async function runEvalCommand(
   if (options.retryErrors) {
     const retryPath = path.resolve(options.retryErrors);
     await ensureFileExists(retryPath, 'Retry-errors JSONL file');
-    const completedIds = await loadCompletedTestIds(retryPath);
+    const completedIds = await loadFullyCompletedTestIds(retryPath);
     const errorIds = await loadErrorTestIds(retryPath);
     retryNonErrorResults = await loadNonErrorResults(retryPath);
 
     if (errorIds.length > 0) {
       console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
     }
-    // Use a negation filter to exclude completed (non-error) cases.
-    // This re-runs both error cases and any cases missing from the previous output (crash recovery).
+    // Use a negation filter to exclude fully-completed (non-error across all targets) cases.
+    // This re-runs error cases, cases missing from the output (crash recovery), and cases
+    // that errored on some targets even if they succeeded on others (matrix safety).
     if (completedIds.length > 0) {
-      const excludePattern =
-        completedIds.length === 1 ? `!${completedIds[0]}` : `!{${completedIds.join(',')}}`;
-      options = { ...options, filter: excludePattern };
+      options = { ...options, filter: buildExclusionFilter(completedIds) };
       console.log(`Skipping ${completedIds.length} already-completed test(s).`);
     }
   }
@@ -1204,9 +1208,9 @@ export async function runEvalCommand(
   }
 
   if (totalEvalCount === 0) {
-    // When using --retry-errors, all tests being filtered means everything completed successfully
+    // When using --retry-errors, all tests being filtered means no errors or missing cases remain
     if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
-      console.log('All tests completed successfully in the previous run. Nothing to retry.');
+      console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
       return;
     }
     throw new Error('No tests matched the provided filters.');
diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
index e76f6c4ab..ce3b28dc1 100644
--- a/apps/cli/test/unit/retry-errors.test.ts
+++ b/apps/cli/test/unit/retry-errors.test.ts
@@ -4,8 +4,9 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 
 import {
-  loadCompletedTestIds,
+  buildExclusionFilter,
   loadErrorTestIds,
+  loadFullyCompletedTestIds,
   loadNonErrorResults,
 } from '../../src/commands/eval/retry-errors.js';
 
@@ -135,7 +136,7 @@ describe('retry-errors', () => {
     expect(ids).toEqual(['case-2']);
   });
 
-  it('loadCompletedTestIds returns only non-error test IDs', async () => {
+  it('loadFullyCompletedTestIds returns only non-error test IDs', async () => {
     const filePath = createIndexFile([
       { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
       { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
@@ -148,28 +149,42 @@ describe('retry-errors', () => {
       },
     ]);
 
-    const ids = await loadCompletedTestIds(filePath);
+    const ids = await loadFullyCompletedTestIds(filePath);
     expect(ids).toEqual(['case-1', 'case-3']);
   });
 
-  it('loadCompletedTestIds returns empty array when all are errors', async () => {
+  it('loadFullyCompletedTestIds returns empty array when all are errors', async () => {
     const filePath = createIndexFile([
       { test_id: 'case-1', execution_status: 'execution_error', score: 0 },
       { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
     ]);
 
-    const ids = await loadCompletedTestIds(filePath);
+    const ids = await loadFullyCompletedTestIds(filePath);
     expect(ids).toEqual([]);
   });
 
-  it('loadCompletedTestIds deduplicates IDs', async () => {
+  it('loadFullyCompletedTestIds excludes IDs that errored on any target (matrix safety)', async () => {
     const filePath = createIndexFile([
-      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
-      { test_id: 'case-1', execution_status: 'ok', score: 0.8 },
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9, target: 'gpt-4' },
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0, target: 'claude' },
+      { test_id: 'case-2', execution_status: 'ok', score: 0.8, target: 'gpt-4' },
+      { test_id: 'case-2', execution_status: 'ok', score: 0.7, target: 'claude' },
     ]);
 
-    const ids = await loadCompletedTestIds(filePath);
-    expect(ids).toEqual(['case-1']);
+    const ids = await loadFullyCompletedTestIds(filePath);
+    // case-1 errored on claude, so it should NOT be excluded from retry
+    expect(ids).toEqual(['case-2']);
+  });
+
+  it('buildExclusionFilter escapes glob metacharacters in test IDs', () => {
+    expect(buildExclusionFilter(['simple'])).toBe('!simple');
+    expect(buildExclusionFilter(['a', 'b'])).toBe('!{a,b}');
+    // Braces, brackets, stars, question marks should be escaped
+    expect(buildExclusionFilter(['test[1]'])).toBe('!test\\[1\\]');
+    expect(buildExclusionFilter(['test{a}'])).toBe('!test\\{a\\}');
+    expect(buildExclusionFilter(['test*'])).toBe('!test\\*');
+    expect(buildExclusionFilter(['test?'])).toBe('!test\\?');
+    expect(buildExclusionFilter(['!negated'])).toBe('!\\!negated');
   });
 
   it('throws on malformed index.jsonl lines', async () => {