EntityProcess · christso · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
@@ -6,6 +6,14 @@ async function loadRetrySourceResults(jsonlPath: string): Promise<readonly Evalu
   return loadManifestResults(resolveResultSourcePath(jsonlPath));
 }
 
+/**
+ * Escape micromatch glob metacharacters in a string so it matches literally.
+ * Characters escaped: * ? [ ] { } ( ) ! @ # + |
+ */
+function escapeGlob(id: string): string {
+  return id.replace(/[*?[\]{}()!@#+|\\]/g, '\\$&');
+}
+
 /**
  * Load test IDs from an index/results source that have executionStatus === 'execution_error'.
  */
@@ -17,6 +25,38 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
   return [...new Set(ids)];
 }
 
+/**
+ * Load test IDs that are fully completed (non-error) across ALL targets.
+ * A test ID is only considered "completed" if every result for that ID is non-error.
+ * This is conservative for matrix runs: if case-1 succeeded on target A but errored
+ * on target B, case-1 is NOT excluded (it will re-run on both targets).
+ */
+export async function loadFullyCompletedTestIds(jsonlPath: string): Promise<readonly string[]> {
+  const results = await loadRetrySourceResults(jsonlPath);
+  const allIds = new Set<string>();
+  const errorIds = new Set<string>();
+
+  for (const result of results) {
+    if (!result.testId) continue;
+    allIds.add(result.testId);
+    if (result.executionStatus === 'execution_error') {
+      errorIds.add(result.testId);
+    }
+  }
+
+  // Only IDs where every result is non-error
+  return [...allIds].filter((id) => !errorIds.has(id));
+}
+
+/**
+ * Build a micromatch negation pattern that excludes the given test IDs.
+ * Escapes glob metacharacters in IDs to ensure literal matching.
+ */
+export function buildExclusionFilter(completedIds: readonly string[]): string {
+  const escaped = completedIds.map(escapeGlob);
+  return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(',')}}`;
+}
+
 /**
  * Load results from an index/results source that do NOT have executionStatus === 'execution_error'.
  * These are the "good" results that should be preserved when merging retry output.

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,7 +34,12 @@ import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir } from './result-layout.js';
-import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
+import {
+  buildExclusionFilter,
+  loadErrorTestIds,
+  loadFullyCompletedTestIds,
+  loadNonErrorResults,
+} from './retry-errors.js';
 import { saveRunCache } from './run-cache.js';
 import { findRepoRoot } from './shared.js';
 import {
@@ -895,23 +900,29 @@ export async function runEvalCommand(
     throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
   }
 
-  // --retry-errors: override filter to only re-run execution_error test cases.
+  // --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
+  // Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
+  // both error cases and cases that never ran (e.g., due to a crash or interrupt).
   // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
   // since the retry source and output destination may refer to the same file.
   let retryNonErrorResults: readonly EvaluationResult[] | undefined;
   if (options.retryErrors) {
     const retryPath = path.resolve(options.retryErrors);
     await ensureFileExists(retryPath, 'Retry-errors JSONL file');
+    const completedIds = await loadFullyCompletedTestIds(retryPath);
     const errorIds = await loadErrorTestIds(retryPath);
-    if (errorIds.length === 0) {
-      console.log('No execution errors found in the previous output. Nothing to retry.');
-      return;
-    }
-    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
-    // Override the filter to match only error test IDs using micromatch brace expansion
-    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`;
-    options = { ...options, filter: filterPattern };
     retryNonErrorResults = await loadNonErrorResults(retryPath);
+
+    if (errorIds.length > 0) {
+      console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
+    }
+    // Use a negation filter to exclude fully-completed (non-error across all targets) cases.
+    // This re-runs error cases, cases missing from the output (crash recovery), and cases
+    // that errored on some targets even if they succeeded on others (matrix safety).
+    if (completedIds.length > 0) {
+      options = { ...options, filter: buildExclusionFilter(completedIds) };
+      console.log(`Skipping ${completedIds.length} already-completed test(s).`);
+    }
   }
 
   // Validate static workspace path exists and is a directory
@@ -1197,6 +1208,11 @@ export async function runEvalCommand(
   }
 
   if (totalEvalCount === 0) {
+    // When using --retry-errors, all tests being filtered means no errors or missing cases remain
+    if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
+      console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
+      return;
+    }
     throw new Error('No tests matched the provided filters.');
   }
   const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });

diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
@@ -3,7 +3,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
-import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js';
+import {
+  buildExclusionFilter,
+  loadErrorTestIds,
+  loadFullyCompletedTestIds,
+  loadNonErrorResults,
+} from '../../src/commands/eval/retry-errors.js';
 
 describe('retry-errors', () => {
   let tmpDir: string;
@@ -131,6 +136,57 @@ describe('retry-errors', () => {
     expect(ids).toEqual(['case-2']);
   });
 
+  it('loadFullyCompletedTestIds returns only non-error test IDs', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
+      { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 },
+      {
+        test_id: 'case-4',
+        execution_status: 'execution_error',
+        score: 0,
+        error: 'provider failed',
+      },
+    ]);
+
+    const ids = await loadFullyCompletedTestIds(filePath);
+    expect(ids).toEqual(['case-1', 'case-3']);
+  });
+
+  it('loadFullyCompletedTestIds returns empty array when all are errors', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0 },
+      { test_id: 'case-2', execution_status: 'execution_error', score: 0 },
+    ]);
+
+    const ids = await loadFullyCompletedTestIds(filePath);
+    expect(ids).toEqual([]);
+  });
+
+  it('loadFullyCompletedTestIds excludes IDs that errored on any target (matrix safety)', async () => {
+    const filePath = createIndexFile([
+      { test_id: 'case-1', execution_status: 'ok', score: 0.9, target: 'gpt-4' },
+      { test_id: 'case-1', execution_status: 'execution_error', score: 0, target: 'claude' },
+      { test_id: 'case-2', execution_status: 'ok', score: 0.8, target: 'gpt-4' },
+      { test_id: 'case-2', execution_status: 'ok', score: 0.7, target: 'claude' },
+    ]);
+
+    const ids = await loadFullyCompletedTestIds(filePath);
+    // case-1 errored on claude, so it should NOT be excluded from retry
+    expect(ids).toEqual(['case-2']);
+  });
+
+  it('buildExclusionFilter escapes glob metacharacters in test IDs', () => {
+    expect(buildExclusionFilter(['simple'])).toBe('!simple');
+    expect(buildExclusionFilter(['a', 'b'])).toBe('!{a,b}');
+    // Braces, brackets, stars, question marks should be escaped
+    expect(buildExclusionFilter(['test[1]'])).toBe('!test\\[1\\]');
+    expect(buildExclusionFilter(['test{a}'])).toBe('!test\\{a\\}');
+    expect(buildExclusionFilter(['test*'])).toBe('!test\\*');
+    expect(buildExclusionFilter(['test?'])).toBe('!test\\?');
+    expect(buildExclusionFilter(['!negated'])).toBe('!\\!negated');
+  });
+
   it('throws on malformed index.jsonl lines', async () => {
     tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
     const filePath = path.join(tmpDir, 'index.jsonl');