From 5db5935fc2621c3d11d84d3065cb8e9205664196 Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 8 Apr 2026 13:16:20 +0000 Subject: [PATCH 1/3] feat(cli): extend --retry-errors to resume missing cases from crashed runs (#975) Previously --retry-errors only re-ran execution_error test cases. If the process crashed mid-run, cases that never started were lost. Now uses an exclusion filter (negating completed test IDs) so both error cases AND missing cases are re-run, enabling crash recovery without a new checkpoint system. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/retry-errors.ts | 12 +++++++ apps/cli/src/commands/eval/run-eval.ts | 32 ++++++++++++------ apps/cli/test/unit/retry-errors.test.ts | 38 +++++++++++++++++++++- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts index 65f76c2ea..7ccdaaf8f 100644 --- a/apps/cli/src/commands/eval/retry-errors.ts +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -17,6 +17,18 @@ export async function loadErrorTestIds(jsonlPath: string): Promise { + const ids = (await loadRetrySourceResults(jsonlPath)) + .filter((result) => result.testId && result.executionStatus !== 'execution_error') + .map((result) => result.testId); + + return [...new Set(ids)]; +} + /** * Load results from an index/results source that do NOT have executionStatus === 'execution_error'. * These are the "good" results that should be preserved when merging retry output. diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 4b052d32c..0015503c6 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,7 +34,7 @@ import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir } from './result-layout.js'; -import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; +import { loadCompletedTestIds, loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; import { saveRunCache } from './run-cache.js'; import { findRepoRoot } from './shared.js'; import { @@ -895,23 +895,30 @@ export async function runEvalCommand( throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)'); } - // --retry-errors: override filter to only re-run execution_error test cases. + // --retry-errors: resume from a previous run by re-running execution_error and missing test cases. + // Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes + // both error cases and cases that never ran (e.g., due to a crash or interrupt). // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below, // since the retry source and output destination may refer to the same file. let retryNonErrorResults: readonly EvaluationResult[] | undefined; if (options.retryErrors) { const retryPath = path.resolve(options.retryErrors); await ensureFileExists(retryPath, 'Retry-errors JSONL file'); + const completedIds = await loadCompletedTestIds(retryPath); const errorIds = await loadErrorTestIds(retryPath); - if (errorIds.length === 0) { - console.log('No execution errors found in the previous output. Nothing to retry.'); - return; - } - console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); - // Override the filter to match only error test IDs using micromatch brace expansion - const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`; - options = { ...options, filter: filterPattern }; retryNonErrorResults = await loadNonErrorResults(retryPath); + + if (errorIds.length > 0) { + console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); + } + // Use a negation filter to exclude completed (non-error) cases. + // This re-runs both error cases and any cases missing from the previous output (crash recovery). + if (completedIds.length > 0) { + const excludePattern = + completedIds.length === 1 ? `!${completedIds[0]}` : `!{${completedIds.join(',')}}`; + options = { ...options, filter: excludePattern }; + console.log(`Skipping ${completedIds.length} already-completed test(s).`); + } } // Validate static workspace path exists and is a directory @@ -1197,6 +1204,11 @@ export async function runEvalCommand( } if (totalEvalCount === 0) { + // When using --retry-errors, all tests being filtered means everything completed successfully + if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { + console.log('All tests completed successfully in the previous run. Nothing to retry.'); + return; + } throw new Error('No tests matched the provided filters.'); } const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose }); diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts index 9aca5b16b..1bd644c96 100644 --- a/apps/cli/test/unit/retry-errors.test.ts +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -3,7 +3,11 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js'; +import { + loadCompletedTestIds, + loadErrorTestIds, + loadNonErrorResults, +} from '../../src/commands/eval/retry-errors.js'; describe('retry-errors', () => { let tmpDir: string; @@ -131,6 +135,38 @@ describe('retry-errors', () => { expect(ids).toEqual(['case-2']); }); + it('loadCompletedTestIds returns only non-error test IDs', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' }, + { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 }, + { test_id: 'case-4', execution_status: 'execution_error', score: 0, error: 'provider failed' }, + ]); + + const ids = await loadCompletedTestIds(filePath); + expect(ids).toEqual(['case-1', 'case-3']); + }); + + it('loadCompletedTestIds returns empty array when all are errors', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'execution_error', score: 0 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, + ]); + + const ids = await loadCompletedTestIds(filePath); + expect(ids).toEqual([]); + }); + + it('loadCompletedTestIds deduplicates IDs', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-1', execution_status: 'ok', score: 0.8 }, + ]); + + const ids = await loadCompletedTestIds(filePath); + expect(ids).toEqual(['case-1']); + }); + it('throws on malformed index.jsonl lines', async () => { tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); const filePath = path.join(tmpDir, 'index.jsonl'); From 741b2cd7419889b85b30cc68e90d0687a9fc8efa Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 8 Apr 2026 13:17:40 +0000 Subject: [PATCH 2/3] style: fix lint formatting in retry-errors test Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/test/unit/retry-errors.test.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts index 1bd644c96..e76f6c4ab 100644 --- a/apps/cli/test/unit/retry-errors.test.ts +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -140,7 +140,12 @@ describe('retry-errors', () => { { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' }, { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 }, - { test_id: 'case-4', execution_status: 'execution_error', score: 0, error: 'provider failed' }, + { + test_id: 'case-4', + execution_status: 'execution_error', + score: 0, + error: 'provider failed', + }, ]); const ids = await loadCompletedTestIds(filePath); From 9b392e8d4fce86e0e61e4f8bdb713a1ef93b4c8c Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 8 Apr 2026 13:39:19 +0000 Subject: [PATCH 3/3] fix(cli): address code review findings for --retry-errors resume 1. Matrix safety: loadFullyCompletedTestIds only excludes test IDs where ALL results are non-error. If case-1 succeeded on target A but errored on target B, case-1 re-runs on both targets (conservative). 2. Glob escaping: buildExclusionFilter escapes micromatch metacharacters in test IDs so IDs with *, ?, [], {}, ! match literally. 3. Early-return message: changed from "All tests completed successfully" to "No execution errors or missing cases" since non-error results include quality_failure. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/retry-errors.ts | 42 ++++++++++++++++++---- apps/cli/src/commands/eval/run-eval.ts | 22 +++++++----- apps/cli/test/unit/retry-errors.test.ts | 35 ++++++++++++------ 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts index 7ccdaaf8f..37e647533 100644 --- a/apps/cli/src/commands/eval/retry-errors.ts +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -6,6 +6,14 @@ async function loadRetrySourceResults(jsonlPath: string): Promise { - const ids = (await loadRetrySourceResults(jsonlPath)) - .filter((result) => result.testId && result.executionStatus !== 'execution_error') - .map((result) => result.testId); +export async function loadFullyCompletedTestIds(jsonlPath: string): Promise { + const results = await loadRetrySourceResults(jsonlPath); + const allIds = new Set(); + const errorIds = new Set(); - return [...new Set(ids)]; + for (const result of results) { + if (!result.testId) continue; + allIds.add(result.testId); + if (result.executionStatus === 'execution_error') { + errorIds.add(result.testId); + } + } + + // Only IDs where every result is non-error + return [...allIds].filter((id) => !errorIds.has(id)); +} + +/** + * Build a micromatch negation pattern that excludes the given test IDs. + * Escapes glob metacharacters in IDs to ensure literal matching. + */ +export function buildExclusionFilter(completedIds: readonly string[]): string { + const escaped = completedIds.map(escapeGlob); + return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(',')}}`; } /** diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 0015503c6..471293a52 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,7 +34,12 @@ import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir } from './result-layout.js'; -import { loadCompletedTestIds, loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; +import { + buildExclusionFilter, + loadErrorTestIds, + loadFullyCompletedTestIds, + loadNonErrorResults, +} from './retry-errors.js'; import { saveRunCache } from './run-cache.js'; import { findRepoRoot } from './shared.js'; import { @@ -904,19 +909,18 @@ export async function runEvalCommand( if (options.retryErrors) { const retryPath = path.resolve(options.retryErrors); await ensureFileExists(retryPath, 'Retry-errors JSONL file'); - const completedIds = await loadCompletedTestIds(retryPath); + const completedIds = await loadFullyCompletedTestIds(retryPath); const errorIds = await loadErrorTestIds(retryPath); retryNonErrorResults = await loadNonErrorResults(retryPath); if (errorIds.length > 0) { console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); } - // Use a negation filter to exclude completed (non-error) cases. - // This re-runs both error cases and any cases missing from the previous output (crash recovery). + // Use a negation filter to exclude fully-completed (non-error across all targets) cases. + // This re-runs error cases, cases missing from the output (crash recovery), and cases + // that errored on some targets even if they succeeded on others (matrix safety). if (completedIds.length > 0) { - const excludePattern = - completedIds.length === 1 ? `!${completedIds[0]}` : `!{${completedIds.join(',')}}`; - options = { ...options, filter: excludePattern }; + options = { ...options, filter: buildExclusionFilter(completedIds) }; console.log(`Skipping ${completedIds.length} already-completed test(s).`); } } @@ -1204,9 +1208,9 @@ export async function runEvalCommand( } if (totalEvalCount === 0) { - // When using --retry-errors, all tests being filtered means everything completed successfully + // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { - console.log('All tests completed successfully in the previous run. Nothing to retry.'); + console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); return; } throw new Error('No tests matched the provided filters.'); diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts index e76f6c4ab..ce3b28dc1 100644 --- a/apps/cli/test/unit/retry-errors.test.ts +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -4,8 +4,9 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { - loadCompletedTestIds, + buildExclusionFilter, loadErrorTestIds, + loadFullyCompletedTestIds, loadNonErrorResults, } from '../../src/commands/eval/retry-errors.js'; @@ -135,7 +136,7 @@ describe('retry-errors', () => { expect(ids).toEqual(['case-2']); }); - it('loadCompletedTestIds returns only non-error test IDs', async () => { + it('loadFullyCompletedTestIds returns only non-error test IDs', async () => { const filePath = createIndexFile([ { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' }, @@ -148,28 +149,42 @@ describe('retry-errors', () => { }, ]); - const ids = await loadCompletedTestIds(filePath); + const ids = await loadFullyCompletedTestIds(filePath); expect(ids).toEqual(['case-1', 'case-3']); }); - it('loadCompletedTestIds returns empty array when all are errors', async () => { + it('loadFullyCompletedTestIds returns empty array when all are errors', async () => { const filePath = createIndexFile([ { test_id: 'case-1', execution_status: 'execution_error', score: 0 }, { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, ]); - const ids = await loadCompletedTestIds(filePath); + const ids = await loadFullyCompletedTestIds(filePath); expect(ids).toEqual([]); }); - it('loadCompletedTestIds deduplicates IDs', async () => { + it('loadFullyCompletedTestIds excludes IDs that errored on any target (matrix safety)', async () => { const filePath = createIndexFile([ - { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, - { test_id: 'case-1', execution_status: 'ok', score: 0.8 }, + { test_id: 'case-1', execution_status: 'ok', score: 0.9, target: 'gpt-4' }, + { test_id: 'case-1', execution_status: 'execution_error', score: 0, target: 'claude' }, + { test_id: 'case-2', execution_status: 'ok', score: 0.8, target: 'gpt-4' }, + { test_id: 'case-2', execution_status: 'ok', score: 0.7, target: 'claude' }, ]); - const ids = await loadCompletedTestIds(filePath); - expect(ids).toEqual(['case-1']); + const ids = await loadFullyCompletedTestIds(filePath); + // case-1 errored on claude, so it should NOT be excluded from retry + expect(ids).toEqual(['case-2']); + }); + + it('buildExclusionFilter escapes glob metacharacters in test IDs', () => { + expect(buildExclusionFilter(['simple'])).toBe('!simple'); + expect(buildExclusionFilter(['a', 'b'])).toBe('!{a,b}'); + // Braces, brackets, stars, question marks should be escaped + expect(buildExclusionFilter(['test[1]'])).toBe('!test\\[1\\]'); + expect(buildExclusionFilter(['test{a}'])).toBe('!test\\{a\\}'); + expect(buildExclusionFilter(['test*'])).toBe('!test\\*'); + expect(buildExclusionFilter(['test?'])).toBe('!test\\?'); + expect(buildExclusionFilter(['!negated'])).toBe('!\\!negated'); }); it('throws on malformed index.jsonl lines', async () => {