diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts index 65f76c2ea..37e647533 100644 --- a/apps/cli/src/commands/eval/retry-errors.ts +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -6,6 +6,14 @@ async function loadRetrySourceResults(jsonlPath: string): Promise { + const results = await loadRetrySourceResults(jsonlPath); + const allIds = new Set(); + const errorIds = new Set(); + + for (const result of results) { + if (!result.testId) continue; + allIds.add(result.testId); + if (result.executionStatus === 'execution_error') { + errorIds.add(result.testId); + } + } + + // Only IDs where every result is non-error + return [...allIds].filter((id) => !errorIds.has(id)); +} + +/** + * Build a micromatch negation pattern that excludes the given test IDs. + * Escapes glob metacharacters in IDs to ensure literal matching. + */ +export function buildExclusionFilter(completedIds: readonly string[]): string { + const escaped = completedIds.map(escapeGlob); + return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(',')}}`; +} + /** * Load results from an index/results source that do NOT have executionStatus === 'execution_error'. * These are the "good" results that should be preserved when merging retry output. diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 4b052d32c..471293a52 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,7 +34,12 @@ import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir } from './result-layout.js'; -import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; +import { + buildExclusionFilter, + loadErrorTestIds, + loadFullyCompletedTestIds, + loadNonErrorResults, +} from './retry-errors.js'; import { saveRunCache } from './run-cache.js'; import { findRepoRoot } from './shared.js'; import { @@ -895,23 +900,29 @@ export async function runEvalCommand( throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)'); } - // --retry-errors: override filter to only re-run execution_error test cases. + // --retry-errors: resume from a previous run by re-running execution_error and missing test cases. + // Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes + // both error cases and cases that never ran (e.g., due to a crash or interrupt). // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below, // since the retry source and output destination may refer to the same file. let retryNonErrorResults: readonly EvaluationResult[] | undefined; if (options.retryErrors) { const retryPath = path.resolve(options.retryErrors); await ensureFileExists(retryPath, 'Retry-errors JSONL file'); + const completedIds = await loadFullyCompletedTestIds(retryPath); const errorIds = await loadErrorTestIds(retryPath); - if (errorIds.length === 0) { - console.log('No execution errors found in the previous output. Nothing to retry.'); - return; - } - console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); - // Override the filter to match only error test IDs using micromatch brace expansion - const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`; - options = { ...options, filter: filterPattern }; retryNonErrorResults = await loadNonErrorResults(retryPath); + + if (errorIds.length > 0) { + console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); + } + // Use a negation filter to exclude fully-completed (non-error across all targets) cases. + // This re-runs error cases, cases missing from the output (crash recovery), and cases + // that errored on some targets even if they succeeded on others (matrix safety). + if (completedIds.length > 0) { + options = { ...options, filter: buildExclusionFilter(completedIds) }; + console.log(`Skipping ${completedIds.length} already-completed test(s).`); + } } // Validate static workspace path exists and is a directory @@ -1197,6 +1208,11 @@ export async function runEvalCommand( } if (totalEvalCount === 0) { + // When using --retry-errors, all tests being filtered means no errors or missing cases remain + if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { + console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); + return; + } throw new Error('No tests matched the provided filters.'); } const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose }); diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts index 9aca5b16b..ce3b28dc1 100644 --- a/apps/cli/test/unit/retry-errors.test.ts +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -3,7 +3,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js'; +import { + buildExclusionFilter, + loadErrorTestIds, + loadFullyCompletedTestIds, + loadNonErrorResults, +} from '../../src/commands/eval/retry-errors.js'; describe('retry-errors', () => { let tmpDir: string; @@ -131,6 +136,57 @@ describe('retry-errors', () => { expect(ids).toEqual(['case-2']); }); + it('loadFullyCompletedTestIds returns only non-error test IDs', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' }, + { test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 }, + { + test_id: 'case-4', + execution_status: 'execution_error', + score: 0, + error: 'provider failed', + }, + ]); + + const ids = await loadFullyCompletedTestIds(filePath); + expect(ids).toEqual(['case-1', 'case-3']); + }); + + it('loadFullyCompletedTestIds returns empty array when all are errors', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'execution_error', score: 0 }, + { test_id: 'case-2', execution_status: 'execution_error', score: 0 }, + ]); + + const ids = await loadFullyCompletedTestIds(filePath); + expect(ids).toEqual([]); + }); + + it('loadFullyCompletedTestIds excludes IDs that errored on any target (matrix safety)', async () => { + const filePath = createIndexFile([ + { test_id: 'case-1', execution_status: 'ok', score: 0.9, target: 'gpt-4' }, + { test_id: 'case-1', execution_status: 'execution_error', score: 0, target: 'claude' }, + { test_id: 'case-2', execution_status: 'ok', score: 0.8, target: 'gpt-4' }, + { test_id: 'case-2', execution_status: 'ok', score: 0.7, target: 'claude' }, + ]); + + const ids = await loadFullyCompletedTestIds(filePath); + // case-1 errored on claude, so it should NOT be excluded from retry + expect(ids).toEqual(['case-2']); + }); + + it('buildExclusionFilter escapes glob metacharacters in test IDs', () => { + expect(buildExclusionFilter(['simple'])).toBe('!simple'); + expect(buildExclusionFilter(['a', 'b'])).toBe('!{a,b}'); + // Braces, brackets, stars, question marks should be escaped + expect(buildExclusionFilter(['test[1]'])).toBe('!test\\[1\\]'); + expect(buildExclusionFilter(['test{a}'])).toBe('!test\\{a\\}'); + expect(buildExclusionFilter(['test*'])).toBe('!test\\*'); + expect(buildExclusionFilter(['test?'])).toBe('!test\\?'); + expect(buildExclusionFilter(['!negated'])).toBe('!\\!negated'); + }); + it('throws on malformed index.jsonl lines', async () => { tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); const filePath = path.join(tmpDir, 'index.jsonl');