Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions apps/cli/src/commands/eval/retry-errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ async function loadRetrySourceResults(jsonlPath: string): Promise<readonly Evalu
return loadManifestResults(resolveResultSourcePath(jsonlPath));
}

/**
* Escape micromatch glob metacharacters in a string so it matches literally.
* Characters escaped: * ? [ ] { } ( ) ! @ # + |
*/
function escapeGlob(id: string): string {
return id.replace(/[*?[\]{}()!@#+|\\]/g, '\\$&');
}

/**
* Load test IDs from an index/results source that have executionStatus === 'execution_error'.
*/
Expand All @@ -17,6 +25,38 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
return [...new Set(ids)];
}

/**
* Load test IDs that are fully completed (non-error) across ALL targets.
* A test ID is only considered "completed" if every result for that ID is non-error.
* This is conservative for matrix runs: if case-1 succeeded on target A but errored
* on target B, case-1 is NOT excluded (it will re-run on both targets).
*/
export async function loadFullyCompletedTestIds(jsonlPath: string): Promise<readonly string[]> {
const results = await loadRetrySourceResults(jsonlPath);
const allIds = new Set<string>();
const errorIds = new Set<string>();

for (const result of results) {
if (!result.testId) continue;
allIds.add(result.testId);
if (result.executionStatus === 'execution_error') {
errorIds.add(result.testId);
}
}

// Only IDs where every result is non-error
return [...allIds].filter((id) => !errorIds.has(id));
}

/**
* Build a micromatch negation pattern that excludes the given test IDs.
* Escapes glob metacharacters in IDs to ensure literal matching.
*/
export function buildExclusionFilter(completedIds: readonly string[]): string {
const escaped = completedIds.map(escapeGlob);
return escaped.length === 1 ? `!${escaped[0]}` : `!{${escaped.join(',')}}`;
}

/**
* Load results from an index/results source that do NOT have executionStatus === 'execution_error'.
* These are the "good" results that should be preserved when merging retry output.
Expand Down
36 changes: 26 additions & 10 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@ import { loadEnvFromHierarchy } from './env.js';
import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
import { buildDefaultRunDir } from './result-layout.js';
import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
import {
buildExclusionFilter,
loadErrorTestIds,
loadFullyCompletedTestIds,
loadNonErrorResults,
} from './retry-errors.js';
import { saveRunCache } from './run-cache.js';
import { findRepoRoot } from './shared.js';
import {
Expand Down Expand Up @@ -895,23 +900,29 @@ export async function runEvalCommand(
throw new Error('--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
}

// --retry-errors: override filter to only re-run execution_error test cases.
// --retry-errors: resume from a previous run by re-running execution_error and missing test cases.
// Uses an exclusion filter to skip already-completed (non-error) cases, which naturally includes
// both error cases and cases that never ran (e.g., due to a crash or interrupt).
// IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
// since the retry source and output destination may refer to the same file.
let retryNonErrorResults: readonly EvaluationResult[] | undefined;
if (options.retryErrors) {
const retryPath = path.resolve(options.retryErrors);
await ensureFileExists(retryPath, 'Retry-errors JSONL file');
const completedIds = await loadFullyCompletedTestIds(retryPath);
const errorIds = await loadErrorTestIds(retryPath);
if (errorIds.length === 0) {
console.log('No execution errors found in the previous output. Nothing to retry.');
return;
}
console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
// Override the filter to match only error test IDs using micromatch brace expansion
const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`;
options = { ...options, filter: filterPattern };
retryNonErrorResults = await loadNonErrorResults(retryPath);

if (errorIds.length > 0) {
console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
}
// Use a negation filter to exclude fully-completed (non-error across all targets) cases.
// This re-runs error cases, cases missing from the output (crash recovery), and cases
// that errored on some targets even if they succeeded on others (matrix safety).
if (completedIds.length > 0) {
options = { ...options, filter: buildExclusionFilter(completedIds) };
console.log(`Skipping ${completedIds.length} already-completed test(s).`);
}
}

// Validate static workspace path exists and is a directory
Expand Down Expand Up @@ -1197,6 +1208,11 @@ export async function runEvalCommand(
}

if (totalEvalCount === 0) {
// When using --retry-errors, all tests being filtered means no errors or missing cases remain
if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
return;
}
throw new Error('No tests matched the provided filters.');
}
const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
Expand Down
58 changes: 57 additions & 1 deletion apps/cli/test/unit/retry-errors.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js';
import {
buildExclusionFilter,
loadErrorTestIds,
loadFullyCompletedTestIds,
loadNonErrorResults,
} from '../../src/commands/eval/retry-errors.js';

describe('retry-errors', () => {
let tmpDir: string;
Expand Down Expand Up @@ -131,6 +136,57 @@ describe('retry-errors', () => {
expect(ids).toEqual(['case-2']);
});

it('loadFullyCompletedTestIds returns only non-error test IDs', async () => {
const filePath = createIndexFile([
{ test_id: 'case-1', execution_status: 'ok', score: 0.9 },
{ test_id: 'case-2', execution_status: 'execution_error', score: 0, error: 'timeout' },
{ test_id: 'case-3', execution_status: 'quality_failure', score: 0.3 },
{
test_id: 'case-4',
execution_status: 'execution_error',
score: 0,
error: 'provider failed',
},
]);

const ids = await loadFullyCompletedTestIds(filePath);
expect(ids).toEqual(['case-1', 'case-3']);
});

it('loadFullyCompletedTestIds returns empty array when all are errors', async () => {
const filePath = createIndexFile([
{ test_id: 'case-1', execution_status: 'execution_error', score: 0 },
{ test_id: 'case-2', execution_status: 'execution_error', score: 0 },
]);

const ids = await loadFullyCompletedTestIds(filePath);
expect(ids).toEqual([]);
});

it('loadFullyCompletedTestIds excludes IDs that errored on any target (matrix safety)', async () => {
const filePath = createIndexFile([
{ test_id: 'case-1', execution_status: 'ok', score: 0.9, target: 'gpt-4' },
{ test_id: 'case-1', execution_status: 'execution_error', score: 0, target: 'claude' },
{ test_id: 'case-2', execution_status: 'ok', score: 0.8, target: 'gpt-4' },
{ test_id: 'case-2', execution_status: 'ok', score: 0.7, target: 'claude' },
]);

const ids = await loadFullyCompletedTestIds(filePath);
// case-1 errored on claude, so it should NOT be excluded from retry
expect(ids).toEqual(['case-2']);
});

it('buildExclusionFilter escapes glob metacharacters in test IDs', () => {
expect(buildExclusionFilter(['simple'])).toBe('!simple');
expect(buildExclusionFilter(['a', 'b'])).toBe('!{a,b}');
// Braces, brackets, stars, question marks should be escaped
expect(buildExclusionFilter(['test[1]'])).toBe('!test\\[1\\]');
expect(buildExclusionFilter(['test{a}'])).toBe('!test\\{a\\}');
expect(buildExclusionFilter(['test*'])).toBe('!test\\*');
expect(buildExclusionFilter(['test?'])).toBe('!test\\?');
expect(buildExclusionFilter(['!negated'])).toBe('!\\!negated');
});

it('throws on malformed index.jsonl lines', async () => {
tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
const filePath = path.join(tmpDir, 'index.jsonl');
Expand Down
Loading