Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 54 additions & 14 deletions apps/cli/src/commands/eval/interactive.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { existsSync } from 'node:fs';
import path from 'node:path';
import { listTargetNames, readTargetDefinitions } from '@agentv/core';
import { checkbox, confirm, number, search, select } from '@inquirer/prompts';
Expand Down Expand Up @@ -42,6 +43,22 @@ export async function launchInteractiveWizard(): Promise<void> {
return;
}

if (action === 'resume' && lastConfig?.outputDir) {
const relativeDir = path.relative(cwd, lastConfig.outputDir) || lastConfig.outputDir;
console.log(`\n${ANSI_DIM}Resuming run at ${relativeDir}...${ANSI_RESET}\n`);
await executeConfig(
{
evalPaths: lastConfig.evalPaths,
target: lastConfig.target,
workers: lastConfig.workers,
dryRun: lastConfig.dryRun,
cache: lastConfig.cache,
},
{ resumeOutputDir: lastConfig.outputDir },
);
return;
}

if (action === 'rerun' && lastConfig) {
console.log(`\n${ANSI_DIM}Rerunning last configuration...${ANSI_RESET}\n`);
await executeConfig({
Expand All @@ -66,26 +83,28 @@ export async function launchInteractiveWizard(): Promise<void> {
return;
}

// Save last config
await saveLastConfig({
timestamp: new Date().toISOString(),
cwd,
evalPaths: config.evalPaths,
target: config.target,
workers: config.workers,
dryRun: config.dryRun,
cache: config.cache,
});

await executeConfig(config);
}

async function promptMainMenu(
lastConfig: LastConfig | undefined,
): Promise<'new' | 'rerun' | 'exit'> {
type MenuChoice = 'new' | 'rerun' | 'exit';
): Promise<'new' | 'rerun' | 'resume' | 'exit'> {
type MenuChoice = 'new' | 'rerun' | 'resume' | 'exit';
const choices: Array<{ name: string; value: MenuChoice; description?: string }> = [];

// Resume entry: only when the prior run has a known artifact dir with an index.jsonl
if (lastConfig?.outputDir) {
const indexPath = path.join(lastConfig.outputDir, 'index.jsonl');
if (existsSync(indexPath)) {
const dirLabel = path.basename(lastConfig.outputDir);
choices.push({
name: '⏯ Resume last run',
value: 'resume',
description: `${dirLabel} (target: ${lastConfig.target})`,
});
}
}

if (lastConfig) {
const evalCount = lastConfig.evalPaths.length;
choices.push({
Expand Down Expand Up @@ -315,12 +334,17 @@ async function promptReviewAndConfirm(config: InteractiveConfig, cwd: string): P
});
}

async function executeConfig(config: InteractiveConfig): Promise<void> {
async function executeConfig(
config: InteractiveConfig,
opts?: { resumeOutputDir?: string },
): Promise<void> {
const cwd = process.cwd();
const rawOptions: Record<string, unknown> = {
target: config.target,
workers: config.workers,
dryRun: config.dryRun,
cache: config.cache,
...(opts?.resumeOutputDir ? { output: opts.resumeOutputDir, resume: true } : {}),
dryRunDelay: 0,
dryRunDelayMin: 0,
dryRunDelayMax: 0,
Expand All @@ -337,6 +361,22 @@ async function executeConfig(config: InteractiveConfig): Promise<void> {
rawOptions,
});

// Persist config with the resolved artifact dir so the wizard can offer
// "Resume last run" on the next invocation. Done after a successful run so
// the saved outputDir always points at a real index.jsonl.
if (result) {
await saveLastConfig({
timestamp: new Date().toISOString(),
cwd,
evalPaths: config.evalPaths,
target: config.target,
workers: config.workers,
dryRun: config.dryRun,
cache: config.cache,
outputDir: path.dirname(result.outputPath),
});
}

// Prompt to retry errors when execution errors were detected in a TTY
if (result && result.executionErrorCount > 0 && process.stdin.isTTY) {
await promptRetryErrors(config, result.outputPath);
Expand Down
6 changes: 6 additions & 0 deletions apps/cli/src/commands/eval/last-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ export interface LastConfig {
readonly workers: number;
readonly dryRun: boolean;
readonly cache: boolean;
/**
* Resolved artifact directory of the last completed wizard run. Used to
* power the wizard's "Resume last run" entry. Optional for backward
* compatibility with configs saved before this field existed.
*/
readonly outputDir?: string;
}

export async function loadLastConfig(): Promise<LastConfig | undefined> {
Expand Down
16 changes: 16 additions & 0 deletions apps/cli/src/commands/eval/run-cache.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { existsSync } from 'node:fs';
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import path from 'node:path';

Expand Down Expand Up @@ -44,6 +45,21 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
}
}

/**
* Resolve the cached last-run directory for a cwd, if it still exists on disk.
* Returns undefined when there is no cache, the cache lacks a `lastRunDir`,
* or the directory has since been deleted. Used by `--resume` / `--rerun-failed`
* to default `--output` to the most recent run when no explicit dir is given,
* matching the convention used by promptfoo (`--resume [evalId]`) and
* OpenCompass (`-r [timestamp]`).
*/
export async function resolveCachedRunDir(cwd: string): Promise<string | undefined> {
const cache = await loadRunCache(cwd);
if (!cache?.lastRunDir) return undefined;
if (!existsSync(cache.lastRunDir)) return undefined;
return cache.lastRunDir;
}

export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
return;
Expand Down
18 changes: 16 additions & 2 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ import {
loadFullyCompletedTestIds,
loadNonErrorResults,
} from './retry-errors.js';
import { saveRunCache } from './run-cache.js';
import { resolveCachedRunDir, saveRunCache } from './run-cache.js';
import { findRepoRoot } from './shared.js';
import {
calculateEvaluationSummary,
Expand Down Expand Up @@ -1031,6 +1031,20 @@ export async function runEvalCommand(
}
}

// --resume / --rerun-failed without an explicit --output: default to the
// last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
// `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
// convention. The cache pointer is written by saveRunCache after every eval.
if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) {
const cachedDir = await resolveCachedRunDir(cwd);
if (cachedDir) {
options = { ...options, outputDir: cachedDir };
const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume';
const displayDir = path.relative(cwd, cachedDir) || cachedDir;
console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`);
}
}

// --resume / --rerun-failed: skip already-completed tests and append to existing output.
// IMPORTANT: JSONL must be loaded before the output writer is created (same file).
let resumeSkipKeys: Set<string> | undefined;
Expand Down Expand Up @@ -1059,7 +1073,7 @@ export async function runEvalCommand(
}
} else {
console.warn(
'Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume.',
'Warning: --resume requires --output <dir> (or a cached last run) to identify the run directory. Ignoring --resume.',
);
}
}
Expand Down
17 changes: 8 additions & 9 deletions apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,11 @@ export function preprocessArgv(argv: string[]): string[] {
}
}

// Implicit `run` subcommand: `agentv eval <arg>` → `agentv eval run <arg>`
// when the first arg after `eval` is not a known eval subcommand.
// This preserves backward compatibility now that `eval` is a subcommands group.
// Implicit `run` subcommand: `agentv eval [<arg>]` → `agentv eval run [<arg>]`
// when the first arg after `eval` is absent or is not a known eval subcommand.
// Backward-compat: `eval` used to be a direct command; now it is a subcommands group.
// Bare `agentv eval` falls through to the run handler so its TTY check can launch
// the interactive wizard.
// Only applies when `eval` is the top-level subcommand.
// Exception: `--help` / `-h` should show the eval group help, not run's help.
const evalIdx = result.indexOf('eval');
Expand All @@ -106,12 +108,9 @@ export function preprocessArgv(argv: string[]): string[] {
const isTopLevel = !result.slice(0, evalIdx).some((arg) => TOP_LEVEL_COMMANDS.has(arg));
if (isTopLevel) {
const nextArg = result[evalIdx + 1];
if (
nextArg !== undefined &&
!EVAL_SUBCOMMANDS.has(nextArg) &&
nextArg !== '--help' &&
nextArg !== '-h'
) {
const isHelp = nextArg === '--help' || nextArg === '-h';
const isKnownSubcommand = nextArg !== undefined && EVAL_SUBCOMMANDS.has(nextArg);
if (!isHelp && !isKnownSubcommand) {
result.splice(evalIdx + 1, 0, 'run');
}
}
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/unit/preprocess-argv.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ describe('preprocessArgv', () => {
expect(preprocessArgv(argv)).toEqual(argv);
});

it('does not insert `run` for bare eval', () => {
it('inserts `run` for bare eval so the run handler can launch the wizard', () => {
const argv = ['node', 'agentv', 'eval'];
expect(preprocessArgv(argv)).toEqual(argv);
expect(preprocessArgv(argv)).toEqual(['node', 'agentv', 'eval', 'run']);
});

it('inserts `run` when eval is followed by a flag', () => {
Expand Down
61 changes: 61 additions & 0 deletions apps/cli/test/unit/run-cache.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { afterEach, describe, expect, it } from 'bun:test';
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { resolveCachedRunDir } from '../../src/commands/eval/run-cache.js';

describe('resolveCachedRunDir', () => {
let tmpCwd: string;

afterEach(() => {
if (tmpCwd) {
rmSync(tmpCwd, { recursive: true, force: true });
}
});

function setupCwd(): string {
tmpCwd = mkdtempSync(path.join(tmpdir(), 'agentv-run-cache-test-'));
mkdirSync(path.join(tmpCwd, '.agentv'), { recursive: true });
return tmpCwd;
}

function writeCache(cwd: string, lastRunDir: string | undefined): void {
const cachePath = path.join(cwd, '.agentv', 'cache.json');
const cache = lastRunDir
? { lastRunDir, timestamp: '2026-01-01T00:00:00.000Z' }
: { timestamp: '2026-01-01T00:00:00.000Z' };
writeFileSync(cachePath, JSON.stringify(cache, null, 2));
}

it('returns the cached run dir when it exists on disk', async () => {
const cwd = setupCwd();
const runDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01');
mkdirSync(runDir, { recursive: true });
writeCache(cwd, runDir);

expect(await resolveCachedRunDir(cwd)).toBe(runDir);
});

it('returns undefined when no cache file exists', async () => {
const cwd = setupCwd();
rmSync(path.join(cwd, '.agentv'), { recursive: true });

expect(await resolveCachedRunDir(cwd)).toBeUndefined();
});

it('returns undefined when the cache lacks lastRunDir', async () => {
const cwd = setupCwd();
writeCache(cwd, undefined);

expect(await resolveCachedRunDir(cwd)).toBeUndefined();
});

it('returns undefined when the cached dir has been deleted', async () => {
const cwd = setupCwd();
const staleDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01');
writeCache(cwd, staleDir);

expect(await resolveCachedRunDir(cwd)).toBeUndefined();
});
});
26 changes: 23 additions & 3 deletions apps/web/src/content/docs/docs/evaluation/running-evals.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -241,15 +241,35 @@ Notes:
- `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset).
- Pool slots are managed separately (`agentv workspace list|clean`).

### Retry Execution Errors
### Resume an Interrupted Run

Re-run only the tests that had infrastructure/execution errors from a previous output:
AgentV ships three flags for picking up a partial run. They differ only in **which prior results are skipped**; in all three modes the new results are merged with the prior run.

| Flag | What it skips | What it re-runs | Use when |
|------|---------------|-----------------|----------|
| `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish |
| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing |
| `--retry-errors <path>` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to |

`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output <dir>` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`).

```bash
# Resume the last run — no args needed; AgentV finds it from .agentv/cache.json
agentv eval evals/my-eval.yaml --resume

# Or target a specific run dir explicitly
agentv eval evals/my-eval.yaml --output .agentv/results/runs/<timestamp> --resume

# Re-run errors AND failed cases against the last run dir
agentv eval evals/my-eval.yaml --rerun-failed

# Re-run only execution errors from any prior run by path
agentv eval evals/my-eval.yaml --retry-errors .agentv/results/runs/<timestamp>/index.jsonl
```

This reads the previous run manifest, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output.
After any failing run, the CLI prints the exact `--rerun-failed` command for the run dir that just completed — copy/paste it.

The interactive wizard (`agentv eval` with no arguments) remembers the last run's artifact directory and surfaces a **"Resume last run"** entry in the main menu when one exists.

### Execution Error Tolerance

Expand Down
Loading