diff --git a/apps/cli/src/commands/eval/interactive.ts b/apps/cli/src/commands/eval/interactive.ts index 41720d76..dad5efa9 100644 --- a/apps/cli/src/commands/eval/interactive.ts +++ b/apps/cli/src/commands/eval/interactive.ts @@ -1,3 +1,4 @@ +import { existsSync } from 'node:fs'; import path from 'node:path'; import { listTargetNames, readTargetDefinitions } from '@agentv/core'; import { checkbox, confirm, number, search, select } from '@inquirer/prompts'; @@ -42,6 +43,22 @@ export async function launchInteractiveWizard(): Promise { return; } + if (action === 'resume' && lastConfig?.outputDir) { + const relativeDir = path.relative(cwd, lastConfig.outputDir) || lastConfig.outputDir; + console.log(`\n${ANSI_DIM}Resuming run at ${relativeDir}...${ANSI_RESET}\n`); + await executeConfig( + { + evalPaths: lastConfig.evalPaths, + target: lastConfig.target, + workers: lastConfig.workers, + dryRun: lastConfig.dryRun, + cache: lastConfig.cache, + }, + { resumeOutputDir: lastConfig.outputDir }, + ); + return; + } + if (action === 'rerun' && lastConfig) { console.log(`\n${ANSI_DIM}Rerunning last configuration...${ANSI_RESET}\n`); await executeConfig({ @@ -66,26 +83,28 @@ export async function launchInteractiveWizard(): Promise { return; } - // Save last config - await saveLastConfig({ - timestamp: new Date().toISOString(), - cwd, - evalPaths: config.evalPaths, - target: config.target, - workers: config.workers, - dryRun: config.dryRun, - cache: config.cache, - }); - await executeConfig(config); } async function promptMainMenu( lastConfig: LastConfig | undefined, -): Promise<'new' | 'rerun' | 'exit'> { - type MenuChoice = 'new' | 'rerun' | 'exit'; +): Promise<'new' | 'rerun' | 'resume' | 'exit'> { + type MenuChoice = 'new' | 'rerun' | 'resume' | 'exit'; const choices: Array<{ name: string; value: MenuChoice; description?: string }> = []; + // Resume entry: only when the prior run has a known artifact dir with an index.jsonl + if (lastConfig?.outputDir) { + const indexPath = path.join(lastConfig.outputDir, 'index.jsonl'); + if (existsSync(indexPath)) { + const dirLabel = path.basename(lastConfig.outputDir); + choices.push({ + name: '⏯ Resume last run', + value: 'resume', + description: `${dirLabel} (target: ${lastConfig.target})`, + }); + } + } + if (lastConfig) { const evalCount = lastConfig.evalPaths.length; choices.push({ @@ -315,12 +334,17 @@ async function promptReviewAndConfirm(config: InteractiveConfig, cwd: string): P }); } -async function executeConfig(config: InteractiveConfig): Promise { +async function executeConfig( + config: InteractiveConfig, + opts?: { resumeOutputDir?: string }, +): Promise { + const cwd = process.cwd(); const rawOptions: Record = { target: config.target, workers: config.workers, dryRun: config.dryRun, cache: config.cache, + ...(opts?.resumeOutputDir ? { output: opts.resumeOutputDir, resume: true } : {}), dryRunDelay: 0, dryRunDelayMin: 0, dryRunDelayMax: 0, @@ -337,6 +361,22 @@ async function executeConfig(config: InteractiveConfig): Promise { rawOptions, }); + // Persist config with the resolved artifact dir so the wizard can offer + // "Resume last run" on the next invocation. Done after a successful run so + // the saved outputDir always points at a real index.jsonl. + if (result) { + await saveLastConfig({ + timestamp: new Date().toISOString(), + cwd, + evalPaths: config.evalPaths, + target: config.target, + workers: config.workers, + dryRun: config.dryRun, + cache: config.cache, + outputDir: path.dirname(result.outputPath), + }); + } + // Prompt to retry errors when execution errors were detected in a TTY if (result && result.executionErrorCount > 0 && process.stdin.isTTY) { await promptRetryErrors(config, result.outputPath); diff --git a/apps/cli/src/commands/eval/last-config.ts b/apps/cli/src/commands/eval/last-config.ts index 44402a4e..fca7a8f1 100644 --- a/apps/cli/src/commands/eval/last-config.ts +++ b/apps/cli/src/commands/eval/last-config.ts @@ -13,6 +13,12 @@ export interface LastConfig { readonly workers: number; readonly dryRun: boolean; readonly cache: boolean; + /** + * Resolved artifact directory of the last completed wizard run. Used to + * power the wizard's "Resume last run" entry. Optional for backward + * compatibility with configs saved before this field existed. + */ + readonly outputDir?: string; } export async function loadLastConfig(): Promise { diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts index 50c9e782..36f302da 100644 --- a/apps/cli/src/commands/eval/run-cache.ts +++ b/apps/cli/src/commands/eval/run-cache.ts @@ -1,3 +1,4 @@ +import { existsSync } from 'node:fs'; import { mkdir, readFile, writeFile } from 'node:fs/promises'; import path from 'node:path'; @@ -44,6 +45,21 @@ export async function loadRunCache(cwd: string): Promise { } } +/** + * Resolve the cached last-run directory for a cwd, if it still exists on disk. + * Returns undefined when there is no cache, the cache lacks a `lastRunDir`, + * or the directory has since been deleted. Used by `--resume` / `--rerun-failed` + * to default `--output` to the most recent run when no explicit dir is given, + * matching the convention used by promptfoo (`--resume [evalId]`) and + * OpenCompass (`-r [timestamp]`). + */ +export async function resolveCachedRunDir(cwd: string): Promise { + const cache = await loadRunCache(cwd); + if (!cache?.lastRunDir) return undefined; + if (!existsSync(cache.lastRunDir)) return undefined; + return cache.lastRunDir; +} + export async function saveRunCache(cwd: string, resultPath: string): Promise { if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) { return; diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 552db330..db0b0fb1 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -50,7 +50,7 @@ import { loadFullyCompletedTestIds, loadNonErrorResults, } from './retry-errors.js'; -import { saveRunCache } from './run-cache.js'; +import { resolveCachedRunDir, saveRunCache } from './run-cache.js'; import { findRepoRoot } from './shared.js'; import { calculateEvaluationSummary, @@ -1031,6 +1031,20 @@ export async function runEvalCommand( } } + // --resume / --rerun-failed without an explicit --output: default to the + // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's + // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" + // convention. The cache pointer is written by saveRunCache after every eval. + if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) { + const cachedDir = await resolveCachedRunDir(cwd); + if (cachedDir) { + options = { ...options, outputDir: cachedDir }; + const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume'; + const displayDir = path.relative(cwd, cachedDir) || cachedDir; + console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`); + } + } + // --resume / --rerun-failed: skip already-completed tests and append to existing output. // IMPORTANT: JSONL must be loaded before the output writer is created (same file). let resumeSkipKeys: Set | undefined; @@ -1059,7 +1073,7 @@ export async function runEvalCommand( } } else { console.warn( - 'Warning: --resume requires --output to identify the run directory. Ignoring --resume.', + 'Warning: --resume requires --output (or a cached last run) to identify the run directory. Ignoring --resume.', ); } } diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index a100a1b8..93d4156a 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -94,9 +94,11 @@ export function preprocessArgv(argv: string[]): string[] { } } - // Implicit `run` subcommand: `agentv eval ` → `agentv eval run ` - // when the first arg after `eval` is not a known eval subcommand. - // This preserves backward compatibility now that `eval` is a subcommands group. + // Implicit `run` subcommand: `agentv eval []` → `agentv eval run []` + // when the first arg after `eval` is absent or is not a known eval subcommand. + // Backward-compat: `eval` used to be a direct command; now it is a subcommands group. + // Bare `agentv eval` falls through to the run handler so its TTY check can launch + // the interactive wizard. // Only applies when `eval` is the top-level subcommand. // Exception: `--help` / `-h` should show the eval group help, not run's help. const evalIdx = result.indexOf('eval'); @@ -106,12 +108,9 @@ export function preprocessArgv(argv: string[]): string[] { const isTopLevel = !result.slice(0, evalIdx).some((arg) => TOP_LEVEL_COMMANDS.has(arg)); if (isTopLevel) { const nextArg = result[evalIdx + 1]; - if ( - nextArg !== undefined && - !EVAL_SUBCOMMANDS.has(nextArg) && - nextArg !== '--help' && - nextArg !== '-h' - ) { + const isHelp = nextArg === '--help' || nextArg === '-h'; + const isKnownSubcommand = nextArg !== undefined && EVAL_SUBCOMMANDS.has(nextArg); + if (!isHelp && !isKnownSubcommand) { result.splice(evalIdx + 1, 0, 'run'); } } diff --git a/apps/cli/test/unit/preprocess-argv.test.ts b/apps/cli/test/unit/preprocess-argv.test.ts index d91c31bc..cf70249a 100644 --- a/apps/cli/test/unit/preprocess-argv.test.ts +++ b/apps/cli/test/unit/preprocess-argv.test.ts @@ -31,9 +31,9 @@ describe('preprocessArgv', () => { expect(preprocessArgv(argv)).toEqual(argv); }); - it('does not insert `run` for bare eval', () => { + it('inserts `run` for bare eval so the run handler can launch the wizard', () => { const argv = ['node', 'agentv', 'eval']; - expect(preprocessArgv(argv)).toEqual(argv); + expect(preprocessArgv(argv)).toEqual(['node', 'agentv', 'eval', 'run']); }); it('inserts `run` when eval is followed by a flag', () => { diff --git a/apps/cli/test/unit/run-cache.test.ts b/apps/cli/test/unit/run-cache.test.ts new file mode 100644 index 00000000..11496d05 --- /dev/null +++ b/apps/cli/test/unit/run-cache.test.ts @@ -0,0 +1,61 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { resolveCachedRunDir } from '../../src/commands/eval/run-cache.js'; + +describe('resolveCachedRunDir', () => { + let tmpCwd: string; + + afterEach(() => { + if (tmpCwd) { + rmSync(tmpCwd, { recursive: true, force: true }); + } + }); + + function setupCwd(): string { + tmpCwd = mkdtempSync(path.join(tmpdir(), 'agentv-run-cache-test-')); + mkdirSync(path.join(tmpCwd, '.agentv'), { recursive: true }); + return tmpCwd; + } + + function writeCache(cwd: string, lastRunDir: string | undefined): void { + const cachePath = path.join(cwd, '.agentv', 'cache.json'); + const cache = lastRunDir + ? { lastRunDir, timestamp: '2026-01-01T00:00:00.000Z' } + : { timestamp: '2026-01-01T00:00:00.000Z' }; + writeFileSync(cachePath, JSON.stringify(cache, null, 2)); + } + + it('returns the cached run dir when it exists on disk', async () => { + const cwd = setupCwd(); + const runDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01'); + mkdirSync(runDir, { recursive: true }); + writeCache(cwd, runDir); + + expect(await resolveCachedRunDir(cwd)).toBe(runDir); + }); + + it('returns undefined when no cache file exists', async () => { + const cwd = setupCwd(); + rmSync(path.join(cwd, '.agentv'), { recursive: true }); + + expect(await resolveCachedRunDir(cwd)).toBeUndefined(); + }); + + it('returns undefined when the cache lacks lastRunDir', async () => { + const cwd = setupCwd(); + writeCache(cwd, undefined); + + expect(await resolveCachedRunDir(cwd)).toBeUndefined(); + }); + + it('returns undefined when the cached dir has been deleted', async () => { + const cwd = setupCwd(); + const staleDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01'); + writeCache(cwd, staleDir); + + expect(await resolveCachedRunDir(cwd)).toBeUndefined(); + }); +}); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 49a8947c..f0bc287b 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -241,15 +241,35 @@ Notes: - `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset). - Pool slots are managed separately (`agentv workspace list|clean`). -### Retry Execution Errors +### Resume an Interrupted Run -Re-run only the tests that had infrastructure/execution errors from a previous output: +AgentV ships three flags for picking up a partial run. They differ only in **which prior results are skipped**; in all three modes the new results are merged with the prior run. + +| Flag | What it skips | What it re-runs | Use when | +|------|---------------|-----------------|----------| +| `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish | +| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing | +| `--retry-errors ` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to | + +`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output ` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`). ```bash +# Resume the last run — no args needed; AgentV finds it from .agentv/cache.json +agentv eval evals/my-eval.yaml --resume + +# Or target a specific run dir explicitly +agentv eval evals/my-eval.yaml --output .agentv/results/runs/ --resume + +# Re-run errors AND failed cases against the last run dir +agentv eval evals/my-eval.yaml --rerun-failed + +# Re-run only execution errors from any prior run by path agentv eval evals/my-eval.yaml --retry-errors .agentv/results/runs//index.jsonl ``` -This reads the previous run manifest, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output. +After any failing run, the CLI prints the exact `--rerun-failed` command for the run dir that just completed — copy/paste it. + +The interactive wizard (`agentv eval` with no arguments) remembers the last run's artifact directory and surfaces a **"Resume last run"** entry in the main menu when one exists. ### Execution Error Tolerance