EntityProcess · christso · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/apps/cli/src/commands/eval/interactive.ts b/apps/cli/src/commands/eval/interactive.ts
@@ -1,3 +1,4 @@
+import { existsSync } from 'node:fs';
 import path from 'node:path';
 import { listTargetNames, readTargetDefinitions } from '@agentv/core';
 import { checkbox, confirm, number, search, select } from '@inquirer/prompts';
@@ -42,6 +43,22 @@ export async function launchInteractiveWizard(): Promise<void> {
     return;
   }
 
+  if (action === 'resume' && lastConfig?.outputDir) {
+    const relativeDir = path.relative(cwd, lastConfig.outputDir) || lastConfig.outputDir;
+    console.log(`\n${ANSI_DIM}Resuming run at ${relativeDir}...${ANSI_RESET}\n`);
+    await executeConfig(
+      {
+        evalPaths: lastConfig.evalPaths,
+        target: lastConfig.target,
+        workers: lastConfig.workers,
+        dryRun: lastConfig.dryRun,
+        cache: lastConfig.cache,
+      },
+      { resumeOutputDir: lastConfig.outputDir },
+    );
+    return;
+  }
+
   if (action === 'rerun' && lastConfig) {
     console.log(`\n${ANSI_DIM}Rerunning last configuration...${ANSI_RESET}\n`);
     await executeConfig({
@@ -66,26 +83,28 @@ export async function launchInteractiveWizard(): Promise<void> {
     return;
   }
 
-  // Save last config
-  await saveLastConfig({
-    timestamp: new Date().toISOString(),
-    cwd,
-    evalPaths: config.evalPaths,
-    target: config.target,
-    workers: config.workers,
-    dryRun: config.dryRun,
-    cache: config.cache,
-  });
-
   await executeConfig(config);
 }
 
 async function promptMainMenu(
   lastConfig: LastConfig | undefined,
-): Promise<'new' | 'rerun' | 'exit'> {
-  type MenuChoice = 'new' | 'rerun' | 'exit';
+): Promise<'new' | 'rerun' | 'resume' | 'exit'> {
+  type MenuChoice = 'new' | 'rerun' | 'resume' | 'exit';
   const choices: Array<{ name: string; value: MenuChoice; description?: string }> = [];
 
+  // Resume entry: only when the prior run has a known artifact dir with an index.jsonl
+  if (lastConfig?.outputDir) {
+    const indexPath = path.join(lastConfig.outputDir, 'index.jsonl');
+    if (existsSync(indexPath)) {
+      const dirLabel = path.basename(lastConfig.outputDir);
+      choices.push({
+        name: '⏯  Resume last run',
+        value: 'resume',
+        description: `${dirLabel} (target: ${lastConfig.target})`,
+      });
+    }
+  }
+
   if (lastConfig) {
     const evalCount = lastConfig.evalPaths.length;
     choices.push({
@@ -315,12 +334,17 @@ async function promptReviewAndConfirm(config: InteractiveConfig, cwd: string): P
   });
 }
 
-async function executeConfig(config: InteractiveConfig): Promise<void> {
+async function executeConfig(
+  config: InteractiveConfig,
+  opts?: { resumeOutputDir?: string },
+): Promise<void> {
+  const cwd = process.cwd();
   const rawOptions: Record<string, unknown> = {
     target: config.target,
     workers: config.workers,
     dryRun: config.dryRun,
     cache: config.cache,
+    ...(opts?.resumeOutputDir ? { output: opts.resumeOutputDir, resume: true } : {}),
     dryRunDelay: 0,
     dryRunDelayMin: 0,
     dryRunDelayMax: 0,
@@ -337,6 +361,22 @@ async function executeConfig(config: InteractiveConfig): Promise<void> {
     rawOptions,
   });
 
+  // Persist config with the resolved artifact dir so the wizard can offer
+  // "Resume last run" on the next invocation. Done after a successful run so
+  // the saved outputDir always points at a real index.jsonl.
+  if (result) {
+    await saveLastConfig({
+      timestamp: new Date().toISOString(),
+      cwd,
+      evalPaths: config.evalPaths,
+      target: config.target,
+      workers: config.workers,
+      dryRun: config.dryRun,
+      cache: config.cache,
+      outputDir: path.dirname(result.outputPath),
+    });
+  }
+
   // Prompt to retry errors when execution errors were detected in a TTY
   if (result && result.executionErrorCount > 0 && process.stdin.isTTY) {
     await promptRetryErrors(config, result.outputPath);

diff --git a/apps/cli/src/commands/eval/last-config.ts b/apps/cli/src/commands/eval/last-config.ts
@@ -13,6 +13,12 @@ export interface LastConfig {
   readonly workers: number;
   readonly dryRun: boolean;
   readonly cache: boolean;
+  /**
+   * Resolved artifact directory of the last completed wizard run. Used to
+   * power the wizard's "Resume last run" entry. Optional for backward
+   * compatibility with configs saved before this field existed.
+   */
+  readonly outputDir?: string;
 }
 
 export async function loadLastConfig(): Promise<LastConfig | undefined> {

diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts
@@ -1,3 +1,4 @@
+import { existsSync } from 'node:fs';
 import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import path from 'node:path';
 
@@ -44,6 +45,21 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
   }
 }
 
+/**
+ * Resolve the cached last-run directory for a cwd, if it still exists on disk.
+ * Returns undefined when there is no cache, the cache lacks a `lastRunDir`,
+ * or the directory has since been deleted. Used by `--resume` / `--rerun-failed`
+ * to default `--output` to the most recent run when no explicit dir is given,
+ * matching the convention used by promptfoo (`--resume [evalId]`) and
+ * OpenCompass (`-r [timestamp]`).
+ */
+export async function resolveCachedRunDir(cwd: string): Promise<string | undefined> {
+  const cache = await loadRunCache(cwd);
+  if (!cache?.lastRunDir) return undefined;
+  if (!existsSync(cache.lastRunDir)) return undefined;
+  return cache.lastRunDir;
+}
+
 export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
   if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
     return;

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -50,7 +50,7 @@ import {
   loadFullyCompletedTestIds,
   loadNonErrorResults,
 } from './retry-errors.js';
-import { saveRunCache } from './run-cache.js';
+import { resolveCachedRunDir, saveRunCache } from './run-cache.js';
 import { findRepoRoot } from './shared.js';
 import {
   calculateEvaluationSummary,
@@ -1031,6 +1031,20 @@ export async function runEvalCommand(
     }
   }
 
+  // --resume / --rerun-failed without an explicit --output: default to the
+  // last-known run dir for this cwd from .agentv/cache.json. Matches promptfoo's
+  // `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default"
+  // convention. The cache pointer is written by saveRunCache after every eval.
+  if (options.resume && !options.retryErrors && !options.outputDir && !options.artifacts) {
+    const cachedDir = await resolveCachedRunDir(cwd);
+    if (cachedDir) {
+      options = { ...options, outputDir: cachedDir };
+      const flagLabel = options.rerunFailed ? 'rerun-failed' : 'resume';
+      const displayDir = path.relative(cwd, cachedDir) || cachedDir;
+      console.log(`Auto-detected last run dir for --${flagLabel}: ${displayDir}`);
+    }
+  }
+
   // --resume / --rerun-failed: skip already-completed tests and append to existing output.
   // IMPORTANT: JSONL must be loaded before the output writer is created (same file).
   let resumeSkipKeys: Set<string> | undefined;
@@ -1059,7 +1073,7 @@ export async function runEvalCommand(
       }
     } else {
       console.warn(
-        'Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume.',
+        'Warning: --resume requires --output <dir> (or a cached last run) to identify the run directory. Ignoring --resume.',
       );
     }
   }

diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
@@ -94,9 +94,11 @@ export function preprocessArgv(argv: string[]): string[] {
     }
   }
 
-  // Implicit `run` subcommand: `agentv eval <arg>` → `agentv eval run <arg>`
-  // when the first arg after `eval` is not a known eval subcommand.
-  // This preserves backward compatibility now that `eval` is a subcommands group.
+  // Implicit `run` subcommand: `agentv eval [<arg>]` → `agentv eval run [<arg>]`
+  // when the first arg after `eval` is absent or is not a known eval subcommand.
+  // Backward-compat: `eval` used to be a direct command; now it is a subcommands group.
+  // Bare `agentv eval` falls through to the run handler so its TTY check can launch
+  // the interactive wizard.
   // Only applies when `eval` is the top-level subcommand.
   // Exception: `--help` / `-h` should show the eval group help, not run's help.
   const evalIdx = result.indexOf('eval');
@@ -106,12 +108,9 @@ export function preprocessArgv(argv: string[]): string[] {
     const isTopLevel = !result.slice(0, evalIdx).some((arg) => TOP_LEVEL_COMMANDS.has(arg));
     if (isTopLevel) {
       const nextArg = result[evalIdx + 1];
-      if (
-        nextArg !== undefined &&
-        !EVAL_SUBCOMMANDS.has(nextArg) &&
-        nextArg !== '--help' &&
-        nextArg !== '-h'
-      ) {
+      const isHelp = nextArg === '--help' || nextArg === '-h';
+      const isKnownSubcommand = nextArg !== undefined && EVAL_SUBCOMMANDS.has(nextArg);
+      if (!isHelp && !isKnownSubcommand) {
         result.splice(evalIdx + 1, 0, 'run');
       }
     }

diff --git a/apps/cli/test/unit/preprocess-argv.test.ts b/apps/cli/test/unit/preprocess-argv.test.ts
@@ -31,9 +31,9 @@ describe('preprocessArgv', () => {
       expect(preprocessArgv(argv)).toEqual(argv);
     });
 
-    it('does not insert `run` for bare eval', () => {
+    it('inserts `run` for bare eval so the run handler can launch the wizard', () => {
       const argv = ['node', 'agentv', 'eval'];
-      expect(preprocessArgv(argv)).toEqual(argv);
+      expect(preprocessArgv(argv)).toEqual(['node', 'agentv', 'eval', 'run']);
     });
 
     it('inserts `run` when eval is followed by a flag', () => {

diff --git a/apps/cli/test/unit/run-cache.test.ts b/apps/cli/test/unit/run-cache.test.ts
@@ -0,0 +1,61 @@
+import { afterEach, describe, expect, it } from 'bun:test';
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+
+import { resolveCachedRunDir } from '../../src/commands/eval/run-cache.js';
+
+describe('resolveCachedRunDir', () => {
+  let tmpCwd: string;
+
+  afterEach(() => {
+    if (tmpCwd) {
+      rmSync(tmpCwd, { recursive: true, force: true });
+    }
+  });
+
+  function setupCwd(): string {
+    tmpCwd = mkdtempSync(path.join(tmpdir(), 'agentv-run-cache-test-'));
+    mkdirSync(path.join(tmpCwd, '.agentv'), { recursive: true });
+    return tmpCwd;
+  }
+
+  function writeCache(cwd: string, lastRunDir: string | undefined): void {
+    const cachePath = path.join(cwd, '.agentv', 'cache.json');
+    const cache = lastRunDir
+      ? { lastRunDir, timestamp: '2026-01-01T00:00:00.000Z' }
+      : { timestamp: '2026-01-01T00:00:00.000Z' };
+    writeFileSync(cachePath, JSON.stringify(cache, null, 2));
+  }
+
+  it('returns the cached run dir when it exists on disk', async () => {
+    const cwd = setupCwd();
+    const runDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01');
+    mkdirSync(runDir, { recursive: true });
+    writeCache(cwd, runDir);
+
+    expect(await resolveCachedRunDir(cwd)).toBe(runDir);
+  });
+
+  it('returns undefined when no cache file exists', async () => {
+    const cwd = setupCwd();
+    rmSync(path.join(cwd, '.agentv'), { recursive: true });
+
+    expect(await resolveCachedRunDir(cwd)).toBeUndefined();
+  });
+
+  it('returns undefined when the cache lacks lastRunDir', async () => {
+    const cwd = setupCwd();
+    writeCache(cwd, undefined);
+
+    expect(await resolveCachedRunDir(cwd)).toBeUndefined();
+  });
+
+  it('returns undefined when the cached dir has been deleted', async () => {
+    const cwd = setupCwd();
+    const staleDir = path.join(cwd, '.agentv', 'results', 'runs', 'default', '2026-01-01');
+    writeCache(cwd, staleDir);
+
+    expect(await resolveCachedRunDir(cwd)).toBeUndefined();
+  });
+});
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -241,15 +241,35 @@ Notes:
 - `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset).
 - Pool slots are managed separately (`agentv workspace list|clean`).
 
-### Retry Execution Errors
+### Resume an Interrupted Run
 
-Re-run only the tests that had infrastructure/execution errors from a previous output:
+AgentV ships three flags for picking up a partial run. They differ only in **which prior results are skipped**; in all three modes the new results are merged with the prior run.
+
+| Flag | What it skips | What it re-runs | Use when |
+|------|---------------|-----------------|----------|
+| `--resume` | Anything that finished without an `execution_error` (passes, fails, threshold misses) | Errors and missing cases | The run was interrupted (Ctrl-C, crash, OOM) and you just want it to finish |
+| `--rerun-failed` | Only cases with `executionStatus === 'ok'` | Errors **and** test failures (assertion misses, threshold misses) | A grader change or model swap means you want to re-grade everything that wasn't already passing |
+| `--retry-errors <path>` | Anything that completed without an `execution_error` (same set as `--resume`) | Errors and missing cases | You want to point at an arbitrary prior run/manifest by path, instead of resuming the run dir you're currently writing to |
+
+`--resume` and `--rerun-failed` both append to the existing `index.jsonl`. When `--output <dir>` is given they target that directory; when omitted they default to the **last run dir for the current cwd**, recorded in `.agentv/cache.json` and updated after every eval. This matches promptfoo's `--resume [evalId]` and OpenCompass's `-r [timestamp]` "latest by default" convention. `--retry-errors` takes the prior run's path directly (a directory or an `index.jsonl`).
 
 ```bash
+# Resume the last run — no args needed; AgentV finds it from .agentv/cache.json
+agentv eval evals/my-eval.yaml --resume
+
+# Or target a specific run dir explicitly
+agentv eval evals/my-eval.yaml --output .agentv/results/runs/<timestamp> --resume
+
+# Re-run errors AND failed cases against the last run dir
+agentv eval evals/my-eval.yaml --rerun-failed
+
+# Re-run only execution errors from any prior run by path
 agentv eval evals/my-eval.yaml --retry-errors .agentv/results/runs/<timestamp>/index.jsonl
 ```
 
-This reads the previous run manifest, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output.
+After any failing run, the CLI prints the exact `--rerun-failed` command for the run dir that just completed — copy/paste it.
+
+The interactive wizard (`agentv eval` with no arguments) remembers the last run's artifact directory and surfaces a **"Resume last run"** entry in the main menu when one exists.
 
 ### Execution Error Tolerance