diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index c7844ccf..c5cf1b26 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -10,6 +10,47 @@ import {
 import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
 import { RESULT_INDEX_FILENAME } from './result-layout.js';
 
+export function buildTestTargetKey(testId?: string, target?: string): string {
+  return `${testId ?? 'unknown'}::${target ?? 'unknown'}`;
+}
+
+// Deduplication helper — keeps the last entry per (test_id, target) pair.
+export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
+  const seen = new Map<string, number>();
+  for (let i = 0; i < results.length; i++) {
+    seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
+  }
+  const deduped: EvaluationResult[] = [];
+  for (let i = 0; i < results.length; i++) {
+    const key = buildTestTargetKey(results[i].testId, results[i].target);
+    if (seen.get(key) === i) {
+      deduped.push(results[i]);
+    }
+  }
+  return deduped;
+}
+
+export async function aggregateRunDir(
+  runDir: string,
+  options?: { evalFile?: string; experiment?: string },
+): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
+  const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  const content = await readFile(indexPath, 'utf8');
+  const allResults = parseJsonlResults(content);
+  const results = deduplicateByTestIdTarget(allResults);
+
+  const timing = buildTimingArtifact(results);
+  const timingPath = path.join(runDir, 'timing.json');
+  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
+
+  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  const benchmarkPath = path.join(runDir, 'benchmark.json');
+  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
+
+  const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
+  return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
+}
+
 // ---------------------------------------------------------------------------
 // Artifact interfaces (snake_case to match skill-creator conventions)
 // ---------------------------------------------------------------------------
@@ -739,6 +780,45 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
   return lines.length > 0 ? `${lines.join('\n')}\n` : '';
 }
 
+export async function writePerTestArtifacts(
+  results: readonly EvaluationResult[],
+  outputDir: string,
+  options?: { experiment?: string },
+): Promise<void> {
+  await mkdir(outputDir, { recursive: true });
+  for (const result of results) {
+    const grading = buildGradingArtifact(result);
+    const timing = buildTimingArtifact([result]);
+    const artifactSubdir = buildArtifactSubdir(result);
+    const testDir = path.join(outputDir, artifactSubdir);
+    await mkdir(testDir, { recursive: true });
+    await writeFile(
+      path.join(testDir, 'grading.json'),
+      `${JSON.stringify(grading, null, 2)}\n`,
+      'utf8',
+    );
+    await writeFile(
+      path.join(testDir, 'timing.json'),
+      `${JSON.stringify(timing, null, 2)}\n`,
+      'utf8',
+    );
+
+    const input = extractInput(result);
+    if (input) {
+      await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
+    }
+    if (result.output && result.output.length > 0) {
+      const outputsDir = path.join(testDir, 'outputs');
+      await mkdir(outputsDir, { recursive: true });
+      await writeFile(
+        path.join(outputsDir, 'response.md'),
+        formatOutputMarkdown(result.output),
+        'utf8',
+      );
+    }
+  }
+}
+
 export async function writeArtifactsFromResults(
   results: readonly EvaluationResult[],
   outputDir: string,
diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts
new file mode 100644
index 00000000..7483b841
--- /dev/null
+++ b/apps/cli/src/commands/eval/commands/aggregate.ts
@@ -0,0 +1,24 @@
+import path from 'node:path';
+import { command, positional, string } from 'cmd-ts';
+
+import { aggregateRunDir } from '../artifact-writer.js';
+
+export const evalAggregateCommand = command({
+  name: 'aggregate',
+  description:
+    'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
+  args: {
+    runDir: positional({
+      type: string,
+      displayName: 'run-dir',
+      description: 'Path to a run directory containing index.jsonl',
+    }),
+  },
+  handler: async (args) => {
+    const runDir = path.resolve(args.runDir);
+    const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
+    console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
+    console.log(`  Benchmark: ${benchmarkPath}`);
+    console.log(`  Timing:    ${timingPath}`);
+  },
+});
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index d8c2722e..18668aa5 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -160,6 +160,16 @@ export const evalRunCommand = command({
       description:
         'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
     }),
+    resume: flag({
+      long: 'resume',
+      description:
+        'Resume an interrupted run: skip already-completed tests and append new results to --output dir',
+    }),
+    rerunFailed: flag({
+      long: 'rerun-failed',
+      description:
+        'Rerun failed/errored tests while keeping passing results. Implies --resume semantics',
+    }),
     strict: flag({
       long: 'strict',
       description: 'Exit with error on version mismatch (instead of warning)',
@@ -254,6 +264,8 @@ export const evalRunCommand = command({
       otelCaptureContent: args.otelCaptureContent,
       otelGroupTurns: args.otelGroupTurns,
       retryErrors: args.retryErrors,
+      resume: args.resume,
+      rerunFailed: args.rerunFailed,
       strict: args.strict,
       benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts
index 305590d1..d40b92ca 100644
--- a/apps/cli/src/commands/eval/index.ts
+++ b/apps/cli/src/commands/eval/index.ts
@@ -1,5 +1,6 @@
 import { subcommands } from 'cmd-ts';
 
+import { evalAggregateCommand } from './commands/aggregate.js';
 import { evalAssertCommand } from './commands/assert.js';
 import { evalRunCommand } from './commands/run.js';
 
@@ -9,5 +10,6 @@ export const evalCommand = subcommands({
   cmds: {
     run: evalRunCommand,
     assert: evalAssertCommand,
+    aggregate: evalAggregateCommand,
   },
 });
diff --git a/apps/cli/src/commands/eval/jsonl-writer.ts b/apps/cli/src/commands/eval/jsonl-writer.ts
index dc1c7d41..827bb964 100644
--- a/apps/cli/src/commands/eval/jsonl-writer.ts
+++ b/apps/cli/src/commands/eval/jsonl-writer.ts
@@ -15,9 +15,10 @@ export class JsonlWriter {
     this.stream = stream;
   }
 
-  static async open(filePath: string): Promise<JsonlWriter> {
+  static async open(filePath: string, options?: { append?: boolean }): Promise<JsonlWriter> {
     await mkdir(path.dirname(filePath), { recursive: true });
-    const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' });
+    const flags = options?.append ? 'a' : 'w';
+    const stream = createWriteStream(filePath, { flags, encoding: 'utf8' });
     return new JsonlWriter(stream);
   }
 
diff --git a/apps/cli/src/commands/eval/output-writer.ts b/apps/cli/src/commands/eval/output-writer.ts
index e4d2cebd..f61a70f0 100644
--- a/apps/cli/src/commands/eval/output-writer.ts
+++ b/apps/cli/src/commands/eval/output-writer.ts
@@ -22,10 +22,11 @@ export interface WriterOptions {
 export async function createOutputWriter(
   filePath: string,
   format: OutputFormat,
+  options?: { append?: boolean },
 ): Promise<OutputWriter> {
   switch (format) {
     case 'jsonl':
-      return JsonlWriter.open(filePath);
+      return JsonlWriter.open(filePath, { append: options?.append });
     case 'yaml':
       return YamlWriter.open(filePath);
     case 'html':
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index e9f106c7..b1737f19 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1,5 +1,5 @@
-import { constants, mkdirSync } from 'node:fs';
-import { access } from 'node:fs/promises';
+import { constants, existsSync, mkdirSync } from 'node:fs';
+import { access, readFile } from 'node:fs/promises';
 import path from 'node:path';
 import { pathToFileURL } from 'node:url';
 
@@ -30,7 +30,13 @@ import {
 
 import { enforceRequiredVersion } from '../../version-check.js';
 import { maybeAutoExportRunArtifacts } from '../results/remote.js';
-import { writeArtifactsFromResults } from './artifact-writer.js';
+import {
+  aggregateRunDir,
+  buildTestTargetKey,
+  deduplicateByTestIdTarget,
+  parseJsonlResults,
+  writeArtifactsFromResults,
+} from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
@@ -53,6 +59,16 @@ import { type TargetSelection, selectMultipleTargets, selectTarget } from './tar
 
 const DEFAULT_WORKERS = 3;
 
+function shouldSkipExistingResultForResume(
+  result: Pick<EvaluationResult, 'executionStatus'>,
+  rerunFailed: boolean,
+): boolean {
+  if (rerunFailed) {
+    return result.executionStatus === 'ok';
+  }
+  return result.executionStatus !== 'execution_error';
+}
+
 interface RunEvalCommandInput {
   readonly testFiles: readonly string[];
   readonly rawOptions: Record<string, unknown>;
@@ -85,6 +101,8 @@ interface NormalizedOptions {
   readonly otelCaptureContent: boolean;
   readonly otelGroupTurns: boolean;
   readonly retryErrors?: string;
+  readonly resume: boolean;
+  readonly rerunFailed: boolean;
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly keepWorkspaces: boolean;
@@ -356,6 +374,8 @@ function normalizeOptions(
     otelGroupTurns:
       normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
     retryErrors: normalizeString(rawOptions.retryErrors),
+    resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
+    rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
     workspaceMode,
     workspacePath,
     // Precedence: CLI > YAML config > TS config
@@ -946,6 +966,39 @@ export async function runEvalCommand(
     }
   }
 
+  // --resume / --rerun-failed: skip already-completed tests and append to existing output.
+  // IMPORTANT: JSONL must be loaded before the output writer is created (same file).
+  let resumeSkipKeys: Set<string> | undefined;
+  let isResumeAppend = false;
+  if (options.resume && !options.retryErrors) {
+    const explicitResumeDir = options.outputDir ?? options.artifacts;
+    if (explicitResumeDir) {
+      const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl');
+      if (existsSync(resumeIndexPath)) {
+        const content = await readFile(resumeIndexPath, 'utf8');
+        const existingResults = parseJsonlResults(content);
+        resumeSkipKeys = new Set<string>();
+        for (const r of existingResults) {
+          if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
+            resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target));
+          }
+        }
+        isResumeAppend = true;
+        const modeLabel = options.rerunFailed ? 'Rerun-failed' : 'Resume';
+        console.log(
+          `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`,
+        );
+      } else {
+        // No existing index.jsonl — behave like a normal run
+        console.log('Resume: no existing index.jsonl found, starting fresh run.');
+      }
+    } else {
+      console.warn(
+        'Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume.',
+      );
+    }
+  }
+
   // Validate static workspace path exists and is a directory
   if (options.workspacePath) {
     const resolvedWorkspace = path.resolve(options.workspacePath);
@@ -1203,13 +1256,17 @@ export async function runEvalCommand(
   // Additional --export paths get their own writers that receive all results after the run.
   const writerOptions =
     resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined;
-  const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl');
+  const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl', {
+    append: isResumeAppend,
+  });
 
   // Detect matrix mode: multiple targets for any file
   const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
 
   // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides)
+  // When resuming, subtract tests that will be skipped
   let totalEvalCount = 0;
+  let resumeSkippedCount = 0;
   for (const meta of fileMetadata.values()) {
     const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
     for (const test of meta.testCases) {
@@ -1218,7 +1275,15 @@ export async function runEvalCommand(
         test.targets && test.targets.length > 0
           ? test.targets.filter((t) => suiteTargetNames.includes(t))
           : suiteTargetNames;
-      totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
+      const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ['unknown'];
+      for (const tn of effectiveTargets) {
+        const key = `${test.id}::${tn}`;
+        if (resumeSkipKeys?.has(key)) {
+          resumeSkippedCount++;
+        } else {
+          totalEvalCount++;
+        }
+      }
     }
   }
 
@@ -1228,6 +1293,11 @@ export async function runEvalCommand(
       console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
       return;
     }
+    // When using --resume, all tests being completed means nothing to resume
+    if (resumeSkipKeys && resumeSkippedCount > 0) {
+      console.log(`Nothing to resume — all ${resumeSkippedCount} test(s) already completed.`);
+      return;
+    }
     throw new Error('No tests matched the provided filters.');
   }
   const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
@@ -1338,7 +1408,14 @@ export async function runEvalCommand(
                 })
               : targetPrep.testCases;
 
-          if (applicableTestCases.length === 0) {
+          // --resume / --rerun-failed: skip tests that are already completed
+          const filteredTestCases = resumeSkipKeys
+            ? applicableTestCases.filter(
+                (test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName)),
+              )
+            : applicableTestCases;
+
+          if (filteredTestCases.length === 0) {
             return [];
           }
 
@@ -1359,7 +1436,7 @@ export async function runEvalCommand(
               displayIdTracker,
               selection,
               inlineTargetLabel,
-              testCases: applicableTestCases,
+              testCases: filteredTestCases,
               trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
               matrixMode: targetPrep.selections.length > 1,
               totalBudgetUsd: targetPrep.totalBudgetUsd,
@@ -1388,7 +1465,7 @@ export async function runEvalCommand(
             console.error(
               `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
             );
-            const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
+            const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => ({
               timestamp: new Date().toISOString(),
               testId: testCase.id,
               score: 0,
@@ -1428,9 +1505,19 @@ export async function runEvalCommand(
       );
     }
 
+    // Flush the output writer so all results are on disk before we read back.
+    await outputWriter.close().catch(() => undefined);
+
+    // When resuming, compute summary from ALL results (old + new, deduplicated)
+    let summaryResults = allResults;
+    if (isResumeAppend && usesDefaultArtifactWorkspace) {
+      const content = await readFile(outputPath, 'utf8');
+      summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
+    }
+
     const thresholdOpts =
       resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined;
-    const summary = calculateEvaluationSummary(allResults, thresholdOpts);
+    const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
     console.log(formatEvaluationSummary(summary, thresholdOpts));
 
     // Exit code: 2 when all tests are execution errors (no evaluation performed),
@@ -1439,8 +1526,8 @@ export async function runEvalCommand(
     const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0;
 
     // Print matrix summary when multiple targets were evaluated
-    if (isMatrixMode && allResults.length > 0) {
-      console.log(formatMatrixSummary(allResults));
+    if (isMatrixMode && summaryResults.length > 0) {
+      console.log(formatMatrixSummary(summaryResults));
     }
 
     // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat)
@@ -1453,22 +1540,41 @@ export async function runEvalCommand(
     // Write artifacts to the run directory (always, not conditional on flags)
     if (usesDefaultArtifactWorkspace && allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-      const {
-        testArtifactDir,
-        timingPath,
-        benchmarkPath: workspaceBenchmarkPath,
-        indexPath,
-      } = await writeArtifactsFromResults(allResults, runDir, {
-        evalFile,
-        experiment: normalizeExperimentName(options.experiment),
-      });
-      console.log(`Artifact workspace written to: ${runDir}`);
-      console.log(`  Index: ${indexPath}`);
-      console.log(
-        `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
-      );
-      console.log(`  Timing: ${timingPath}`);
-      console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+      if (isResumeAppend) {
+        // Resume mode: write per-test artifacts for newly-run tests, then aggregate
+        // from the full index.jsonl (old + new results with deduplication)
+        const { writePerTestArtifacts } = await import('./artifact-writer.js');
+        await writePerTestArtifacts(allResults, runDir, {
+          experiment: normalizeExperimentName(options.experiment),
+        });
+        const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
+          runDir,
+          { evalFile, experiment: normalizeExperimentName(options.experiment) },
+        );
+        const indexPath = path.join(runDir, 'index.jsonl');
+        console.log(`Artifact workspace updated: ${runDir}`);
+        console.log(`  Index: ${indexPath}`);
+        console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
+        console.log(`  Timing: ${timingPath}`);
+        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+      } else {
+        const {
+          testArtifactDir,
+          timingPath,
+          benchmarkPath: workspaceBenchmarkPath,
+          indexPath,
+        } = await writeArtifactsFromResults(allResults, runDir, {
+          evalFile,
+          experiment: normalizeExperimentName(options.experiment),
+        });
+        console.log(`Artifact workspace written to: ${runDir}`);
+        console.log(`  Index: ${indexPath}`);
+        console.log(
+          `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
+        );
+        console.log(`  Timing: ${timingPath}`);
+        console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+      }
     }
 
     // Write --export output files (additional formats)
@@ -1541,14 +1647,14 @@ export async function runEvalCommand(
       });
     }
 
-    // Suggest retry-errors command when execution errors are detected
-    if (summary.executionErrorCount > 0 && !options.retryErrors) {
+    // Suggest resume commands when execution errors are detected
+    if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) {
       const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' ');
       const targetFlag = options.target ? ` --target ${options.target}` : '';
-      const relativeOutputPath = path.relative(cwd, outputPath);
+      const relativeRunDir = path.relative(cwd, runDir);
       console.log(
         `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
-          `  agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`,
+          `  agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`,
       );
     }
 
diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
index 78561b03..59500586 100644
--- a/apps/cli/src/index.ts
+++ b/apps/cli/src/index.ts
@@ -48,7 +48,7 @@ export const app = subcommands({
  * Known eval subcommand names — used to decide whether to inject the
  * implicit `run` subcommand for backward-compatible `agentv eval <paths>`.
  */
-const EVAL_SUBCOMMANDS = new Set(['run', 'assert']);
+const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate']);
 
 /**
  * Top-level CLI command names (excluding `eval` itself).
diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts
new file mode 100644
index 00000000..c79bb707
--- /dev/null
+++ b/apps/cli/test/commands/eval/aggregate.test.ts
@@ -0,0 +1,192 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+
+import type { EvaluationResult } from '@agentv/core';
+import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js';
+
+import {
+  aggregateRunDir,
+  deduplicateByTestIdTarget,
+  parseJsonlResults,
+  writePerTestArtifacts,
+} from '../../../src/commands/eval/artifact-writer.js';
+
+function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
+  return {
+    timestamp: '2026-04-13T00:00:00.000Z',
+    testId: 'test-1',
+    score: 0.9,
+    assertions: [{ text: 'criterion-1', passed: true }],
+    output: [{ role: 'assistant' as const, content: 'test answer' }],
+    target: 'test-target',
+    executionStatus: 'ok',
+    ...overrides,
+  } as EvaluationResult;
+}
+
+function writeJsonlIndex(dir: string, results: Partial<EvaluationResult>[]): string {
+  const indexPath = path.join(dir, 'index.jsonl');
+  const lines = results.map((r) => JSON.stringify(toSnakeCaseDeep(makeResult(r)))).join('\n');
+  writeFileSync(indexPath, `${lines}\n`);
+  return indexPath;
+}
+
+// ---------------------------------------------------------------------------
+// deduplicateByTestIdTarget
+// ---------------------------------------------------------------------------
+
+describe('deduplicateByTestIdTarget', () => {
+  it('keeps last entry per (testId, target) pair', () => {
+    const results = [
+      makeResult({ testId: 'a', target: 'x', score: 0.1 }),
+      makeResult({ testId: 'a', target: 'x', score: 0.9 }),
+      makeResult({ testId: 'b', target: 'x', score: 0.5 }),
+    ];
+    const deduped = deduplicateByTestIdTarget(results);
+    expect(deduped).toHaveLength(2);
+    expect(deduped[0].testId).toBe('a');
+    expect(deduped[0].score).toBe(0.9);
+    expect(deduped[1].testId).toBe('b');
+  });
+
+  it('keeps entries with different targets', () => {
+    const results = [
+      makeResult({ testId: 'a', target: 'x', score: 0.3 }),
+      makeResult({ testId: 'a', target: 'y', score: 0.7 }),
+    ];
+    const deduped = deduplicateByTestIdTarget(results);
+    expect(deduped).toHaveLength(2);
+  });
+
+  it('handles empty input', () => {
+    expect(deduplicateByTestIdTarget([])).toHaveLength(0);
+  });
+
+  it('preserves order with no duplicates', () => {
+    const results = [
+      makeResult({ testId: 'a', target: 'x' }),
+      makeResult({ testId: 'b', target: 'x' }),
+      makeResult({ testId: 'c', target: 'x' }),
+    ];
+    const deduped = deduplicateByTestIdTarget(results);
+    expect(deduped.map((r) => r.testId)).toEqual(['a', 'b', 'c']);
+  });
+
+  it('deduplicates multiple duplicate pairs', () => {
+    const results = [
+      makeResult({ testId: 'a', target: 'x', score: 0.1 }),
+      makeResult({ testId: 'b', target: 'x', score: 0.2 }),
+      makeResult({ testId: 'a', target: 'x', score: 0.3 }),
+      makeResult({ testId: 'b', target: 'x', score: 0.4 }),
+    ];
+    const deduped = deduplicateByTestIdTarget(results);
+    expect(deduped).toHaveLength(2);
+    expect(deduped[0].score).toBe(0.3);
+    expect(deduped[1].score).toBe(0.4);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// aggregateRunDir
+// ---------------------------------------------------------------------------
+
+describe('aggregateRunDir', () => {
+  let tmpDir: string;
+
+  beforeEach(() => {
+    tmpDir = mkdtempSync(path.join(tmpdir(), 'aggregate-test-'));
+  });
+
+  afterEach(() => {
+    rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  it('reads index.jsonl, deduplicates, writes benchmark.json and timing.json', async () => {
+    writeJsonlIndex(tmpDir, [
+      { testId: 'a', target: 'x', score: 0.1, executionStatus: 'execution_error' },
+      { testId: 'a', target: 'x', score: 0.9, executionStatus: 'ok' },
+      { testId: 'b', target: 'x', score: 0.8, executionStatus: 'ok' },
+    ]);
+
+    const result = await aggregateRunDir(tmpDir);
+    expect(result.testCount).toBe(2);
+    expect(result.targetCount).toBe(1);
+
+    const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8'));
+    expect(benchmark.metadata.tests_run).toContain('a');
+    expect(benchmark.metadata.tests_run).toContain('b');
+    expect(benchmark.run_summary.x).toBeDefined();
+
+    const timing = JSON.parse(readFileSync(result.timingPath, 'utf8'));
+    expect(timing.total_tokens).toBeGreaterThanOrEqual(0);
+  });
+
+  it('uses last entry for duplicates in benchmark stats', async () => {
+    writeJsonlIndex(tmpDir, [
+      { testId: 'a', target: 'x', score: 0.0, executionStatus: 'execution_error' },
+      { testId: 'a', target: 'x', score: 1.0, executionStatus: 'ok' },
+    ]);
+
+    const result = await aggregateRunDir(tmpDir);
+    expect(result.testCount).toBe(1);
+
+    const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8'));
+    // Should have 100% pass rate since the last entry is ok with score 1.0
+    expect(benchmark.run_summary.x.pass_rate.mean).toBe(1);
+  });
+
+  it('handles multi-target results', async () => {
+    writeJsonlIndex(tmpDir, [
+      { testId: 'a', target: 'x', score: 0.9 },
+      { testId: 'a', target: 'y', score: 0.8 },
+    ]);
+
+    const result = await aggregateRunDir(tmpDir);
+    expect(result.testCount).toBe(2);
+    expect(result.targetCount).toBe(2);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// writePerTestArtifacts
+// ---------------------------------------------------------------------------
+
+describe('writePerTestArtifacts', () => {
+  let tmpDir: string;
+
+  beforeEach(() => {
+    tmpDir = mkdtempSync(path.join(tmpdir(), 'per-test-artifacts-'));
+  });
+
+  afterEach(() => {
+    rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  it('writes grading.json and timing.json for each result', async () => {
+    const results = [makeResult({ testId: 'test-1' }), makeResult({ testId: 'test-2' })];
+
+    await writePerTestArtifacts(results, tmpDir);
+
+    const grading1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'grading.json'), 'utf8'));
+    expect(grading1.assertions).toHaveLength(1);
+
+    const timing1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'timing.json'), 'utf8'));
+    expect(timing1.total_tokens).toBeGreaterThanOrEqual(0);
+
+    const grading2 = JSON.parse(readFileSync(path.join(tmpDir, 'test-2', 'grading.json'), 'utf8'));
+    expect(grading2.assertions).toHaveLength(1);
+  });
+
+  it('writes response.md for results with output', async () => {
+    const results = [
+      makeResult({ testId: 'test-1', output: [{ role: 'assistant' as const, content: 'hello' }] }),
+    ];
+
+    await writePerTestArtifacts(results, tmpDir);
+
+    const response = readFileSync(path.join(tmpDir, 'test-1', 'outputs', 'response.md'), 'utf8');
+    expect(response).toContain('hello');
+  });
+});