diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index c7844ccf..c5cf1b26 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -10,6 +10,47 @@ import { import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; +export function buildTestTargetKey(testId?: string, target?: string): string { + return `${testId ?? 'unknown'}::${target ?? 'unknown'}`; +} + +// Deduplication helper — keeps the last entry per (test_id, target) pair. +export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] { + const seen = new Map(); + for (let i = 0; i < results.length; i++) { + seen.set(buildTestTargetKey(results[i].testId, results[i].target), i); + } + const deduped: EvaluationResult[] = []; + for (let i = 0; i < results.length; i++) { + const key = buildTestTargetKey(results[i].testId, results[i].target); + if (seen.get(key) === i) { + deduped.push(results[i]); + } + } + return deduped; +} + +export async function aggregateRunDir( + runDir: string, + options?: { evalFile?: string; experiment?: string }, +): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> { + const indexPath = path.join(runDir, RESULT_INDEX_FILENAME); + const content = await readFile(indexPath, 'utf8'); + const allResults = parseJsonlResults(content); + const results = deduplicateByTestIdTarget(allResults); + + const timing = buildTimingArtifact(results); + const timingPath = path.join(runDir, 'timing.json'); + await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); + + const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); + const benchmarkPath = path.join(runDir, 'benchmark.json'); + await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); + + const targetSet = new Set(results.map((r) => r.target ?? 'unknown')); + return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size }; +} + // --------------------------------------------------------------------------- // Artifact interfaces (snake_case to match skill-creator conventions) // --------------------------------------------------------------------------- @@ -739,6 +780,45 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri return lines.length > 0 ? `${lines.join('\n')}\n` : ''; } +export async function writePerTestArtifacts( + results: readonly EvaluationResult[], + outputDir: string, + options?: { experiment?: string }, +): Promise { + await mkdir(outputDir, { recursive: true }); + for (const result of results) { + const grading = buildGradingArtifact(result); + const timing = buildTimingArtifact([result]); + const artifactSubdir = buildArtifactSubdir(result); + const testDir = path.join(outputDir, artifactSubdir); + await mkdir(testDir, { recursive: true }); + await writeFile( + path.join(testDir, 'grading.json'), + `${JSON.stringify(grading, null, 2)}\n`, + 'utf8', + ); + await writeFile( + path.join(testDir, 'timing.json'), + `${JSON.stringify(timing, null, 2)}\n`, + 'utf8', + ); + + const input = extractInput(result); + if (input) { + await writeFile(path.join(testDir, 'input.md'), input, 'utf8'); + } + if (result.output && result.output.length > 0) { + const outputsDir = path.join(testDir, 'outputs'); + await mkdir(outputsDir, { recursive: true }); + await writeFile( + path.join(outputsDir, 'response.md'), + formatOutputMarkdown(result.output), + 'utf8', + ); + } + } +} + export async function writeArtifactsFromResults( results: readonly EvaluationResult[], outputDir: string, diff --git a/apps/cli/src/commands/eval/commands/aggregate.ts b/apps/cli/src/commands/eval/commands/aggregate.ts new file mode 100644 index 00000000..7483b841 --- /dev/null +++ b/apps/cli/src/commands/eval/commands/aggregate.ts @@ -0,0 +1,24 @@ +import path from 'node:path'; +import { command, positional, string } from 'cmd-ts'; + +import { aggregateRunDir } from '../artifact-writer.js'; + +export const evalAggregateCommand = command({ + name: 'aggregate', + description: + 'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.', + args: { + runDir: positional({ + type: string, + displayName: 'run-dir', + description: 'Path to a run directory containing index.jsonl', + }), + }, + handler: async (args) => { + const runDir = path.resolve(args.runDir); + const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir); + console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`); + console.log(` Benchmark: ${benchmarkPath}`); + console.log(` Timing: ${timingPath}`); + }, +}); diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index d8c2722e..18668aa5 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -160,6 +160,16 @@ export const evalRunCommand = command({ description: 'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases', }), + resume: flag({ + long: 'resume', + description: + 'Resume an interrupted run: skip already-completed tests and append new results to --output dir', + }), + rerunFailed: flag({ + long: 'rerun-failed', + description: + 'Rerun failed/errored tests while keeping passing results. Implies --resume semantics', + }), strict: flag({ long: 'strict', description: 'Exit with error on version mismatch (instead of warning)', @@ -254,6 +264,8 @@ export const evalRunCommand = command({ otelCaptureContent: args.otelCaptureContent, otelGroupTurns: args.otelGroupTurns, retryErrors: args.retryErrors, + resume: args.resume, + rerunFailed: args.rerunFailed, strict: args.strict, benchmarkJson: args.benchmarkJson, artifacts: args.artifacts, diff --git a/apps/cli/src/commands/eval/index.ts b/apps/cli/src/commands/eval/index.ts index 305590d1..d40b92ca 100644 --- a/apps/cli/src/commands/eval/index.ts +++ b/apps/cli/src/commands/eval/index.ts @@ -1,5 +1,6 @@ import { subcommands } from 'cmd-ts'; +import { evalAggregateCommand } from './commands/aggregate.js'; import { evalAssertCommand } from './commands/assert.js'; import { evalRunCommand } from './commands/run.js'; @@ -9,5 +10,6 @@ export const evalCommand = subcommands({ cmds: { run: evalRunCommand, assert: evalAssertCommand, + aggregate: evalAggregateCommand, }, }); diff --git a/apps/cli/src/commands/eval/jsonl-writer.ts b/apps/cli/src/commands/eval/jsonl-writer.ts index dc1c7d41..827bb964 100644 --- a/apps/cli/src/commands/eval/jsonl-writer.ts +++ b/apps/cli/src/commands/eval/jsonl-writer.ts @@ -15,9 +15,10 @@ export class JsonlWriter { this.stream = stream; } - static async open(filePath: string): Promise { + static async open(filePath: string, options?: { append?: boolean }): Promise { await mkdir(path.dirname(filePath), { recursive: true }); - const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' }); + const flags = options?.append ? 'a' : 'w'; + const stream = createWriteStream(filePath, { flags, encoding: 'utf8' }); return new JsonlWriter(stream); } diff --git a/apps/cli/src/commands/eval/output-writer.ts b/apps/cli/src/commands/eval/output-writer.ts index e4d2cebd..f61a70f0 100644 --- a/apps/cli/src/commands/eval/output-writer.ts +++ b/apps/cli/src/commands/eval/output-writer.ts @@ -22,10 +22,11 @@ export interface WriterOptions { export async function createOutputWriter( filePath: string, format: OutputFormat, + options?: { append?: boolean }, ): Promise { switch (format) { case 'jsonl': - return JsonlWriter.open(filePath); + return JsonlWriter.open(filePath, { append: options?.append }); case 'yaml': return YamlWriter.open(filePath); case 'html': diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index e9f106c7..b1737f19 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1,5 +1,5 @@ -import { constants, mkdirSync } from 'node:fs'; -import { access } from 'node:fs/promises'; +import { constants, existsSync, mkdirSync } from 'node:fs'; +import { access, readFile } from 'node:fs/promises'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; @@ -30,7 +30,13 @@ import { import { enforceRequiredVersion } from '../../version-check.js'; import { maybeAutoExportRunArtifacts } from '../results/remote.js'; -import { writeArtifactsFromResults } from './artifact-writer.js'; +import { + aggregateRunDir, + buildTestTargetKey, + deduplicateByTestIdTarget, + parseJsonlResults, + writeArtifactsFromResults, +} from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; @@ -53,6 +59,16 @@ import { type TargetSelection, selectMultipleTargets, selectTarget } from './tar const DEFAULT_WORKERS = 3; +function shouldSkipExistingResultForResume( + result: Pick, + rerunFailed: boolean, +): boolean { + if (rerunFailed) { + return result.executionStatus === 'ok'; + } + return result.executionStatus !== 'execution_error'; +} + interface RunEvalCommandInput { readonly testFiles: readonly string[]; readonly rawOptions: Record; @@ -85,6 +101,8 @@ interface NormalizedOptions { readonly otelCaptureContent: boolean; readonly otelGroupTurns: boolean; readonly retryErrors?: string; + readonly resume: boolean; + readonly rerunFailed: boolean; readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; readonly keepWorkspaces: boolean; @@ -356,6 +374,8 @@ function normalizeOptions( otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true, retryErrors: normalizeString(rawOptions.retryErrors), + resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed), + rerunFailed: normalizeBoolean(rawOptions.rerunFailed), workspaceMode, workspacePath, // Precedence: CLI > YAML config > TS config @@ -946,6 +966,39 @@ export async function runEvalCommand( } } + // --resume / --rerun-failed: skip already-completed tests and append to existing output. + // IMPORTANT: JSONL must be loaded before the output writer is created (same file). + let resumeSkipKeys: Set | undefined; + let isResumeAppend = false; + if (options.resume && !options.retryErrors) { + const explicitResumeDir = options.outputDir ?? options.artifacts; + if (explicitResumeDir) { + const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl'); + if (existsSync(resumeIndexPath)) { + const content = await readFile(resumeIndexPath, 'utf8'); + const existingResults = parseJsonlResults(content); + resumeSkipKeys = new Set(); + for (const r of existingResults) { + if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { + resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target)); + } + } + isResumeAppend = true; + const modeLabel = options.rerunFailed ? 'Rerun-failed' : 'Resume'; + console.log( + `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`, + ); + } else { + // No existing index.jsonl — behave like a normal run + console.log('Resume: no existing index.jsonl found, starting fresh run.'); + } + } else { + console.warn( + 'Warning: --resume requires --output to identify the run directory. Ignoring --resume.', + ); + } + } + // Validate static workspace path exists and is a directory if (options.workspacePath) { const resolvedWorkspace = path.resolve(options.workspacePath); @@ -1203,13 +1256,17 @@ export async function runEvalCommand( // Additional --export paths get their own writers that receive all results after the run. const writerOptions = resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; - const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl'); + const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl', { + append: isResumeAppend, + }); // Detect matrix mode: multiple targets for any file const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1); // In matrix mode, total eval count is tests × targets (accounting for per-test target overrides) + // When resuming, subtract tests that will be skipped let totalEvalCount = 0; + let resumeSkippedCount = 0; for (const meta of fileMetadata.values()) { const suiteTargetNames = meta.selections.map((s) => s.selection.targetName); for (const test of meta.testCases) { @@ -1218,7 +1275,15 @@ export async function runEvalCommand( test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames; - totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1; + const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ['unknown']; + for (const tn of effectiveTargets) { + const key = `${test.id}::${tn}`; + if (resumeSkipKeys?.has(key)) { + resumeSkippedCount++; + } else { + totalEvalCount++; + } + } } } @@ -1228,6 +1293,11 @@ export async function runEvalCommand( console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); return; } + // When using --resume, all tests being completed means nothing to resume + if (resumeSkipKeys && resumeSkippedCount > 0) { + console.log(`Nothing to resume — all ${resumeSkippedCount} test(s) already completed.`); + return; + } throw new Error('No tests matched the provided filters.'); } const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, { @@ -1338,7 +1408,14 @@ export async function runEvalCommand( }) : targetPrep.testCases; - if (applicableTestCases.length === 0) { + // --resume / --rerun-failed: skip tests that are already completed + const filteredTestCases = resumeSkipKeys + ? applicableTestCases.filter( + (test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName)), + ) + : applicableTestCases; + + if (filteredTestCases.length === 0) { return []; } @@ -1359,7 +1436,7 @@ export async function runEvalCommand( displayIdTracker, selection, inlineTargetLabel, - testCases: applicableTestCases, + testCases: filteredTestCases, trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, @@ -1388,7 +1465,7 @@ export async function runEvalCommand( console.error( `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, ); - const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ + const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => ({ timestamp: new Date().toISOString(), testId: testCase.id, score: 0, @@ -1428,9 +1505,19 @@ export async function runEvalCommand( ); } + // Flush the output writer so all results are on disk before we read back. + await outputWriter.close().catch(() => undefined); + + // When resuming, compute summary from ALL results (old + new, deduplicated) + let summaryResults = allResults; + if (isResumeAppend && usesDefaultArtifactWorkspace) { + const content = await readFile(outputPath, 'utf8'); + summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content)); + } + const thresholdOpts = resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; - const summary = calculateEvaluationSummary(allResults, thresholdOpts); + const summary = calculateEvaluationSummary(summaryResults, thresholdOpts); console.log(formatEvaluationSummary(summary, thresholdOpts)); // Exit code: 2 when all tests are execution errors (no evaluation performed), @@ -1439,8 +1526,8 @@ export async function runEvalCommand( const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0; // Print matrix summary when multiple targets were evaluated - if (isMatrixMode && allResults.length > 0) { - console.log(formatMatrixSummary(allResults)); + if (isMatrixMode && summaryResults.length > 0) { + console.log(formatMatrixSummary(summaryResults)); } // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat) @@ -1453,22 +1540,41 @@ export async function runEvalCommand( // Write artifacts to the run directory (always, not conditional on flags) if (usesDefaultArtifactWorkspace && allResults.length > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; - const { - testArtifactDir, - timingPath, - benchmarkPath: workspaceBenchmarkPath, - indexPath, - } = await writeArtifactsFromResults(allResults, runDir, { - evalFile, - experiment: normalizeExperimentName(options.experiment), - }); - console.log(`Artifact workspace written to: ${runDir}`); - console.log(` Index: ${indexPath}`); - console.log( - ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, - ); - console.log(` Timing: ${timingPath}`); - console.log(` Benchmark: ${workspaceBenchmarkPath}`); + if (isResumeAppend) { + // Resume mode: write per-test artifacts for newly-run tests, then aggregate + // from the full index.jsonl (old + new results with deduplication) + const { writePerTestArtifacts } = await import('./artifact-writer.js'); + await writePerTestArtifacts(allResults, runDir, { + experiment: normalizeExperimentName(options.experiment), + }); + const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir( + runDir, + { evalFile, experiment: normalizeExperimentName(options.experiment) }, + ); + const indexPath = path.join(runDir, 'index.jsonl'); + console.log(`Artifact workspace updated: ${runDir}`); + console.log(` Index: ${indexPath}`); + console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`); + console.log(` Timing: ${timingPath}`); + console.log(` Benchmark: ${workspaceBenchmarkPath}`); + } else { + const { + testArtifactDir, + timingPath, + benchmarkPath: workspaceBenchmarkPath, + indexPath, + } = await writeArtifactsFromResults(allResults, runDir, { + evalFile, + experiment: normalizeExperimentName(options.experiment), + }); + console.log(`Artifact workspace written to: ${runDir}`); + console.log(` Index: ${indexPath}`); + console.log( + ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, + ); + console.log(` Timing: ${timingPath}`); + console.log(` Benchmark: ${workspaceBenchmarkPath}`); + } } // Write --export output files (additional formats) @@ -1541,14 +1647,14 @@ export async function runEvalCommand( }); } - // Suggest retry-errors command when execution errors are detected - if (summary.executionErrorCount > 0 && !options.retryErrors) { + // Suggest resume commands when execution errors are detected + if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) { const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' '); const targetFlag = options.target ? ` --target ${options.target}` : ''; - const relativeOutputPath = path.relative(cwd, outputPath); + const relativeRunDir = path.relative(cwd, runDir); console.log( `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + - ` agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`, + ` agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`, ); } diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index 78561b03..59500586 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -48,7 +48,7 @@ export const app = subcommands({ * Known eval subcommand names — used to decide whether to inject the * implicit `run` subcommand for backward-compatible `agentv eval `. */ -const EVAL_SUBCOMMANDS = new Set(['run', 'assert']); +const EVAL_SUBCOMMANDS = new Set(['run', 'assert', 'aggregate']); /** * Top-level CLI command names (excluding `eval` itself). diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts new file mode 100644 index 00000000..c79bb707 --- /dev/null +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -0,0 +1,192 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import type { EvaluationResult } from '@agentv/core'; +import { toSnakeCaseDeep } from '../../../src/utils/case-conversion.js'; + +import { + aggregateRunDir, + deduplicateByTestIdTarget, + parseJsonlResults, + writePerTestArtifacts, +} from '../../../src/commands/eval/artifact-writer.js'; + +function makeResult(overrides: Partial = {}): EvaluationResult { + return { + timestamp: '2026-04-13T00:00:00.000Z', + testId: 'test-1', + score: 0.9, + assertions: [{ text: 'criterion-1', passed: true }], + output: [{ role: 'assistant' as const, content: 'test answer' }], + target: 'test-target', + executionStatus: 'ok', + ...overrides, + } as EvaluationResult; +} + +function writeJsonlIndex(dir: string, results: Partial[]): string { + const indexPath = path.join(dir, 'index.jsonl'); + const lines = results.map((r) => JSON.stringify(toSnakeCaseDeep(makeResult(r)))).join('\n'); + writeFileSync(indexPath, `${lines}\n`); + return indexPath; +} + +// --------------------------------------------------------------------------- +// deduplicateByTestIdTarget +// --------------------------------------------------------------------------- + +describe('deduplicateByTestIdTarget', () => { + it('keeps last entry per (testId, target) pair', () => { + const results = [ + makeResult({ testId: 'a', target: 'x', score: 0.1 }), + makeResult({ testId: 'a', target: 'x', score: 0.9 }), + makeResult({ testId: 'b', target: 'x', score: 0.5 }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + expect(deduped[0].testId).toBe('a'); + expect(deduped[0].score).toBe(0.9); + expect(deduped[1].testId).toBe('b'); + }); + + it('keeps entries with different targets', () => { + const results = [ + makeResult({ testId: 'a', target: 'x', score: 0.3 }), + makeResult({ testId: 'a', target: 'y', score: 0.7 }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + }); + + it('handles empty input', () => { + expect(deduplicateByTestIdTarget([])).toHaveLength(0); + }); + + it('preserves order with no duplicates', () => { + const results = [ + makeResult({ testId: 'a', target: 'x' }), + makeResult({ testId: 'b', target: 'x' }), + makeResult({ testId: 'c', target: 'x' }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped.map((r) => r.testId)).toEqual(['a', 'b', 'c']); + }); + + it('deduplicates multiple duplicate pairs', () => { + const results = [ + makeResult({ testId: 'a', target: 'x', score: 0.1 }), + makeResult({ testId: 'b', target: 'x', score: 0.2 }), + makeResult({ testId: 'a', target: 'x', score: 0.3 }), + makeResult({ testId: 'b', target: 'x', score: 0.4 }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + expect(deduped[0].score).toBe(0.3); + expect(deduped[1].score).toBe(0.4); + }); +}); + +// --------------------------------------------------------------------------- +// aggregateRunDir +// --------------------------------------------------------------------------- + +describe('aggregateRunDir', () => { + let tmpDir: string; + + beforeEach(() => { + tmpDir = mkdtempSync(path.join(tmpdir(), 'aggregate-test-')); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + }); + + it('reads index.jsonl, deduplicates, writes benchmark.json and timing.json', async () => { + writeJsonlIndex(tmpDir, [ + { testId: 'a', target: 'x', score: 0.1, executionStatus: 'execution_error' }, + { testId: 'a', target: 'x', score: 0.9, executionStatus: 'ok' }, + { testId: 'b', target: 'x', score: 0.8, executionStatus: 'ok' }, + ]); + + const result = await aggregateRunDir(tmpDir); + expect(result.testCount).toBe(2); + expect(result.targetCount).toBe(1); + + const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8')); + expect(benchmark.metadata.tests_run).toContain('a'); + expect(benchmark.metadata.tests_run).toContain('b'); + expect(benchmark.run_summary.x).toBeDefined(); + + const timing = JSON.parse(readFileSync(result.timingPath, 'utf8')); + expect(timing.total_tokens).toBeGreaterThanOrEqual(0); + }); + + it('uses last entry for duplicates in benchmark stats', async () => { + writeJsonlIndex(tmpDir, [ + { testId: 'a', target: 'x', score: 0.0, executionStatus: 'execution_error' }, + { testId: 'a', target: 'x', score: 1.0, executionStatus: 'ok' }, + ]); + + const result = await aggregateRunDir(tmpDir); + expect(result.testCount).toBe(1); + + const benchmark = JSON.parse(readFileSync(result.benchmarkPath, 'utf8')); + // Should have 100% pass rate since the last entry is ok with score 1.0 + expect(benchmark.run_summary.x.pass_rate.mean).toBe(1); + }); + + it('handles multi-target results', async () => { + writeJsonlIndex(tmpDir, [ + { testId: 'a', target: 'x', score: 0.9 }, + { testId: 'a', target: 'y', score: 0.8 }, + ]); + + const result = await aggregateRunDir(tmpDir); + expect(result.testCount).toBe(2); + expect(result.targetCount).toBe(2); + }); +}); + +// --------------------------------------------------------------------------- +// writePerTestArtifacts +// --------------------------------------------------------------------------- + +describe('writePerTestArtifacts', () => { + let tmpDir: string; + + beforeEach(() => { + tmpDir = mkdtempSync(path.join(tmpdir(), 'per-test-artifacts-')); + }); + + afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); + }); + + it('writes grading.json and timing.json for each result', async () => { + const results = [makeResult({ testId: 'test-1' }), makeResult({ testId: 'test-2' })]; + + await writePerTestArtifacts(results, tmpDir); + + const grading1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'grading.json'), 'utf8')); + expect(grading1.assertions).toHaveLength(1); + + const timing1 = JSON.parse(readFileSync(path.join(tmpDir, 'test-1', 'timing.json'), 'utf8')); + expect(timing1.total_tokens).toBeGreaterThanOrEqual(0); + + const grading2 = JSON.parse(readFileSync(path.join(tmpDir, 'test-2', 'grading.json'), 'utf8')); + expect(grading2.assertions).toHaveLength(1); + }); + + it('writes response.md for results with output', async () => { + const results = [ + makeResult({ testId: 'test-1', output: [{ role: 'assistant' as const, content: 'hello' }] }), + ]; + + await writePerTestArtifacts(results, tmpDir); + + const response = readFileSync(path.join(tmpDir, 'test-1', 'outputs', 'response.md'), 'utf8'); + expect(response).toContain('hello'); + }); +});