Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,47 @@ import {
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';

export function buildTestTargetKey(testId?: string, target?: string): string {
return `${testId ?? 'unknown'}::${target ?? 'unknown'}`;
}

// Deduplication helper — keeps the last entry per (test_id, target) pair.
export function deduplicateByTestIdTarget(results: readonly EvaluationResult[]): EvaluationResult[] {
const seen = new Map<string, number>();
for (let i = 0; i < results.length; i++) {
seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
}
const deduped: EvaluationResult[] = [];
for (let i = 0; i < results.length; i++) {
const key = buildTestTargetKey(results[i].testId, results[i].target);
if (seen.get(key) === i) {
deduped.push(results[i]);
}
}
return deduped;
}

export async function aggregateRunDir(
runDir: string,
options?: { evalFile?: string; experiment?: string },
): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
const content = await readFile(indexPath, 'utf8');
const allResults = parseJsonlResults(content);
const results = deduplicateByTestIdTarget(allResults);

const timing = buildTimingArtifact(results);
const timingPath = path.join(runDir, 'timing.json');
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');

const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
const benchmarkPath = path.join(runDir, 'benchmark.json');
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');

const targetSet = new Set(results.map((r) => r.target ?? 'unknown'));
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
}

// ---------------------------------------------------------------------------
// Artifact interfaces (snake_case to match skill-creator conventions)
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -739,6 +780,45 @@ function buildTranscriptMessageLines(results: readonly EvaluationResult[]): stri
return lines.length > 0 ? `${lines.join('\n')}\n` : '';
}

export async function writePerTestArtifacts(
results: readonly EvaluationResult[],
outputDir: string,
options?: { experiment?: string },
): Promise<void> {
await mkdir(outputDir, { recursive: true });
for (const result of results) {
const grading = buildGradingArtifact(result);
const timing = buildTimingArtifact([result]);
const artifactSubdir = buildArtifactSubdir(result);
const testDir = path.join(outputDir, artifactSubdir);
await mkdir(testDir, { recursive: true });
await writeFile(
path.join(testDir, 'grading.json'),
`${JSON.stringify(grading, null, 2)}\n`,
'utf8',
);
await writeFile(
path.join(testDir, 'timing.json'),
`${JSON.stringify(timing, null, 2)}\n`,
'utf8',
);

const input = extractInput(result);
if (input) {
await writeFile(path.join(testDir, 'input.md'), input, 'utf8');
}
if (result.output && result.output.length > 0) {
const outputsDir = path.join(testDir, 'outputs');
await mkdir(outputsDir, { recursive: true });
await writeFile(
path.join(outputsDir, 'response.md'),
formatOutputMarkdown(result.output),
'utf8',
);
}
}
}

export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
Expand Down
24 changes: 24 additions & 0 deletions apps/cli/src/commands/eval/commands/aggregate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import path from 'node:path';
import { command, positional, string } from 'cmd-ts';

import { aggregateRunDir } from '../artifact-writer.js';

export const evalAggregateCommand = command({
name: 'aggregate',
description:
'Recompute benchmark.json and timing.json from a run directory. Deduplicates by (test_id, target), keeping the last entry.',
args: {
runDir: positional({
type: string,
displayName: 'run-dir',
description: 'Path to a run directory containing index.jsonl',
}),
},
handler: async (args) => {
const runDir = path.resolve(args.runDir);
const { benchmarkPath, timingPath, testCount, targetCount } = await aggregateRunDir(runDir);
console.log(`Aggregated ${testCount} test result(s) across ${targetCount} target(s)`);
console.log(` Benchmark: ${benchmarkPath}`);
console.log(` Timing: ${timingPath}`);
},
});
12 changes: 12 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ export const evalRunCommand = command({
description:
'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
}),
resume: flag({
long: 'resume',
description:
'Resume an interrupted run: skip already-completed tests and append new results to --output dir',
}),
rerunFailed: flag({
long: 'rerun-failed',
description:
'Rerun failed/errored tests while keeping passing results. Implies --resume semantics',
}),
strict: flag({
long: 'strict',
description: 'Exit with error on version mismatch (instead of warning)',
Expand Down Expand Up @@ -254,6 +264,8 @@ export const evalRunCommand = command({
otelCaptureContent: args.otelCaptureContent,
otelGroupTurns: args.otelGroupTurns,
retryErrors: args.retryErrors,
resume: args.resume,
rerunFailed: args.rerunFailed,
strict: args.strict,
benchmarkJson: args.benchmarkJson,
artifacts: args.artifacts,
Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/eval/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { subcommands } from 'cmd-ts';

import { evalAggregateCommand } from './commands/aggregate.js';
import { evalAssertCommand } from './commands/assert.js';
import { evalRunCommand } from './commands/run.js';

Expand All @@ -9,5 +10,6 @@ export const evalCommand = subcommands({
cmds: {
run: evalRunCommand,
assert: evalAssertCommand,
aggregate: evalAggregateCommand,
},
});
5 changes: 3 additions & 2 deletions apps/cli/src/commands/eval/jsonl-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ export class JsonlWriter {
this.stream = stream;
}

static async open(filePath: string): Promise<JsonlWriter> {
static async open(filePath: string, options?: { append?: boolean }): Promise<JsonlWriter> {
await mkdir(path.dirname(filePath), { recursive: true });
const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' });
const flags = options?.append ? 'a' : 'w';
const stream = createWriteStream(filePath, { flags, encoding: 'utf8' });
return new JsonlWriter(stream);
}

Expand Down
3 changes: 2 additions & 1 deletion apps/cli/src/commands/eval/output-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ export interface WriterOptions {
export async function createOutputWriter(
filePath: string,
format: OutputFormat,
options?: { append?: boolean },
): Promise<OutputWriter> {
switch (format) {
case 'jsonl':
return JsonlWriter.open(filePath);
return JsonlWriter.open(filePath, { append: options?.append });
case 'yaml':
return YamlWriter.open(filePath);
case 'html':
Expand Down
Loading
Loading