Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 30 additions & 34 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
DEFAULT_THRESHOLD,
type EvaluationResult,
type EvaluatorResult,
type TranscriptJsonLine,
toTranscriptJsonLines,
} from '@agentv/core';
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';
Expand Down Expand Up @@ -711,6 +711,34 @@ export async function writeArtifacts(
return writeArtifactsFromResults(results, outputDir, options);
}

function buildTranscriptMessageLines(results: readonly EvaluationResult[]): string {
const lines: string[] = [];

for (const result of results) {
const transcriptLines = toTranscriptJsonLines(
{
messages: [...(result.input ?? []), ...result.output],
source: {
provider: result.target,
sessionId: result.conversationId ?? result.testId,
startedAt: result.timestamp,
},
tokenUsage: result.tokenUsage,
durationMs: result.durationMs,
costUsd: result.costUsd,
},
{
testId: result.testId,
target: result.target,
},
);

lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
}

return lines.length > 0 ? `${lines.join('\n')}\n` : '';
}

export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
Expand Down Expand Up @@ -773,39 +801,7 @@ export async function writeArtifactsFromResults(

// Write transcript JSONL (auto-generated on every eval run)
const transcriptPath = path.join(outputDir, 'transcript.jsonl');
const transcriptLines: TranscriptJsonLine[] = results.map((result) => {
let inputText = '';
if (typeof result.input === 'string') {
inputText = result.input;
} else if (Array.isArray(result.input)) {
const firstUserMsg = result.input.find((m) => m.role === 'user');
inputText = typeof firstUserMsg?.content === 'string' ? firstUserMsg.content : '';
}
return {
input: inputText,
output: result.output,
token_usage: result.tokenUsage
? {
input: result.tokenUsage.input,
output: result.tokenUsage.output,
cached: result.tokenUsage.cached,
}
: undefined,
duration_ms: result.durationMs,
cost_usd: result.costUsd,
source: {
provider: result.target,
session_id: result.conversationId ?? result.testId,
timestamp: result.timestamp,
},
};
});
await writeFile(
transcriptPath,
transcriptLines.map((line) => JSON.stringify(line)).join('\n') +
(transcriptLines.length ? '\n' : ''),
'utf8',
);
await writeFile(transcriptPath, buildTranscriptMessageLines(results), 'utf8');

return { testArtifactDir, timingPath, benchmarkPath, indexPath };
}
6 changes: 3 additions & 3 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1260,22 +1260,22 @@ export async function runEvalCommand(
// Use only files that survived tag filtering (fileMetadata keys)
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));

// --transcript: create a shared TranscriptProvider and validate line count
// --transcript: create a shared TranscriptProvider and validate entry count
let transcriptProviderFactory:
| ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider)
| undefined;
if (options.transcript) {
const { TranscriptProvider } = await import('@agentv/core');
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);

// Validate: transcript lines must match total test cases across all files
// Validate: transcript entries must match total test cases across all files
const totalTests = [...fileMetadata.values()].reduce(
(sum, meta) => sum + meta.testCases.length,
0,
);
if (transcriptProvider.lineCount !== totalTests) {
throw new Error(
`Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`,
`Transcript has ${transcriptProvider.lineCount} entr${transcriptProvider.lineCount === 1 ? 'y' : 'ies'} but eval defines ${totalTests} test(s). Each transcript entry maps positionally to one test case.`,
);
}

Expand Down
12 changes: 8 additions & 4 deletions apps/cli/src/commands/import/claude.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import {
discoverClaudeSessions,
parseClaudeSession,
readTranscriptFile,
toTranscriptJsonLine,
toTranscriptJsonLines,
} from '@agentv/core';
import { command, flag, option, optional, string } from 'cmd-ts';

Expand Down Expand Up @@ -94,9 +94,13 @@ export const importClaudeCommand = command({
// Ensure output directory exists
await mkdir(path.dirname(outputPath), { recursive: true });

// Write transcript as JSONL (one line per test case, snake_case wire format)
const jsonLine = toTranscriptJsonLine(transcript);
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');
// Write transcript as JSONL (one message per line, grouped by test_id)
const jsonLines = toTranscriptJsonLines(transcript);
await writeFile(
outputPath,
`${jsonLines.map((line) => JSON.stringify(line)).join('\n')}\n`,
'utf8',
);

const msgCount = transcript.messages.length;
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
Expand Down
12 changes: 8 additions & 4 deletions apps/cli/src/commands/import/codex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import {
discoverCodexSessions,
parseCodexSession,
readTranscriptFile,
toTranscriptJsonLine,
toTranscriptJsonLines,
} from '@agentv/core';
import { command, flag, option, optional, string } from 'cmd-ts';

Expand Down Expand Up @@ -91,9 +91,13 @@ export const importCodexCommand = command({
// Ensure output directory exists
await mkdir(path.dirname(outputPath), { recursive: true });

// Write transcript as JSONL (snake_case wire format)
const jsonLine = toTranscriptJsonLine(transcript);
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');
// Write transcript as JSONL (one message per line, grouped by test_id)
const jsonLines = toTranscriptJsonLines(transcript);
await writeFile(
outputPath,
`${jsonLines.map((line) => JSON.stringify(line)).join('\n')}\n`,
'utf8',
);

const msgCount = transcript.messages.length;
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
Expand Down
12 changes: 8 additions & 4 deletions apps/cli/src/commands/import/copilot.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { discoverCopilotSessions, parseCopilotEvents, toTranscriptJsonLine } from '@agentv/core';
import { discoverCopilotSessions, parseCopilotEvents, toTranscriptJsonLines } from '@agentv/core';
import { command, flag, option, optional, string } from 'cmd-ts';

export const importCopilotCommand = command({
Expand Down Expand Up @@ -99,9 +99,13 @@ export const importCopilotCommand = command({
// Ensure output directory exists
await mkdir(path.dirname(outputPath), { recursive: true });

// Write transcript as JSONL (snake_case wire format)
const jsonLine = toTranscriptJsonLine(transcript);
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');
// Write transcript as JSONL (one message per line, grouped by test_id)
const jsonLines = toTranscriptJsonLines(transcript);
await writeFile(
outputPath,
`${jsonLines.map((line) => JSON.stringify(line)).join('\n')}\n`,
'utf8',
);

const msgCount = transcript.messages.length;
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
Expand Down
74 changes: 74 additions & 0 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,80 @@ describe('writeArtifactsFromResults', () => {
expect(timingOne.duration_ms).toBe(0);
});

it('writes transcript.jsonl as one message object per line', async () => {
const results = [
makeResult({
testId: 'transcript-case',
target: 'codex',
conversationId: 'session-123',
durationMs: 4200,
costUsd: 0.25,
tokenUsage: { input: 100, output: 40, cached: 10, reasoning: 5 },
input: [{ role: 'user' as const, content: 'Inspect artifact output' }],
output: [
{
role: 'assistant' as const,
content: 'Reading artifact-writer.ts',
toolCalls: [
{
tool: 'Read',
input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' },
output: 'file contents',
},
],
},
],
}),
];

await writeArtifactsFromResults(results, testDir);

const transcriptLines = (await readFile(path.join(testDir, 'transcript.jsonl'), 'utf8'))
.trim()
.split('\n')
.map((line) => JSON.parse(line));

expect(transcriptLines).toEqual([
{
test_id: 'transcript-case',
target: 'codex',
message_index: 0,
role: 'user',
content: 'Inspect artifact output',
transcript_token_usage: { input: 100, output: 40, cached: 10, reasoning: 5 },
transcript_duration_ms: 4200,
transcript_cost_usd: 0.25,
source: {
provider: 'codex',
session_id: 'session-123',
timestamp: '2026-03-13T00:00:00.000Z',
},
},
{
test_id: 'transcript-case',
target: 'codex',
message_index: 1,
role: 'assistant',
content: 'Reading artifact-writer.ts',
tool_calls: [
{
tool: 'Read',
input: { file_path: 'apps/cli/src/commands/eval/artifact-writer.ts' },
output: 'file contents',
},
],
transcript_token_usage: { input: 100, output: 40, cached: 10, reasoning: 5 },
transcript_duration_ms: 4200,
transcript_cost_usd: 0.25,
source: {
provider: 'codex',
session_id: 'session-123',
timestamp: '2026-03-13T00:00:00.000Z',
},
},
]);
});

it('sanitizes test IDs for directory names', async () => {
const results = [makeResult({ testId: 'path/to:test*1' })];
await writeArtifactsFromResults(results, testDir);
Expand Down
4 changes: 3 additions & 1 deletion packages/core/src/import/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ export {
} from './session-discovery.js';
export { TranscriptProvider } from './transcript-provider.js';
export {
groupTranscriptJsonLines,
readTranscriptFile,
readTranscriptJsonl,
toTranscriptJsonLine,
toTranscriptJsonLines,
type TranscriptEntry,
type TranscriptJsonLine,
type TranscriptReplayEntry,
type TranscriptSource,
} from './types.js';

Expand Down
42 changes: 22 additions & 20 deletions packages/core/src/import/transcript-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@
*/

import type { Provider, ProviderRequest, ProviderResponse } from '../evaluation/providers/types.js';
import type { TranscriptJsonLine } from './types.js';
import { readTranscriptJsonl } from './types.js';
import type { TranscriptReplayEntry } from './types.js';
import { groupTranscriptJsonLines, readTranscriptJsonl } from './types.js';

export class TranscriptProvider implements Provider {
readonly id: string;
readonly kind = 'transcript' as const;
readonly targetName: string;

private lines: TranscriptJsonLine[];
private entries: TranscriptReplayEntry[];
private cursor = 0;

constructor(targetName: string, lines: TranscriptJsonLine[]) {
constructor(targetName: string, entries: TranscriptReplayEntry[]) {
this.targetName = targetName;
this.id = `transcript:${targetName}`;
this.lines = lines;
this.entries = entries;
}

/**
Expand All @@ -40,36 +40,38 @@ export class TranscriptProvider implements Provider {
if (lines.length === 0) {
throw new Error(`Transcript file is empty: ${filePath}`);
}
const providerName = lines[0].source.provider ?? 'transcript';
return new TranscriptProvider(providerName, lines);
const entries = groupTranscriptJsonLines(lines);
const providerName = entries[0]?.source.provider ?? 'transcript';
return new TranscriptProvider(providerName, entries);
}

get lineCount(): number {
return this.lines.length;
return this.entries.length;
}

async invoke(_request: ProviderRequest): Promise<ProviderResponse> {
if (this.cursor >= this.lines.length) {
if (this.cursor >= this.entries.length) {
throw new Error(
`Transcript exhausted: ${this.lines.length} line(s) available but ` +
`${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`,
`Transcript exhausted: ${this.entries.length} entr${this.entries.length === 1 ? 'y' : 'ies'} available but ` +
`${this.cursor + 1} invocations attempted. Each transcript entry maps to one test case.`,
);
}

const line = this.lines[this.cursor++];
const entry = this.entries[this.cursor++];

return {
output: line.output,
tokenUsage: line.token_usage
output: entry.messages,
tokenUsage: entry.tokenUsage
? {
input: line.token_usage.input,
output: line.token_usage.output,
cached: line.token_usage.cached,
input: entry.tokenUsage.input,
output: entry.tokenUsage.output,
cached: entry.tokenUsage.cached,
reasoning: entry.tokenUsage.reasoning,
}
: undefined,
durationMs: line.duration_ms,
costUsd: line.cost_usd ?? undefined,
startTime: line.source.timestamp,
durationMs: entry.durationMs,
costUsd: entry.costUsd ?? undefined,
startTime: entry.source.startedAt,
};
}
}
Loading
Loading