Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/cli/src/commands/results/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { subcommands } from 'cmd-ts';

import { resultsExportCommand } from './export.js';
import { resultsFailuresCommand } from './failures.js';
import { resultsReportCommand } from './report.js';
import { resultsShowCommand } from './show.js';
import { resultsSummaryCommand } from './summary.js';
import { resultsValidateCommand } from './validate.js';
Expand All @@ -11,6 +12,7 @@ export const resultsCommand = subcommands({
description: 'Inspect, export, and manage evaluation results',
cmds: {
export: resultsExportCommand,
report: resultsReportCommand,
summary: resultsSummaryCommand,
failures: resultsFailuresCommand,
show: resultsShowCommand,
Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/results/report-template.ts

Large diffs are not rendered by default.

184 changes: 184 additions & 0 deletions apps/cli/src/commands/results/report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import path from 'node:path';

import { command, option, optional, string } from 'cmd-ts';

import type { EvaluationResult } from '@agentv/core';

import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
import { resolveSourceFile, sourceArg } from './shared.js';

interface ReportManifestRecord {
readonly eval_file?: string;
}

interface BenchmarkMetadata {
readonly metadata?: {
readonly eval_file?: string;
};
}

function normalizeEvalFileLabel(value: string | undefined): string | undefined {
const trimmed = value?.trim();
if (!trimmed) {
return undefined;
}

return path
.basename(trimmed)
.replace(/\.results\.jsonl$/i, '')
.replace(/\.eval\.ya?ml$/i, '')
.replace(/\.ya?ml$/i, '')
.replace(/\.jsonl$/i, '');
}

function readBenchmarkEvalFile(sourceFile: string): string | undefined {
const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json');
if (!existsSync(benchmarkPath)) {
return undefined;
}

try {
const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata;
return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
} catch {
return undefined;
}
}

export function deriveReportPath(sourceFile: string): string {
return path.join(path.dirname(sourceFile), 'report.html');
}

function serializeReportResult(
result: EvaluationResult,
sourceFile: string,
manifestRecord?: ReportManifestRecord,
benchmarkEvalFile?: string,
): Record<string, unknown> {
const fallbackEvalFile =
normalizeEvalFileLabel(manifestRecord?.eval_file) ??
benchmarkEvalFile ??
normalizeEvalFileLabel(result.suite) ??
path.basename(path.dirname(sourceFile));

return {
timestamp: result.timestamp,
test_id: result.testId,
suite: result.suite,
category: result.category,
target: result.target,
score: result.score,
scores: result.scores,
execution_status: result.executionStatus,
error: result.error,
duration_ms: result.durationMs,
token_usage: result.tokenUsage,
cost_usd: result.costUsd,
input: result.input,
output: result.output,
assertions: result.assertions,
eval_file: fallbackEvalFile,
};
}

export async function loadReportSource(
source: string | undefined,
cwd: string,
): Promise<{
sourceFile: string;
results: EvaluationResult[];
records: readonly ReportManifestRecord[];
benchmarkEvalFile?: string;
}> {
const { sourceFile } = await resolveSourceFile(source, cwd);
const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
const content = readFileSync(resolvedSourceFile, 'utf8');
const records = parseResultManifest(content) as ReportManifestRecord[];
const results = loadManifestResults(resolvedSourceFile);

if (results.length === 0) {
throw new Error(`No results found in ${resolvedSourceFile}`);
}

return {
sourceFile: resolvedSourceFile,
results,
records,
benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile),
};
}

export function renderResultsReport(
results: readonly EvaluationResult[],
sourceFile: string,
records: readonly ReportManifestRecord[],
benchmarkEvalFile?: string,
): string {
if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
throw new Error('Report template is missing __DATA_PLACEHOLDER__');
}

const rows = results.map((result, index) =>
serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile),
);
const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', dataJson);
}

export async function writeResultsReport(
source: string | undefined,
outputPath: string | undefined,
cwd: string,
): Promise<{ sourceFile: string; outputPath: string; html: string }> {
const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
const resolvedOutputPath = outputPath
? path.isAbsolute(outputPath)
? outputPath
: path.resolve(cwd, outputPath)
: deriveReportPath(sourceFile);
const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);

mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
writeFileSync(resolvedOutputPath, html, 'utf8');

const written = readFileSync(resolvedOutputPath, 'utf8');
if (written.includes('__DATA_PLACEHOLDER__')) {
throw new Error('Report placeholder substitution failed');
}

return { sourceFile, outputPath: resolvedOutputPath, html: written };
}

export const resultsReportCommand = command({
name: 'report',
description: 'Generate a static HTML report from a run workspace or index.jsonl manifest',
args: {
source: sourceArg,
out: option({
type: optional(string),
long: 'out',
short: 'o',
description: 'Output HTML file (defaults to <run-dir>/report.html)',
}),
dir: option({
type: optional(string),
long: 'dir',
short: 'd',
description: 'Working directory (default: current directory)',
}),
},
handler: async ({ source, out, dir }) => {
const cwd = dir ?? process.cwd();

try {
const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd);
console.log(`Report written to ${outputPath}`);
console.log(`Source: ${sourceFile}`);
} catch (error) {
console.error(`Error: ${(error as Error).message}`);
process.exit(1);
}
},
});
174 changes: 174 additions & 0 deletions apps/cli/test/commands/results/report.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';
import vm from 'node:vm';

import type { EvaluationResult, EvaluatorResult } from '@agentv/core';

import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js';
import {
deriveReportPath,
loadReportSource,
writeResultsReport,
} from '../../../src/commands/results/report.js';

function makeScore(
name: string,
type: string,
score: number,
assertions: EvaluatorResult['assertions'],
): EvaluatorResult {
return {
name,
type,
score,
assertions,
verdict: score >= 0.5 ? 'pass' : 'fail',
};
}

function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
return {
timestamp: '2026-04-15T01:00:00.000Z',
testId: 'test-1',
suite: 'default',
score: 1,
assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }],
output: [{ role: 'assistant', content: 'answer' }],
input: [{ role: 'user', content: 'question' }],
target: 'default',
executionStatus: 'ok',
tokenUsage: { input: 100, output: 50 },
durationMs: 1200,
...overrides,
};
}

describe('results report', () => {
let tempDir: string;

beforeEach(() => {
tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-report-test-'));
});

afterEach(() => {
rmSync(tempDir, { recursive: true, force: true });
});

it('derives default report path from the run workspace', () => {
const sourceFile = path.join(tempDir, 'run', 'index.jsonl');
expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html'));
});

it('loads benchmark eval file metadata from a run workspace', async () => {
const runDir = path.join(tempDir, 'run');
await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });

const loaded = await loadReportSource(runDir, tempDir);

expect(loaded.results).toHaveLength(1);
expect(loaded.benchmarkEvalFile).toBe('demo');
});

it('writes a static HTML report with grouped eval files and assertion type badges', async () => {
const runDir = path.join(tempDir, 'run');
await writeArtifactsFromResults(
[
makeResult({
testId: 'registry-pass',
target: 'claude-sonnet',
scores: [
makeScore('contains', 'contains', 1, [
{ text: 'mentions registry', passed: true, evidence: 'registry present' },
]),
],
}),
makeResult({
testId: 'billing-fail',
target: 'gpt-5.4',
score: 0.2,
executionStatus: 'quality_failure',
scores: [
makeScore('regex', 'regex', 0.2, [
{ text: 'matches invoice pattern', passed: false, evidence: 'no invoice id' },
]),
],
}),
],
runDir,
{ evalFile: 'evals/demo.eval.yaml' },
);

const indexPath = path.join(runDir, 'index.jsonl');
const lines = readFileSync(indexPath, 'utf8')
.trim()
.split('\n')
.map((line) => JSON.parse(line) as Record<string, unknown>);
lines[0].eval_file = 'cw-freight-boolean-registry';
lines[1].eval_file = 'cw-freight-billing';
writeFileSync(indexPath, `${lines.map((line) => JSON.stringify(line)).join('\n')}\n`, 'utf8');

const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
const html = readFileSync(outputPath, 'utf8');

expect(outputPath).toBe(path.join(runDir, 'report.html'));
expect(html).not.toContain('__DATA_PLACEHOLDER__');
expect(html).toContain('#030712');
expect(html).toContain('cw-freight-boolean-registry');
expect(html).toContain('cw-freight-billing');
expect(html).toContain('contains');
expect(html).toContain('regex');
expect(html).toContain('AgentV Evaluation Report');
expect(html).not.toContain('<th>Progress</th>');
expect(html).not.toContain('metric-stack');
expect(html).toContain('<span class="pass-rate-track">');
expect(html).toContain('<span class="pass-rate-label">${formatPercent(rate)}</span>');
expect(html).toContain(
'<span class="metric-value">${escapeHtml(formatPercent(group.stats.pass_rate))}</span>',
);
expect(html).toContain('Assertions');
expect(html).toContain('assertion-badge');
expect(html).not.toContain('Grader Results');
expect(html).not.toContain('Evaluator Results');
});

it('emits an inline report script that parses successfully', async () => {
const runDir = path.join(tempDir, 'run');
await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });

const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
const html = readFileSync(outputPath, 'utf8');
const script = html.match(/<script>([\s\S]*)<\/script>/)?.[1];

expect(script).toBeString();

const app = { innerHTML: '' };
const headerMeta = { innerHTML: '' };
const tabNav = { classList: { add: () => undefined, remove: () => undefined } };
const tabButton = {
getAttribute: () => 'overview',
classList: { toggle: () => undefined },
addEventListener: () => undefined,
};

expect(() =>
vm.runInNewContext(script as string, {
console,
document: {
documentElement: { classList: { contains: () => false, toggle: () => undefined } },
getElementById(id: string) {
if (id === 'app') return app;
if (id === 'header-meta') return headerMeta;
if (id === 'tab-nav') return tabNav;
if (id === 'theme-btn') return { addEventListener: () => undefined };
return null;
},
querySelectorAll(selector: string) {
return selector === '.tab' ? [tabButton] : [];
},
},
}),
).not.toThrow();
});
});
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading