diff --git a/apps/cli/src/commands/results/index.ts b/apps/cli/src/commands/results/index.ts index c09d51d3e..5dc90626a 100644 --- a/apps/cli/src/commands/results/index.ts +++ b/apps/cli/src/commands/results/index.ts @@ -2,6 +2,7 @@ import { subcommands } from 'cmd-ts'; import { resultsExportCommand } from './export.js'; import { resultsFailuresCommand } from './failures.js'; +import { resultsReportCommand } from './report.js'; import { resultsShowCommand } from './show.js'; import { resultsSummaryCommand } from './summary.js'; import { resultsValidateCommand } from './validate.js'; @@ -11,6 +12,7 @@ export const resultsCommand = subcommands({ description: 'Inspect, export, and manage evaluation results', cmds: { export: resultsExportCommand, + report: resultsReportCommand, summary: resultsSummaryCommand, failures: resultsFailuresCommand, show: resultsShowCommand, diff --git a/apps/cli/src/commands/results/report-template.ts b/apps/cli/src/commands/results/report-template.ts new file mode 100644 index 000000000..983728d61 --- /dev/null +++ b/apps/cli/src/commands/results/report-template.ts @@ -0,0 +1,2 @@ +export const RESULTS_REPORT_TEMPLATE = + "\n\n\n \n \n AgentV Evaluation Report\n \n\n\n
\n
\n
\n
AgentV static export
\n

Evaluation Report

\n

Studio-themed HTML generated from an existing AgentV results workspace.

\n
\n
\n
\n\n \n\n
\n
\n\n \n\n\n"; diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts new file mode 100644 index 000000000..5158cabfc --- /dev/null +++ b/apps/cli/src/commands/results/report.ts @@ -0,0 +1,184 @@ +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; + +import { command, option, optional, string } from 'cmd-ts'; + +import type { EvaluationResult } from '@agentv/core'; + +import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js'; +import { RESULTS_REPORT_TEMPLATE } from './report-template.js'; +import { resolveSourceFile, sourceArg } from './shared.js'; + +interface ReportManifestRecord { + readonly eval_file?: string; +} + +interface BenchmarkMetadata { + readonly metadata?: { + readonly eval_file?: string; + }; +} + +function normalizeEvalFileLabel(value: string | undefined): string | undefined { + const trimmed = value?.trim(); + if (!trimmed) { + return undefined; + } + + return path + .basename(trimmed) + .replace(/\.results\.jsonl$/i, '') + .replace(/\.eval\.ya?ml$/i, '') + .replace(/\.ya?ml$/i, '') + .replace(/\.jsonl$/i, ''); +} + +function readBenchmarkEvalFile(sourceFile: string): string | undefined { + const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json'); + if (!existsSync(benchmarkPath)) { + return undefined; + } + + try { + const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata; + return normalizeEvalFileLabel(benchmark.metadata?.eval_file); + } catch { + return undefined; + } +} + +export function deriveReportPath(sourceFile: string): string { + return path.join(path.dirname(sourceFile), 'report.html'); +} + +function serializeReportResult( + result: EvaluationResult, + sourceFile: string, + manifestRecord?: ReportManifestRecord, + benchmarkEvalFile?: string, +): Record { + const fallbackEvalFile = + normalizeEvalFileLabel(manifestRecord?.eval_file) ?? + benchmarkEvalFile ?? + normalizeEvalFileLabel(result.suite) ?? + path.basename(path.dirname(sourceFile)); + + return { + timestamp: result.timestamp, + test_id: result.testId, + suite: result.suite, + category: result.category, + target: result.target, + score: result.score, + scores: result.scores, + execution_status: result.executionStatus, + error: result.error, + duration_ms: result.durationMs, + token_usage: result.tokenUsage, + cost_usd: result.costUsd, + input: result.input, + output: result.output, + assertions: result.assertions, + eval_file: fallbackEvalFile, + }; +} + +export async function loadReportSource( + source: string | undefined, + cwd: string, +): Promise<{ + sourceFile: string; + results: EvaluationResult[]; + records: readonly ReportManifestRecord[]; + benchmarkEvalFile?: string; +}> { + const { sourceFile } = await resolveSourceFile(source, cwd); + const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd); + const content = readFileSync(resolvedSourceFile, 'utf8'); + const records = parseResultManifest(content) as ReportManifestRecord[]; + const results = loadManifestResults(resolvedSourceFile); + + if (results.length === 0) { + throw new Error(`No results found in ${resolvedSourceFile}`); + } + + return { + sourceFile: resolvedSourceFile, + results, + records, + benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile), + }; +} + +export function renderResultsReport( + results: readonly EvaluationResult[], + sourceFile: string, + records: readonly ReportManifestRecord[], + benchmarkEvalFile?: string, +): string { + if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) { + throw new Error('Report template is missing __DATA_PLACEHOLDER__'); + } + + const rows = results.map((result, index) => + serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile), + ); + const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/'); + return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', dataJson); +} + +export async function writeResultsReport( + source: string | undefined, + outputPath: string | undefined, + cwd: string, +): Promise<{ sourceFile: string; outputPath: string; html: string }> { + const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd); + const resolvedOutputPath = outputPath + ? path.isAbsolute(outputPath) + ? outputPath + : path.resolve(cwd, outputPath) + : deriveReportPath(sourceFile); + const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile); + + mkdirSync(path.dirname(resolvedOutputPath), { recursive: true }); + writeFileSync(resolvedOutputPath, html, 'utf8'); + + const written = readFileSync(resolvedOutputPath, 'utf8'); + if (written.includes('__DATA_PLACEHOLDER__')) { + throw new Error('Report placeholder substitution failed'); + } + + return { sourceFile, outputPath: resolvedOutputPath, html: written }; +} + +export const resultsReportCommand = command({ + name: 'report', + description: 'Generate a static HTML report from a run workspace or index.jsonl manifest', + args: { + source: sourceArg, + out: option({ + type: optional(string), + long: 'out', + short: 'o', + description: 'Output HTML file (defaults to /report.html)', + }), + dir: option({ + type: optional(string), + long: 'dir', + short: 'd', + description: 'Working directory (default: current directory)', + }), + }, + handler: async ({ source, out, dir }) => { + const cwd = dir ?? process.cwd(); + + try { + const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd); + console.log(`Report written to ${outputPath}`); + console.log(`Source: ${sourceFile}`); + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + }, +}); diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts new file mode 100644 index 000000000..e33b5de87 --- /dev/null +++ b/apps/cli/test/commands/results/report.test.ts @@ -0,0 +1,174 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import vm from 'node:vm'; + +import type { EvaluationResult, EvaluatorResult } from '@agentv/core'; + +import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js'; +import { + deriveReportPath, + loadReportSource, + writeResultsReport, +} from '../../../src/commands/results/report.js'; + +function makeScore( + name: string, + type: string, + score: number, + assertions: EvaluatorResult['assertions'], +): EvaluatorResult { + return { + name, + type, + score, + assertions, + verdict: score >= 0.5 ? 'pass' : 'fail', + }; +} + +function makeResult(overrides: Partial = {}): EvaluationResult { + return { + timestamp: '2026-04-15T01:00:00.000Z', + testId: 'test-1', + suite: 'default', + score: 1, + assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }], + output: [{ role: 'assistant', content: 'answer' }], + input: [{ role: 'user', content: 'question' }], + target: 'default', + executionStatus: 'ok', + tokenUsage: { input: 100, output: 50 }, + durationMs: 1200, + ...overrides, + }; +} + +describe('results report', () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-report-test-')); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('derives default report path from the run workspace', () => { + const sourceFile = path.join(tempDir, 'run', 'index.jsonl'); + expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html')); + }); + + it('loads benchmark eval file metadata from a run workspace', async () => { + const runDir = path.join(tempDir, 'run'); + await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' }); + + const loaded = await loadReportSource(runDir, tempDir); + + expect(loaded.results).toHaveLength(1); + expect(loaded.benchmarkEvalFile).toBe('demo'); + }); + + it('writes a static HTML report with grouped eval files and assertion type badges', async () => { + const runDir = path.join(tempDir, 'run'); + await writeArtifactsFromResults( + [ + makeResult({ + testId: 'registry-pass', + target: 'claude-sonnet', + scores: [ + makeScore('contains', 'contains', 1, [ + { text: 'mentions registry', passed: true, evidence: 'registry present' }, + ]), + ], + }), + makeResult({ + testId: 'billing-fail', + target: 'gpt-5.4', + score: 0.2, + executionStatus: 'quality_failure', + scores: [ + makeScore('regex', 'regex', 0.2, [ + { text: 'matches invoice pattern', passed: false, evidence: 'no invoice id' }, + ]), + ], + }), + ], + runDir, + { evalFile: 'evals/demo.eval.yaml' }, + ); + + const indexPath = path.join(runDir, 'index.jsonl'); + const lines = readFileSync(indexPath, 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); + lines[0].eval_file = 'cw-freight-boolean-registry'; + lines[1].eval_file = 'cw-freight-billing'; + writeFileSync(indexPath, `${lines.map((line) => JSON.stringify(line)).join('\n')}\n`, 'utf8'); + + const { outputPath } = await writeResultsReport(runDir, undefined, tempDir); + const html = readFileSync(outputPath, 'utf8'); + + expect(outputPath).toBe(path.join(runDir, 'report.html')); + expect(html).not.toContain('__DATA_PLACEHOLDER__'); + expect(html).toContain('#030712'); + expect(html).toContain('cw-freight-boolean-registry'); + expect(html).toContain('cw-freight-billing'); + expect(html).toContain('contains'); + expect(html).toContain('regex'); + expect(html).toContain('AgentV Evaluation Report'); + expect(html).not.toContain('Progress'); + expect(html).not.toContain('metric-stack'); + expect(html).toContain(''); + expect(html).toContain('${formatPercent(rate)}'); + expect(html).toContain( + '${escapeHtml(formatPercent(group.stats.pass_rate))}', + ); + expect(html).toContain('Assertions'); + expect(html).toContain('assertion-badge'); + expect(html).not.toContain('Grader Results'); + expect(html).not.toContain('Evaluator Results'); + }); + + it('emits an inline report script that parses successfully', async () => { + const runDir = path.join(tempDir, 'run'); + await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' }); + + const { outputPath } = await writeResultsReport(runDir, undefined, tempDir); + const html = readFileSync(outputPath, 'utf8'); + const script = html.match(/