diff --git a/apps/cli/src/commands/results/index.ts b/apps/cli/src/commands/results/index.ts
index c09d51d3e..5dc90626a 100644
--- a/apps/cli/src/commands/results/index.ts
+++ b/apps/cli/src/commands/results/index.ts
@@ -2,6 +2,7 @@ import { subcommands } from 'cmd-ts';
import { resultsExportCommand } from './export.js';
import { resultsFailuresCommand } from './failures.js';
+import { resultsReportCommand } from './report.js';
import { resultsShowCommand } from './show.js';
import { resultsSummaryCommand } from './summary.js';
import { resultsValidateCommand } from './validate.js';
@@ -11,6 +12,7 @@ export const resultsCommand = subcommands({
description: 'Inspect, export, and manage evaluation results',
cmds: {
export: resultsExportCommand,
+ report: resultsReportCommand,
summary: resultsSummaryCommand,
failures: resultsFailuresCommand,
show: resultsShowCommand,
diff --git a/apps/cli/src/commands/results/report-template.ts b/apps/cli/src/commands/results/report-template.ts
new file mode 100644
index 000000000..983728d61
--- /dev/null
+++ b/apps/cli/src/commands/results/report-template.ts
@@ -0,0 +1,2 @@
+export const RESULTS_REPORT_TEMPLATE =
+ "\n\n
\n \n \n AgentV Evaluation Report\n \n\n\n \n \n\n \n\n \n
\n\n \n\n\n";
diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts
new file mode 100644
index 000000000..5158cabfc
--- /dev/null
+++ b/apps/cli/src/commands/results/report.ts
@@ -0,0 +1,184 @@
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+
+import { command, option, optional, string } from 'cmd-ts';
+
+import type { EvaluationResult } from '@agentv/core';
+
+import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
+import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
+import { resolveSourceFile, sourceArg } from './shared.js';
+
+interface ReportManifestRecord {
+ readonly eval_file?: string;
+}
+
+interface BenchmarkMetadata {
+ readonly metadata?: {
+ readonly eval_file?: string;
+ };
+}
+
+function normalizeEvalFileLabel(value: string | undefined): string | undefined {
+ const trimmed = value?.trim();
+ if (!trimmed) {
+ return undefined;
+ }
+
+ return path
+ .basename(trimmed)
+ .replace(/\.results\.jsonl$/i, '')
+ .replace(/\.eval\.ya?ml$/i, '')
+ .replace(/\.ya?ml$/i, '')
+ .replace(/\.jsonl$/i, '');
+}
+
+function readBenchmarkEvalFile(sourceFile: string): string | undefined {
+ const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json');
+ if (!existsSync(benchmarkPath)) {
+ return undefined;
+ }
+
+ try {
+ const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata;
+ return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
+ } catch {
+ return undefined;
+ }
+}
+
+export function deriveReportPath(sourceFile: string): string {
+ return path.join(path.dirname(sourceFile), 'report.html');
+}
+
+function serializeReportResult(
+ result: EvaluationResult,
+ sourceFile: string,
+ manifestRecord?: ReportManifestRecord,
+ benchmarkEvalFile?: string,
+): Record {
+ const fallbackEvalFile =
+ normalizeEvalFileLabel(manifestRecord?.eval_file) ??
+ benchmarkEvalFile ??
+ normalizeEvalFileLabel(result.suite) ??
+ path.basename(path.dirname(sourceFile));
+
+ return {
+ timestamp: result.timestamp,
+ test_id: result.testId,
+ suite: result.suite,
+ category: result.category,
+ target: result.target,
+ score: result.score,
+ scores: result.scores,
+ execution_status: result.executionStatus,
+ error: result.error,
+ duration_ms: result.durationMs,
+ token_usage: result.tokenUsage,
+ cost_usd: result.costUsd,
+ input: result.input,
+ output: result.output,
+ assertions: result.assertions,
+ eval_file: fallbackEvalFile,
+ };
+}
+
+export async function loadReportSource(
+ source: string | undefined,
+ cwd: string,
+): Promise<{
+ sourceFile: string;
+ results: EvaluationResult[];
+ records: readonly ReportManifestRecord[];
+ benchmarkEvalFile?: string;
+}> {
+ const { sourceFile } = await resolveSourceFile(source, cwd);
+ const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
+ const content = readFileSync(resolvedSourceFile, 'utf8');
+ const records = parseResultManifest(content) as ReportManifestRecord[];
+ const results = loadManifestResults(resolvedSourceFile);
+
+ if (results.length === 0) {
+ throw new Error(`No results found in ${resolvedSourceFile}`);
+ }
+
+ return {
+ sourceFile: resolvedSourceFile,
+ results,
+ records,
+ benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile),
+ };
+}
+
+export function renderResultsReport(
+ results: readonly EvaluationResult[],
+ sourceFile: string,
+ records: readonly ReportManifestRecord[],
+ benchmarkEvalFile?: string,
+): string {
+ if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
+ throw new Error('Report template is missing __DATA_PLACEHOLDER__');
+ }
+
+ const rows = results.map((result, index) =>
+ serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile),
+ );
+ const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
+ return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', dataJson);
+}
+
+export async function writeResultsReport(
+ source: string | undefined,
+ outputPath: string | undefined,
+ cwd: string,
+): Promise<{ sourceFile: string; outputPath: string; html: string }> {
+ const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
+ const resolvedOutputPath = outputPath
+ ? path.isAbsolute(outputPath)
+ ? outputPath
+ : path.resolve(cwd, outputPath)
+ : deriveReportPath(sourceFile);
+ const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);
+
+ mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
+ writeFileSync(resolvedOutputPath, html, 'utf8');
+
+ const written = readFileSync(resolvedOutputPath, 'utf8');
+ if (written.includes('__DATA_PLACEHOLDER__')) {
+ throw new Error('Report placeholder substitution failed');
+ }
+
+ return { sourceFile, outputPath: resolvedOutputPath, html: written };
+}
+
+export const resultsReportCommand = command({
+ name: 'report',
+ description: 'Generate a static HTML report from a run workspace or index.jsonl manifest',
+ args: {
+ source: sourceArg,
+ out: option({
+ type: optional(string),
+ long: 'out',
+ short: 'o',
+ description: 'Output HTML file (defaults to /report.html)',
+ }),
+ dir: option({
+ type: optional(string),
+ long: 'dir',
+ short: 'd',
+ description: 'Working directory (default: current directory)',
+ }),
+ },
+ handler: async ({ source, out, dir }) => {
+ const cwd = dir ?? process.cwd();
+
+ try {
+ const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd);
+ console.log(`Report written to ${outputPath}`);
+ console.log(`Source: ${sourceFile}`);
+ } catch (error) {
+ console.error(`Error: ${(error as Error).message}`);
+ process.exit(1);
+ }
+ },
+});
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
new file mode 100644
index 000000000..e33b5de87
--- /dev/null
+++ b/apps/cli/test/commands/results/report.test.ts
@@ -0,0 +1,174 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import vm from 'node:vm';
+
+import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
+
+import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js';
+import {
+ deriveReportPath,
+ loadReportSource,
+ writeResultsReport,
+} from '../../../src/commands/results/report.js';
+
+function makeScore(
+ name: string,
+ type: string,
+ score: number,
+ assertions: EvaluatorResult['assertions'],
+): EvaluatorResult {
+ return {
+ name,
+ type,
+ score,
+ assertions,
+ verdict: score >= 0.5 ? 'pass' : 'fail',
+ };
+}
+
+function makeResult(overrides: Partial = {}): EvaluationResult {
+ return {
+ timestamp: '2026-04-15T01:00:00.000Z',
+ testId: 'test-1',
+ suite: 'default',
+ score: 1,
+ assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }],
+ output: [{ role: 'assistant', content: 'answer' }],
+ input: [{ role: 'user', content: 'question' }],
+ target: 'default',
+ executionStatus: 'ok',
+ tokenUsage: { input: 100, output: 50 },
+ durationMs: 1200,
+ ...overrides,
+ };
+}
+
+describe('results report', () => {
+ let tempDir: string;
+
+ beforeEach(() => {
+ tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-report-test-'));
+ });
+
+ afterEach(() => {
+ rmSync(tempDir, { recursive: true, force: true });
+ });
+
+ it('derives default report path from the run workspace', () => {
+ const sourceFile = path.join(tempDir, 'run', 'index.jsonl');
+ expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html'));
+ });
+
+ it('loads benchmark eval file metadata from a run workspace', async () => {
+ const runDir = path.join(tempDir, 'run');
+ await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
+
+ const loaded = await loadReportSource(runDir, tempDir);
+
+ expect(loaded.results).toHaveLength(1);
+ expect(loaded.benchmarkEvalFile).toBe('demo');
+ });
+
+ it('writes a static HTML report with grouped eval files and assertion type badges', async () => {
+ const runDir = path.join(tempDir, 'run');
+ await writeArtifactsFromResults(
+ [
+ makeResult({
+ testId: 'registry-pass',
+ target: 'claude-sonnet',
+ scores: [
+ makeScore('contains', 'contains', 1, [
+ { text: 'mentions registry', passed: true, evidence: 'registry present' },
+ ]),
+ ],
+ }),
+ makeResult({
+ testId: 'billing-fail',
+ target: 'gpt-5.4',
+ score: 0.2,
+ executionStatus: 'quality_failure',
+ scores: [
+ makeScore('regex', 'regex', 0.2, [
+ { text: 'matches invoice pattern', passed: false, evidence: 'no invoice id' },
+ ]),
+ ],
+ }),
+ ],
+ runDir,
+ { evalFile: 'evals/demo.eval.yaml' },
+ );
+
+ const indexPath = path.join(runDir, 'index.jsonl');
+ const lines = readFileSync(indexPath, 'utf8')
+ .trim()
+ .split('\n')
+ .map((line) => JSON.parse(line) as Record);
+ lines[0].eval_file = 'cw-freight-boolean-registry';
+ lines[1].eval_file = 'cw-freight-billing';
+ writeFileSync(indexPath, `${lines.map((line) => JSON.stringify(line)).join('\n')}\n`, 'utf8');
+
+ const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
+ const html = readFileSync(outputPath, 'utf8');
+
+ expect(outputPath).toBe(path.join(runDir, 'report.html'));
+ expect(html).not.toContain('__DATA_PLACEHOLDER__');
+ expect(html).toContain('#030712');
+ expect(html).toContain('cw-freight-boolean-registry');
+ expect(html).toContain('cw-freight-billing');
+ expect(html).toContain('contains');
+ expect(html).toContain('regex');
+ expect(html).toContain('AgentV Evaluation Report');
+ expect(html).not.toContain('Progress | ');
+ expect(html).not.toContain('metric-stack');
+ expect(html).toContain('');
+ expect(html).toContain('${formatPercent(rate)}');
+ expect(html).toContain(
+ '${escapeHtml(formatPercent(group.stats.pass_rate))}',
+ );
+ expect(html).toContain('Assertions');
+ expect(html).toContain('assertion-badge');
+ expect(html).not.toContain('Grader Results');
+ expect(html).not.toContain('Evaluator Results');
+ });
+
+ it('emits an inline report script that parses successfully', async () => {
+ const runDir = path.join(tempDir, 'run');
+ await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
+
+ const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
+ const html = readFileSync(outputPath, 'utf8');
+ const script = html.match(/