EntityProcess · christso · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/apps/cli/src/commands/results/index.ts b/apps/cli/src/commands/results/index.ts
@@ -2,6 +2,7 @@ import { subcommands } from 'cmd-ts';
 
 import { resultsExportCommand } from './export.js';
 import { resultsFailuresCommand } from './failures.js';
+import { resultsReportCommand } from './report.js';
 import { resultsShowCommand } from './show.js';
 import { resultsSummaryCommand } from './summary.js';
 import { resultsValidateCommand } from './validate.js';
@@ -11,6 +12,7 @@ export const resultsCommand = subcommands({
   description: 'Inspect, export, and manage evaluation results',
   cmds: {
     export: resultsExportCommand,
+    report: resultsReportCommand,
     summary: resultsSummaryCommand,
     failures: resultsFailuresCommand,
     show: resultsShowCommand,

diff --git a/apps/cli/src/commands/results/report-template.ts b/apps/cli/src/commands/results/report-template.ts
diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts
@@ -0,0 +1,184 @@
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+
+import { command, option, optional, string } from 'cmd-ts';
+
+import type { EvaluationResult } from '@agentv/core';
+
+import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
+import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
+import { resolveSourceFile, sourceArg } from './shared.js';
+
+interface ReportManifestRecord {
+  readonly eval_file?: string;
+}
+
+interface BenchmarkMetadata {
+  readonly metadata?: {
+    readonly eval_file?: string;
+  };
+}
+
+function normalizeEvalFileLabel(value: string | undefined): string | undefined {
+  const trimmed = value?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+
+  return path
+    .basename(trimmed)
+    .replace(/\.results\.jsonl$/i, '')
+    .replace(/\.eval\.ya?ml$/i, '')
+    .replace(/\.ya?ml$/i, '')
+    .replace(/\.jsonl$/i, '');
+}
+
+function readBenchmarkEvalFile(sourceFile: string): string | undefined {
+  const benchmarkPath = path.join(path.dirname(sourceFile), 'benchmark.json');
+  if (!existsSync(benchmarkPath)) {
+    return undefined;
+  }
+
+  try {
+    const benchmark = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as BenchmarkMetadata;
+    return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
+  } catch {
+    return undefined;
+  }
+}
+
+export function deriveReportPath(sourceFile: string): string {
+  return path.join(path.dirname(sourceFile), 'report.html');
+}
+
+function serializeReportResult(
+  result: EvaluationResult,
+  sourceFile: string,
+  manifestRecord?: ReportManifestRecord,
+  benchmarkEvalFile?: string,
+): Record<string, unknown> {
+  const fallbackEvalFile =
+    normalizeEvalFileLabel(manifestRecord?.eval_file) ??
+    benchmarkEvalFile ??
+    normalizeEvalFileLabel(result.suite) ??
+    path.basename(path.dirname(sourceFile));
+
+  return {
+    timestamp: result.timestamp,
+    test_id: result.testId,
+    suite: result.suite,
+    category: result.category,
+    target: result.target,
+    score: result.score,
+    scores: result.scores,
+    execution_status: result.executionStatus,
+    error: result.error,
+    duration_ms: result.durationMs,
+    token_usage: result.tokenUsage,
+    cost_usd: result.costUsd,
+    input: result.input,
+    output: result.output,
+    assertions: result.assertions,
+    eval_file: fallbackEvalFile,
+  };
+}
+
+export async function loadReportSource(
+  source: string | undefined,
+  cwd: string,
+): Promise<{
+  sourceFile: string;
+  results: EvaluationResult[];
+  records: readonly ReportManifestRecord[];
+  benchmarkEvalFile?: string;
+}> {
+  const { sourceFile } = await resolveSourceFile(source, cwd);
+  const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
+  const content = readFileSync(resolvedSourceFile, 'utf8');
+  const records = parseResultManifest(content) as ReportManifestRecord[];
+  const results = loadManifestResults(resolvedSourceFile);
+
+  if (results.length === 0) {
+    throw new Error(`No results found in ${resolvedSourceFile}`);
+  }
+
+  return {
+    sourceFile: resolvedSourceFile,
+    results,
+    records,
+    benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile),
+  };
+}
+
+export function renderResultsReport(
+  results: readonly EvaluationResult[],
+  sourceFile: string,
+  records: readonly ReportManifestRecord[],
+  benchmarkEvalFile?: string,
+): string {
+  if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
+    throw new Error('Report template is missing __DATA_PLACEHOLDER__');
+  }
+
+  const rows = results.map((result, index) =>
+    serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile),
+  );
+  const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
+  return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', dataJson);
+}
+
+export async function writeResultsReport(
+  source: string | undefined,
+  outputPath: string | undefined,
+  cwd: string,
+): Promise<{ sourceFile: string; outputPath: string; html: string }> {
+  const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
+  const resolvedOutputPath = outputPath
+    ? path.isAbsolute(outputPath)
+      ? outputPath
+      : path.resolve(cwd, outputPath)
+    : deriveReportPath(sourceFile);
+  const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);
+
+  mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
+  writeFileSync(resolvedOutputPath, html, 'utf8');
+
+  const written = readFileSync(resolvedOutputPath, 'utf8');
+  if (written.includes('__DATA_PLACEHOLDER__')) {
+    throw new Error('Report placeholder substitution failed');
+  }
+
+  return { sourceFile, outputPath: resolvedOutputPath, html: written };
+}
+
+export const resultsReportCommand = command({
+  name: 'report',
+  description: 'Generate a static HTML report from a run workspace or index.jsonl manifest',
+  args: {
+    source: sourceArg,
+    out: option({
+      type: optional(string),
+      long: 'out',
+      short: 'o',
+      description: 'Output HTML file (defaults to <run-dir>/report.html)',
+    }),
+    dir: option({
+      type: optional(string),
+      long: 'dir',
+      short: 'd',
+      description: 'Working directory (default: current directory)',
+    }),
+  },
+  handler: async ({ source, out, dir }) => {
+    const cwd = dir ?? process.cwd();
+
+    try {
+      const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd);
+      console.log(`Report written to ${outputPath}`);
+      console.log(`Source: ${sourceFile}`);
+    } catch (error) {
+      console.error(`Error: ${(error as Error).message}`);
+      process.exit(1);
+    }
+  },
+});
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
@@ -0,0 +1,174 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import vm from 'node:vm';
+
+import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
+
+import { writeArtifactsFromResults } from '../../../src/commands/eval/artifact-writer.js';
+import {
+  deriveReportPath,
+  loadReportSource,
+  writeResultsReport,
+} from '../../../src/commands/results/report.js';
+
+function makeScore(
+  name: string,
+  type: string,
+  score: number,
+  assertions: EvaluatorResult['assertions'],
+): EvaluatorResult {
+  return {
+    name,
+    type,
+    score,
+    assertions,
+    verdict: score >= 0.5 ? 'pass' : 'fail',
+  };
+}
+
+function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
+  return {
+    timestamp: '2026-04-15T01:00:00.000Z',
+    testId: 'test-1',
+    suite: 'default',
+    score: 1,
+    assertions: [{ text: 'fallback assertion', passed: true, evidence: 'ok' }],
+    output: [{ role: 'assistant', content: 'answer' }],
+    input: [{ role: 'user', content: 'question' }],
+    target: 'default',
+    executionStatus: 'ok',
+    tokenUsage: { input: 100, output: 50 },
+    durationMs: 1200,
+    ...overrides,
+  };
+}
+
+describe('results report', () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-report-test-'));
+  });
+
+  afterEach(() => {
+    rmSync(tempDir, { recursive: true, force: true });
+  });
+
+  it('derives default report path from the run workspace', () => {
+    const sourceFile = path.join(tempDir, 'run', 'index.jsonl');
+    expect(deriveReportPath(sourceFile)).toBe(path.join(tempDir, 'run', 'report.html'));
+  });
+
+  it('loads benchmark eval file metadata from a run workspace', async () => {
+    const runDir = path.join(tempDir, 'run');
+    await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
+
+    const loaded = await loadReportSource(runDir, tempDir);
+
+    expect(loaded.results).toHaveLength(1);
+    expect(loaded.benchmarkEvalFile).toBe('demo');
+  });
+
+  it('writes a static HTML report with grouped eval files and assertion type badges', async () => {
+    const runDir = path.join(tempDir, 'run');
+    await writeArtifactsFromResults(
+      [
+        makeResult({
+          testId: 'registry-pass',
+          target: 'claude-sonnet',
+          scores: [
+            makeScore('contains', 'contains', 1, [
+              { text: 'mentions registry', passed: true, evidence: 'registry present' },
+            ]),
+          ],
+        }),
+        makeResult({
+          testId: 'billing-fail',
+          target: 'gpt-5.4',
+          score: 0.2,
+          executionStatus: 'quality_failure',
+          scores: [
+            makeScore('regex', 'regex', 0.2, [
+              { text: 'matches invoice pattern', passed: false, evidence: 'no invoice id' },
+            ]),
+          ],
+        }),
+      ],
+      runDir,
+      { evalFile: 'evals/demo.eval.yaml' },
+    );
+
+    const indexPath = path.join(runDir, 'index.jsonl');
+    const lines = readFileSync(indexPath, 'utf8')
+      .trim()
+      .split('\n')
+      .map((line) => JSON.parse(line) as Record<string, unknown>);
+    lines[0].eval_file = 'cw-freight-boolean-registry';
+    lines[1].eval_file = 'cw-freight-billing';
+    writeFileSync(indexPath, `${lines.map((line) => JSON.stringify(line)).join('\n')}\n`, 'utf8');
+
+    const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
+    const html = readFileSync(outputPath, 'utf8');
+
+    expect(outputPath).toBe(path.join(runDir, 'report.html'));
+    expect(html).not.toContain('__DATA_PLACEHOLDER__');
+    expect(html).toContain('#030712');
+    expect(html).toContain('cw-freight-boolean-registry');
+    expect(html).toContain('cw-freight-billing');
+    expect(html).toContain('contains');
+    expect(html).toContain('regex');
+    expect(html).toContain('AgentV Evaluation Report');
+    expect(html).not.toContain('<th>Progress</th>');
+    expect(html).not.toContain('metric-stack');
+    expect(html).toContain('<span class="pass-rate-track">');
+    expect(html).toContain('<span class="pass-rate-label">${formatPercent(rate)}</span>');
+    expect(html).toContain(
+      '<span class="metric-value">${escapeHtml(formatPercent(group.stats.pass_rate))}</span>',
+    );
+    expect(html).toContain('Assertions');
+    expect(html).toContain('assertion-badge');
+    expect(html).not.toContain('Grader Results');
+    expect(html).not.toContain('Evaluator Results');
+  });
+
+  it('emits an inline report script that parses successfully', async () => {
+    const runDir = path.join(tempDir, 'run');
+    await writeArtifactsFromResults([makeResult()], runDir, { evalFile: 'evals/demo.eval.yaml' });
+
+    const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
+    const html = readFileSync(outputPath, 'utf8');
+    const script = html.match(/<script>([\s\S]*)<\/script>/)?.[1];
+
+    expect(script).toBeString();
+
+    const app = { innerHTML: '' };
+    const headerMeta = { innerHTML: '' };
+    const tabNav = { classList: { add: () => undefined, remove: () => undefined } };
+    const tabButton = {
+      getAttribute: () => 'overview',
+      classList: { toggle: () => undefined },
+      addEventListener: () => undefined,
+    };
+
+    expect(() =>
+      vm.runInNewContext(script as string, {
+        console,
+        document: {
+          documentElement: { classList: { contains: () => false, toggle: () => undefined } },
+          getElementById(id: string) {
+            if (id === 'app') return app;
+            if (id === 'header-meta') return headerMeta;
+            if (id === 'tab-nav') return tabNav;
+            if (id === 'theme-btn') return { addEventListener: () => undefined };
+            return null;
+          },
+          querySelectorAll(selector: string) {
+            return selector === '.tab' ? [tabButton] : [];
+          },
+        },
+      }),
+    ).not.toThrow();
+  });
+});
diff --git a/apps/web/src/assets/screenshots/results-report-details.png b/apps/web/src/assets/screenshots/results-report-details.png
diff --git a/apps/web/src/assets/screenshots/results-report-overview.png b/apps/web/src/assets/screenshots/results-report-overview.png