diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 7a4e3d721..fb3b4e7a4 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -193,6 +193,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] { export interface LightweightResultRecord { readonly testId: string; + readonly dataset?: string; readonly target?: string; readonly experiment?: string; readonly score: number; @@ -209,6 +210,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec if (isIndexManifestPath(resolvedSourceFile)) { return parseResultManifest(content).map((record) => ({ testId: record.test_id ?? record.eval_id ?? 'unknown', + dataset: record.dataset, target: record.target, experiment: record.experiment, score: record.score, @@ -244,6 +246,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec records.push({ testId: rawTestId, + dataset: typeof record.dataset === 'string' ? record.dataset : undefined, target: typeof record.target === 'string' ? record.target : undefined, score: record.score, scores: Array.isArray(record.scores) diff --git a/apps/cli/src/commands/trend/index.ts b/apps/cli/src/commands/trend/index.ts new file mode 100644 index 000000000..edd616d77 --- /dev/null +++ b/apps/cli/src/commands/trend/index.ts @@ -0,0 +1,500 @@ +import path from 'node:path'; + +import { command, flag, number, oneOf, option, optional, restPositionals, string } from 'cmd-ts'; + +import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; +import { RESULT_INDEX_FILENAME } from '../eval/result-layout.js'; +import { + type LightweightResultRecord, + loadLightweightResults, + resolveResultSourcePath, +} from '../results/manifest.js'; +import { listResultFiles } from '../trace/utils.js'; + +const colors = { + reset: '\x1b[0m', + bold: '\x1b[1m', + dim: '\x1b[2m', + green: '\x1b[32m', + red: '\x1b[31m', + yellow: '\x1b[33m', + cyan: '\x1b[36m', + gray: '\x1b[90m', +}; + +const noColor = process.env.NO_COLOR !== undefined || !process.stdout.isTTY; +const c = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ''])) : colors; +const ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, 'g'); + +export interface TrendRunRecord extends LightweightResultRecord { + readonly sourcePath: string; +} + +export interface TrendRunPoint { + readonly label: string; + readonly path: string; + readonly timestamp?: string; + readonly matchedTestCount: number; + readonly meanScore: number; +} + +export interface TrendFilters { + readonly dataset?: string; + readonly target?: string; + readonly allowMissingTests: boolean; +} + +export interface TrendSummary { + readonly runCount: number; + readonly matchedTestCount: number; + readonly dateRange: { + readonly start?: string; + readonly end?: string; + }; + readonly slope: number; + readonly intercept: number; + readonly rSquared: number; + readonly direction: 'degrading' | 'improving' | 'stable'; +} + +export interface TrendRegression { + readonly slopeThreshold: number; + readonly failOnDegrading: boolean; + readonly triggered: boolean; +} + +export interface TrendOutput { + readonly runs: readonly TrendRunPoint[]; + readonly filters: TrendFilters; + readonly summary: TrendSummary; + readonly regression: TrendRegression; +} + +interface RegressionStats { + readonly slope: number; + readonly intercept: number; + readonly rSquared: number; +} + +function stripAnsi(str: string): string { + return str.replace(ansiPattern, ''); +} + +function padRight(str: string, len: number): string { + const plainLen = stripAnsi(str).length; + return str + ' '.repeat(Math.max(0, len - plainLen)); +} + +function padLeft(str: string, len: number): string { + const plainLen = stripAnsi(str).length; + return ' '.repeat(Math.max(0, len - plainLen)) + str; +} + +function formatSignedNumber(value: number, digits = 3): string { + const sign = value >= 0 ? '+' : ''; + return `${sign}${value.toFixed(digits)}`; +} + +function colorizeDirection(direction: TrendSummary['direction']): string { + switch (direction) { + case 'improving': + return `${c.green}${direction}${c.reset}`; + case 'degrading': + return `${c.red}${direction}${c.reset}`; + case 'stable': + return `${c.gray}${direction}${c.reset}`; + } +} + +function colorizeSlope(value: number): string { + if (value > 0) { + return `${c.green}${formatSignedNumber(value)}${c.reset}`; + } + if (value < 0) { + return `${c.red}${formatSignedNumber(value)}${c.reset}`; + } + return `${c.gray}${formatSignedNumber(value)}${c.reset}`; +} + +function ensureTrendIndexPath(source: string, cwd: string): string { + const resolved = resolveResultSourcePath(source, cwd); + if (path.basename(resolved) !== RESULT_INDEX_FILENAME) { + throw new Error( + `Unsupported result source for trend: ${source}. Use a run workspace directory or ${RESULT_INDEX_FILENAME} manifest.`, + ); + } + return resolved; +} + +export function resolveTrendSources( + cwd: string, + sources: readonly string[], + last?: number, +): string[] { + if (sources.length > 0 && last !== undefined) { + throw new Error('Use either explicit run sources or --last, not both'); + } + + if (sources.length > 0) { + return sources.map((source) => ensureTrendIndexPath(source, cwd)); + } + + if (last === undefined) { + throw new Error('Provide one or more run workspaces or use --last '); + } + + if (last < 2) { + throw new Error('--last must be at least 2'); + } + + const metas = listResultFiles(cwd) + .filter((meta) => path.basename(meta.path) === RESULT_INDEX_FILENAME) + .slice(0, last); + + if (metas.length < 2) { + throw new Error( + 'Trend analysis requires at least 2 canonical run workspaces in .agentv/results/runs/', + ); + } + + return metas.map((meta) => meta.path).reverse(); +} + +function filterRunRecords( + records: readonly LightweightResultRecord[], + sourcePath: string, + dataset?: string, + target?: string, +): TrendRunRecord[] { + return records + .filter((record) => (dataset ? record.dataset === dataset : true)) + .filter((record) => (target ? record.target === target : true)) + .map((record) => ({ ...record, sourcePath })); +} + +function getRunLabel(sourcePath: string, timestamp?: string): string { + if (timestamp) { + return timestamp; + } + return path.basename(path.dirname(sourcePath)); +} + +function getRunSortKey(sourcePath: string, timestamp?: string): string { + return timestamp ?? path.basename(path.dirname(sourcePath)); +} + +function mean(values: readonly number[]): number { + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function roundMetric(value: number, digits = 6): number { + return Number(value.toFixed(digits)); +} + +export function computeMatchedTestIds( + runs: readonly TrendRunRecord[][], + allowMissingTests: boolean, +): string[] | undefined { + if (allowMissingTests) { + return undefined; + } + + const [firstRun, ...rest] = runs; + const intersection = new Set(firstRun.map((record) => record.testId)); + + for (const run of rest) { + const runIds = new Set(run.map((record) => record.testId)); + for (const testId of intersection) { + if (!runIds.has(testId)) { + intersection.delete(testId); + } + } + } + + return [...intersection].sort(); +} + +export function computeRegressionStats(values: readonly number[]): RegressionStats { + if (values.length < 2) { + throw new Error('Trend analysis requires at least 2 runs'); + } + + const n = values.length; + const meanX = (n - 1) / 2; + const meanY = mean(values); + + let numerator = 0; + let denominator = 0; + for (let i = 0; i < n; i++) { + const dx = i - meanX; + numerator += dx * (values[i] - meanY); + denominator += dx * dx; + } + + const slope = denominator === 0 ? 0 : numerator / denominator; + const intercept = meanY - slope * meanX; + + let ssTot = 0; + let ssRes = 0; + for (let i = 0; i < n; i++) { + const predicted = intercept + slope * i; + ssTot += (values[i] - meanY) ** 2; + ssRes += (values[i] - predicted) ** 2; + } + + const rSquared = ssTot === 0 ? 1 : 1 - ssRes / ssTot; + return { slope, intercept, rSquared }; +} + +export function classifyTrendDirection( + slope: number, + slopeThreshold: number, +): TrendSummary['direction'] { + if (slope <= -slopeThreshold) { + return 'degrading'; + } + if (slope >= slopeThreshold) { + return 'improving'; + } + return 'stable'; +} + +export function determineTrendExitCode( + direction: TrendSummary['direction'], + failOnDegrading: boolean, +): number { + return failOnDegrading && direction === 'degrading' ? 1 : 0; +} + +export function analyzeTrend(params: { + readonly sourcePaths: readonly string[]; + readonly dataset?: string; + readonly target?: string; + readonly slopeThreshold: number; + readonly allowMissingTests: boolean; + readonly failOnDegrading: boolean; +}): TrendOutput { + const { sourcePaths, dataset, target, slopeThreshold, allowMissingTests, failOnDegrading } = + params; + + if (sourcePaths.length < 2) { + throw new Error('Trend analysis requires at least 2 runs'); + } + + const filteredRuns = sourcePaths.map((sourcePath) => { + const records = filterRunRecords( + loadLightweightResults(sourcePath), + sourcePath, + dataset, + target, + ); + if (records.length === 0) { + const filters = [dataset ? `dataset=${dataset}` : '', target ? `target=${target}` : ''] + .filter(Boolean) + .join(', '); + const suffix = filters ? ` after filtering by ${filters}` : ''; + throw new Error(`Run has no matching records${suffix}: ${sourcePath}`); + } + return records; + }); + + const chronologicalRuns = filteredRuns + .map((records, index) => ({ + sourcePath: sourcePaths[index], + records, + sortKey: getRunSortKey(sourcePaths[index], records[0]?.timestamp), + })) + .sort((a, b) => a.sortKey.localeCompare(b.sortKey)); + + const matchedTestIds = computeMatchedTestIds( + chronologicalRuns.map((run) => run.records), + allowMissingTests, + ); + if (!allowMissingTests && (!matchedTestIds || matchedTestIds.length === 0)) { + throw new Error('No shared test IDs remain across the selected runs after filtering'); + } + + const runs = chronologicalRuns.map(({ records, sourcePath }) => { + const applicableRecords = + matchedTestIds === undefined + ? records + : records.filter((record) => matchedTestIds.includes(record.testId)); + + if (applicableRecords.length === 0) { + throw new Error(`Run has no matched tests after intersection: ${sourcePath}`); + } + + return { + label: getRunLabel(sourcePath, applicableRecords[0]?.timestamp ?? records[0]?.timestamp), + path: sourcePath, + timestamp: applicableRecords[0]?.timestamp ?? records[0]?.timestamp, + matchedTestCount: applicableRecords.length, + meanScore: roundMetric(mean(applicableRecords.map((record) => record.score))), + } satisfies TrendRunPoint; + }); + + const regressionStats = computeRegressionStats(runs.map((run) => run.meanScore)); + const direction = classifyTrendDirection(regressionStats.slope, slopeThreshold); + + return { + runs, + filters: { + dataset, + target, + allowMissingTests, + }, + summary: { + runCount: runs.length, + matchedTestCount: + matchedTestIds?.length ?? Math.min(...runs.map((run) => run.matchedTestCount)), + dateRange: { + start: runs[0]?.timestamp, + end: runs.at(-1)?.timestamp, + }, + slope: roundMetric(regressionStats.slope), + intercept: roundMetric(regressionStats.intercept), + rSquared: roundMetric(regressionStats.rSquared), + direction, + }, + regression: { + slopeThreshold, + failOnDegrading, + triggered: failOnDegrading && direction === 'degrading', + }, + }; +} + +export function formatTrendTable(output: TrendOutput): string { + const lines: string[] = []; + const runLabelWidth = Math.max(3, ...output.runs.map((run) => run.label.length)); + const scoreWidth = Math.max(10, ...output.runs.map((run) => run.meanScore.toFixed(3).length)); + const matchWidth = Math.max(7, ...output.runs.map((run) => String(run.matchedTestCount).length)); + + lines.push(''); + lines.push(`${c.bold}Trend Analysis${c.reset}`); + lines.push(''); + lines.push( + `${c.bold}Runs:${c.reset} ${output.summary.runCount} | ${c.bold}Range:${c.reset} ${output.summary.dateRange.start ?? 'unknown'} → ${output.summary.dateRange.end ?? 'unknown'}`, + ); + lines.push( + `${c.bold}Filters:${c.reset} dataset=${output.filters.dataset ?? '*'} target=${output.filters.target ?? '*'} mode=${output.filters.allowMissingTests ? 'independent' : 'matched-tests'}`, + ); + lines.push( + `${c.bold}Matched Tests:${c.reset} ${output.summary.matchedTestCount} | ${c.bold}Verdict:${c.reset} ${colorizeDirection(output.summary.direction)}`, + ); + lines.push(''); + + const header = ` ${padRight('Run', runLabelWidth)} ${padLeft('Tests', matchWidth)} ${padLeft('Mean Score', scoreWidth)}`; + lines.push(`${c.dim}${header}${c.reset}`); + lines.push( + `${c.dim} ${'─'.repeat(runLabelWidth)} ${'─'.repeat(matchWidth)} ${'─'.repeat(scoreWidth)}${c.reset}`, + ); + + for (const run of output.runs) { + lines.push( + ` ${padRight(run.label, runLabelWidth)} ${padLeft(String(run.matchedTestCount), matchWidth)} ${padLeft(run.meanScore.toFixed(3), scoreWidth)}`, + ); + } + + lines.push(''); + lines.push( + `${c.bold}Summary:${c.reset} slope=${colorizeSlope(output.summary.slope)} intercept=${output.summary.intercept.toFixed(3)} r²=${output.summary.rSquared.toFixed(3)}`, + ); + lines.push( + `${c.bold}Regression Gate:${c.reset} threshold=${output.regression.slopeThreshold.toFixed(3)} fail_on_degrading=${output.regression.failOnDegrading ? 'true' : 'false'} triggered=${output.regression.triggered ? `${c.red}true${c.reset}` : 'false'}`, + ); + lines.push(''); + + return lines.join('\n'); +} + +export const trendCommand = command({ + name: 'trend', + description: 'Analyze score drift across multiple historical run manifests', + args: { + runs: restPositionals({ + type: string, + displayName: 'runs', + description: 'Run workspace directories or index.jsonl manifest paths', + }), + last: option({ + type: optional(number), + long: 'last', + description: 'Use the most recent N runs from .agentv/results/runs/', + }), + dataset: option({ + type: optional(string), + long: 'dataset', + description: 'Filter records to a dataset name', + }), + target: option({ + type: optional(string), + long: 'target', + description: 'Filter records to a target name', + }), + slopeThreshold: option({ + type: optional(number), + long: 'slope-threshold', + description: 'Minimum absolute slope required to classify improving or degrading', + }), + failOnDegrading: flag({ + long: 'fail-on-degrading', + description: 'Exit non-zero when the detected trend is degrading beyond the slope threshold', + }), + allowMissingTests: flag({ + long: 'allow-missing-tests', + description: 'Aggregate each run independently instead of intersecting test IDs across runs', + }), + format: option({ + type: optional(oneOf(['table', 'json'])), + long: 'format', + short: 'f', + description: 'Output format: table (default) or json', + }), + json: flag({ + long: 'json', + description: 'Output JSON format (shorthand for --format=json)', + }), + }, + handler: async ({ + runs, + last, + dataset, + target, + slopeThreshold, + failOnDegrading, + allowMissingTests, + format, + json, + }) => { + const outputFormat = json ? 'json' : (format ?? 'table'); + const effectiveSlopeThreshold = slopeThreshold ?? 0.01; + + try { + if (effectiveSlopeThreshold < 0) { + throw new Error('--slope-threshold must be non-negative'); + } + + const sourcePaths = resolveTrendSources(process.cwd(), runs, last); + const output = analyzeTrend({ + sourcePaths, + dataset, + target, + slopeThreshold: effectiveSlopeThreshold, + allowMissingTests, + failOnDegrading, + }); + + if (outputFormat === 'json') { + console.log(JSON.stringify(toSnakeCaseDeep(output), null, 2)); + } else { + console.log(formatTrendTable(output)); + } + + process.exit(determineTrendExitCode(output.summary.direction, failOnDegrading)); + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + }, +}); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index a1dea9d5c..18cf70feb 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -13,6 +13,7 @@ import { resultsServeCommand } from './commands/results/serve.js'; import { selfCommand } from './commands/self/index.js'; import { traceCommand } from './commands/trace/index.js'; import { transpileCommand } from './commands/transpile/index.js'; +import { trendCommand } from './commands/trend/index.js'; import { trimCommand } from './commands/trim/index.js'; import { validateCommand } from './commands/validate/index.js'; import { workspaceCommand } from './commands/workspace/index.js'; @@ -35,6 +36,7 @@ export const app = subcommands({ serve: resultsServeCommand, studio: resultsServeCommand, trace: traceCommand, + trend: trendCommand, transpile: transpileCommand, trim: trimCommand, validate: validateCommand, @@ -64,6 +66,7 @@ const TOP_LEVEL_COMMANDS = new Set([ 'serve', 'studio', 'trace', + 'trend', 'transpile', 'trim', 'validate', diff --git a/apps/cli/test/commands/trend/trend.test.ts b/apps/cli/test/commands/trend/trend.test.ts new file mode 100644 index 000000000..2f32e184e --- /dev/null +++ b/apps/cli/test/commands/trend/trend.test.ts @@ -0,0 +1,546 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { execa } from 'execa'; + +import { + analyzeTrend, + classifyTrendDirection, + computeRegressionStats, + determineTrendExitCode, + resolveTrendSources, +} from '../../../src/commands/trend/index.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const projectRoot = path.resolve(__dirname, '../../../../..'); +const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); + +interface RunRecordInput { + readonly test_id: string; + readonly score: number; + readonly dataset?: string; + readonly target?: string; + readonly timestamp?: string; +} + +async function createTempDir(): Promise { + return mkdtemp(path.join(tmpdir(), 'agentv-trend-test-')); +} + +async function createRunWorkspace( + rootDir: string, + runName: string, + records: readonly RunRecordInput[], +): Promise<{ runDir: string; indexPath: string }> { + const runDir = path.join(rootDir, '.agentv', 'results', 'runs', runName); + await mkdir(runDir, { recursive: true }); + const indexPath = path.join(runDir, 'index.jsonl'); + await writeFile( + indexPath, + `${records.map((record) => JSON.stringify(record)).join('\n')}\n`, + 'utf8', + ); + return { runDir, indexPath }; +} + +describe('trend command', () => { + const cleanupDirs: string[] = []; + + afterEach(async () => { + await Promise.all( + cleanupDirs.splice(0).map((dir) => rm(dir, { recursive: true, force: true })), + ); + }); + + it('computes a degrading trend over matched tests after dataset and target filtering', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.95, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.85, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't1', + dataset: 'code-review', + target: 'gpt-5', + score: 0.7, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.85, + timestamp: '2026-03-08T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.75, + timestamp: '2026-03-08T10:00:00.000Z', + }, + { + test_id: 't1', + dataset: 'code-review', + target: 'gpt-5', + score: 0.8, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.75, + timestamp: '2026-03-15T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'code-review', + target: 'claude-sonnet', + score: 0.65, + timestamp: '2026-03-15T10:00:00.000Z', + }, + { + test_id: 't1', + dataset: 'code-review', + target: 'gpt-5', + score: 0.9, + timestamp: '2026-03-15T10:00:00.000Z', + }, + ]); + + const output = analyzeTrend({ + sourcePaths: [run1.indexPath, run2.indexPath, run3.indexPath], + dataset: 'code-review', + target: 'claude-sonnet', + slopeThreshold: 0.01, + allowMissingTests: false, + failOnDegrading: false, + }); + + expect(output.runs).toHaveLength(3); + expect(output.runs[0]?.meanScore).toBeCloseTo(0.9, 10); + expect(output.runs[1]?.meanScore).toBeCloseTo(0.8, 10); + expect(output.runs[2]?.meanScore).toBeCloseTo(0.7, 10); + expect(output.summary.matchedTestCount).toBe(2); + expect(output.summary.slope).toBeCloseTo(-0.1, 10); + expect(output.summary.direction).toBe('degrading'); + expect(output.regression.triggered).toBe(false); + }); + + it('supports independent run aggregation when missing tests are allowed', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.6, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.9, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + + const output = analyzeTrend({ + sourcePaths: [run1.indexPath, run2.indexPath], + dataset: 'suite', + target: 'alpha', + slopeThreshold: 0.01, + allowMissingTests: true, + failOnDegrading: false, + }); + + expect(output.filters.allowMissingTests).toBe(true); + expect(output.runs.map((run) => run.matchedTestCount)).toEqual([2, 1]); + expect(output.summary.direction).toBe('improving'); + }); + + it('rejects runs that have no matching records after target filtering', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'beta', + score: 0.7, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + + expect(() => + analyzeTrend({ + sourcePaths: [run1.indexPath, run2.indexPath], + dataset: 'suite', + target: 'alpha', + slopeThreshold: 0.01, + allowMissingTests: false, + failOnDegrading: false, + }), + ).toThrow('Run has no matching records'); + }); + + it('rejects legacy flat jsonl inputs', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const flatFile = path.join(cwd, 'results.jsonl'); + await writeFile(flatFile, `${JSON.stringify({ test_id: 't1', score: 0.9 })}\n`, 'utf8'); + + expect(() => resolveTrendSources(cwd, [flatFile])).toThrow( + 'Unsupported result source for trend', + ); + }); + + it('discovers canonical run workspaces with --last ordering oldest to newest', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { test_id: 't1', score: 0.8, timestamp: '2026-03-01T10:00:00.000Z' }, + ]); + await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { test_id: 't1', score: 0.85, timestamp: '2026-03-08T10:00:00.000Z' }, + ]); + await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ + { test_id: 't1', score: 0.9, timestamp: '2026-03-15T10:00:00.000Z' }, + ]); + + const sources = resolveTrendSources(cwd, [], 2); + expect(sources).toHaveLength(2); + expect(sources[0]).toContain('2026-03-08T10-00-00-000Z'); + expect(sources[1]).toContain('2026-03-15T10-00-00-000Z'); + }); + + it('classifies direction and exit code using the slope threshold', () => { + const stats = computeRegressionStats([0.9, 0.8, 0.7]); + const direction = classifyTrendDirection(stats.slope, 0.01); + + expect(direction).toBe('degrading'); + expect(determineTrendExitCode(direction, false)).toBe(0); + expect(determineTrendExitCode(direction, true)).toBe(1); + }); + + it('emits JSON output for explicit run inputs', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.9, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-08T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.7, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.7, + timestamp: '2026-03-15T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.6, + timestamp: '2026-03-15T10:00:00.000Z', + }, + ]); + + const result = await execa( + 'bun', + [ + '--no-env-file', + CLI_ENTRY, + 'trend', + run1.runDir, + run2.indexPath, + run3.runDir, + '--dataset', + 'suite', + '--target', + 'alpha', + '--json', + ], + { cwd, reject: false }, + ); + + expect(result.exitCode).toBe(0); + const parsed = JSON.parse(result.stdout) as Record; + expect(parsed.filters).toEqual({ + dataset: 'suite', + target: 'alpha', + allow_missing_tests: false, + }); + expect((parsed.summary as Record).direction).toBe('degrading'); + expect((parsed.summary as Record).matched_test_count).toBe(2); + }); + + it('normalizes explicit run inputs to chronological order before analysis', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + const run1 = await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.9, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + const run2 = await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-08T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.7, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + const run3 = await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.7, + timestamp: '2026-03-15T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.6, + timestamp: '2026-03-15T10:00:00.000Z', + }, + ]); + + const output = analyzeTrend({ + sourcePaths: [run3.runDir, run1.indexPath, run2.runDir], + dataset: 'suite', + target: 'alpha', + slopeThreshold: 0.01, + allowMissingTests: false, + failOnDegrading: false, + }); + + expect(output.runs.map((run) => run.timestamp)).toEqual([ + '2026-03-01T10:00:00.000Z', + '2026-03-08T10:00:00.000Z', + '2026-03-15T10:00:00.000Z', + ]); + expect(output.summary.dateRange).toEqual({ + start: '2026-03-01T10:00:00.000Z', + end: '2026-03-15T10:00:00.000Z', + }); + expect(output.summary.direction).toBe('degrading'); + }); + + it('uses --last discovery and fails CI gating on sustained degradation', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.95, + timestamp: '2026-03-01T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.85, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.85, + timestamp: '2026-03-08T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.75, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + await createRunWorkspace(cwd, '2026-03-15T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.75, + timestamp: '2026-03-15T10:00:00.000Z', + }, + { + test_id: 't2', + dataset: 'suite', + target: 'alpha', + score: 0.65, + timestamp: '2026-03-15T10:00:00.000Z', + }, + ]); + + const result = await execa( + 'bun', + [ + '--no-env-file', + CLI_ENTRY, + 'trend', + '--last', + '3', + '--dataset', + 'suite', + '--target', + 'alpha', + '--fail-on-degrading', + '--slope-threshold', + '0.01', + ], + { cwd, reject: false }, + ); + + expect(result.exitCode).toBe(1); + expect(result.stdout).toContain('Trend Analysis'); + expect(result.stdout).toContain('degrading'); + }); + + it('errors when target filtering leaves a selected run empty in CLI mode', async () => { + const cwd = await createTempDir(); + cleanupDirs.push(cwd); + + await createRunWorkspace(cwd, '2026-03-01T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'alpha', + score: 0.8, + timestamp: '2026-03-01T10:00:00.000Z', + }, + ]); + await createRunWorkspace(cwd, '2026-03-08T10-00-00-000Z', [ + { + test_id: 't1', + dataset: 'suite', + target: 'beta', + score: 0.7, + timestamp: '2026-03-08T10:00:00.000Z', + }, + ]); + + const result = await execa( + 'bun', + [ + '--no-env-file', + CLI_ENTRY, + 'trend', + '--last', + '2', + '--dataset', + 'suite', + '--target', + 'alpha', + ], + { cwd, reject: false }, + ); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('Run has no matching records'); + }); +}); diff --git a/apps/web/src/content/docs/docs/tools/trend.mdx b/apps/web/src/content/docs/docs/tools/trend.mdx new file mode 100644 index 000000000..0d832ece4 --- /dev/null +++ b/apps/web/src/content/docs/docs/tools/trend.mdx @@ -0,0 +1,158 @@ +--- +title: Trend +description: Analyze score drift across multiple historical eval runs +sidebar: + order: 2 +--- + +The `trend` command analyzes score movement across multiple historical run manifests and reports whether quality is improving, degrading, or stable over time. + +Use it when pairwise `compare` is too narrow and you want to detect gradual drift across a sequence of runs. + +## Usage + +Analyze the last 8 canonical runs in the current workspace: + +```bash +agentv trend --last 8 +``` + +This is the primary day-to-day workflow. In most cases, users should start with `--last`. + +Filter to one dataset and target: + +```bash +agentv trend --last 8 --dataset code-review --target claude-sonnet +``` + +Point directly at run workspaces or `index.jsonl` manifests when you need a specific historical slice or want a reproducible example: + +```bash +agentv trend \ + .agentv/results/runs/2026-03-01T10-00-00-000Z/ \ + .agentv/results/runs/2026-03-08T10-00-00-000Z/index.jsonl \ + .agentv/results/runs/2026-03-15T10-00-00-000Z/ +``` + +Concrete regression-gating example: + +```bash +agentv trend --last 8 --dataset code-review --target claude-sonnet \ + --fail-on-degrading --slope-threshold 0.01 +``` + +## Supported Inputs + +`trend` only accepts canonical run workspaces: + +- `.agentv/results/runs//` +- `.agentv/results/runs//index.jsonl` + +Legacy flat `results.jsonl` files are rejected. The command stays on lightweight `index.jsonl` manifests and does not require per-test artifact hydration. + +## Options + +| Option | Description | +|--------|-------------| +| `--last ` | Use the most recent `n` runs from `.agentv/results/runs/` | +| `--dataset ` | Filter records to one dataset | +| `--target ` | Filter records to one target inside each run | +| `--slope-threshold ` | Minimum absolute slope required to classify improving or degrading (default: `0.01`) | +| `--fail-on-degrading` | Exit non-zero when the detected trend is degrading beyond the threshold | +| `--allow-missing-tests` | Aggregate each run independently instead of intersecting test IDs across runs | +| `--format`, `-f` | Output format: `table` (default) or `json` | +| `--json` | Shorthand for `--format=json` | + +## How It Works + +1. Loads each selected `index.jsonl` manifest. +2. Applies `dataset` and `target` filters per record. +3. By default, reduces every run to the intersection of test IDs present in all selected runs. +4. Computes one mean score per run. +5. Fits a simple linear regression over run index `0..N-1`. +6. Classifies the slope as `improving`, `degrading`, or `stable`. + +Strict matched-test analysis is the default because changing test composition across runs can create false drift signals. + +## Worked Example + +Suppose three historical runs for `dataset=code-review` and `target=claude-sonnet` produce matched mean scores of `0.92`, `0.86`, and `0.80`. + +- The slope is negative. +- The command reports `direction=degrading`. +- With `--fail-on-degrading --slope-threshold 0.01`, the command exits with code `1`. + +This is the intended CI workflow for detecting slow drift that a single pairwise comparison can miss. + +## Output + +### Table format + +```text +Trend Analysis + +Runs: 3 | Range: 2026-03-01T10:00:00.000Z → 2026-03-15T10:00:00.000Z +Filters: dataset=code-review target=claude-sonnet mode=matched-tests +Matched Tests: 42 | Verdict: degrading + + Run Tests Mean Score + ---------------------------- ----- ---------- + 2026-03-01T10:00:00.000Z 42 0.920 + 2026-03-08T10:00:00.000Z 42 0.905 + 2026-03-15T10:00:00.000Z 42 0.892 + +Summary: slope=-0.014 intercept=0.920 r²=0.943 +Regression Gate: threshold=0.010 fail_on_degrading=true triggered=true +``` + +### JSON format + +```json +{ + "runs": [ + { + "label": "2026-03-01T10:00:00.000Z", + "path": "/repo/.agentv/results/runs/2026-03-01T10-00-00-000Z/index.jsonl", + "timestamp": "2026-03-01T10:00:00.000Z", + "matched_test_count": 42, + "mean_score": 0.92 + } + ], + "filters": { + "dataset": "code-review", + "target": "claude-sonnet", + "allow_missing_tests": false + }, + "summary": { + "run_count": 8, + "matched_test_count": 42, + "date_range": { + "start": "2026-03-01T10:00:00.000Z", + "end": "2026-03-15T10:00:00.000Z" + }, + "slope": -0.014, + "intercept": 0.923, + "r_squared": 0.943, + "direction": "degrading" + }, + "regression": { + "slope_threshold": 0.01, + "fail_on_degrading": true, + "triggered": true + } +} +``` + +## Exit Codes + +| Code | Meaning | +|------|---------| +| `0` | Informational mode, or no degrading trend triggered | +| `1` | Invalid input, analysis error, or `--fail-on-degrading` detected a degrading trend | + +## Compare vs Trend + +- `compare` answers: "Did this run beat that run?" +- `trend` answers: "Across many runs, are scores drifting up or down?" + +Use `compare` for pairwise regressions. Use `trend` for longitudinal drift detection. diff --git a/examples/features/trend/README.md b/examples/features/trend/README.md new file mode 100644 index 000000000..83c70afe2 --- /dev/null +++ b/examples/features/trend/README.md @@ -0,0 +1,95 @@ +# Trend Analysis Example + +This example demonstrates `agentv trend` on three historical runs for the same dataset and target. + +Scenario: + +- Dataset: `code-review` +- Target: `claude-sonnet` +- Test IDs tracked across runs: `summary-accuracy`, `tool-selection` +- Outcome: scores degrade steadily from `0.92` to `0.86` to `0.80` + +## Files + +Tracked sample runs live in: + +```text +sample-runs/ + 2026-03-01T10-00-00-000Z/index.jsonl + 2026-03-08T10-00-00-000Z/index.jsonl + 2026-03-15T10-00-00-000Z/index.jsonl +``` + +These are canonical run directories with `index.jsonl`. + +## End-User Flow + +Most real users will run `trend` against their latest eval history with `--last`. + +To reproduce that flow from this example directory, first copy the sample runs into the normal runtime layout: + +```bash +mkdir -p .agentv/results/runs +cp -R sample-runs/* .agentv/results/runs/ +``` + +Then run: + +```bash +bun ../../../apps/cli/src/cli.ts trend --last 3 --dataset code-review --target claude-sonnet +``` + +Expected output: + +```text +Trend Analysis + +Runs: 3 | Range: 2026-03-01T10:00:00.000Z → 2026-03-15T10:00:00.000Z +Filters: dataset=code-review target=claude-sonnet mode=matched-tests +Matched Tests: 2 | Verdict: degrading + + Run Tests Mean Score + ──────────────────────── ─────── ────────── + 2026-03-01T10:00:00.000Z 2 0.920 + 2026-03-08T10:00:00.000Z 2 0.860 + 2026-03-15T10:00:00.000Z 2 0.800 + +Summary: slope=-0.060 intercept=0.920 r²=1.000 +Regression Gate: threshold=0.010 fail_on_degrading=false triggered=false +``` + +Interpretation: + +- The command auto-discovers the most recent three runs. +- It filters to `dataset=code-review` and `target=claude-sonnet`. +- It intersects matched test IDs across runs and detects a steady downward score trend. + +## Explicit Inputs + +If you want to see the same analysis without copying files into `.agentv/results/runs/`, point `trend` at the sample runs directly: + +```bash +bun ../../../apps/cli/src/cli.ts trend \ + sample-runs/2026-03-01T10-00-00-000Z \ + sample-runs/2026-03-08T10-00-00-000Z \ + sample-runs/2026-03-15T10-00-00-000Z \ + --dataset code-review \ + --target claude-sonnet +``` + +## CI Gate Example + +To turn the same analysis into a failure signal: + +```bash +bun ../../../apps/cli/src/cli.ts trend \ + sample-runs/2026-03-01T10-00-00-000Z \ + sample-runs/2026-03-08T10-00-00-000Z \ + sample-runs/2026-03-15T10-00-00-000Z \ + --dataset code-review \ + --target claude-sonnet \ + --fail-on-degrading \ + --slope-threshold 0.01 +``` + +This exits with code `1` because the degrading slope magnitude exceeds `0.01`. diff --git a/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl new file mode 100644 index 000000000..8379d80e6 --- /dev/null +++ b/examples/features/trend/sample-runs/2026-03-01T10-00-00-000Z/index.jsonl @@ -0,0 +1,3 @@ +{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.94} +{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.90} +{"timestamp":"2026-03-01T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.88} diff --git a/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl new file mode 100644 index 000000000..3a41da3b0 --- /dev/null +++ b/examples/features/trend/sample-runs/2026-03-08T10-00-00-000Z/index.jsonl @@ -0,0 +1,3 @@ +{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.88} +{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.84} +{"timestamp":"2026-03-08T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.90} diff --git a/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl b/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl new file mode 100644 index 000000000..75dc05a21 --- /dev/null +++ b/examples/features/trend/sample-runs/2026-03-15T10-00-00-000Z/index.jsonl @@ -0,0 +1,3 @@ +{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"claude-sonnet","score":0.82} +{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"tool-selection","dataset":"code-review","target":"claude-sonnet","score":0.78} +{"timestamp":"2026-03-15T10:00:00.000Z","test_id":"summary-accuracy","dataset":"code-review","target":"gpt-5","score":0.91} diff --git a/packages/core/test/evaluation/providers/cli-schema.test.ts b/packages/core/test/evaluation/providers/cli-schema.test.ts index acff93e1e..a2646b7d4 100644 --- a/packages/core/test/evaluation/providers/cli-schema.test.ts +++ b/packages/core/test/evaluation/providers/cli-schema.test.ts @@ -101,7 +101,6 @@ describe('CliTargetInputSchema', () => { const input = { provider: 'cli', command: 'agent run {PROMPT}' }; expect(CliTargetInputSchema.safeParse(input).success).toBe(false); }); - }); describe('CliHealthcheckSchema (strict)', () => {