From 634d19d46e3721ed29d5d146b4fe0815e3ee2342 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 05:35:07 +0000 Subject: [PATCH 1/8] feat(cli): add [INFO] log prefix, fix verdict icons, show score as percentage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add [INFO] prefix to all eval CLI output lines so framework messages are distinguishable from interleaved provider logs (Copilot, Codex, Pi). Fix verdict icons: completed tests with FAIL verdict now show ❌ instead of ✅, and ERROR shows ⚠️. Show scores as percentages (75%) instead of decimals (0.750). Closes #1073 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../cli/src/commands/eval/progress-display.ts | 26 +++++--- apps/cli/src/commands/eval/run-eval.ts | 60 +++++++++---------- apps/cli/src/commands/eval/statistics.ts | 5 +- apps/cli/test/eval.integration.test.ts | 6 +- 4 files changed, 53 insertions(+), 44 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 38b8edfc9..b30ca7d8c 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -1,5 +1,8 @@ export type Verdict = 'PASS' | 'FAIL' | 'ERROR'; +/** Prefix for all AgentV framework log lines, making them distinguishable from provider output. */ +export const LOG_PREFIX = '[INFO]'; + export interface WorkerProgress { workerId: number; testId: string; @@ -27,7 +30,7 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined): if (verdict === undefined) return ''; const colors = useColors(); - const scoreStr = score !== undefined ? score.toFixed(3) : ''; + const scoreStr = score !== undefined ? `${Math.round(score * 100)}%` : ''; const verdictLabel = verdict === 'ERROR' ? 'ERROR' : `${scoreStr} ${verdict}`; if (!colors) return ` | ${verdictLabel}`; @@ -87,25 +90,30 @@ export class ProgressDisplay { case 'pending': // Only print pending in verbose mode (just shows the queue) if (this.verbose && !previous) { - console.log(`${countPrefix} ⏳ ${progress.testId}${targetSuffix}`); + console.log(`${LOG_PREFIX} ${countPrefix} ⏳ ${progress.testId}${targetSuffix}`); } break; case 'running': // Always print running - useful feedback for long-running agents if (!previous || previous.status === 'pending') { - console.log(`${countPrefix} 🔄 ${progress.testId}${targetSuffix}`); + console.log(`${LOG_PREFIX} ${countPrefix} 🔄 ${progress.testId}${targetSuffix}`); } break; - case 'completed': + case 'completed': { + // Pick icon based on verdict: ✅ PASS, ❌ FAIL, ⚠️ ERROR + const icon = progress.verdict === 'FAIL' ? '❌' : progress.verdict === 'ERROR' ? '⚠️' : '✅'; console.log( - `${countPrefix} ✅ ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, + `${LOG_PREFIX} ${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, ); break; - case 'failed': + } + case 'failed': { + const failIcon = progress.verdict === 'ERROR' ? '⚠️' : '❌'; console.log( - `${countPrefix} ❌ ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, + `${LOG_PREFIX} ${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, ); break; + } } } @@ -133,13 +141,13 @@ export class ProgressDisplay { : provider === 'copilot' ? 'Copilot CLI' : 'Codex CLI'; - console.log(`${label} logs:`); + console.log(`${LOG_PREFIX} ${label} logs:`); this.hasPrintedLogHeader = true; } const startIndex = this.logPaths.length - newPaths.length; newPaths.forEach((path, offset) => { - console.log(`${startIndex + offset + 1}. ${path}`); + console.log(`${LOG_PREFIX} ${startIndex + offset + 1}. ${path}`); }); } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index ada1ae382..bce5b4b77 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,7 +34,7 @@ import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; -import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; +import { LOG_PREFIX, ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js'; import { buildExclusionFilter, @@ -684,7 +684,7 @@ async function runSingleEvalFile(params: { ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`; if (!progressReporter.isInteractive || options.verbose) { - console.log(targetMessage); + console.log(`${LOG_PREFIX} ${targetMessage}`); } const agentTimeoutMs = @@ -748,7 +748,7 @@ async function runSingleEvalFile(params: { const targetConfig = resolvedTargetSelection.resolvedTarget.config as Record; if (shouldSkipCacheForTemperature(targetConfig)) { if (options.verbose) { - console.log('Cache skipped: target temperature > 0'); + console.log(`${LOG_PREFIX} Cache skipped: target temperature > 0`); } return false; } @@ -914,14 +914,14 @@ export async function runEvalCommand( retryNonErrorResults = await loadNonErrorResults(retryPath); if (errorIds.length > 0) { - console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); + console.log(`${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); } // Use a negation filter to exclude fully-completed (non-error across all targets) cases. // This re-runs error cases, cases missing from the output (crash recovery), and cases // that errored on some targets even if they succeeded on others (matrix safety). if (completedIds.length > 0) { options = { ...options, filter: buildExclusionFilter(completedIds) }; - console.log(`Skipping ${completedIds.length} already-completed test(s).`); + console.log(`${LOG_PREFIX} Skipping ${completedIds.length} already-completed test(s).`); } } @@ -944,7 +944,7 @@ export async function runEvalCommand( } if (options.verbose) { - console.log(`Repository root: ${repoRoot}`); + console.log(`${LOG_PREFIX} Repository root: ${repoRoot}`); } // Emit deprecation warnings for legacy flags @@ -1054,18 +1054,18 @@ export async function runEvalCommand( // Resolve --export paths (additional output files) const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p)); - console.log(`Artifact directory: ${runDir}`); + console.log(`${LOG_PREFIX} Artifact directory: ${runDir}`); if (resolvedExportPaths.length > 0) { - console.log('Export files:'); + console.log(`${LOG_PREFIX} Export files:`); for (const p of resolvedExportPaths) { - console.log(` ${p}`); + console.log(`${LOG_PREFIX} ${p}`); } } // Log file export paths const resolvedTestFiles = input.testFiles.map((file) => path.resolve(file)); if (options.otelFile) { - console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`); + console.log(`${LOG_PREFIX} OTLP JSON file: ${path.resolve(options.otelFile)}`); } // Determine cache state after loading file metadata (need YAML config) @@ -1144,11 +1144,11 @@ export async function runEvalCommand( } if (skippedFiles.length > 0 && options.verbose) { console.log( - `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`, + `${LOG_PREFIX} Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`, ); } if (fileMetadata.size === 0) { - console.log('No eval files matched the tag filters. Nothing to run.'); + console.log(`${LOG_PREFIX} No eval files matched the tag filters. Nothing to run.`); return; } } @@ -1168,7 +1168,7 @@ export async function runEvalCommand( : undefined; if (cacheEnabled) { - console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); + console.log(`${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); } // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold. @@ -1204,7 +1204,7 @@ export async function runEvalCommand( if (totalEvalCount === 0) { // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { - console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); + console.log(`${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`); return; } throw new Error('No tests matched the provided filters.'); @@ -1288,7 +1288,7 @@ export async function runEvalCommand( transcriptProviderFactory = () => transcriptProvider; console.log( - `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, + `${LOG_PREFIX} Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, ); } @@ -1364,7 +1364,7 @@ export async function runEvalCommand( // before_all or other setup failures should not abort the entire run. // Mark all tests in this file as errors and continue with other files. const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + console.error(`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ timestamp: new Date().toISOString(), testId: testCase.id, @@ -1401,7 +1401,7 @@ export async function runEvalCommand( } allResults.push(...retryNonErrorResults); console.log( - `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`, + `${LOG_PREFIX} Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`, ); } @@ -1424,7 +1424,7 @@ export async function runEvalCommand( if (options.benchmarkJson && allResults.length > 0) { const benchmarkPath = path.resolve(options.benchmarkJson); await writeBenchmarkJson(benchmarkPath, allResults); - console.log(`Benchmark written to: ${benchmarkPath}`); + console.log(`${LOG_PREFIX} Benchmark written to: ${benchmarkPath}`); } // Write artifacts to the run directory (always, not conditional on flags) @@ -1439,13 +1439,13 @@ export async function runEvalCommand( evalFile, experiment: normalizeExperimentName(options.experiment), }); - console.log(`Artifact workspace written to: ${runDir}`); - console.log(` Index: ${indexPath}`); + console.log(`${LOG_PREFIX} Artifact workspace written to: ${runDir}`); + console.log(`${LOG_PREFIX} Index: ${indexPath}`); console.log( - ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, + `${LOG_PREFIX} Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, ); - console.log(` Timing: ${timingPath}`); - console.log(` Benchmark: ${workspaceBenchmarkPath}`); + console.log(`${LOG_PREFIX} Timing: ${timingPath}`); + console.log(`${LOG_PREFIX} Benchmark: ${workspaceBenchmarkPath}`); } // Write --export output files (additional formats) @@ -1458,7 +1458,7 @@ export async function runEvalCommand( await writer.close(); } console.log( - `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, + `${LOG_PREFIX} Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, ); } @@ -1469,9 +1469,9 @@ export async function runEvalCommand( : resultsWithWorkspaces.filter((r) => r.error || r.score < 0.5); if (preservedWorkspaces.length > 0) { - console.log('\nPreserved workspaces:'); + console.log(`\n${LOG_PREFIX} Preserved workspaces:`); for (const result of preservedWorkspaces) { - console.log(` ${result.testId} -> ${result.workspacePath}`); + console.log(`${LOG_PREFIX} ${result.testId} -> ${result.workspacePath}`); } } @@ -1480,11 +1480,11 @@ export async function runEvalCommand( resultsWithWorkspaces.length > 0 || (options.workspaceMode && options.workspaceMode !== 'static'); if (!options.keepWorkspaces && usedWorkspaces) { - console.log('Use --keep-workspaces to preserve all workspaces for inspection.'); + console.log(`${LOG_PREFIX} Use --keep-workspaces to preserve all workspaces for inspection.`); } if (allResults.length > 0) { - console.log(`\nResults written to: ${outputPath}`); + console.log(`\n${LOG_PREFIX} Results written to: ${outputPath}`); // Persist last run path for `agentv results` commands await saveRunCache(cwd, outputPath).catch(() => undefined); @@ -1524,8 +1524,8 @@ export async function runEvalCommand( const targetFlag = options.target ? ` --target ${options.target}` : ''; const relativeOutputPath = path.relative(cwd, outputPath); console.log( - `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + - ` agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`, + `\n${LOG_PREFIX} Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + + `${LOG_PREFIX} agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`, ); } diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 2d584c55d..7765114b4 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -1,4 +1,5 @@ import type { EvaluationResult } from '@agentv/core'; +import { LOG_PREFIX } from './progress-display.js'; export interface HistogramBin { readonly range: readonly [number, number]; @@ -296,7 +297,7 @@ export function formatEvaluationSummary( } } - return lines.join('\n'); + return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n'); } /** @@ -359,5 +360,5 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin }); lines.push(`${'Average'.padEnd(testIdColWidth)} ${avgCells.join(' ')}`); - return lines.join('\n'); + return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n'); } diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 1d8199494..51901c844 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -167,12 +167,12 @@ function extractOutputPath(stdout: string): string { const lines = stdout.split(/\r?\n/); // Try new format first, then legacy const outputLine = - lines.find((line) => line.startsWith('Results written to:')) ?? - lines.find((line) => line.startsWith('Output path:')); + lines.find((line) => line.includes('Results written to:')) ?? + lines.find((line) => line.includes('Output path:')); if (!outputLine) { throw new Error(`Unable to parse output path from CLI output:\n${stdout}`); } - return outputLine.replace(/^(Results written to:|Output path:)/, '').trim(); + return outputLine.replace(/^.*?(Results written to:|Output path:)/, '').trim(); } async function readJsonLines(filePath: string): Promise { From 025497b0ba8fb697ecf3e5b1ee2935a1d3269ae9 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 05:36:03 +0000 Subject: [PATCH 2/8] style: fix biome formatting in run-eval.ts Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/run-eval.ts | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index bce5b4b77..e4933b5d2 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,7 +34,12 @@ import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; -import { LOG_PREFIX, ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; +import { + LOG_PREFIX, + ProgressDisplay, + type Verdict, + type WorkerProgress, +} from './progress-display.js'; import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js'; import { buildExclusionFilter, @@ -914,7 +919,9 @@ export async function runEvalCommand( retryNonErrorResults = await loadNonErrorResults(retryPath); if (errorIds.length > 0) { - console.log(`${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); + console.log( + `${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`, + ); } // Use a negation filter to exclude fully-completed (non-error across all targets) cases. // This re-runs error cases, cases missing from the output (crash recovery), and cases @@ -1168,7 +1175,9 @@ export async function runEvalCommand( : undefined; if (cacheEnabled) { - console.log(`${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); + console.log( + `${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`, + ); } // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold. @@ -1204,7 +1213,9 @@ export async function runEvalCommand( if (totalEvalCount === 0) { // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { - console.log(`${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`); + console.log( + `${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`, + ); return; } throw new Error('No tests matched the provided filters.'); @@ -1364,7 +1375,9 @@ export async function runEvalCommand( // before_all or other setup failures should not abort the entire run. // Mark all tests in this file as errors and continue with other files. const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error(`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + console.error( + `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, + ); const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ timestamp: new Date().toISOString(), testId: testCase.id, From 763a946542f28cf9e7eeed0596a360f8c5e798ef Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 05:39:22 +0000 Subject: [PATCH 3/8] =?UTF-8?q?fix:=20swap=20verdict=20icons=20=E2=80=94?= =?UTF-8?q?=20=E2=9A=A0=EF=B8=8F=20for=20FAIL,=20=E2=9D=8C=20for=20ERROR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FAIL is a quality warning (test ran but scored below threshold). ERROR is a hard failure (execution broke). Icons now match severity. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/progress-display.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index b30ca7d8c..516de38f6 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -100,15 +100,15 @@ export class ProgressDisplay { } break; case 'completed': { - // Pick icon based on verdict: ✅ PASS, ❌ FAIL, ⚠️ ERROR - const icon = progress.verdict === 'FAIL' ? '❌' : progress.verdict === 'ERROR' ? '⚠️' : '✅'; + // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR + const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅'; console.log( `${LOG_PREFIX} ${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, ); break; } case 'failed': { - const failIcon = progress.verdict === 'ERROR' ? '⚠️' : '❌'; + const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️'; console.log( `${LOG_PREFIX} ${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, ); From 1cf29227d840641ae2ac8adb45086c49984fa178 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 05:41:35 +0000 Subject: [PATCH 4/8] refactor: remove [INFO] prefix from summary blocks The evaluation summary and matrix table are already delimited with ===== borders and print as a single block after all tests complete. The prefix is only needed for progress lines interleaved with provider output mid-run. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/statistics.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 7765114b4..2d584c55d 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -1,5 +1,4 @@ import type { EvaluationResult } from '@agentv/core'; -import { LOG_PREFIX } from './progress-display.js'; export interface HistogramBin { readonly range: readonly [number, number]; @@ -297,7 +296,7 @@ export function formatEvaluationSummary( } } - return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n'); + return lines.join('\n'); } /** @@ -360,5 +359,5 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin }); lines.push(`${'Average'.padEnd(testIdColWidth)} ${avgCells.join(' ')}`); - return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n'); + return lines.join('\n'); } From 1789d05ee9df5f48ea7c6876091a34fb2db6653d Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 05:55:19 +0000 Subject: [PATCH 5/8] refactor: replace log heading with per-line "Log created:" prefix The "Copilot CLI logs:" heading lost context when interleaved with provider output. Each log path now prints as its own self-contained line: [INFO] Log created: /path/to/log Co-Authored-By: Claude Opus 4.6 (1M context) --- .../cli/src/commands/eval/progress-display.ts | 20 +++---------------- apps/cli/src/commands/eval/run-eval.ts | 13 ++++++------ 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 516de38f6..753eafadd 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -51,7 +51,6 @@ export class ProgressDisplay { private completedTests = 0; private readonly logPaths: string[] = []; private readonly logPathSet = new Set(); - private hasPrintedLogHeader = false; private started = false; private finished = false; private readonly verbose: boolean; @@ -117,7 +116,7 @@ export class ProgressDisplay { } } - addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void { + addLogPaths(paths: readonly string[]): void { const newPaths: string[] = []; for (const path of paths) { if (this.logPathSet.has(path)) { @@ -133,22 +132,9 @@ export class ProgressDisplay { this.logPaths.push(...newPaths); - if (!this.hasPrintedLogHeader) { - console.log(''); - const label = - provider === 'pi' - ? 'Pi Coding Agent' - : provider === 'copilot' - ? 'Copilot CLI' - : 'Codex CLI'; - console.log(`${LOG_PREFIX} ${label} logs:`); - this.hasPrintedLogHeader = true; + for (const p of newPaths) { + console.log(`${LOG_PREFIX} Log created: ${p}`); } - - const startIndex = this.logPaths.length - newPaths.length; - newPaths.forEach((path, offset) => { - console.log(`${LOG_PREFIX} ${startIndex + offset + 1}. ${path}`); - }); } finish(): void { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index e4933b5d2..e82ec225f 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -401,7 +401,7 @@ type ProgressReporter = { setTotal(total: number): void; update(workerId: number, progress: WorkerProgress): void; finish(): void; - addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void; + addLogPaths(paths: readonly string[]): void; }; function createProgressReporter( @@ -416,8 +416,7 @@ function createProgressReporter( update: (workerId: number, progress: WorkerProgress) => display.updateWorker({ ...progress, workerId }), finish: () => display.finish(), - addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot') => - display.addLogPaths(paths, provider), + addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths), }; } @@ -1231,7 +1230,7 @@ export async function runEvalCommand( return; } seenCodexLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'codex'); + progressReporter.addLogPaths([entry.filePath]); }); const seenPiLogPaths = new Set(); const unsubscribePiLogs = subscribeToPiLogEntries((entry) => { @@ -1239,7 +1238,7 @@ export async function runEvalCommand( return; } seenPiLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'pi'); + progressReporter.addLogPaths([entry.filePath]); }); const seenCopilotLogPaths = new Set(); const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => { @@ -1247,14 +1246,14 @@ export async function runEvalCommand( return; } seenCopilotLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'copilot'); + progressReporter.addLogPaths([entry.filePath]); }); const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => { if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) { return; } seenCopilotLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'copilot'); + progressReporter.addLogPaths([entry.filePath]); }); for (const [testFilePath, meta] of fileMetadata.entries()) { for (const { selection, inlineTargetLabel } of meta.selections) { From cf502ad715fdb12249029dba909df1ec01294ec3 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 06:17:05 +0000 Subject: [PATCH 6/8] fix: rename "Log created" to "Provider log" for clarity Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/progress-display.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 753eafadd..f9fc1fd6b 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -133,7 +133,7 @@ export class ProgressDisplay { this.logPaths.push(...newPaths); for (const p of newPaths) { - console.log(`${LOG_PREFIX} Log created: ${p}`); + console.log(`${LOG_PREFIX} Provider log: ${p}`); } } From f4460527d7503507be37e7686b5cf33bfd6911df Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 06:20:16 +0000 Subject: [PATCH 7/8] feat(cli): show resolved target name when default is a use_target redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When "default" in targets.yaml delegates via use_target (e.g., to "copilot"), the progress lines now show "default → copilot" instead of just "default", so the user sees which provider is actually running. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/run-eval.ts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index e82ec225f..554cdd306 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -424,6 +424,14 @@ function makeTestCaseKey(testFilePath: string, testId: string): string { return `${path.resolve(testFilePath)}::${testId}`; } +/** Show the resolved target name when `default` is a `use_target` redirect. */ +function resolveTargetLabel(requestedName: string, resolvedName: string): string { + if (resolvedName !== requestedName) { + return `${requestedName} → ${resolvedName}`; + } + return requestedName; +} + function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } { const map = new Map(); let nextId = 1; @@ -583,7 +591,7 @@ async function prepareFileMetadata(params: { selections = multiSelections.map((sel) => ({ selection: sel, - inlineTargetLabel: sel.targetName, + inlineTargetLabel: resolveTargetLabel(sel.targetName, sel.resolvedTarget.name), })); } else { // Single target mode (legacy path) @@ -603,7 +611,10 @@ async function prepareFileMetadata(params: { selections = [ { selection, - inlineTargetLabel: selection.targetName, + inlineTargetLabel: resolveTargetLabel( + selection.targetName, + selection.resolvedTarget.name, + ), }, ]; } From 3c438483a7b4a8b6eeaedc7a6e6462a3b0f03664 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 06:32:12 +0000 Subject: [PATCH 8/8] =?UTF-8?q?refactor:=20remove=20[INFO]=20prefix=20?= =?UTF-8?q?=E2=80=94=20icons=20and=20resolved=20target=20are=20sufficient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With verdict icons (✅/⚠️/❌), percentages, and resolved target names (default → copilot), every line is already self-describing. The [INFO] prefix was just noise. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../cli/src/commands/eval/progress-display.ts | 13 ++-- apps/cli/src/commands/eval/run-eval.ts | 69 ++++++++----------- 2 files changed, 34 insertions(+), 48 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index f9fc1fd6b..90b70e6cc 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -1,8 +1,5 @@ export type Verdict = 'PASS' | 'FAIL' | 'ERROR'; -/** Prefix for all AgentV framework log lines, making them distinguishable from provider output. */ -export const LOG_PREFIX = '[INFO]'; - export interface WorkerProgress { workerId: number; testId: string; @@ -89,27 +86,27 @@ export class ProgressDisplay { case 'pending': // Only print pending in verbose mode (just shows the queue) if (this.verbose && !previous) { - console.log(`${LOG_PREFIX} ${countPrefix} ⏳ ${progress.testId}${targetSuffix}`); + console.log(`${countPrefix} ⏳ ${progress.testId}${targetSuffix}`); } break; case 'running': // Always print running - useful feedback for long-running agents if (!previous || previous.status === 'pending') { - console.log(`${LOG_PREFIX} ${countPrefix} 🔄 ${progress.testId}${targetSuffix}`); + console.log(`${countPrefix} 🔄 ${progress.testId}${targetSuffix}`); } break; case 'completed': { // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅'; console.log( - `${LOG_PREFIX} ${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, + `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, ); break; } case 'failed': { const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️'; console.log( - `${LOG_PREFIX} ${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, + `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, ); break; } @@ -133,7 +130,7 @@ export class ProgressDisplay { this.logPaths.push(...newPaths); for (const p of newPaths) { - console.log(`${LOG_PREFIX} Provider log: ${p}`); + console.log(`Provider log: ${p}`); } } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 554cdd306..bbf64e2bb 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -34,12 +34,7 @@ import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; -import { - LOG_PREFIX, - ProgressDisplay, - type Verdict, - type WorkerProgress, -} from './progress-display.js'; +import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js'; import { buildExclusionFilter, @@ -699,7 +694,7 @@ async function runSingleEvalFile(params: { ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`; if (!progressReporter.isInteractive || options.verbose) { - console.log(`${LOG_PREFIX} ${targetMessage}`); + console.log(`${targetMessage}`); } const agentTimeoutMs = @@ -763,7 +758,7 @@ async function runSingleEvalFile(params: { const targetConfig = resolvedTargetSelection.resolvedTarget.config as Record; if (shouldSkipCacheForTemperature(targetConfig)) { if (options.verbose) { - console.log(`${LOG_PREFIX} Cache skipped: target temperature > 0`); + console.log('Cache skipped: target temperature > 0'); } return false; } @@ -929,16 +924,14 @@ export async function runEvalCommand( retryNonErrorResults = await loadNonErrorResults(retryPath); if (errorIds.length > 0) { - console.log( - `${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`, - ); + console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); } // Use a negation filter to exclude fully-completed (non-error across all targets) cases. // This re-runs error cases, cases missing from the output (crash recovery), and cases // that errored on some targets even if they succeeded on others (matrix safety). if (completedIds.length > 0) { options = { ...options, filter: buildExclusionFilter(completedIds) }; - console.log(`${LOG_PREFIX} Skipping ${completedIds.length} already-completed test(s).`); + console.log(`Skipping ${completedIds.length} already-completed test(s).`); } } @@ -961,7 +954,7 @@ export async function runEvalCommand( } if (options.verbose) { - console.log(`${LOG_PREFIX} Repository root: ${repoRoot}`); + console.log(`Repository root: ${repoRoot}`); } // Emit deprecation warnings for legacy flags @@ -1071,18 +1064,18 @@ export async function runEvalCommand( // Resolve --export paths (additional output files) const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p)); - console.log(`${LOG_PREFIX} Artifact directory: ${runDir}`); + console.log(`Artifact directory: ${runDir}`); if (resolvedExportPaths.length > 0) { - console.log(`${LOG_PREFIX} Export files:`); + console.log('Export files:'); for (const p of resolvedExportPaths) { - console.log(`${LOG_PREFIX} ${p}`); + console.log(` ${p}`); } } // Log file export paths const resolvedTestFiles = input.testFiles.map((file) => path.resolve(file)); if (options.otelFile) { - console.log(`${LOG_PREFIX} OTLP JSON file: ${path.resolve(options.otelFile)}`); + console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`); } // Determine cache state after loading file metadata (need YAML config) @@ -1161,11 +1154,11 @@ export async function runEvalCommand( } if (skippedFiles.length > 0 && options.verbose) { console.log( - `${LOG_PREFIX} Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`, + `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`, ); } if (fileMetadata.size === 0) { - console.log(`${LOG_PREFIX} No eval files matched the tag filters. Nothing to run.`); + console.log('No eval files matched the tag filters. Nothing to run.'); return; } } @@ -1185,9 +1178,7 @@ export async function runEvalCommand( : undefined; if (cacheEnabled) { - console.log( - `${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`, - ); + console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`); } // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold. @@ -1223,9 +1214,7 @@ export async function runEvalCommand( if (totalEvalCount === 0) { // When using --retry-errors, all tests being filtered means no errors or missing cases remain if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) { - console.log( - `${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`, - ); + console.log('No execution errors or missing cases in the previous run. Nothing to retry.'); return; } throw new Error('No tests matched the provided filters.'); @@ -1309,7 +1298,7 @@ export async function runEvalCommand( transcriptProviderFactory = () => transcriptProvider; console.log( - `${LOG_PREFIX} Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, + `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`, ); } @@ -1424,7 +1413,7 @@ export async function runEvalCommand( } allResults.push(...retryNonErrorResults); console.log( - `${LOG_PREFIX} Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`, + `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`, ); } @@ -1447,7 +1436,7 @@ export async function runEvalCommand( if (options.benchmarkJson && allResults.length > 0) { const benchmarkPath = path.resolve(options.benchmarkJson); await writeBenchmarkJson(benchmarkPath, allResults); - console.log(`${LOG_PREFIX} Benchmark written to: ${benchmarkPath}`); + console.log(`Benchmark written to: ${benchmarkPath}`); } // Write artifacts to the run directory (always, not conditional on flags) @@ -1462,13 +1451,13 @@ export async function runEvalCommand( evalFile, experiment: normalizeExperimentName(options.experiment), }); - console.log(`${LOG_PREFIX} Artifact workspace written to: ${runDir}`); - console.log(`${LOG_PREFIX} Index: ${indexPath}`); + console.log(`Artifact workspace written to: ${runDir}`); + console.log(` Index: ${indexPath}`); console.log( - `${LOG_PREFIX} Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, + ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, ); - console.log(`${LOG_PREFIX} Timing: ${timingPath}`); - console.log(`${LOG_PREFIX} Benchmark: ${workspaceBenchmarkPath}`); + console.log(` Timing: ${timingPath}`); + console.log(` Benchmark: ${workspaceBenchmarkPath}`); } // Write --export output files (additional formats) @@ -1481,7 +1470,7 @@ export async function runEvalCommand( await writer.close(); } console.log( - `${LOG_PREFIX} Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, + `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, ); } @@ -1492,9 +1481,9 @@ export async function runEvalCommand( : resultsWithWorkspaces.filter((r) => r.error || r.score < 0.5); if (preservedWorkspaces.length > 0) { - console.log(`\n${LOG_PREFIX} Preserved workspaces:`); + console.log('\nPreserved workspaces:'); for (const result of preservedWorkspaces) { - console.log(`${LOG_PREFIX} ${result.testId} -> ${result.workspacePath}`); + console.log(` ${result.testId} -> ${result.workspacePath}`); } } @@ -1503,11 +1492,11 @@ export async function runEvalCommand( resultsWithWorkspaces.length > 0 || (options.workspaceMode && options.workspaceMode !== 'static'); if (!options.keepWorkspaces && usedWorkspaces) { - console.log(`${LOG_PREFIX} Use --keep-workspaces to preserve all workspaces for inspection.`); + console.log('Use --keep-workspaces to preserve all workspaces for inspection.'); } if (allResults.length > 0) { - console.log(`\n${LOG_PREFIX} Results written to: ${outputPath}`); + console.log(`\nResults written to: ${outputPath}`); // Persist last run path for `agentv results` commands await saveRunCache(cwd, outputPath).catch(() => undefined); @@ -1547,8 +1536,8 @@ export async function runEvalCommand( const targetFlag = options.target ? ` --target ${options.target}` : ''; const relativeOutputPath = path.relative(cwd, outputPath); console.log( - `\n${LOG_PREFIX} Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + - `${LOG_PREFIX} agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`, + `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` + + ` agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`, ); }