diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 38b8edfc..90b70e6c 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -27,7 +27,7 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined): if (verdict === undefined) return ''; const colors = useColors(); - const scoreStr = score !== undefined ? score.toFixed(3) : ''; + const scoreStr = score !== undefined ? `${Math.round(score * 100)}%` : ''; const verdictLabel = verdict === 'ERROR' ? 'ERROR' : `${scoreStr} ${verdict}`; if (!colors) return ` | ${verdictLabel}`; @@ -48,7 +48,6 @@ export class ProgressDisplay { private completedTests = 0; private readonly logPaths: string[] = []; private readonly logPathSet = new Set(); - private hasPrintedLogHeader = false; private started = false; private finished = false; private readonly verbose: boolean; @@ -96,20 +95,25 @@ export class ProgressDisplay { console.log(`${countPrefix} šŸ”„ ${progress.testId}${targetSuffix}`); } break; - case 'completed': + case 'completed': { + // Pick icon based on verdict: āœ… PASS, āš ļø FAIL, āŒ ERROR + const icon = progress.verdict === 'FAIL' ? 'āš ļø' : progress.verdict === 'ERROR' ? 'āŒ' : 'āœ…'; console.log( - `${countPrefix} āœ… ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, + `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, ); break; - case 'failed': + } + case 'failed': { + const failIcon = progress.verdict === 'ERROR' ? 'āŒ' : 'āš ļø'; console.log( - `${countPrefix} āŒ ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, + `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, ); break; + } } } - addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void { + addLogPaths(paths: readonly string[]): void { const newPaths: string[] = []; for (const path of paths) { if (this.logPathSet.has(path)) { @@ -125,22 +129,9 @@ export class ProgressDisplay { this.logPaths.push(...newPaths); - if (!this.hasPrintedLogHeader) { - console.log(''); - const label = - provider === 'pi' - ? 'Pi Coding Agent' - : provider === 'copilot' - ? 'Copilot CLI' - : 'Codex CLI'; - console.log(`${label} logs:`); - this.hasPrintedLogHeader = true; + for (const p of newPaths) { + console.log(`Provider log: ${p}`); } - - const startIndex = this.logPaths.length - newPaths.length; - newPaths.forEach((path, offset) => { - console.log(`${startIndex + offset + 1}. ${path}`); - }); } finish(): void { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index ada1ae38..bbf64e2b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -396,7 +396,7 @@ type ProgressReporter = { setTotal(total: number): void; update(workerId: number, progress: WorkerProgress): void; finish(): void; - addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void; + addLogPaths(paths: readonly string[]): void; }; function createProgressReporter( @@ -411,8 +411,7 @@ function createProgressReporter( update: (workerId: number, progress: WorkerProgress) => display.updateWorker({ ...progress, workerId }), finish: () => display.finish(), - addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot') => - display.addLogPaths(paths, provider), + addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths), }; } @@ -420,6 +419,14 @@ function makeTestCaseKey(testFilePath: string, testId: string): string { return `${path.resolve(testFilePath)}::${testId}`; } +/** Show the resolved target name when `default` is a `use_target` redirect. */ +function resolveTargetLabel(requestedName: string, resolvedName: string): string { + if (resolvedName !== requestedName) { + return `${requestedName} → ${resolvedName}`; + } + return requestedName; +} + function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } { const map = new Map(); let nextId = 1; @@ -579,7 +586,7 @@ async function prepareFileMetadata(params: { selections = multiSelections.map((sel) => ({ selection: sel, - inlineTargetLabel: sel.targetName, + inlineTargetLabel: resolveTargetLabel(sel.targetName, sel.resolvedTarget.name), })); } else { // Single target mode (legacy path) @@ -599,7 +606,10 @@ async function prepareFileMetadata(params: { selections = [ { selection, - inlineTargetLabel: selection.targetName, + inlineTargetLabel: resolveTargetLabel( + selection.targetName, + selection.resolvedTarget.name, + ), }, ]; } @@ -684,7 +694,7 @@ async function runSingleEvalFile(params: { ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`; if (!progressReporter.isInteractive || options.verbose) { - console.log(targetMessage); + console.log(`${targetMessage}`); } const agentTimeoutMs = @@ -1220,7 +1230,7 @@ export async function runEvalCommand( return; } seenCodexLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'codex'); + progressReporter.addLogPaths([entry.filePath]); }); const seenPiLogPaths = new Set(); const unsubscribePiLogs = subscribeToPiLogEntries((entry) => { @@ -1228,7 +1238,7 @@ export async function runEvalCommand( return; } seenPiLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'pi'); + progressReporter.addLogPaths([entry.filePath]); }); const seenCopilotLogPaths = new Set(); const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => { @@ -1236,14 +1246,14 @@ export async function runEvalCommand( return; } seenCopilotLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'copilot'); + progressReporter.addLogPaths([entry.filePath]); }); const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => { if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) { return; } seenCopilotLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath], 'copilot'); + progressReporter.addLogPaths([entry.filePath]); }); for (const [testFilePath, meta] of fileMetadata.entries()) { for (const { selection, inlineTargetLabel } of meta.selections) { @@ -1364,7 +1374,9 @@ export async function runEvalCommand( // before_all or other setup failures should not abort the entire run. // Mark all tests in this file as errors and continue with other files. const message = fileError instanceof Error ? fileError.message : String(fileError); - console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + console.error( + `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, + ); const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({ timestamp: new Date().toISOString(), testId: testCase.id, diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 1d819949..51901c84 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -167,12 +167,12 @@ function extractOutputPath(stdout: string): string { const lines = stdout.split(/\r?\n/); // Try new format first, then legacy const outputLine = - lines.find((line) => line.startsWith('Results written to:')) ?? - lines.find((line) => line.startsWith('Output path:')); + lines.find((line) => line.includes('Results written to:')) ?? + lines.find((line) => line.includes('Output path:')); if (!outputLine) { throw new Error(`Unable to parse output path from CLI output:\n${stdout}`); } - return outputLine.replace(/^(Results written to:|Output path:)/, '').trim(); + return outputLine.replace(/^.*?(Results written to:|Output path:)/, '').trim(); } async function readJsonLines(filePath: string): Promise {