Skip to content
35 changes: 13 additions & 22 deletions apps/cli/src/commands/eval/progress-display.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined):
if (verdict === undefined) return '';

const colors = useColors();
const scoreStr = score !== undefined ? score.toFixed(3) : '';
const scoreStr = score !== undefined ? `${Math.round(score * 100)}%` : '';
const verdictLabel = verdict === 'ERROR' ? 'ERROR' : `${scoreStr} ${verdict}`;

if (!colors) return ` | ${verdictLabel}`;
Expand All @@ -48,7 +48,6 @@ export class ProgressDisplay {
private completedTests = 0;
private readonly logPaths: string[] = [];
private readonly logPathSet = new Set<string>();
private hasPrintedLogHeader = false;
private started = false;
private finished = false;
private readonly verbose: boolean;
Expand Down Expand Up @@ -96,20 +95,25 @@ export class ProgressDisplay {
console.log(`${countPrefix} 🔄 ${progress.testId}${targetSuffix}`);
}
break;
case 'completed':
case 'completed': {
// Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR
const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅';
console.log(
`${countPrefix} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
`${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
);
break;
case 'failed':
}
case 'failed': {
const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️';
console.log(
`${countPrefix} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
`${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
);
break;
}
}
}

addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void {
addLogPaths(paths: readonly string[]): void {
const newPaths: string[] = [];
for (const path of paths) {
if (this.logPathSet.has(path)) {
Expand All @@ -125,22 +129,9 @@ export class ProgressDisplay {

this.logPaths.push(...newPaths);

if (!this.hasPrintedLogHeader) {
console.log('');
const label =
provider === 'pi'
? 'Pi Coding Agent'
: provider === 'copilot'
? 'Copilot CLI'
: 'Codex CLI';
console.log(`${label} logs:`);
this.hasPrintedLogHeader = true;
for (const p of newPaths) {
console.log(`Provider log: ${p}`);
}

const startIndex = this.logPaths.length - newPaths.length;
newPaths.forEach((path, offset) => {
console.log(`${startIndex + offset + 1}. ${path}`);
});
}

finish(): void {
Expand Down
34 changes: 23 additions & 11 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ type ProgressReporter = {
setTotal(total: number): void;
update(workerId: number, progress: WorkerProgress): void;
finish(): void;
addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void;
addLogPaths(paths: readonly string[]): void;
};

function createProgressReporter(
Expand All @@ -411,15 +411,22 @@ function createProgressReporter(
update: (workerId: number, progress: WorkerProgress) =>
display.updateWorker({ ...progress, workerId }),
finish: () => display.finish(),
addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot') =>
display.addLogPaths(paths, provider),
addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths),
};
}

function makeTestCaseKey(testFilePath: string, testId: string): string {
return `${path.resolve(testFilePath)}::${testId}`;
}

/** Show the resolved target name when `default` is a `use_target` redirect. */
function resolveTargetLabel(requestedName: string, resolvedName: string): string {
if (resolvedName !== requestedName) {
return `${requestedName} → ${resolvedName}`;
}
return requestedName;
}

function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } {
const map = new Map<string, number>();
let nextId = 1;
Expand Down Expand Up @@ -579,7 +586,7 @@ async function prepareFileMetadata(params: {

selections = multiSelections.map((sel) => ({
selection: sel,
inlineTargetLabel: sel.targetName,
inlineTargetLabel: resolveTargetLabel(sel.targetName, sel.resolvedTarget.name),
}));
} else {
// Single target mode (legacy path)
Expand All @@ -599,7 +606,10 @@ async function prepareFileMetadata(params: {
selections = [
{
selection,
inlineTargetLabel: selection.targetName,
inlineTargetLabel: resolveTargetLabel(
selection.targetName,
selection.resolvedTarget.name,
),
},
];
}
Expand Down Expand Up @@ -684,7 +694,7 @@ async function runSingleEvalFile(params: {
? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}`
: `Using target: ${inlineTargetLabel}`;
if (!progressReporter.isInteractive || options.verbose) {
console.log(targetMessage);
console.log(`${targetMessage}`);
}

const agentTimeoutMs =
Expand Down Expand Up @@ -1220,30 +1230,30 @@ export async function runEvalCommand(
return;
}
seenCodexLogPaths.add(entry.filePath);
progressReporter.addLogPaths([entry.filePath], 'codex');
progressReporter.addLogPaths([entry.filePath]);
});
const seenPiLogPaths = new Set<string>();
const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) {
return;
}
seenPiLogPaths.add(entry.filePath);
progressReporter.addLogPaths([entry.filePath], 'pi');
progressReporter.addLogPaths([entry.filePath]);
});
const seenCopilotLogPaths = new Set<string>();
const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => {
if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
return;
}
seenCopilotLogPaths.add(entry.filePath);
progressReporter.addLogPaths([entry.filePath], 'copilot');
progressReporter.addLogPaths([entry.filePath]);
});
const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => {
if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
return;
}
seenCopilotLogPaths.add(entry.filePath);
progressReporter.addLogPaths([entry.filePath], 'copilot');
progressReporter.addLogPaths([entry.filePath]);
});
for (const [testFilePath, meta] of fileMetadata.entries()) {
for (const { selection, inlineTargetLabel } of meta.selections) {
Expand Down Expand Up @@ -1364,7 +1374,9 @@ export async function runEvalCommand(
// before_all or other setup failures should not abort the entire run.
// Mark all tests in this file as errors and continue with other files.
const message = fileError instanceof Error ? fileError.message : String(fileError);
console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
console.error(
`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
);
const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
timestamp: new Date().toISOString(),
testId: testCase.id,
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ function extractOutputPath(stdout: string): string {
const lines = stdout.split(/\r?\n/);
// Try new format first, then legacy
const outputLine =
lines.find((line) => line.startsWith('Results written to:')) ??
lines.find((line) => line.startsWith('Output path:'));
lines.find((line) => line.includes('Results written to:')) ??
lines.find((line) => line.includes('Output path:'));
if (!outputLine) {
throw new Error(`Unable to parse output path from CLI output:\n${stdout}`);
}
return outputLine.replace(/^(Results written to:|Output path:)/, '').trim();
return outputLine.replace(/^.*?(Results written to:|Output path:)/, '').trim();
}

async function readJsonLines(filePath: string): Promise<readonly unknown[]> {
Expand Down
Loading