From 634d19d46e3721ed29d5d146b4fe0815e3ee2342 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 05:35:07 +0000
Subject: [PATCH 1/8] feat(cli): add [INFO] log prefix, fix verdict icons, show
 score as percentage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add [INFO] prefix to all eval CLI output lines so framework messages are
distinguishable from interleaved provider logs (Copilot, Codex, Pi).

Fix verdict icons: completed tests with FAIL verdict now show ❌ instead of ✅,
and ERROR shows ⚠️. Show scores as percentages (75%) instead of decimals (0.750).

Closes #1073

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../cli/src/commands/eval/progress-display.ts | 26 +++++---
 apps/cli/src/commands/eval/run-eval.ts        | 60 +++++++++----------
 apps/cli/src/commands/eval/statistics.ts      |  5 +-
 apps/cli/test/eval.integration.test.ts        |  6 +-
 4 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
index 38b8edfc9..b30ca7d8c 100644
--- a/apps/cli/src/commands/eval/progress-display.ts
+++ b/apps/cli/src/commands/eval/progress-display.ts
@@ -1,5 +1,8 @@
 export type Verdict = 'PASS' | 'FAIL' | 'ERROR';
 
+/** Prefix for all AgentV framework log lines, making them distinguishable from provider output. */
+export const LOG_PREFIX = '[INFO]';
+
 export interface WorkerProgress {
   workerId: number;
   testId: string;
@@ -27,7 +30,7 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined):
   if (verdict === undefined) return '';
 
   const colors = useColors();
-  const scoreStr = score !== undefined ? score.toFixed(3) : '';
+  const scoreStr = score !== undefined ? `${Math.round(score * 100)}%` : '';
   const verdictLabel = verdict === 'ERROR' ? 'ERROR' : `${scoreStr} ${verdict}`;
 
   if (!colors) return ` | ${verdictLabel}`;
@@ -87,25 +90,30 @@ export class ProgressDisplay {
       case 'pending':
         // Only print pending in verbose mode (just shows the queue)
         if (this.verbose && !previous) {
-          console.log(`${countPrefix}   ⏳ ${progress.testId}${targetSuffix}`);
+          console.log(`${LOG_PREFIX} ${countPrefix}   ⏳ ${progress.testId}${targetSuffix}`);
         }
         break;
       case 'running':
         // Always print running - useful feedback for long-running agents
         if (!previous || previous.status === 'pending') {
-          console.log(`${countPrefix}   🔄 ${progress.testId}${targetSuffix}`);
+          console.log(`${LOG_PREFIX} ${countPrefix}   🔄 ${progress.testId}${targetSuffix}`);
         }
         break;
-      case 'completed':
+      case 'completed': {
+        // Pick icon based on verdict: ✅ PASS, ❌ FAIL, ⚠️ ERROR
+        const icon = progress.verdict === 'FAIL' ? '❌' : progress.verdict === 'ERROR' ? '⚠️' : '✅';
         console.log(
-          `${countPrefix}   ✅ ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
+          `${LOG_PREFIX} ${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
         );
         break;
-      case 'failed':
+      }
+      case 'failed': {
+        const failIcon = progress.verdict === 'ERROR' ? '⚠️' : '❌';
         console.log(
-          `${countPrefix}   ❌ ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
+          `${LOG_PREFIX} ${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
         );
         break;
+      }
     }
   }
 
@@ -133,13 +141,13 @@ export class ProgressDisplay {
           : provider === 'copilot'
             ? 'Copilot CLI'
             : 'Codex CLI';
-      console.log(`${label} logs:`);
+      console.log(`${LOG_PREFIX} ${label} logs:`);
       this.hasPrintedLogHeader = true;
     }
 
     const startIndex = this.logPaths.length - newPaths.length;
     newPaths.forEach((path, offset) => {
-      console.log(`${startIndex + offset + 1}. ${path}`);
+      console.log(`${LOG_PREFIX} ${startIndex + offset + 1}. ${path}`);
     });
   }
 
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index ada1ae382..bce5b4b77 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,7 +34,7 @@ import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
-import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
+import { LOG_PREFIX, ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js';
 import {
   buildExclusionFilter,
@@ -684,7 +684,7 @@ async function runSingleEvalFile(params: {
     ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}`
     : `Using target: ${inlineTargetLabel}`;
   if (!progressReporter.isInteractive || options.verbose) {
-    console.log(targetMessage);
+    console.log(`${LOG_PREFIX} ${targetMessage}`);
   }
 
   const agentTimeoutMs =
@@ -748,7 +748,7 @@ async function runSingleEvalFile(params: {
       const targetConfig = resolvedTargetSelection.resolvedTarget.config as Record<string, unknown>;
       if (shouldSkipCacheForTemperature(targetConfig)) {
         if (options.verbose) {
-          console.log('Cache skipped: target temperature > 0');
+          console.log(`${LOG_PREFIX} Cache skipped: target temperature > 0`);
         }
         return false;
       }
@@ -914,14 +914,14 @@ export async function runEvalCommand(
     retryNonErrorResults = await loadNonErrorResults(retryPath);
 
     if (errorIds.length > 0) {
-      console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
+      console.log(`${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
     }
     // Use a negation filter to exclude fully-completed (non-error across all targets) cases.
     // This re-runs error cases, cases missing from the output (crash recovery), and cases
     // that errored on some targets even if they succeeded on others (matrix safety).
     if (completedIds.length > 0) {
       options = { ...options, filter: buildExclusionFilter(completedIds) };
-      console.log(`Skipping ${completedIds.length} already-completed test(s).`);
+      console.log(`${LOG_PREFIX} Skipping ${completedIds.length} already-completed test(s).`);
     }
   }
 
@@ -944,7 +944,7 @@ export async function runEvalCommand(
   }
 
   if (options.verbose) {
-    console.log(`Repository root: ${repoRoot}`);
+    console.log(`${LOG_PREFIX} Repository root: ${repoRoot}`);
   }
 
   // Emit deprecation warnings for legacy flags
@@ -1054,18 +1054,18 @@ export async function runEvalCommand(
   // Resolve --export paths (additional output files)
   const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p));
 
-  console.log(`Artifact directory: ${runDir}`);
+  console.log(`${LOG_PREFIX} Artifact directory: ${runDir}`);
   if (resolvedExportPaths.length > 0) {
-    console.log('Export files:');
+    console.log(`${LOG_PREFIX} Export files:`);
     for (const p of resolvedExportPaths) {
-      console.log(`  ${p}`);
+      console.log(`${LOG_PREFIX}   ${p}`);
     }
   }
 
   // Log file export paths
   const resolvedTestFiles = input.testFiles.map((file) => path.resolve(file));
   if (options.otelFile) {
-    console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`);
+    console.log(`${LOG_PREFIX} OTLP JSON file: ${path.resolve(options.otelFile)}`);
   }
 
   // Determine cache state after loading file metadata (need YAML config)
@@ -1144,11 +1144,11 @@ export async function runEvalCommand(
     }
     if (skippedFiles.length > 0 && options.verbose) {
       console.log(
-        `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
+        `${LOG_PREFIX} Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
       );
     }
     if (fileMetadata.size === 0) {
-      console.log('No eval files matched the tag filters. Nothing to run.');
+      console.log(`${LOG_PREFIX} No eval files matched the tag filters. Nothing to run.`);
       return;
     }
   }
@@ -1168,7 +1168,7 @@ export async function runEvalCommand(
     : undefined;
 
   if (cacheEnabled) {
-    console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
+    console.log(`${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
   }
 
   // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
@@ -1204,7 +1204,7 @@ export async function runEvalCommand(
   if (totalEvalCount === 0) {
     // When using --retry-errors, all tests being filtered means no errors or missing cases remain
     if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
-      console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
+      console.log(`${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`);
       return;
     }
     throw new Error('No tests matched the provided filters.');
@@ -1288,7 +1288,7 @@ export async function runEvalCommand(
 
     transcriptProviderFactory = () => transcriptProvider;
     console.log(
-      `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
+      `${LOG_PREFIX} Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
     );
   }
 
@@ -1364,7 +1364,7 @@ export async function runEvalCommand(
             // before_all or other setup failures should not abort the entire run.
             // Mark all tests in this file as errors and continue with other files.
             const message = fileError instanceof Error ? fileError.message : String(fileError);
-            console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
+            console.error(`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
             const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
               timestamp: new Date().toISOString(),
               testId: testCase.id,
@@ -1401,7 +1401,7 @@ export async function runEvalCommand(
       }
       allResults.push(...retryNonErrorResults);
       console.log(
-        `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`,
+        `${LOG_PREFIX} Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`,
       );
     }
 
@@ -1424,7 +1424,7 @@ export async function runEvalCommand(
     if (options.benchmarkJson && allResults.length > 0) {
       const benchmarkPath = path.resolve(options.benchmarkJson);
       await writeBenchmarkJson(benchmarkPath, allResults);
-      console.log(`Benchmark written to: ${benchmarkPath}`);
+      console.log(`${LOG_PREFIX} Benchmark written to: ${benchmarkPath}`);
     }
 
     // Write artifacts to the run directory (always, not conditional on flags)
@@ -1439,13 +1439,13 @@ export async function runEvalCommand(
         evalFile,
         experiment: normalizeExperimentName(options.experiment),
       });
-      console.log(`Artifact workspace written to: ${runDir}`);
-      console.log(`  Index: ${indexPath}`);
+      console.log(`${LOG_PREFIX} Artifact workspace written to: ${runDir}`);
+      console.log(`${LOG_PREFIX}   Index: ${indexPath}`);
       console.log(
-        `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
+        `${LOG_PREFIX}   Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
       );
-      console.log(`  Timing: ${timingPath}`);
-      console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
+      console.log(`${LOG_PREFIX}   Timing: ${timingPath}`);
+      console.log(`${LOG_PREFIX}   Benchmark: ${workspaceBenchmarkPath}`);
     }
 
     // Write --export output files (additional formats)
@@ -1458,7 +1458,7 @@ export async function runEvalCommand(
         await writer.close();
       }
       console.log(
-        `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`,
+        `${LOG_PREFIX} Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`,
       );
     }
 
@@ -1469,9 +1469,9 @@ export async function runEvalCommand(
       : resultsWithWorkspaces.filter((r) => r.error || r.score < 0.5);
 
     if (preservedWorkspaces.length > 0) {
-      console.log('\nPreserved workspaces:');
+      console.log(`\n${LOG_PREFIX} Preserved workspaces:`);
       for (const result of preservedWorkspaces) {
-        console.log(`  ${result.testId} -> ${result.workspacePath}`);
+        console.log(`${LOG_PREFIX}   ${result.testId} -> ${result.workspacePath}`);
       }
     }
 
@@ -1480,11 +1480,11 @@ export async function runEvalCommand(
       resultsWithWorkspaces.length > 0 ||
       (options.workspaceMode && options.workspaceMode !== 'static');
     if (!options.keepWorkspaces && usedWorkspaces) {
-      console.log('Use --keep-workspaces to preserve all workspaces for inspection.');
+      console.log(`${LOG_PREFIX} Use --keep-workspaces to preserve all workspaces for inspection.`);
     }
 
     if (allResults.length > 0) {
-      console.log(`\nResults written to: ${outputPath}`);
+      console.log(`\n${LOG_PREFIX} Results written to: ${outputPath}`);
 
       // Persist last run path for `agentv results` commands
       await saveRunCache(cwd, outputPath).catch(() => undefined);
@@ -1524,8 +1524,8 @@ export async function runEvalCommand(
       const targetFlag = options.target ? ` --target ${options.target}` : '';
       const relativeOutputPath = path.relative(cwd, outputPath);
       console.log(
-        `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
-          `  agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`,
+        `\n${LOG_PREFIX} Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
+          `${LOG_PREFIX}   agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`,
       );
     }
 
diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 2d584c55d..7765114b4 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -1,4 +1,5 @@
 import type { EvaluationResult } from '@agentv/core';
+import { LOG_PREFIX } from './progress-display.js';
 
 export interface HistogramBin {
   readonly range: readonly [number, number];
@@ -296,7 +297,7 @@ export function formatEvaluationSummary(
     }
   }
 
-  return lines.join('\n');
+  return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n');
 }
 
 /**
@@ -359,5 +360,5 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin
   });
   lines.push(`${'Average'.padEnd(testIdColWidth)}  ${avgCells.join('  ')}`);
 
-  return lines.join('\n');
+  return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n');
 }
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 1d8199494..51901c844 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -167,12 +167,12 @@ function extractOutputPath(stdout: string): string {
   const lines = stdout.split(/\r?\n/);
   // Try new format first, then legacy
   const outputLine =
-    lines.find((line) => line.startsWith('Results written to:')) ??
-    lines.find((line) => line.startsWith('Output path:'));
+    lines.find((line) => line.includes('Results written to:')) ??
+    lines.find((line) => line.includes('Output path:'));
   if (!outputLine) {
     throw new Error(`Unable to parse output path from CLI output:\n${stdout}`);
   }
-  return outputLine.replace(/^(Results written to:|Output path:)/, '').trim();
+  return outputLine.replace(/^.*?(Results written to:|Output path:)/, '').trim();
 }
 
 async function readJsonLines(filePath: string): Promise<readonly unknown[]> {

From 025497b0ba8fb697ecf3e5b1ee2935a1d3269ae9 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 05:36:03 +0000
Subject: [PATCH 2/8] style: fix biome formatting in run-eval.ts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index bce5b4b77..e4933b5d2 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,7 +34,12 @@ import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
-import { LOG_PREFIX, ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
+import {
+  LOG_PREFIX,
+  ProgressDisplay,
+  type Verdict,
+  type WorkerProgress,
+} from './progress-display.js';
 import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js';
 import {
   buildExclusionFilter,
@@ -914,7 +919,9 @@ export async function runEvalCommand(
     retryNonErrorResults = await loadNonErrorResults(retryPath);
 
     if (errorIds.length > 0) {
-      console.log(`${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
+      console.log(
+        `${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`,
+      );
     }
     // Use a negation filter to exclude fully-completed (non-error across all targets) cases.
     // This re-runs error cases, cases missing from the output (crash recovery), and cases
@@ -1168,7 +1175,9 @@ export async function runEvalCommand(
     : undefined;
 
   if (cacheEnabled) {
-    console.log(`${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
+    console.log(
+      `${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`,
+    );
   }
 
   // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
@@ -1204,7 +1213,9 @@ export async function runEvalCommand(
   if (totalEvalCount === 0) {
     // When using --retry-errors, all tests being filtered means no errors or missing cases remain
     if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
-      console.log(`${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`);
+      console.log(
+        `${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`,
+      );
       return;
     }
     throw new Error('No tests matched the provided filters.');
@@ -1364,7 +1375,9 @@ export async function runEvalCommand(
             // before_all or other setup failures should not abort the entire run.
             // Mark all tests in this file as errors and continue with other files.
             const message = fileError instanceof Error ? fileError.message : String(fileError);
-            console.error(`\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
+            console.error(
+              `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`,
+            );
             const errorResults: EvaluationResult[] = applicableTestCases.map((testCase) => ({
               timestamp: new Date().toISOString(),
               testId: testCase.id,

From 763a946542f28cf9e7eeed0596a360f8c5e798ef Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 05:39:22 +0000
Subject: [PATCH 3/8] =?UTF-8?q?fix:=20swap=20verdict=20icons=20=E2=80=94?=
 =?UTF-8?q?=20=E2=9A=A0=EF=B8=8F=20for=20FAIL,=20=E2=9D=8C=20for=20ERROR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FAIL is a quality warning (test ran but scored below threshold).
ERROR is a hard failure (execution broke). Icons now match severity.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/progress-display.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
index b30ca7d8c..516de38f6 100644
--- a/apps/cli/src/commands/eval/progress-display.ts
+++ b/apps/cli/src/commands/eval/progress-display.ts
@@ -100,15 +100,15 @@ export class ProgressDisplay {
         }
         break;
       case 'completed': {
-        // Pick icon based on verdict: ✅ PASS, ❌ FAIL, ⚠️ ERROR
-        const icon = progress.verdict === 'FAIL' ? '❌' : progress.verdict === 'ERROR' ? '⚠️' : '✅';
+        // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR
+        const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅';
         console.log(
           `${LOG_PREFIX} ${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
         );
         break;
       }
       case 'failed': {
-        const failIcon = progress.verdict === 'ERROR' ? '⚠️' : '❌';
+        const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️';
         console.log(
           `${LOG_PREFIX} ${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
         );

From 1cf29227d840641ae2ac8adb45086c49984fa178 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 05:41:35 +0000
Subject: [PATCH 4/8] refactor: remove [INFO] prefix from summary blocks

The evaluation summary and matrix table are already delimited with
===== borders and print as a single block after all tests complete.
The prefix is only needed for progress lines interleaved with provider
output mid-run.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/statistics.ts | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 7765114b4..2d584c55d 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -1,5 +1,4 @@
 import type { EvaluationResult } from '@agentv/core';
-import { LOG_PREFIX } from './progress-display.js';
 
 export interface HistogramBin {
   readonly range: readonly [number, number];
@@ -297,7 +296,7 @@ export function formatEvaluationSummary(
     }
   }
 
-  return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n');
+  return lines.join('\n');
 }
 
 /**
@@ -360,5 +359,5 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin
   });
   lines.push(`${'Average'.padEnd(testIdColWidth)}  ${avgCells.join('  ')}`);
 
-  return lines.map((line) => (line === '' ? '' : `${LOG_PREFIX} ${line}`)).join('\n');
+  return lines.join('\n');
 }

From 1789d05ee9df5f48ea7c6876091a34fb2db6653d Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 05:55:19 +0000
Subject: [PATCH 5/8] refactor: replace log heading with per-line "Log
 created:" prefix

The "Copilot CLI logs:" heading lost context when interleaved with
provider output. Each log path now prints as its own self-contained
line: [INFO] Log created: /path/to/log

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../cli/src/commands/eval/progress-display.ts | 20 +++----------------
 apps/cli/src/commands/eval/run-eval.ts        | 13 ++++++------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
index 516de38f6..753eafadd 100644
--- a/apps/cli/src/commands/eval/progress-display.ts
+++ b/apps/cli/src/commands/eval/progress-display.ts
@@ -51,7 +51,6 @@ export class ProgressDisplay {
   private completedTests = 0;
   private readonly logPaths: string[] = [];
   private readonly logPathSet = new Set<string>();
-  private hasPrintedLogHeader = false;
   private started = false;
   private finished = false;
   private readonly verbose: boolean;
@@ -117,7 +116,7 @@ export class ProgressDisplay {
     }
   }
 
-  addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void {
+  addLogPaths(paths: readonly string[]): void {
     const newPaths: string[] = [];
     for (const path of paths) {
       if (this.logPathSet.has(path)) {
@@ -133,22 +132,9 @@ export class ProgressDisplay {
 
     this.logPaths.push(...newPaths);
 
-    if (!this.hasPrintedLogHeader) {
-      console.log('');
-      const label =
-        provider === 'pi'
-          ? 'Pi Coding Agent'
-          : provider === 'copilot'
-            ? 'Copilot CLI'
-            : 'Codex CLI';
-      console.log(`${LOG_PREFIX} ${label} logs:`);
-      this.hasPrintedLogHeader = true;
+    for (const p of newPaths) {
+      console.log(`${LOG_PREFIX} Log created: ${p}`);
     }
-
-    const startIndex = this.logPaths.length - newPaths.length;
-    newPaths.forEach((path, offset) => {
-      console.log(`${LOG_PREFIX} ${startIndex + offset + 1}. ${path}`);
-    });
   }
 
   finish(): void {
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index e4933b5d2..e82ec225f 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -401,7 +401,7 @@ type ProgressReporter = {
   setTotal(total: number): void;
   update(workerId: number, progress: WorkerProgress): void;
   finish(): void;
-  addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot'): void;
+  addLogPaths(paths: readonly string[]): void;
 };
 
 function createProgressReporter(
@@ -416,8 +416,7 @@ function createProgressReporter(
     update: (workerId: number, progress: WorkerProgress) =>
       display.updateWorker({ ...progress, workerId }),
     finish: () => display.finish(),
-    addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi' | 'copilot') =>
-      display.addLogPaths(paths, provider),
+    addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths),
   };
 }
 
@@ -1231,7 +1230,7 @@ export async function runEvalCommand(
       return;
     }
     seenCodexLogPaths.add(entry.filePath);
-    progressReporter.addLogPaths([entry.filePath], 'codex');
+    progressReporter.addLogPaths([entry.filePath]);
   });
   const seenPiLogPaths = new Set<string>();
   const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
@@ -1239,7 +1238,7 @@ export async function runEvalCommand(
       return;
     }
     seenPiLogPaths.add(entry.filePath);
-    progressReporter.addLogPaths([entry.filePath], 'pi');
+    progressReporter.addLogPaths([entry.filePath]);
   });
   const seenCopilotLogPaths = new Set<string>();
   const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => {
@@ -1247,14 +1246,14 @@ export async function runEvalCommand(
       return;
     }
     seenCopilotLogPaths.add(entry.filePath);
-    progressReporter.addLogPaths([entry.filePath], 'copilot');
+    progressReporter.addLogPaths([entry.filePath]);
   });
   const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => {
     if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
       return;
     }
     seenCopilotLogPaths.add(entry.filePath);
-    progressReporter.addLogPaths([entry.filePath], 'copilot');
+    progressReporter.addLogPaths([entry.filePath]);
   });
   for (const [testFilePath, meta] of fileMetadata.entries()) {
     for (const { selection, inlineTargetLabel } of meta.selections) {

From cf502ad715fdb12249029dba909df1ec01294ec3 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 06:17:05 +0000
Subject: [PATCH 6/8] fix: rename "Log created" to "Provider log" for clarity

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/progress-display.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
index 753eafadd..f9fc1fd6b 100644
--- a/apps/cli/src/commands/eval/progress-display.ts
+++ b/apps/cli/src/commands/eval/progress-display.ts
@@ -133,7 +133,7 @@ export class ProgressDisplay {
     this.logPaths.push(...newPaths);
 
     for (const p of newPaths) {
-      console.log(`${LOG_PREFIX} Log created: ${p}`);
+      console.log(`${LOG_PREFIX} Provider log: ${p}`);
     }
   }
 

From f4460527d7503507be37e7686b5cf33bfd6911df Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 06:20:16 +0000
Subject: [PATCH 7/8] feat(cli): show resolved target name when default is a
 use_target redirect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When "default" in targets.yaml delegates via use_target (e.g., to
"copilot"), the progress lines now show "default → copilot" instead
of just "default", so the user sees which provider is actually running.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/run-eval.ts | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index e82ec225f..554cdd306 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -424,6 +424,14 @@ function makeTestCaseKey(testFilePath: string, testId: string): string {
   return `${path.resolve(testFilePath)}::${testId}`;
 }
 
+/** Show the resolved target name when `default` is a `use_target` redirect. */
+function resolveTargetLabel(requestedName: string, resolvedName: string): string {
+  if (resolvedName !== requestedName) {
+    return `${requestedName} → ${resolvedName}`;
+  }
+  return requestedName;
+}
+
 function createDisplayIdTracker(): { getOrAssign(testCaseKey: string): number } {
   const map = new Map<string, number>();
   let nextId = 1;
@@ -583,7 +591,7 @@ async function prepareFileMetadata(params: {
 
       selections = multiSelections.map((sel) => ({
         selection: sel,
-        inlineTargetLabel: sel.targetName,
+        inlineTargetLabel: resolveTargetLabel(sel.targetName, sel.resolvedTarget.name),
       }));
     } else {
       // Single target mode (legacy path)
@@ -603,7 +611,10 @@ async function prepareFileMetadata(params: {
       selections = [
         {
           selection,
-          inlineTargetLabel: selection.targetName,
+          inlineTargetLabel: resolveTargetLabel(
+            selection.targetName,
+            selection.resolvedTarget.name,
+          ),
         },
       ];
     }

From 3c438483a7b4a8b6eeaedc7a6e6462a3b0f03664 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 06:32:12 +0000
Subject: [PATCH 8/8] =?UTF-8?q?refactor:=20remove=20[INFO]=20prefix=20?=
 =?UTF-8?q?=E2=80=94=20icons=20and=20resolved=20target=20are=20sufficient?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With verdict icons (✅/⚠️/❌), percentages, and resolved target names
(default → copilot), every line is already self-describing. The [INFO]
prefix was just noise.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../cli/src/commands/eval/progress-display.ts | 13 ++--
 apps/cli/src/commands/eval/run-eval.ts        | 69 ++++++++-----------
 2 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
index f9fc1fd6b..90b70e6cc 100644
--- a/apps/cli/src/commands/eval/progress-display.ts
+++ b/apps/cli/src/commands/eval/progress-display.ts
@@ -1,8 +1,5 @@
 export type Verdict = 'PASS' | 'FAIL' | 'ERROR';
 
-/** Prefix for all AgentV framework log lines, making them distinguishable from provider output. */
-export const LOG_PREFIX = '[INFO]';
-
 export interface WorkerProgress {
   workerId: number;
   testId: string;
@@ -89,27 +86,27 @@ export class ProgressDisplay {
       case 'pending':
         // Only print pending in verbose mode (just shows the queue)
         if (this.verbose && !previous) {
-          console.log(`${LOG_PREFIX} ${countPrefix}   ⏳ ${progress.testId}${targetSuffix}`);
+          console.log(`${countPrefix}   ⏳ ${progress.testId}${targetSuffix}`);
         }
         break;
       case 'running':
         // Always print running - useful feedback for long-running agents
         if (!previous || previous.status === 'pending') {
-          console.log(`${LOG_PREFIX} ${countPrefix}   🔄 ${progress.testId}${targetSuffix}`);
+          console.log(`${countPrefix}   🔄 ${progress.testId}${targetSuffix}`);
         }
         break;
       case 'completed': {
         // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR
         const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅';
         console.log(
-          `${LOG_PREFIX} ${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
+          `${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
         );
         break;
       }
       case 'failed': {
         const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️';
         console.log(
-          `${LOG_PREFIX} ${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
+          `${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
         );
         break;
       }
@@ -133,7 +130,7 @@ export class ProgressDisplay {
     this.logPaths.push(...newPaths);
 
     for (const p of newPaths) {
-      console.log(`${LOG_PREFIX} Provider log: ${p}`);
+      console.log(`Provider log: ${p}`);
     }
   }
 
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 554cdd306..bbf64e2bb 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -34,12 +34,7 @@ import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
-import {
-  LOG_PREFIX,
-  ProgressDisplay,
-  type Verdict,
-  type WorkerProgress,
-} from './progress-display.js';
+import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
 import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js';
 import {
   buildExclusionFilter,
@@ -699,7 +694,7 @@ async function runSingleEvalFile(params: {
     ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}`
     : `Using target: ${inlineTargetLabel}`;
   if (!progressReporter.isInteractive || options.verbose) {
-    console.log(`${LOG_PREFIX} ${targetMessage}`);
+    console.log(`${targetMessage}`);
   }
 
   const agentTimeoutMs =
@@ -763,7 +758,7 @@ async function runSingleEvalFile(params: {
       const targetConfig = resolvedTargetSelection.resolvedTarget.config as Record<string, unknown>;
       if (shouldSkipCacheForTemperature(targetConfig)) {
         if (options.verbose) {
-          console.log(`${LOG_PREFIX} Cache skipped: target temperature > 0`);
+          console.log('Cache skipped: target temperature > 0');
         }
         return false;
       }
@@ -929,16 +924,14 @@ export async function runEvalCommand(
     retryNonErrorResults = await loadNonErrorResults(retryPath);
 
     if (errorIds.length > 0) {
-      console.log(
-        `${LOG_PREFIX} Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`,
-      );
+      console.log(`Found ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
     }
     // Use a negation filter to exclude fully-completed (non-error across all targets) cases.
     // This re-runs error cases, cases missing from the output (crash recovery), and cases
     // that errored on some targets even if they succeeded on others (matrix safety).
     if (completedIds.length > 0) {
       options = { ...options, filter: buildExclusionFilter(completedIds) };
-      console.log(`${LOG_PREFIX} Skipping ${completedIds.length} already-completed test(s).`);
+      console.log(`Skipping ${completedIds.length} already-completed test(s).`);
     }
   }
 
@@ -961,7 +954,7 @@ export async function runEvalCommand(
   }
 
   if (options.verbose) {
-    console.log(`${LOG_PREFIX} Repository root: ${repoRoot}`);
+    console.log(`Repository root: ${repoRoot}`);
   }
 
   // Emit deprecation warnings for legacy flags
@@ -1071,18 +1064,18 @@ export async function runEvalCommand(
   // Resolve --export paths (additional output files)
   const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p));
 
-  console.log(`${LOG_PREFIX} Artifact directory: ${runDir}`);
+  console.log(`Artifact directory: ${runDir}`);
   if (resolvedExportPaths.length > 0) {
-    console.log(`${LOG_PREFIX} Export files:`);
+    console.log('Export files:');
     for (const p of resolvedExportPaths) {
-      console.log(`${LOG_PREFIX}   ${p}`);
+      console.log(`  ${p}`);
     }
   }
 
   // Log file export paths
   const resolvedTestFiles = input.testFiles.map((file) => path.resolve(file));
   if (options.otelFile) {
-    console.log(`${LOG_PREFIX} OTLP JSON file: ${path.resolve(options.otelFile)}`);
+    console.log(`OTLP JSON file: ${path.resolve(options.otelFile)}`);
   }
 
   // Determine cache state after loading file metadata (need YAML config)
@@ -1161,11 +1154,11 @@ export async function runEvalCommand(
     }
     if (skippedFiles.length > 0 && options.verbose) {
       console.log(
-        `${LOG_PREFIX} Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
+        `Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
       );
     }
     if (fileMetadata.size === 0) {
-      console.log(`${LOG_PREFIX} No eval files matched the tag filters. Nothing to run.`);
+      console.log('No eval files matched the tag filters. Nothing to run.');
       return;
     }
   }
@@ -1185,9 +1178,7 @@ export async function runEvalCommand(
     : undefined;
 
   if (cacheEnabled) {
-    console.log(
-      `${LOG_PREFIX} Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`,
-    );
+    console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ''}`);
   }
 
   // Resolve suite-level threshold: CLI --threshold takes precedence over YAML execution.threshold.
@@ -1223,9 +1214,7 @@ export async function runEvalCommand(
   if (totalEvalCount === 0) {
     // When using --retry-errors, all tests being filtered means no errors or missing cases remain
     if (options.retryErrors && retryNonErrorResults && retryNonErrorResults.length > 0) {
-      console.log(
-        `${LOG_PREFIX} No execution errors or missing cases in the previous run. Nothing to retry.`,
-      );
+      console.log('No execution errors or missing cases in the previous run. Nothing to retry.');
       return;
     }
     throw new Error('No tests matched the provided filters.');
@@ -1309,7 +1298,7 @@ export async function runEvalCommand(
 
     transcriptProviderFactory = () => transcriptProvider;
     console.log(
-      `${LOG_PREFIX} Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
+      `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
     );
   }
 
@@ -1424,7 +1413,7 @@ export async function runEvalCommand(
       }
       allResults.push(...retryNonErrorResults);
       console.log(
-        `${LOG_PREFIX} Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`,
+        `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`,
       );
     }
 
@@ -1447,7 +1436,7 @@ export async function runEvalCommand(
     if (options.benchmarkJson && allResults.length > 0) {
       const benchmarkPath = path.resolve(options.benchmarkJson);
       await writeBenchmarkJson(benchmarkPath, allResults);
-      console.log(`${LOG_PREFIX} Benchmark written to: ${benchmarkPath}`);
+      console.log(`Benchmark written to: ${benchmarkPath}`);
     }
 
     // Write artifacts to the run directory (always, not conditional on flags)
@@ -1462,13 +1451,13 @@ export async function runEvalCommand(
         evalFile,
         experiment: normalizeExperimentName(options.experiment),
       });
-      console.log(`${LOG_PREFIX} Artifact workspace written to: ${runDir}`);
-      console.log(`${LOG_PREFIX}   Index: ${indexPath}`);
+      console.log(`Artifact workspace written to: ${runDir}`);
+      console.log(`  Index: ${indexPath}`);
       console.log(
-        `${LOG_PREFIX}   Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
+        `  Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`,
       );
-      console.log(`${LOG_PREFIX}   Timing: ${timingPath}`);
-      console.log(`${LOG_PREFIX}   Benchmark: ${workspaceBenchmarkPath}`);
+      console.log(`  Timing: ${timingPath}`);
+      console.log(`  Benchmark: ${workspaceBenchmarkPath}`);
     }
 
     // Write --export output files (additional formats)
@@ -1481,7 +1470,7 @@ export async function runEvalCommand(
         await writer.close();
       }
       console.log(
-        `${LOG_PREFIX} Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`,
+        `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`,
       );
     }
 
@@ -1492,9 +1481,9 @@ export async function runEvalCommand(
       : resultsWithWorkspaces.filter((r) => r.error || r.score < 0.5);
 
     if (preservedWorkspaces.length > 0) {
-      console.log(`\n${LOG_PREFIX} Preserved workspaces:`);
+      console.log('\nPreserved workspaces:');
       for (const result of preservedWorkspaces) {
-        console.log(`${LOG_PREFIX}   ${result.testId} -> ${result.workspacePath}`);
+        console.log(`  ${result.testId} -> ${result.workspacePath}`);
       }
     }
 
@@ -1503,11 +1492,11 @@ export async function runEvalCommand(
       resultsWithWorkspaces.length > 0 ||
       (options.workspaceMode && options.workspaceMode !== 'static');
     if (!options.keepWorkspaces && usedWorkspaces) {
-      console.log(`${LOG_PREFIX} Use --keep-workspaces to preserve all workspaces for inspection.`);
+      console.log('Use --keep-workspaces to preserve all workspaces for inspection.');
     }
 
     if (allResults.length > 0) {
-      console.log(`\n${LOG_PREFIX} Results written to: ${outputPath}`);
+      console.log(`\nResults written to: ${outputPath}`);
 
       // Persist last run path for `agentv results` commands
       await saveRunCache(cwd, outputPath).catch(() => undefined);
@@ -1547,8 +1536,8 @@ export async function runEvalCommand(
       const targetFlag = options.target ? ` --target ${options.target}` : '';
       const relativeOutputPath = path.relative(cwd, outputPath);
       console.log(
-        `\n${LOG_PREFIX} Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
-          `${LOG_PREFIX}   agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`,
+        `\nTip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:\n` +
+          `  agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath}`,
       );
     }