EntityProcess · christso · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -497,3 +497,4 @@ bun run promote:latest 2.18.0
 
 ## Python Scripts
 When running Python scripts, always use: `uv run <script.py>`
+
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -4,6 +4,7 @@ import path from 'node:path';
 import { pathToFileURL } from 'node:url';
 
 import {
+  DEFAULT_THRESHOLD,
   type EvalTest,
   type EvaluationCache,
   type EvaluationResult,
@@ -28,6 +29,7 @@ import {
 } from '@agentv/core';
 
 import { enforceRequiredVersion } from '../../version-check.js';
+import { maybeAutoExportRunArtifacts } from '../results/remote.js';
 import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
@@ -858,6 +860,11 @@ export interface RunEvalResult {
   readonly allExecutionErrors?: boolean;
 }
 
+interface RemoteEvalSummaryInput {
+  readonly evalFile: string;
+  readonly results: EvaluationResult[];
+}
+
 export async function runEvalCommand(
   input: RunEvalCommandInput,
 ): Promise<RunEvalResult | undefined> {
@@ -1077,6 +1084,7 @@ export async function runEvalCommand(
   // We defer cache creation until after file metadata is loaded
   const evaluationRunner = await resolveEvaluationRunner();
   const allResults: EvaluationResult[] = [];
+  const remoteEvalSummaries: RemoteEvalSummaryInput[] = [];
   const seenTestCases = new Set<string>();
   const displayIdTracker = createDisplayIdTracker();
 
@@ -1352,6 +1360,18 @@ export async function runEvalCommand(
               threshold: resolvedThreshold,
               providerFactory: transcriptProviderFactory,
             });
+            const evalFile = path.relative(cwd, testFilePath);
+            const existingSummary = remoteEvalSummaries.find(
+              (summary) => summary.evalFile === evalFile,
+            );
+            if (existingSummary) {
+              existingSummary.results.push(...result.results);
+            } else {
+              remoteEvalSummaries.push({
+                evalFile,
+                results: [...result.results],
+              });
+            }
 
             return result.results;
           } catch (fileError) {
@@ -1472,6 +1492,34 @@ export async function runEvalCommand(
 
       // Persist last run path for `agentv results` commands
       await saveRunCache(cwd, outputPath).catch(() => undefined);
+
+      await maybeAutoExportRunArtifacts({
+        cwd,
+        run_dir: runDir,
+        test_files: activeTestFiles,
+        results: allResults,
+        eval_summaries: remoteEvalSummaries.map((summary) => ({
+          eval_file: summary.evalFile,
+          total: summary.results.length,
+          passed: summary.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
+          avg_score:
+            summary.results.length > 0
+              ? summary.results.reduce((sum, result) => sum + result.score, 0) /
+                summary.results.length
+              : 0,
+          results: summary.results.map((result) => ({
+            test_id: result.testId,
+            score: result.score,
+            status:
+              result.executionStatus === 'execution_error' || result.error
+                ? 'ERROR'
+                : result.score >= DEFAULT_THRESHOLD
+                  ? 'PASS'
+                  : 'FAIL',
+          })),
+        })),
+        experiment: normalizeExperimentName(options.experiment),
+      });
     }
 
     // Suggest retry-errors command when execution errors are detected

diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts
@@ -569,12 +569,7 @@ function collectRunManifestPaths(
   }
 }
 
-/**
- * Enumerate canonical run manifests in `.agentv/results/runs/`.
- */
-export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
-  const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME);
-
+export function listResultFilesFromRunsDir(runsDir: string, limit?: number): ResultFileMeta[] {
   const files: { filePath: string; displayName: string; runId: string }[] = [];
 
   try {
@@ -626,6 +621,16 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
   return metas;
 }
 
+/**
+ * Enumerate canonical run manifests in `.agentv/results/runs/`.
+ */
+export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
+  return listResultFilesFromRunsDir(
+    path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME),
+    limit,
+  );
+}
+
 /**
  * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl
  */

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -15,6 +15,9 @@ import { join } from 'node:path';
 
 import { command, positional, string } from 'cmd-ts';
 
+import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core';
+import { maybeAutoExportRunArtifacts } from '../results/remote.js';
+
 interface EvaluatorScore {
   readonly name: string;
   readonly type: string;
@@ -223,6 +226,48 @@ export const evalBenchCommand = command({
     );
 
     console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
+
+    const results = indexLines.map((line) => JSON.parse(line)) as Array<{
+      test_id: string;
+      score: number;
+      execution_status?: string;
+      target?: string;
+      timestamp?: string;
+    }>;
+    await maybeAutoExportRunArtifacts({
+      cwd: process.cwd(),
+      run_dir: exportDir,
+      experiment,
+      test_files: manifest.eval_file ? [manifest.eval_file] : [],
+      results: results.map((result) => ({
+        testId: result.test_id,
+        score: result.score,
+        executionStatus: result.execution_status,
+        target: result.target,
+        timestamp: result.timestamp,
+      })) as EvaluationResult[],
+      eval_summaries: [
+        {
+          eval_file: manifest.eval_file ?? 'pipeline',
+          total: results.length,
+          passed: results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
+          avg_score:
+            results.length > 0
+              ? results.reduce((sum, result) => sum + result.score, 0) / results.length
+              : 0,
+          results: results.map((result) => ({
+            test_id: result.test_id,
+            score: result.score,
+            status:
+              result.execution_status === 'execution_error'
+                ? 'ERROR'
+                : result.score >= DEFAULT_THRESHOLD
+                  ? 'PASS'
+                  : 'FAIL',
+          })),
+        },
+      ],
+    });
   },
 });
Original file line number	Diff line number	Diff line change
Expand Up		@@ -497,3 +497,4 @@ bun run promote:latest 2.18.0

		## Python Scripts
		When running Python scripts, always use: `uv run <script.py>`