diff --git a/AGENTS.md b/AGENTS.md index b97b917ca..a437cd20f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -497,3 +497,4 @@ bun run promote:latest 2.18.0 ## Python Scripts When running Python scripts, always use: `uv run ` + diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 7dc02d77d..b7ad4ffa1 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import { pathToFileURL } from 'node:url'; import { + DEFAULT_THRESHOLD, type EvalTest, type EvaluationCache, type EvaluationResult, @@ -28,6 +29,7 @@ import { } from '@agentv/core'; import { enforceRequiredVersion } from '../../version-check.js'; +import { maybeAutoExportRunArtifacts } from '../results/remote.js'; import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; @@ -858,6 +860,11 @@ export interface RunEvalResult { readonly allExecutionErrors?: boolean; } +interface RemoteEvalSummaryInput { + readonly evalFile: string; + readonly results: EvaluationResult[]; +} + export async function runEvalCommand( input: RunEvalCommandInput, ): Promise { @@ -1077,6 +1084,7 @@ export async function runEvalCommand( // We defer cache creation until after file metadata is loaded const evaluationRunner = await resolveEvaluationRunner(); const allResults: EvaluationResult[] = []; + const remoteEvalSummaries: RemoteEvalSummaryInput[] = []; const seenTestCases = new Set(); const displayIdTracker = createDisplayIdTracker(); @@ -1352,6 +1360,18 @@ export async function runEvalCommand( threshold: resolvedThreshold, providerFactory: transcriptProviderFactory, }); + const evalFile = path.relative(cwd, testFilePath); + const existingSummary = remoteEvalSummaries.find( + (summary) => summary.evalFile === evalFile, + ); + if (existingSummary) { + existingSummary.results.push(...result.results); + } else { + remoteEvalSummaries.push({ + evalFile, + results: [...result.results], + }); + } return result.results; } catch (fileError) { @@ -1472,6 +1492,34 @@ export async function runEvalCommand( // Persist last run path for `agentv results` commands await saveRunCache(cwd, outputPath).catch(() => undefined); + + await maybeAutoExportRunArtifacts({ + cwd, + run_dir: runDir, + test_files: activeTestFiles, + results: allResults, + eval_summaries: remoteEvalSummaries.map((summary) => ({ + eval_file: summary.evalFile, + total: summary.results.length, + passed: summary.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length, + avg_score: + summary.results.length > 0 + ? summary.results.reduce((sum, result) => sum + result.score, 0) / + summary.results.length + : 0, + results: summary.results.map((result) => ({ + test_id: result.testId, + score: result.score, + status: + result.executionStatus === 'execution_error' || result.error + ? 'ERROR' + : result.score >= DEFAULT_THRESHOLD + ? 'PASS' + : 'FAIL', + })), + })), + experiment: normalizeExperimentName(options.experiment), + }); } // Suggest retry-errors command when execution errors are detected diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 0d9689153..3592d34e8 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -569,12 +569,7 @@ function collectRunManifestPaths( } } -/** - * Enumerate canonical run manifests in `.agentv/results/runs/`. - */ -export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { - const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME); - +export function listResultFilesFromRunsDir(runsDir: string, limit?: number): ResultFileMeta[] { const files: { filePath: string; displayName: string; runId: string }[] = []; try { @@ -626,6 +621,16 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { return metas; } +/** + * Enumerate canonical run manifests in `.agentv/results/runs/`. + */ +export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] { + return listResultFilesFromRunsDir( + path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME), + limit, + ); +} + /** * Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl */ diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index cb4bcc4ad..6686c4d61 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -15,6 +15,9 @@ import { join } from 'node:path'; import { command, positional, string } from 'cmd-ts'; +import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core'; +import { maybeAutoExportRunArtifacts } from '../results/remote.js'; + interface EvaluatorScore { readonly name: string; readonly type: string; @@ -223,6 +226,48 @@ export const evalBenchCommand = command({ ); console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`); + + const results = indexLines.map((line) => JSON.parse(line)) as Array<{ + test_id: string; + score: number; + execution_status?: string; + target?: string; + timestamp?: string; + }>; + await maybeAutoExportRunArtifacts({ + cwd: process.cwd(), + run_dir: exportDir, + experiment, + test_files: manifest.eval_file ? [manifest.eval_file] : [], + results: results.map((result) => ({ + testId: result.test_id, + score: result.score, + executionStatus: result.execution_status, + target: result.target, + timestamp: result.timestamp, + })) as EvaluationResult[], + eval_summaries: [ + { + eval_file: manifest.eval_file ?? 'pipeline', + total: results.length, + passed: results.filter((result) => result.score >= DEFAULT_THRESHOLD).length, + avg_score: + results.length > 0 + ? results.reduce((sum, result) => sum + result.score, 0) / results.length + : 0, + results: results.map((result) => ({ + test_id: result.test_id, + score: result.score, + status: + result.execution_status === 'execution_error' + ? 'ERROR' + : result.score >= DEFAULT_THRESHOLD + ? 'PASS' + : 'FAIL', + })), + }, + ], + }); }, }); diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts new file mode 100644 index 000000000..f87c47a80 --- /dev/null +++ b/apps/cli/src/commands/results/remote.ts @@ -0,0 +1,323 @@ +import path from 'node:path'; + +import { + DEFAULT_THRESHOLD, + type EvaluationResult, + type ResultsExportConfig, + type ResultsRepoStatus, + commitAndPushResultsBranch, + createDraftResultsPr, + directorySizeBytes, + getResultsRepoStatus, + loadConfig, + prepareResultsRepoBranch, + resolveResultsRepoRunsDir, + stageResultsArtifacts, + syncResultsRepo, +} from '@agentv/core'; + +import { findRepoRoot } from '../eval/shared.js'; +import { + type ResultFileMeta, + listResultFiles, + listResultFilesFromRunsDir, +} from '../inspect/utils.js'; + +export type RunSource = 'local' | 'remote'; + +export interface SourcedResultFileMeta extends ResultFileMeta { + readonly source: RunSource; + readonly raw_filename: string; +} + +export interface RemoteEvalSummary { + readonly eval_file: string; + readonly total: number; + readonly passed: number; + readonly avg_score: number; + readonly results: Array<{ + readonly test_id: string; + readonly score: number; + readonly status: 'PASS' | 'FAIL' | 'ERROR'; + }>; +} + +export interface RemoteExportPayload { + readonly cwd: string; + readonly run_dir: string; + readonly test_files: readonly string[]; + readonly results: readonly EvaluationResult[]; + readonly eval_summaries: readonly RemoteEvalSummary[]; + readonly experiment?: string; +} + +export interface RemoteResultsStatus extends ResultsRepoStatus { + readonly run_count: number; +} + +const REMOTE_RUN_PREFIX = 'remote::'; +const SIZE_WARNING_BYTES = 10 * 1024 * 1024; + +function getStatusMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error); +} + +function normalizeResultsExportConfig(config: ResultsExportConfig): Required { + return { + repo: config.repo, + path: config.path, + auto_push: config.auto_push === true, + branch_prefix: config.branch_prefix?.trim() || 'eval-results', + }; +} + +function statusForResult(result: EvaluationResult): 'PASS' | 'FAIL' | 'ERROR' { + if (result.executionStatus === 'execution_error' || result.error) { + return 'ERROR'; + } + return result.score >= DEFAULT_THRESHOLD ? 'PASS' : 'FAIL'; +} + +function slugify(value: string): string { + return value + .trim() + .replace(/[^A-Za-z0-9._/-]+/g, '-') + .replace(/\/+/g, '/') + .replace(/^-+|-+$/g, '') + .slice(0, 120); +} + +function getRelativeRunPath(cwd: string, runDir: string): string { + const relative = path.relative(path.join(cwd, '.agentv', 'results', 'runs'), runDir); + if (!relative.startsWith('..') && !path.isAbsolute(relative)) { + return relative; + } + + const experiment = path.basename(path.dirname(runDir)); + const runName = path.basename(runDir); + return experiment && experiment !== runName ? path.join(experiment, runName) : runName; +} + +function buildBranchName( + config: Required, + payload: RemoteExportPayload, +): string { + const timestamp = path.basename(payload.run_dir); + const evalStem = + payload.test_files.length === 1 + ? path + .basename(payload.test_files[0]) + .replace(/\.eval\.ya?ml$/i, '') + .replace(/\.[^.]+$/i, '') + : `${payload.test_files.length}-evals`; + const experiment = slugify(payload.experiment ?? 'default'); + const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp; + return `${config.branch_prefix}/${branchLeaf}`; +} + +function buildCommitTitle(payload: RemoteExportPayload): string { + const passed = payload.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length; + const avgScore = + payload.results.length > 0 + ? payload.results.reduce((sum, result) => sum + result.score, 0) / payload.results.length + : 0; + const experiment = payload.experiment ?? 'default'; + return `feat(results): ${experiment} - ${passed}/${payload.results.length} PASS (${avgScore.toFixed(3)})`; +} + +function buildPrBody(payload: RemoteExportPayload): string { + const sections = payload.eval_summaries + .map((summary) => { + const table = summary.results + .map((result) => `| ${result.test_id} | ${result.score.toFixed(3)} | ${result.status} |`) + .join('\n'); + return [ + `### ${summary.eval_file}`, + '', + `Summary: ${summary.passed}/${summary.total} PASS (${summary.avg_score.toFixed(3)})`, + '', + '| Test | Score | Status |', + '|---|---|---|', + table || '| (no results) | 0.000 | ERROR |', + ].join('\n'); + }) + .join('\n\n'); + + return [ + '## Results', + '', + sections, + '', + `Run: ${path.basename(payload.run_dir)}`, + `Experiment: ${payload.experiment ?? 'default'}`, + `Eval Files: ${payload.test_files.join(', ')}`, + ].join('\n'); +} + +async function maybeWarnLargeArtifact(runDir: string): Promise { + const sizeBytes = await directorySizeBytes(runDir); + if (sizeBytes > SIZE_WARNING_BYTES) { + console.warn( + `Warning: run artifacts total ${(sizeBytes / (1024 * 1024)).toFixed(1)}MB. Export will continue.`, + ); + } +} + +async function loadNormalizedResultsConfig( + cwd: string, +): Promise | undefined> { + const repoRoot = (await findRepoRoot(cwd)) ?? cwd; + const config = await loadConfig(path.join(cwd, '_'), repoRoot); + if (!config?.results?.export) { + return undefined; + } + return normalizeResultsExportConfig(config.results.export); +} + +export function encodeRemoteRunId(filename: string): string { + return `${REMOTE_RUN_PREFIX}${filename}`; +} + +export function isRemoteRunId(filename: string): boolean { + return filename.startsWith(REMOTE_RUN_PREFIX); +} + +export function decodeRemoteRunId(filename: string): string { + return filename.replace(REMOTE_RUN_PREFIX, ''); +} + +export async function getRemoteResultsStatus(cwd: string): Promise { + const config = await loadNormalizedResultsConfig(cwd); + const status = getResultsRepoStatus(config); + const runCount = + config && status.available + ? listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length + : 0; + return { + ...status, + run_count: runCount, + }; +} + +export async function syncRemoteResults(cwd: string): Promise { + const config = await loadNormalizedResultsConfig(cwd); + if (!config) { + return { + ...getResultsRepoStatus(), + run_count: 0, + }; + } + + try { + await syncResultsRepo(config); + } catch (error) { + return { + ...getResultsRepoStatus(config), + run_count: 0, + last_error: getStatusMessage(error), + }; + } + + return getRemoteResultsStatus(cwd); +} + +export async function listMergedResultFiles( + cwd: string, + limit?: number, +): Promise<{ runs: SourcedResultFileMeta[]; remote_status: RemoteResultsStatus }> { + const localRuns = listResultFiles(cwd).map( + (meta) => + ({ + ...meta, + source: 'local' as const, + raw_filename: meta.filename, + }) satisfies SourcedResultFileMeta, + ); + + const remoteStatus = await getRemoteResultsStatus(cwd); + const config = await loadNormalizedResultsConfig(cwd); + if (!config || !remoteStatus.available) { + return { + runs: limit !== undefined && limit > 0 ? localRuns.slice(0, limit) : localRuns, + remote_status: remoteStatus, + }; + } + + const remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map( + (meta) => + ({ + ...meta, + filename: encodeRemoteRunId(meta.filename), + raw_filename: meta.filename, + source: 'remote' as const, + }) satisfies SourcedResultFileMeta, + ); + + const merged = [...localRuns, ...remoteRuns].sort((a, b) => + b.timestamp.localeCompare(a.timestamp), + ); + return { + runs: limit !== undefined && limit > 0 ? merged.slice(0, limit) : merged, + remote_status: remoteStatus, + }; +} + +export async function findRunById( + cwd: string, + runId: string, +): Promise { + const { runs } = await listMergedResultFiles(cwd); + return runs.find((run) => run.filename === runId); +} + +export async function maybeAutoExportRunArtifacts(payload: RemoteExportPayload): Promise { + const config = await loadNormalizedResultsConfig(payload.cwd); + if (!config?.auto_push) { + return; + } + + try { + await maybeWarnLargeArtifact(payload.run_dir); + + const branchName = buildBranchName(config, payload); + const prepared = await prepareResultsRepoBranch(config, branchName); + + try { + const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir); + const destinationDir = path.join(prepared.repoDir, config.path, relativeRunPath); + await stageResultsArtifacts({ + repoDir: prepared.repoDir, + sourceDir: payload.run_dir, + destinationDir, + }); + + const commitTitle = buildCommitTitle(payload); + const changed = await commitAndPushResultsBranch({ + repoDir: prepared.repoDir, + branchName, + commitMessage: commitTitle, + }); + + if (!changed) { + console.warn('Warning: results export produced no git changes. Skipping PR creation.'); + return; + } + + const prUrl = await createDraftResultsPr({ + repo: config.repo, + repoDir: prepared.repoDir, + baseBranch: prepared.baseBranch, + branchName, + title: commitTitle, + body: buildPrBody(payload), + }); + + console.log(`Remote results draft PR created: ${prUrl}`); + } finally { + await prepared.cleanup(); + } + } catch (error) { + console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`); + console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing."); + } +} diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 57686a897..121d9eac1 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -53,6 +53,12 @@ import { parseResultManifest, resolveResultSourcePath, } from './manifest.js'; +import { + findRunById, + getRemoteResultsStatus, + listMergedResultFiles, + syncRemoteResults, +} from './remote.js'; import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js'; // ── Source resolution ──────────────────────────────────────────────────── @@ -234,8 +240,8 @@ interface DataContext { // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route type C = Context; -function handleRuns(c: C, { searchDir }: DataContext) { - const metas = listResultFiles(searchDir); +async function handleRuns(c: C, { searchDir }: DataContext) { + const { runs: metas } = await listMergedResultFiles(searchDir); return c.json({ runs: metas.map((m) => { let target: string | undefined; @@ -258,6 +264,7 @@ function handleRuns(c: C, { searchDir }: DataContext) { pass_rate: m.passRate, avg_score: m.avgScore, size_bytes: m.sizeBytes, + source: m.source, ...(target && { target }), ...(experiment && { experiment }), }; @@ -265,21 +272,25 @@ function handleRuns(c: C, { searchDir }: DataContext) { }); } -function handleRunDetail(c: C, { searchDir }: DataContext) { - const filename = c.req.param('filename'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); +async function handleRunDetail(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); - return c.json({ results: stripHeavyFields(loaded), source: meta.displayName }); + return c.json({ + results: stripHeavyFields(loaded), + source: meta.source, + source_label: meta.displayName, + }); } catch { return c.json({ error: 'Failed to load run' }, 500); } } -function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { - const filename = c.req.param('filename'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); +async function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); @@ -306,9 +317,9 @@ function handleRunSuites(c: C, { searchDir, agentvDir }: DataContext) { } } -function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { - const filename = c.req.param('filename'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); +async function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); @@ -345,10 +356,10 @@ function handleRunCategories(c: C, { searchDir, agentvDir }: DataContext) { } } -function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { - const filename = c.req.param('filename'); +async function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; const category = decodeURIComponent(c.req.param('category') ?? ''); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); @@ -376,10 +387,10 @@ function handleCategorySuites(c: C, { searchDir, agentvDir }: DataContext) { } } -function handleEvalDetail(c: C, { searchDir }: DataContext) { - const filename = c.req.param('filename'); +async function handleEvalDetail(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; const evalId = c.req.param('evalId'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const loaded = loadManifestResults(meta.path); @@ -391,10 +402,10 @@ function handleEvalDetail(c: C, { searchDir }: DataContext) { } } -function handleEvalFiles(c: C, { searchDir }: DataContext) { - const filename = c.req.param('filename'); +async function handleEvalFiles(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; const evalId = c.req.param('evalId'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); try { const content = readFileSync(meta.path, 'utf8'); @@ -429,9 +440,9 @@ function handleEvalFiles(c: C, { searchDir }: DataContext) { } } -function handleEvalFileContent(c: C, { searchDir }: DataContext) { - const filename = c.req.param('filename'); - const meta = listResultFiles(searchDir).find((m) => m.filename === filename); +async function handleEvalFileContent(c: C, { searchDir }: DataContext) { + const filename = c.req.param('filename') ?? ''; + const meta = await findRunById(searchDir, filename); if (!meta) return c.json({ error: 'Run not found' }, 404); // Extract the wildcard suffix without depending on decoded route params. @@ -465,8 +476,8 @@ function handleEvalFileContent(c: C, { searchDir }: DataContext) { } } -function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { - const metas = listResultFiles(searchDir); +async function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { + const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const experimentMap = new Map< string, @@ -518,8 +529,8 @@ function handleExperiments(c: C, { searchDir, agentvDir }: DataContext) { return c.json({ experiments }); } -function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { - const metas = listResultFiles(searchDir); +async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { + const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); // Collect per-test-case results keyed by experiment × target @@ -608,8 +619,8 @@ function handleCompare(c: C, { searchDir, agentvDir }: DataContext) { }); } -function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { - const metas = listResultFiles(searchDir); +async function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { + const { runs: metas } = await listMergedResultFiles(searchDir); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); const targetMap = new Map< string, @@ -655,10 +666,15 @@ function handleTargets(c: C, { searchDir, agentvDir }: DataContext) { return c.json({ targets }); } -function handleConfig(c: C, { agentvDir }: DataContext, options?: { readOnly?: boolean }) { +function handleConfig( + c: C, + { agentvDir, searchDir }: DataContext, + options?: { readOnly?: boolean }, +) { return c.json({ ...loadStudioConfig(agentvDir), read_only: options?.readOnly === true, + project_name: path.basename(searchDir), }); } @@ -741,30 +757,32 @@ export function createApp( }; } - app.get('/api/projects', (c) => { + app.get('/api/projects', async (c) => { const registry = loadProjectRegistry(); - const projects = registry.projects.map((p) => { - let runCount = 0; - let passRate = 0; - let lastRun: string | null = null; - try { - const metas = listResultFiles(p.path); - runCount = metas.length; - if (metas.length > 0) { - const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0); - passRate = totalPassRate / metas.length; - lastRun = metas[0].timestamp; + const projects = await Promise.all( + registry.projects.map(async (p) => { + let runCount = 0; + let passRate = 0; + let lastRun: string | null = null; + try { + const { runs: metas } = await listMergedResultFiles(p.path); + runCount = metas.length; + if (metas.length > 0) { + const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0); + passRate = totalPassRate / metas.length; + lastRun = metas[0].timestamp; + } + } catch { + // Project path may be missing or inaccessible } - } catch { - // Project path may be missing or inaccessible - } - return { - ...projectEntryToWire(p), - run_count: runCount, - pass_rate: passRate, - last_run: lastRun, - }; - }); + return { + ...projectEntryToWire(p), + run_count: runCount, + pass_rate: passRate, + last_run: lastRun, + }; + }), + ); return c.json({ projects }); }); @@ -791,11 +809,11 @@ export function createApp( return c.json({ ok: true }); }); - app.get('/api/projects/:projectId/summary', (c) => { + app.get('/api/projects/:projectId/summary', async (c) => { const project = getProject(c.req.param('projectId') ?? ''); if (!project) return c.json({ error: 'Project not found' }, 404); try { - const metas = listResultFiles(project.path); + const { runs: metas } = await listMergedResultFiles(project.path); const runCount = metas.length; const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0; const lastRun = metas.length > 0 ? metas[0].timestamp : null; @@ -828,7 +846,7 @@ export function createApp( }); /** Aggregate runs from all registered projects, sorted by timestamp descending. */ - app.get('/api/projects/all-runs', (c) => { + app.get('/api/projects/all-runs', async (c) => { const registry = loadProjectRegistry(); const allRuns: Array<{ filename: string; @@ -841,13 +859,14 @@ export function createApp( size_bytes: number; target?: string; experiment?: string; + source: 'local' | 'remote'; project_id: string; project_name: string; }> = []; for (const p of registry.projects) { try { - const metas = listResultFiles(p.path); + const { runs: metas } = await listMergedResultFiles(p.path); for (const m of metas) { let target: string | undefined; let experiment: string | undefined; @@ -869,6 +888,7 @@ export function createApp( pass_rate: m.passRate, avg_score: m.avgScore, size_bytes: m.sizeBytes, + source: m.source, ...(target && { target }), ...(experiment && { experiment }), project_id: p.id, @@ -887,6 +907,8 @@ export function createApp( // ── Data routes (unscoped) ──────────────────────────────────────────── app.get('/api/config', (c) => handleConfig(c, defaultCtx, { readOnly })); + app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir))); + app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir))); app.get('/api/runs', (c) => handleRuns(c, defaultCtx)); app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx)); app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx)); @@ -957,8 +979,8 @@ export function createApp( }); // Aggregated index (unscoped only) - app.get('/api/index', (c) => { - const metas = listResultFiles(searchDir); + app.get('/api/index', async (c) => { + const { runs: metas } = await listMergedResultFiles(searchDir); const entries = metas.map((m) => { let totalCostUsd = 0; try { @@ -986,6 +1008,14 @@ export function createApp( app.get('/api/projects/:projectId/config', (c) => withProject(c, (ctx, dataCtx) => handleConfig(ctx, dataCtx, { readOnly })), ); + app.get('/api/projects/:projectId/remote/status', (c) => + withProject(c, async (ctx, dataCtx) => + ctx.json(await getRemoteResultsStatus(dataCtx.searchDir)), + ), + ); + app.post('/api/projects/:projectId/remote/sync', (c) => + withProject(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))), + ); app.get('/api/projects/:projectId/runs', (c) => withProject(c, handleRuns)); app.get('/api/projects/:projectId/runs/:filename', (c) => withProject(c, handleRunDetail)); app.get('/api/projects/:projectId/runs/:filename/suites', (c) => withProject(c, handleRunSuites)); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 94841e32f..4a4ea4085 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -358,6 +358,105 @@ describe('serve app', () => { const data = (await res.json()) as { runs: unknown[] }; expect(data.runs).toEqual([]); }); + + it('tags local runs with source metadata', async () => { + const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-00-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(RESULT_A)); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { runs: Array<{ filename: string; source: string }> }; + expect(data.runs).toHaveLength(1); + expect(data.runs[0]).toMatchObject({ + filename, + source: 'local', + }); + }); + + it('merges cached remote runs and tags them with remote source metadata', async () => { + const previousHome = process.env.AGENTV_HOME; + process.env.AGENTV_HOME = path.join(tempDir, 'agentv-home'); + + try { + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + `results: + export: + repo: EntityProcess/agentv-evals + path: autopilot-dev/runs +`, + ); + + const remoteRunDir = path.join( + process.env.AGENTV_HOME, + 'cache', + 'results-repo', + 'EntityProcess-agentv-evals', + 'repo', + 'autopilot-dev', + 'runs', + 'default', + '2026-03-26T10-00-00-000Z', + ); + mkdirSync(remoteRunDir, { recursive: true }); + writeFileSync(path.join(remoteRunDir, 'index.jsonl'), toJsonl(RESULT_A)); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/runs'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + runs: Array<{ filename: string; source: string }>; + }; + expect(data.runs).toHaveLength(1); + expect(data.runs[0]).toMatchObject({ + filename: 'remote::2026-03-26T10-00-00-000Z', + source: 'remote', + }); + } finally { + if (previousHome === undefined) { + process.env.AGENTV_HOME = undefined; + } else { + process.env.AGENTV_HOME = previousHome; + } + } + }); + }); + + describe('GET /api/remote/status', () => { + it('reports configured remote status with graceful local-only fallback', async () => { + mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); + writeFileSync( + path.join(tempDir, '.agentv', 'config.yaml'), + `results: + export: + repo: EntityProcess/agentv-evals + path: autopilot-dev/runs +`, + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const res = await app.request('/api/remote/status'); + + expect(res.status).toBe(200); + const data = (await res.json()) as { + configured: boolean; + available: boolean; + repo: string; + path: string; + }; + expect(data.configured).toBe(true); + expect(data.available).toBe(false); + expect(data.repo).toBe('EntityProcess/agentv-evals'); + expect(data.path).toBe('autopilot-dev/runs'); + }); }); // ── GET /api/runs/:filename ───────────────────────────────────────── @@ -382,10 +481,15 @@ describe('serve app', () => { const app = createApp([], tempDir, tempDir, undefined, { studioDir }); const res = await app.request(`/api/runs/${filename}`); expect(res.status).toBe(200); - const data = (await res.json()) as { results: { testId: string }[]; source: string }; + const data = (await res.json()) as { + results: { testId: string }[]; + source: 'local' | 'remote'; + source_label: string; + }; expect(data.results).toHaveLength(2); expect(data.results[0].testId).toBe('test-greeting'); - expect(data.source).toBe(filename); + expect(data.source).toBe('local'); + expect(data.source_label).toBe(filename); }); }); diff --git a/apps/studio/src/components/ExperimentsTab.tsx b/apps/studio/src/components/ExperimentsTab.tsx index 59335e2ff..90fab6534 100644 --- a/apps/studio/src/components/ExperimentsTab.tsx +++ b/apps/studio/src/components/ExperimentsTab.tsx @@ -10,7 +10,7 @@ import { Link } from '@tanstack/react-router'; import { useExperiments } from '~/lib/api'; import type { ExperimentSummary } from '~/lib/types'; -import { ScoreBar } from './ScoreBar'; +import { PassRatePill } from './PassRatePill'; export function ExperimentsTab() { const { data, isLoading } = useExperiments(); @@ -40,7 +40,8 @@ export function ExperimentsTab() { Experiment Runs Targets - Pass Rate + Evals + Pass Rate Last Run @@ -60,10 +61,17 @@ export function ExperimentsTab() { {exp.target_count} + + {exp.passed_count} + / + {exp.eval_count} + - + + + + {formatTimestamp(exp.last_run).date} - {formatTimestamp(exp.last_run)} ))} @@ -72,14 +80,23 @@ export function ExperimentsTab() { ); } -function formatTimestamp(ts: string | undefined | null): string { - if (!ts) return 'N/A'; +function formatTimestamp(ts: string | undefined | null): { date: string; full: string } { + if (!ts) return { date: 'N/A', full: 'N/A' }; try { const d = new Date(ts); - if (Number.isNaN(d.getTime())) return 'N/A'; - return d.toLocaleString(); + if (Number.isNaN(d.getTime())) return { date: 'N/A', full: 'N/A' }; + const full = d.toLocaleString(); + const diffMs = Date.now() - d.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + const diffHour = Math.floor(diffMs / 3_600_000); + let date: string; + if (diffMin < 1) date = 'just now'; + else if (diffMin < 60) date = `${diffMin} min ago`; + else if (diffHour < 24) date = `${diffHour} hour${diffHour === 1 ? '' : 's'} ago`; + else date = d.toLocaleDateString(); + return { date, full }; } catch { - return 'N/A'; + return { date: 'N/A', full: 'N/A' }; } } diff --git a/apps/studio/src/components/PassRatePill.tsx b/apps/studio/src/components/PassRatePill.tsx new file mode 100644 index 000000000..f9c6762ff --- /dev/null +++ b/apps/studio/src/components/PassRatePill.tsx @@ -0,0 +1,23 @@ +/** + * Progress-bar pill: gradient blue fill proportional to rate, percentage text inside. + * Used for pass rate and per-test score throughout Studio. + */ + +interface PassRatePillProps { + rate: number; +} + +export function PassRatePill({ rate }: PassRatePillProps) { + const pct = Math.round(rate * 100); + return ( +
+
+ + {pct}% + +
+ ); +} diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx index baa5b8526..a380b6e88 100644 --- a/apps/studio/src/components/RunDetail.tsx +++ b/apps/studio/src/components/RunDetail.tsx @@ -1,17 +1,18 @@ /** * Run detail component showing per-eval breakdown with score bars. * - * Groups results by category (from file path), then by suite within each category. - * Categories are shown as collapsible sections with suite cards inside. + * Groups results by category, then by suite within each category. + * Category Breakdown is shown as a clean table with coloured pass-rate pills. + * The All Evals table shows ERR badge instead of 0% for execution errors. */ import { Link } from '@tanstack/react-router'; -import { useState } from 'react'; import type { EvalResult } from '~/lib/types'; import { isPassing, useStudioConfig } from '~/lib/api'; -import { ScoreBar } from './ScoreBar'; + +import { PassRatePill } from './PassRatePill'; import { StatsCards } from './StatsCards'; interface RunDetailProps { @@ -95,7 +96,6 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); const categories = buildCategoryGroups(results, passThreshold); - const hasMultipleCategories = categories.length > 1; if (total === 0) { return ( @@ -119,162 +119,121 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { totalCost={totalCost > 0 ? totalCost : undefined} /> - {hasMultipleCategories ? ( -
-

Categories

- {categories.map((cat) => ( - - ))} -
- ) : ( -
-

Suites

-
- {categories[0]?.suites.map((ds) => ( - - ))} -
-
- )} - -
- - - - - - - - - - - - - {results.map((result, idx) => ( - - - - - - - + {/* Category Breakdown */} +
+

Category Breakdown

+
+
Test IDTargetScoreStatusDurationCost
- {projectId ? ( - - {result.testId} - - ) : ( - - {result.testId} - - )} - {result.target ?? '-'} - - - - - {result.durationMs != null ? `${(result.durationMs / 1000).toFixed(1)}s` : '-'} - - {result.costUsd != null ? `$${result.costUsd.toFixed(4)}` : '-'} -
+ + + + + + + - ))} - -
CategoryPass RatePassedFailedTotal
+ + + {categories.map((cat) => ( + + {cat.name} + + 0 ? cat.passed / cat.total : 0} /> + + + {cat.passed} + + + {cat.failed > 0 ? cat.failed : 0} + + {cat.total} + + ))} + + +
- - ); -} - -function CategorySection({ category, runId }: { category: CategoryGroup; runId: string }) { - const [expanded, setExpanded] = useState(true); - return ( -
- - {expanded && ( -
-
- {category.suites.map((ds) => ( - - ))} -
+ {/* All Evals */} +
+

All Evals

+
+ + + + + + + + + + + + {results.map((result, idx) => { + const isError = result.executionStatus === 'execution_error'; + const passing = isPassing(result.score, passThreshold); + return ( + + {/* Status dot */} + + + + + + + + ); + })} + +
+ Test IDTargetScoreDurationCost
+ {isError ? ( + ! + ) : ( + + {passing ? '✓' : '✗'} + + )} + + {projectId ? ( + + {result.testId} + + ) : ( + + {result.testId} + + )} + {result.target ?? '-'} + {isError ? ( + + ERR + + ) : ( + + )} + + {result.durationMs != null + ? `${(result.durationMs / 1000).toFixed(1)}s` + : '-'} + + {result.costUsd != null ? `$${result.costUsd.toFixed(4)}` : '-'} +
- )} -
- ); -} - -function SuiteCard({ suite, runId }: { suite: SuiteStats; runId: string }) { - return ( - -
- {suite.name} - - {suite.passed}/{suite.total} -
-
- -
-
- {suite.passed} passed - {suite.failed > 0 && {suite.failed} failed} -
- - ); -} - -function StatusBadge({ status }: { status?: string }) { - if (!status) return -; - - const isSuccess = status === 'success' || status === 'completed'; - const isError = status === 'error' || status === 'failed'; - - return ( - - {status} - +
); } diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx index d102c8225..29ee3a8fe 100644 --- a/apps/studio/src/components/RunList.tsx +++ b/apps/studio/src/components/RunList.tsx @@ -1,44 +1,68 @@ /** * Sortable run table component. * - * Displays all available runs with filename, timestamp, test count, - * pass rate score bar, and avg score. Clicking a row navigates to - * the run detail view. + * Displays all available runs with a pass/fail status dot, human-readable name, + * source badge, date, test count, and coloured pass-rate pill. + * Clicking a row navigates to the run detail view. */ +import type React from 'react'; + import { Link } from '@tanstack/react-router'; import type { RunMeta } from '~/lib/types'; -import { ScoreBar } from './ScoreBar'; +import { PassRatePill } from './PassRatePill'; interface RunListProps { runs: RunMeta[]; projectId?: string; + emptyMessage?: React.ReactNode; } -function formatTimestamp(ts: string | undefined | null): string { - if (!ts) return 'N/A'; +function formatDate(ts: string | undefined | null): { date: string; full: string } { + if (!ts) return { date: 'N/A', full: 'N/A' }; try { const d = new Date(ts); - if (Number.isNaN(d.getTime())) return 'N/A'; - return d.toLocaleString(); + if (Number.isNaN(d.getTime())) return { date: 'N/A', full: 'N/A' }; + const full = d.toLocaleString(); + const diffMs = Date.now() - d.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + const diffHour = Math.floor(diffMs / 3_600_000); + let date: string; + if (diffMin < 1) date = 'just now'; + else if (diffMin < 60) date = `${diffMin} min ago`; + else if (diffHour < 24) date = `${diffHour} hour${diffHour === 1 ? '' : 's'} ago`; + else date = d.toLocaleDateString(); + return { date, full }; } catch { - return 'N/A'; + return { date: 'N/A', full: 'N/A' }; } } -export function RunList({ runs, projectId }: RunListProps) { +/** Human-readable run label: "target · experiment" or filename fallback. */ +function runLabel(run: RunMeta): string { + const parts = [run.target, run.experiment].filter((p) => p && p !== 'default' && p !== '-'); + if (parts.length > 0) return parts.join(' · '); + if (run.target) return run.target; + return run.display_name ?? run.filename; +} + +export function RunList({ runs, projectId, emptyMessage }: RunListProps) { if (runs.length === 0) { return (
-

No evaluation runs found.

-

- Run an evaluation first:{' '} - - agentv eval <eval-file> - -

+ {emptyMessage ?? ( + <> +

No evaluation runs found.

+

+ Run an evaluation first:{' '} + + agentv eval <eval-file> + +

+ + )}
); } @@ -48,59 +72,77 @@ export function RunList({ runs, projectId }: RunListProps) { + - - - - - - + + + + + - {runs.map((run) => ( - - + {/* Status dot */} + - - - - - - - - ))} + {passing ? '✓' : '✗'} + + + + {/* Run name */} + + + {/* Passed / Failed / Total */} + + + + + {/* Pass rate pill */} + + + {/* When */} + + + ); + })}
RunTargetExperimentTimestampTests - Tests Passing - - Mean Score - PassedFailedTotalPass RateWhen
- {projectId ? ( - - {run.display_name ?? run.filename} - - ) : ( - { + const ts = formatDate(run.timestamp); + const passing = run.pass_rate >= 0.8; + const label = runLabel(run); + const passedCount = Math.round(run.pass_rate * run.test_count); + const failedCount = run.test_count - passedCount; + return ( +
+ - {run.display_name ?? run.filename} - - )} - {run.target ?? '-'}{run.experiment ?? '-'}{formatTimestamp(run.timestamp)}{run.test_count} - - - {(run.avg_score * 100).toFixed(1)}% -
+ {projectId ? ( + + {label} + + ) : ( + + {label} + + )} + + {passedCount} + + {failedCount > 0 ? failedCount : 0} + + {run.test_count} + + + + {ts.date} +
diff --git a/apps/studio/src/components/RunSourceToolbar.tsx b/apps/studio/src/components/RunSourceToolbar.tsx new file mode 100644 index 000000000..f28254bcf --- /dev/null +++ b/apps/studio/src/components/RunSourceToolbar.tsx @@ -0,0 +1,98 @@ +import type { RemoteStatusResponse } from '~/lib/types'; + +export type RunSourceFilter = 'all' | 'local' | 'remote'; + +interface RunSourceToolbarProps { + filter: RunSourceFilter; + onFilterChange: (filter: RunSourceFilter) => void; + remoteStatus?: RemoteStatusResponse; + syncInFlight?: boolean; + onSync?: () => void; +} + +function formatLastSynced(timestamp?: string): string { + if (!timestamp) { + return 'Never synced'; + } + + const parsed = new Date(timestamp); + if (Number.isNaN(parsed.getTime())) { + return 'Never synced'; + } + + return `Last synced ${parsed.toLocaleString()}`; +} + +export function RunSourceToolbar({ + filter, + onFilterChange, + remoteStatus, + syncInFlight, + onSync, +}: RunSourceToolbarProps) { + const remoteConfigured = remoteStatus?.configured === true; + const remoteUnavailable = remoteConfigured && remoteStatus?.available !== true; + + return ( +
+
+
+ {(['all', 'local', 'remote'] as const).map((value) => { + const dimmed = value === 'remote' && !remoteConfigured; + return ( + + ); + })} +
+ + {remoteConfigured && onSync ? ( + + ) : null} +
+ + {remoteConfigured ? ( +
+ {formatLastSynced(remoteStatus?.last_synced_at)} + {remoteStatus?.repo ? Repo: {remoteStatus.repo} : null} + {remoteUnavailable ? ( + Remote cache unavailable + ) : null} +
+ ) : filter === 'all' ? ( +

+ Remote results are not configured. Add{' '} + results.export to{' '} + .agentv/config.yaml to + enable. +

+ ) : null} + + {remoteStatus?.last_error ? ( +
+ {remoteStatus.last_error} +
+ ) : null} +
+ ); +} diff --git a/apps/studio/src/components/StatsCards.tsx b/apps/studio/src/components/StatsCards.tsx index 6883e5442..3fca80e85 100644 --- a/apps/studio/src/components/StatsCards.tsx +++ b/apps/studio/src/components/StatsCards.tsx @@ -1,7 +1,7 @@ /** - * Overview stat cards for a run or the global index. + * Overview stat bar for a run — compact inline layout matching table width. * - * Shows: total evals, passed, failed, pass rate, and total cost. + * Shows: pass rate, passed, failed, total (and optional cost) in a single row. */ interface StatsCardsProps { @@ -12,27 +12,46 @@ interface StatsCardsProps { totalCost?: number; } -function Card({ label, value, accent }: { label: string; value: string; accent?: string }) { +export function StatsCards({ total, passed, failed, passRate, totalCost }: StatsCardsProps) { + const pct = Math.round(passRate * 100); + const rateColor = pct >= 80 ? 'text-cyan-400' : pct >= 60 ? 'text-amber-400' : 'text-red-400'; + return ( -
-

{label}

-

- {value} -

+
+ +
+ + + + {totalCost !== undefined && ( + <> +
+ + + )}
); } -export function StatsCards({ total, passed, failed, passRate, totalCost }: StatsCardsProps) { +function Stat({ + label, + value, + accent, + large, +}: { + label: string; + value: string; + accent?: string; + large?: boolean; +}) { return ( -
- - - - - {totalCost !== undefined && ( - - )} +
+ {label} + + {value} +
); } diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 0ffc8a868..fa7a43fe7 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -23,6 +23,7 @@ import type { IndexResponse, ProjectEntry, ProjectListResponse, + RemoteStatusResponse, RunDetailResponse, RunEvalRequest, RunListResponse, @@ -147,6 +148,15 @@ export const studioConfigOptions = queryOptions({ staleTime: 5_000, }); +export function remoteStatusOptions(projectId?: string) { + const url = projectId ? `${projectApiBase(projectId)}/remote/status` : '/api/remote/status'; + return queryOptions({ + queryKey: ['remote-status', projectId ?? ''], + queryFn: () => fetchJson(url), + staleTime: 5_000, + }); +} + // ── Hooks ─────────────────────────────────────────────────────────────── export function useRunList() { @@ -205,6 +215,10 @@ export function useStudioConfig() { return useQuery(studioConfigOptions); } +export function useRemoteStatus(projectId?: string) { + return useQuery(remoteStatusOptions(projectId)); +} + /** Default pass threshold matching @agentv/core DEFAULT_THRESHOLD */ export const DEFAULT_PASS_THRESHOLD = 0.8; @@ -407,6 +421,17 @@ export function projectConfigOptions(projectId: string) { }); } +export async function syncRemoteResultsApi(projectId?: string): Promise { + const url = projectId ? `${projectApiBase(projectId)}/remote/sync` : '/api/remote/sync'; + const res = await fetch(url, { + method: 'POST', + }); + if (!res.ok) { + throw new Error(`Failed to sync remote results: ${res.status}`); + } + return res.json() as Promise; +} + export async function saveStudioConfig( config: Partial, ): Promise { diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 97300baa1..19f7e96fc 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -16,6 +16,7 @@ export interface RunMeta { size_bytes: number; target?: string; experiment?: string; + source: 'local' | 'remote'; project_id?: string; project_name?: string; } @@ -71,7 +72,8 @@ export interface EvalResult { export interface RunDetailResponse { results: EvalResult[]; - source: string; + source: 'local' | 'remote'; + source_label?: string; } export interface SuiteSummary { @@ -119,6 +121,8 @@ export interface ExperimentSummary { name: string; run_count: number; target_count: number; + eval_count: number; + passed_count: number; pass_rate: number; last_run: string; } @@ -197,6 +201,20 @@ export interface StudioConfigResponse { /** @deprecated Use threshold */ pass_threshold?: number; read_only?: boolean; + project_name?: string; +} + +export interface RemoteStatusResponse { + configured: boolean; + available: boolean; + repo?: string; + cache_dir?: string; + path?: string; + auto_push?: boolean; + branch_prefix?: string; + run_count?: number; + last_synced_at?: string; + last_error?: string; } // ── Project types ──────────────────────────────────────────────────────── diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 85eb71c30..f3059d44a 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -15,12 +15,15 @@ import { ExperimentsTab } from '~/components/ExperimentsTab'; import { ProjectCard } from '~/components/ProjectCard'; import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; +import { type RunSourceFilter, RunSourceToolbar } from '~/components/RunSourceToolbar'; import { TargetsTab } from '~/components/TargetsTab'; import { addProjectApi, discoverProjectsApi, + syncRemoteResultsApi, useCompare, useProjectList, + useRemoteStatus, useRunList, useStudioConfig, } from '~/lib/api'; @@ -28,10 +31,10 @@ import { type TabId = 'runs' | 'experiments' | 'compare' | 'targets'; const tabs: { id: TabId; label: string }[] = [ - { id: 'runs', label: 'Recent Runs' }, - { id: 'experiments', label: 'Experiments' }, - { id: 'compare', label: 'Compare' }, - { id: 'targets', label: 'Targets' }, + { id: 'runs', label: '🏃 Recent Runs' }, + { id: 'experiments', label: '🧪 Experiments' }, + { id: 'compare', label: '📊 Compare' }, + { id: 'targets', label: '🤖 Targets' }, ]; export const Route = createFileRoute('/')({ @@ -183,17 +186,46 @@ function SingleProjectHome() { const searchParams = routerState.location.search as Record; const tab = searchParams.tab as TabId | undefined; const navigate = useNavigate(); + const queryClient = useQueryClient(); const { data, isLoading, error } = useRunList(); + const { data: remoteStatus } = useRemoteStatus(); const { data: config } = useStudioConfig(); const [showRunEval, setShowRunEval] = useState(false); + const [sourceFilter, setSourceFilter] = useState('all'); + const [syncInFlight, setSyncInFlight] = useState(false); const isReadOnly = config?.read_only === true; const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments'; + const filteredRuns = + sourceFilter === 'all' + ? (data?.runs ?? []) + : (data?.runs ?? []).filter((run) => run.source === sourceFilter); + + async function handleSyncRemote() { + setSyncInFlight(true); + try { + await syncRemoteResultsApi(); + await Promise.all([ + queryClient.invalidateQueries({ queryKey: ['runs'] }), + queryClient.invalidateQueries({ queryKey: ['experiments'] }), + queryClient.invalidateQueries({ queryKey: ['compare'] }), + queryClient.invalidateQueries({ queryKey: ['targets'] }), + queryClient.invalidateQueries({ queryKey: ['remote-status', ''] }), + ]); + } finally { + setSyncInFlight(false); + } + } return (
-

Evaluation Runs

+
+

Evaluation Runs

+ {config?.project_name && ( +

{config.project_name}

+ )} +
{!isReadOnly && (
); } - -function StatusBadge({ status }: { status?: string }) { - if (!status) return -; - - const isSuccess = status === 'success' || status === 'completed'; - const isError = status === 'error' || status === 'failed'; - - return ( - - {status} - - ); -} diff --git a/apps/web/src/assets/screenshots/studio-experiments.png b/apps/web/src/assets/screenshots/studio-experiments.png new file mode 100644 index 000000000..cd7ae05ef Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-experiments.png differ diff --git a/apps/web/src/assets/screenshots/studio-run-detail.png b/apps/web/src/assets/screenshots/studio-run-detail.png new file mode 100644 index 000000000..06cb1cd07 Binary files /dev/null and b/apps/web/src/assets/screenshots/studio-run-detail.png differ diff --git a/apps/web/src/assets/screenshots/studio-runs.png b/apps/web/src/assets/screenshots/studio-runs.png index 1c0022087..7a8721b1d 100644 Binary files a/apps/web/src/assets/screenshots/studio-runs.png and b/apps/web/src/assets/screenshots/studio-runs.png differ diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx index 2365b11ef..233ae4c5d 100644 --- a/apps/web/src/content/docs/docs/tools/studio.mdx +++ b/apps/web/src/content/docs/docs/tools/studio.mdx @@ -7,11 +7,13 @@ sidebar: import { Image } from 'astro:assets'; import studioRuns from '../../../../assets/screenshots/studio-runs.png'; +import studioRunDetail from '../../../../assets/screenshots/studio-run-detail.png'; +import studioExperiments from '../../../../assets/screenshots/studio-experiments.png'; import studioProjects from '../../../../assets/screenshots/studio-projects.png'; -The `studio` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores. +The `studio` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores. It shows both local runs and runs synced from a remote results repository. -AgentV Studio showing evaluation runs with pass rates and scores +AgentV Studio showing evaluation runs with source badges, pass rates, and scores ## Usage @@ -42,11 +44,24 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z ## Features -- **Recent Runs** -- table of all evaluation runs with target, experiment, timestamp, test count, pass rate, and mean score -- **Experiments** -- group and compare runs by experiment name -- **Targets** -- group runs by target (model/agent) -- **Run Detail** -- drill into a run to see per-test results, scores, and evaluator output -- **Human Review** -- add feedback annotations to individual test results +- **Recent Runs** — table of all evaluation runs with source badge (`local` / `remote`), target, experiment, timestamp, test count, pass rate, and mean score +- **Experiments** — group and compare runs by experiment name +- **Targets** — group runs by target (model/agent) +- **Run Detail** — drill into a run to see per-test results, scores, and evaluator output +- **Human Review** — add feedback annotations to individual test results +- **Remote Results** — sync and browse runs pushed from other machines or CI (see [Remote Results](#remote-results)) + +## Run Detail + +Click any run to see a breakdown by suite, per-test scores, target, duration, and cost. The source label (`local` or `remote`) tells you where the run came from. + +AgentV Studio run detail showing 100% pass rate across 5 tests with scores and duration + +## Experiments + +The Experiments tab groups runs by experiment name so you can compare the impact of changes — for example, `with_skills` vs `without_skills`. + +AgentV Studio experiments tab comparing with_skills (100%) vs without_skills (60%) pass rates ## Multi-Project Dashboard @@ -94,3 +109,32 @@ agentv studio --remove my-project ``` Project IDs are derived from the directory name (e.g., `/home/user/repos/my-project` becomes `my-project`). + +## Remote Results + +Studio can display runs pushed to a remote git repository by other machines or CI — alongside your local runs. Each run in the list carries a source badge: **local** (green) or **remote** (amber). + +### Configuration + +Add a `results.export` block to `.agentv/config.yaml`: + +```yaml +results: + export: + repo: EntityProcess/agentv-evals # GitHub repo (owner/repo or full URL) + path: runs # Directory within the repo + auto_push: true # Push automatically after every eval run + branch_prefix: eval-results # Branch naming prefix (default: eval-results) +``` + +With `auto_push: true`, every `agentv eval run` or `agentv pipeline bench` automatically creates a draft PR in the configured repo with a structured results table. + +### Authentication + +Uses `gh` CLI and `git` credentials already configured on the machine. If authentication is missing, AgentV warns and skips the export — the eval run itself is never blocked. + +### Syncing in Studio + +Once configured, Studio fetches remote runs on load. Use the **Sync Remote Results** button in the source toolbar to pull the latest. The toolbar also shows when results were last synced and the configured repo. + +Use the **All Sources / Local Only / Remote Only** filter to narrow the run list by origin. diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index 54505cddc..ea521fe03 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -27,10 +27,20 @@ export type ExecutionDefaults = { readonly pool_slots?: number; }; +export type ResultsExportConfig = { + readonly repo: string; + readonly path: string; + readonly auto_push?: boolean; + readonly branch_prefix?: string; +}; + export type AgentVConfig = { readonly required_version?: string; readonly eval_patterns?: readonly string[]; readonly execution?: ExecutionDefaults; + readonly results?: { + readonly export?: ResultsExportConfig; + }; }; /** @@ -82,11 +92,13 @@ export async function loadConfig( (parsed as Record).execution, configPath, ); + const results = parseResultsConfig((parsed as Record).results, configPath); return { required_version: requiredVersion as string | undefined, eval_patterns: evalPatterns as readonly string[] | undefined, execution: executionDefaults, + results, }; } catch (error) { logWarning( @@ -435,6 +447,77 @@ export function parseExecutionDefaults( return Object.keys(result).length > 0 ? (result as ExecutionDefaults) : undefined; } +export function parseResultsConfig( + raw: unknown, + configPath: string, +): AgentVConfig['results'] | undefined { + if (raw === undefined || raw === null) { + return undefined; + } + if (typeof raw !== 'object' || Array.isArray(raw)) { + logWarning(`Invalid results in ${configPath}, expected object`); + return undefined; + } + + const obj = raw as Record; + const exportConfig = parseResultsExportConfig(obj.export, configPath); + if (!exportConfig) { + return undefined; + } + + return { export: exportConfig }; +} + +export function parseResultsExportConfig( + raw: unknown, + configPath: string, +): ResultsExportConfig | undefined { + if (raw === undefined || raw === null) { + return undefined; + } + if (typeof raw !== 'object' || Array.isArray(raw)) { + logWarning(`Invalid results.export in ${configPath}, expected object`); + return undefined; + } + + const obj = raw as Record; + const repo = typeof obj.repo === 'string' ? obj.repo.trim() : ''; + const exportPath = typeof obj.path === 'string' ? obj.path.trim() : ''; + + if (!repo) { + logWarning(`Invalid results.export.repo in ${configPath}, expected non-empty string`); + return undefined; + } + + if (!exportPath) { + logWarning(`Invalid results.export.path in ${configPath}, expected non-empty string`); + return undefined; + } + + if (obj.auto_push !== undefined && typeof obj.auto_push !== 'boolean') { + logWarning(`Invalid results.export.auto_push in ${configPath}, expected boolean`); + return undefined; + } + + let branchPrefix: string | undefined; + if (obj.branch_prefix !== undefined) { + if (typeof obj.branch_prefix !== 'string' || obj.branch_prefix.trim().length === 0) { + logWarning( + `Invalid results.export.branch_prefix in ${configPath}, expected non-empty string`, + ); + return undefined; + } + branchPrefix = obj.branch_prefix.trim(); + } + + return { + repo, + path: exportPath, + ...(typeof obj.auto_push === 'boolean' && { auto_push: obj.auto_push }), + ...(branchPrefix && { branch_prefix: branchPrefix }), + }; +} + function logWarning(message: string): void { console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`); } diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts new file mode 100644 index 000000000..976e6dcbd --- /dev/null +++ b/packages/core/src/evaluation/results-repo.ts @@ -0,0 +1,398 @@ +import { execFile } from 'node:child_process'; +import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { cp, mkdtemp, readdir, rm, stat } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { promisify } from 'node:util'; + +import { getAgentvHome } from '../paths.js'; +import type { ResultsExportConfig } from './loaders/config-loader.js'; + +const execFileAsync = promisify(execFile); + +export interface ResultsRepoCachePaths { + readonly rootDir: string; + readonly repoDir: string; + readonly statusFile: string; +} + +export interface ResultsRepoStatus { + readonly configured: boolean; + readonly available: boolean; + readonly repo?: string; + readonly path?: string; + readonly auto_push?: boolean; + readonly branch_prefix?: string; + readonly cache_dir?: string; + readonly last_synced_at?: string; + readonly last_error?: string; +} + +export interface CheckedOutResultsRepoBranch { + readonly branchName: string; + readonly baseBranch: string; + readonly repoDir: string; +} + +export interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch { + readonly cleanup: () => Promise; +} + +type PersistedStatus = { + readonly last_synced_at?: string; + readonly last_error?: string; +}; + +function sanitizeRepoSlug(repo: string): string { + return repo.trim().replace(/[^A-Za-z0-9._-]+/g, '-'); +} + +function withFriendlyGitHubAuthError(error: unknown): Error { + const message = error instanceof Error ? error.message : String(error); + const lower = message.toLowerCase(); + if ( + lower.includes('authentication failed') || + lower.includes('could not read username') || + lower.includes('permission denied') || + lower.includes('not logged into any github hosts') + ) { + return new Error(`${message}. Run 'gh auth login' to authenticate.`); + } + return new Error(message); +} + +export function normalizeResultsExportConfig( + config: ResultsExportConfig, +): Required { + return { + repo: config.repo.trim(), + path: config.path.trim().replace(/^\/+|\/+$/g, ''), + auto_push: config.auto_push === true, + branch_prefix: config.branch_prefix?.trim() || 'eval-results', + }; +} + +export function resolveResultsRepoUrl(repo: string): string { + if (repo.includes('://') || repo.startsWith('git@')) { + return repo; + } + return `https://github.com/${repo}.git`; +} + +export function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths { + const rootDir = path.join(getAgentvHome(), 'cache', 'results-repo', sanitizeRepoSlug(repo)); + return { + rootDir, + repoDir: path.join(rootDir, 'repo'), + statusFile: path.join(rootDir, 'status.json'), + }; +} + +function readPersistedStatus(statusFile: string): PersistedStatus { + if (!existsSync(statusFile)) { + return {}; + } + + try { + return JSON.parse(readFileSync(statusFile, 'utf8')) as PersistedStatus; + } catch { + return {}; + } +} + +function writePersistedStatus(statusFile: string, status: PersistedStatus): void { + mkdirSync(path.dirname(statusFile), { recursive: true }); + writeFileSync(statusFile, `${JSON.stringify(status, null, 2)}\n`, 'utf8'); +} + +async function runCommand( + executable: string, + args: readonly string[], + options?: { cwd?: string; check?: boolean }, +): Promise<{ stdout: string; stderr: string }> { + try { + const { stdout, stderr } = await execFileAsync(executable, [...args], { + cwd: options?.cwd, + env: process.env, + }); + return { stdout, stderr }; + } catch (error) { + if (options?.check === false && error && typeof error === 'object') { + const execError = error as { stdout?: string; stderr?: string }; + return { + stdout: execError.stdout ?? '', + stderr: execError.stderr ?? '', + }; + } + throw withFriendlyGitHubAuthError(error); + } +} + +async function runGit( + args: readonly string[], + options?: { cwd?: string; check?: boolean }, +): Promise<{ stdout: string; stderr: string }> { + return runCommand('git', args, options); +} + +async function runGh( + args: readonly string[], + options?: { cwd?: string }, +): Promise<{ stdout: string; stderr: string }> { + return runCommand('gh', args, options); +} + +async function resolveDefaultBranch(repoDir: string): Promise { + try { + const { stdout } = await runGit(['symbolic-ref', 'refs/remotes/origin/HEAD'], { cwd: repoDir }); + const ref = stdout.trim(); + const prefix = 'refs/remotes/origin/'; + if (ref.startsWith(prefix)) { + return ref.slice(prefix.length); + } + } catch { + // Fall through to main/master probing. + } + + for (const candidate of ['main', 'master']) { + try { + await runGit(['rev-parse', '--verify', `origin/${candidate}`], { cwd: repoDir }); + return candidate; + } catch { + // Try next candidate. + } + } + + return 'main'; +} + +async function updateCacheRepo(repoDir: string, baseBranch: string): Promise { + await runGit(['fetch', 'origin', '--prune'], { cwd: repoDir }); + await runGit(['checkout', baseBranch], { cwd: repoDir }); + await runGit(['pull', '--ff-only', 'origin', baseBranch], { cwd: repoDir }); +} + +function updateStatusFile(config: ResultsExportConfig, patch: PersistedStatus): void { + const cachePaths = getResultsRepoCachePaths(config.repo); + const current = readPersistedStatus(cachePaths.statusFile); + writePersistedStatus(cachePaths.statusFile, { + ...current, + ...patch, + }); +} + +export async function ensureResultsRepoClone(config: ResultsExportConfig): Promise { + const normalized = normalizeResultsExportConfig(config); + const cachePaths = getResultsRepoCachePaths(normalized.repo); + mkdirSync(cachePaths.rootDir, { recursive: true }); + + if (!existsSync(cachePaths.repoDir)) { + try { + await runGit([ + 'clone', + '--filter=blob:none', + resolveResultsRepoUrl(normalized.repo), + cachePaths.repoDir, + ]); + return cachePaths.repoDir; + } catch (error) { + updateStatusFile(normalized, { last_error: withFriendlyGitHubAuthError(error).message }); + throw withFriendlyGitHubAuthError(error); + } + } + + if (!existsSync(path.join(cachePaths.repoDir, '.git'))) { + throw new Error(`Results repo cache is not a git repository: ${cachePaths.repoDir}`); + } + + return cachePaths.repoDir; +} + +export function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus { + if (!config) { + return { + configured: false, + available: false, + repo: '', + cache_dir: '', + }; + } + + const normalized = normalizeResultsExportConfig(config); + const cachePaths = getResultsRepoCachePaths(normalized.repo); + const persisted = readPersistedStatus(cachePaths.statusFile); + + return { + configured: true, + available: existsSync(cachePaths.repoDir), + repo: normalized.repo, + path: normalized.path, + auto_push: normalized.auto_push, + branch_prefix: normalized.branch_prefix, + cache_dir: cachePaths.repoDir, + last_synced_at: persisted.last_synced_at, + last_error: persisted.last_error, + }; +} + +export async function syncResultsRepo(config: ResultsExportConfig): Promise { + const normalized = normalizeResultsExportConfig(config); + + try { + const repoDir = await ensureResultsRepoClone(normalized); + const baseBranch = await resolveDefaultBranch(repoDir); + await updateCacheRepo(repoDir, baseBranch); + updateStatusFile(normalized, { + last_synced_at: new Date().toISOString(), + last_error: undefined, + }); + } catch (error) { + updateStatusFile(normalized, { + last_error: withFriendlyGitHubAuthError(error).message, + }); + throw withFriendlyGitHubAuthError(error); + } + + return getResultsRepoStatus(normalized); +} + +export async function checkoutResultsRepoBranch( + config: ResultsExportConfig, + branchName: string, +): Promise { + const normalized = normalizeResultsExportConfig(config); + const repoDir = await ensureResultsRepoClone(normalized); + const baseBranch = await resolveDefaultBranch(repoDir); + await updateCacheRepo(repoDir, baseBranch); + await runGit(['checkout', '-B', branchName, `origin/${baseBranch}`], { cwd: repoDir }); + updateStatusFile(normalized, { last_error: undefined }); + return { + branchName, + baseBranch, + repoDir, + }; +} + +export async function prepareResultsRepoBranch( + config: ResultsExportConfig, + branchName: string, +): Promise { + const normalized = normalizeResultsExportConfig(config); + const cloneDir = await ensureResultsRepoClone(normalized); + const baseBranch = await resolveDefaultBranch(cloneDir); + await updateCacheRepo(cloneDir, baseBranch); + + const worktreeRoot = await mkdtemp(path.join(os.tmpdir(), 'agentv-results-repo-')); + const worktreeDir = path.join(worktreeRoot, 'repo'); + await runGit(['worktree', 'add', '-B', branchName, worktreeDir, `origin/${baseBranch}`], { + cwd: cloneDir, + }); + + return { + branchName, + baseBranch, + repoDir: worktreeDir, + cleanup: async () => { + try { + await runGit(['worktree', 'remove', '--force', worktreeDir], { cwd: cloneDir }); + } finally { + await rm(worktreeRoot, { recursive: true, force: true }).catch(() => undefined); + } + }, + }; +} + +export async function stageResultsArtifacts(params: { + readonly repoDir: string; + readonly sourceDir: string; + readonly destinationDir: string; +}): Promise { + rmSync(params.destinationDir, { recursive: true, force: true }); + mkdirSync(path.dirname(params.destinationDir), { recursive: true }); + await cp(params.sourceDir, params.destinationDir, { recursive: true }); +} + +export function resolveResultsRepoRunsDir(config: ResultsExportConfig): string { + const normalized = normalizeResultsExportConfig(config); + return path.join( + getResultsRepoCachePaths(normalized.repo).repoDir, + ...normalized.path.split('/'), + ); +} + +export async function directorySizeBytes(targetPath: string): Promise { + const entry = await stat(targetPath); + if (entry.isFile()) { + return entry.size; + } + + let total = 0; + for (const child of await readdir(targetPath, { withFileTypes: true })) { + total += await directorySizeBytes(path.join(targetPath, child.name)); + } + return total; +} + +export async function commitAndPushResultsBranch(params: { + readonly repoDir: string; + readonly branchName: string; + readonly commitMessage: string; +}): Promise { + await runGit(['add', '--all'], { cwd: params.repoDir }); + + const { stdout: diffStdout } = await runGit(['status', '--porcelain'], { + cwd: params.repoDir, + check: false, + }); + if (diffStdout.trim().length === 0) { + return false; + } + + await runGit(['commit', '-m', params.commitMessage], { cwd: params.repoDir }); + await runGit(['push', '-u', 'origin', params.branchName], { cwd: params.repoDir }); + return true; +} + +export async function pushResultsRepoBranch( + config: ResultsExportConfig, + branchName: string, + cwd?: string, +): Promise { + const normalized = normalizeResultsExportConfig(config); + await runGit(['push', '-u', 'origin', branchName], { + cwd: cwd ?? getResultsRepoCachePaths(normalized.repo).repoDir, + }); + updateStatusFile(normalized, { + last_synced_at: new Date().toISOString(), + last_error: undefined, + }); +} + +export async function createDraftResultsPr(params: { + readonly repo: string; + readonly repoDir: string; + readonly baseBranch: string; + readonly branchName: string; + readonly title: string; + readonly body: string; +}): Promise { + const { stdout } = await runGh( + [ + 'pr', + 'create', + '--draft', + '--repo', + params.repo, + '--base', + params.baseBranch, + '--head', + params.branchName, + '--title', + params.title, + '--body', + params.body, + ], + { cwd: params.repoDir }, + ); + return stdout.trim(); +} diff --git a/packages/core/src/evaluation/validation/config-validator.ts b/packages/core/src/evaluation/validation/config-validator.ts index d15306cfb..50d55f601 100644 --- a/packages/core/src/evaluation/validation/config-validator.ts +++ b/packages/core/src/evaluation/validation/config-validator.ts @@ -66,11 +66,81 @@ export async function validateConfigFile(filePath: string): Promise).export; + if (exportConfig !== undefined) { + if ( + typeof exportConfig !== 'object' || + exportConfig === null || + Array.isArray(exportConfig) + ) { + errors.push({ + severity: 'error', + filePath, + location: 'results.export', + message: "Field 'results.export' must be an object", + }); + } else { + const exportRecord = exportConfig as Record; + if (typeof exportRecord.repo !== 'string' || exportRecord.repo.trim().length === 0) { + errors.push({ + severity: 'error', + filePath, + location: 'results.export.repo', + message: "Field 'results.export.repo' must be a non-empty string", + }); + } + if (typeof exportRecord.path !== 'string' || exportRecord.path.trim().length === 0) { + errors.push({ + severity: 'error', + filePath, + location: 'results.export.path', + message: "Field 'results.export.path' must be a non-empty string", + }); + } + if ( + exportRecord.auto_push !== undefined && + typeof exportRecord.auto_push !== 'boolean' + ) { + errors.push({ + severity: 'error', + filePath, + location: 'results.export.auto_push', + message: "Field 'results.export.auto_push' must be a boolean", + }); + } + if ( + exportRecord.branch_prefix !== undefined && + (typeof exportRecord.branch_prefix !== 'string' || + exportRecord.branch_prefix.trim().length === 0) + ) { + errors.push({ + severity: 'error', + filePath, + location: 'results.export.branch_prefix', + message: "Field 'results.export.branch_prefix' must be a non-empty string", + }); + } + } + } + } + } + const allowedFields = new Set([ '$schema', 'eval_patterns', 'required_version', 'execution', + 'results', 'studio', ]); const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key)); diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 46b1ecd1f..c980e6baf 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -379,6 +379,7 @@ export const EvalFileSchema = z.object({ .regex(/^[a-z0-9-]+$/) .optional(), description: z.string().optional(), + category: z.string().optional(), version: z.string().optional(), author: z.string().optional(), tags: z.array(z.string()).optional(), diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index b0c58305b..9e6f7de1e 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -106,6 +106,7 @@ type RawTestSuite = JsonObject & { // Suite-level metadata fields readonly name?: JsonValue; readonly description?: JsonValue; + readonly category?: string; readonly version?: JsonValue; readonly author?: JsonValue; readonly tags?: JsonValue; @@ -503,7 +504,7 @@ async function loadTestsFromYaml( const testCase: EvalTest = { id, suite: suiteName, - category: options?.category, + category: suite.category ?? options?.category, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 37bb2e8a2..64a68de23 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -6,6 +6,11 @@ export { isAgentSkillsFormat, parseAgentSkillsEvals, } from './evaluation/loaders/agent-skills-parser.js'; +export { + loadConfig, + type AgentVConfig as AgentVYamlConfig, + type ResultsExportConfig, +} from './evaluation/loaders/config-loader.js'; export { transpileEvalYaml, transpileEvalYamlFile, @@ -47,6 +52,26 @@ export { shouldSkipCacheForTemperature, } from './evaluation/cache/response-cache.js'; export { toSnakeCaseDeep, toCamelCaseDeep } from './evaluation/case-conversion.js'; +export { + ensureResultsRepoClone, + syncResultsRepo, + getResultsRepoCachePaths, + getResultsRepoStatus, + normalizeResultsExportConfig, + resolveResultsRepoRunsDir, + resolveResultsRepoUrl, + prepareResultsRepoBranch, + checkoutResultsRepoBranch, + stageResultsArtifacts, + directorySizeBytes, + commitAndPushResultsBranch, + pushResultsRepoBranch, + createDraftResultsPr, + type CheckedOutResultsRepoBranch, + type PreparedResultsRepoBranch, + type ResultsRepoCachePaths, + type ResultsRepoStatus, +} from './evaluation/results-repo.js'; export { getAgentvHome, getWorkspacesRoot, diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index ac68e0eb9..50b69d772 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -9,6 +9,7 @@ import { extractTotalBudgetUsd, extractTrialsConfig, parseExecutionDefaults, + parseResultsConfig, } from '../../../src/evaluation/loaders/config-loader.js'; import type { JsonObject } from '../../../src/evaluation/types.js'; @@ -134,6 +135,45 @@ describe('extractTrialsConfig', () => { }); }); +describe('parseResultsConfig', () => { + it('parses valid results.export config', () => { + const result = parseResultsConfig( + { + export: { + repo: 'EntityProcess/agentv-evals', + path: 'autopilot-dev/runs', + auto_push: true, + branch_prefix: 'eval-results', + }, + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toEqual({ + export: { + repo: 'EntityProcess/agentv-evals', + path: 'autopilot-dev/runs', + auto_push: true, + branch_prefix: 'eval-results', + }, + }); + }); + + it('returns undefined for invalid export config', () => { + const result = parseResultsConfig( + { + export: { + repo: '', + path: 'autopilot-dev/runs', + }, + }, + '/tmp/.agentv/config.yaml', + ); + + expect(result).toBeUndefined(); + }); +}); + describe('extractTargetFromSuite', () => { it('extracts target from execution.target', () => { const suite: JsonObject = { execution: { target: 'my-target' } }; @@ -464,3 +504,42 @@ describe('parseExecutionDefaults', () => { }); }); }); + +describe('parseResultsConfig', () => { + it('parses valid results export configuration', () => { + expect( + parseResultsConfig( + { + export: { + repo: 'EntityProcess/agentv-evals', + path: 'autopilot-dev/runs', + auto_push: true, + branch_prefix: 'eval-results', + }, + }, + '/test/.agentv/config.yaml', + ), + ).toEqual({ + export: { + repo: 'EntityProcess/agentv-evals', + path: 'autopilot-dev/runs', + auto_push: true, + branch_prefix: 'eval-results', + }, + }); + }); + + it('returns undefined for invalid results export configuration', () => { + expect( + parseResultsConfig( + { + export: { + repo: 123, + path: 'autopilot-dev/runs', + }, + }, + '/test/.agentv/config.yaml', + ), + ).toBeUndefined(); + }); +}); diff --git a/packages/core/test/evaluation/validation/config-validator.test.ts b/packages/core/test/evaluation/validation/config-validator.test.ts index 98d7a5946..42679df59 100644 --- a/packages/core/test/evaluation/validation/config-validator.test.ts +++ b/packages/core/test/evaluation/validation/config-validator.test.ts @@ -46,6 +46,25 @@ describe('validateConfigFile', () => { expect(result.errors).toHaveLength(0); }); + it('accepts results.export field without warnings', async () => { + const filePath = path.join(tempDir, 'config-results.yaml'); + await writeFile( + filePath, + `results: + export: + repo: EntityProcess/agentv-evals + path: autopilot-dev/runs + auto_push: true + branch_prefix: eval-results +`, + ); + + const result = await validateConfigFile(filePath); + + expect(result.valid).toBe(true); + expect(result.errors).toHaveLength(0); + }); + it('errors on invalid required_version type', async () => { const filePath = path.join(tempDir, 'config-bad-version.yaml'); await writeFile(filePath, 'required_version: 3\n'); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index a7f142c04..1f96f0f2b 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -17,6 +17,9 @@ "description": { "type": "string" }, + "category": { + "type": "string" + }, "version": { "type": "string" }, diff --git a/plugins/agentv-self/skills/image-compress-and-docs/SKILL.md b/plugins/agentv-self/skills/image-compress-and-docs/SKILL.md new file mode 100644 index 000000000..6370976c6 --- /dev/null +++ b/plugins/agentv-self/skills/image-compress-and-docs/SKILL.md @@ -0,0 +1,140 @@ +--- +name: image-compress-and-docs +description: Capture, optimize, and publish screenshots to Astro docs. Use when asked to take screenshots for docs, update doc images, compress PNG assets, or add visual documentation to the agentv.dev docs site. Triggers on "add screenshots to docs", "update docs images", "compress screenshots", "optimize PNG", "document with screenshots". +--- + +# Image Compression & Docs Update + +Capture browser screenshots, optimize them for the web, and publish to the Astro docs site at `apps/web/src/content/docs/`. + +## Prerequisites + +Install optimization tools if not present: +```bash +# Ubuntu/Debian (usually pre-installed) +sudo apt-get install -y pngquant optipng + +# macOS +brew install pngquant optipng +``` + +Verify: +```bash +which pngquant optipng +``` + +## Step 1 — Capture Screenshots + +Use `agent-browser` with a named session and 1440×860 viewport for docs-quality screenshots. Always use `--session` to isolate, never `--headed`. + +```bash +# Start the target server first (e.g., Studio) +bun apps/cli/src/cli.ts studio --port 14800 & +sleep 3 + +# Open, set viewport, navigate, screenshot +agent-browser --session docs-shots open http://localhost:14800 +agent-browser --session docs-shots wait --load networkidle +agent-browser --session docs-shots set viewport 1440 860 +agent-browser --session docs-shots snapshot -i # discover refs +agent-browser --session docs-shots click # navigate if needed +agent-browser --session docs-shots wait --load networkidle +agent-browser --session docs-shots screenshot # saved to /run/user/1000/agent-browser/tmp/screenshots/ + +# Clean up +agent-browser --session docs-shots close +kill $(lsof -ti:14800) 2>/dev/null +``` + +**Screenshots with realistic data:** Studio screenshots must have populated data — multiple runs with varying pass rates and real targets. If results are sparse, create synthetic JSONL files in `.agentv/results/runs///index.jsonl` with realistic fields before launching Studio. + +Synthetic JSONL record format: +```json +{"test_id": "my-test", "score": 0.95, "target": "claude-sonnet", "experiment": "default", "timestamp": "2026-04-08T09:15:44.003Z", "execution_status": "success", "suite": "my-suite", "category": "default", "duration_ms": 3500, "token_usage": {"input_tokens": 1200, "output_tokens": 400}, "scores": [{"type": "llm-grader", "score": 0.95, "passed": true}], "error": null} +``` + +## Step 2 — Optimize + +Always apply both passes: **pngquant** (lossy, 50–70% savings) then **optipng** (lossless polish). + +```bash +SHOT="/run/user/1000/agent-browser/tmp/screenshots/screenshot-.png" +OUT="/home/christso/projects/agentv/apps/web/src/assets/screenshots/my-feature.png" + +# Pass 1: lossy quantization (creates -fs8.png or use --output) +pngquant --quality 80-95 --force --output /tmp/opt.png "$SHOT" + +# Pass 2: lossless polish +optipng -o5 -quiet /tmp/opt.png + +# Copy to docs assets +cp /tmp/opt.png "$OUT" + +# Check savings +ls -lh "$SHOT" "$OUT" +``` + +**Typical results:** 116 KB raw → 44 KB optimized (62% reduction). + +For multiple files: +```bash +SHOTS_DIR="/run/user/1000/agent-browser/tmp/screenshots" +ASSETS_DIR="/home/christso/projects/agentv/apps/web/src/assets/screenshots" + +for f in shot1.png shot2.png shot3.png; do + pngquant --quality 80-95 --force --output "$SHOTS_DIR/opt-$f" "$SHOTS_DIR/$f" + optipng -o5 -quiet "$SHOTS_DIR/opt-$f" + cp "$SHOTS_DIR/opt-$f" "$ASSETS_DIR/$f" +done +ls -lh "$ASSETS_DIR" +``` + +## Step 3 — Update Astro Docs + +Docs live at: `apps/web/src/content/docs/docs/` +Assets live at: `apps/web/src/assets/screenshots/` + +**Import pattern** (Astro `` for automatic optimization): +```mdx +import { Image } from 'astro:assets'; +import myFeature from '../../../../assets/screenshots/my-feature.png'; +import myDetail from '../../../../assets/screenshots/my-detail.png'; + +Descriptive alt text for accessibility +``` + +**Alt text rules:** +- Describe what the screenshot shows, not just what the feature is +- Include key data visible in the image (e.g., "showing 100% pass rate across 5 tests") +- Never use "screenshot of" — just describe the content + +**Placement:** +- Put the hero image directly after the intro paragraph (before ## Usage) +- Put feature-specific images directly after the section that describes them +- Don't cluster all images at the top or bottom + +## Step 4 — Commit + +```bash +# Feature branch: UI changes +cd /path/to/worktree +git add apps/studio/... +git commit -m "fix(studio): ..." + +# Main repo: docs changes +cd /home/christso/projects/agentv +git add apps/web/src/assets/screenshots/ apps/web/src/content/docs/ +git commit -m "docs(): add screenshots and update documentation" +git push +``` + +## Checklist + +- [ ] Screenshots show realistic data (multiple runs, real targets, varying scores) +- [ ] Viewport set to 1440×860 before capturing +- [ ] Both pngquant and optipng applied +- [ ] File size verified (target: <50 KB per screenshot) +- [ ] Alt text is descriptive and specific +- [ ] Image placed close to the content it illustrates +- [ ] Astro `` component used (not raw ``) +- [ ] Docs committed separately from code changes