Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
26a1950
feat(results): export remote runs and sync Studio
christso Apr 9, 2026
9c52d6c
fix(results): sync follow-up polish
christso Apr 9, 2026
275ea36
fix(results): satisfy remote repo exports and lint
christso Apr 9, 2026
7221f6a
fix(pipeline): use DEFAULT_THRESHOLD in bench export summary instead …
christso Apr 9, 2026
2af3f08
fix(studio): context-aware source toolbar and empty states
christso Apr 9, 2026
ec86828
docs(studio): add remote results docs and optimized screenshots
christso Apr 9, 2026
d2548e8
docs(agents): register image-compress-and-docs skill in AGENTS.md
christso Apr 9, 2026
18c7a2a
feat(skills): add image-compress-and-docs skill
christso Apr 9, 2026
f268ec7
docs(agents): fix skill location path in AGENTS.md
christso Apr 9, 2026
b9c66d8
feat(plugins): add agentv-self plugin with image-compress-and-docs skill
christso Apr 9, 2026
18e3774
fix(studio): replace Status column with ERR badge in score cell on ex…
christso Apr 9, 2026
ad0df1d
fix(studio): show ERR badge instead of 0% score on execution errors; …
christso Apr 9, 2026
d554083
feat(studio): cleaner run list and detail UI inspired by Convex Evals
christso Apr 9, 2026
e4041ff
feat(core): add category field to eval YAML schema; update screenshots
christso Apr 9, 2026
3a914c6
chore: apply biome formatting to eval-schema.json
christso Apr 9, 2026
3357265
fix(studio): move date to last column, rename to When
christso Apr 9, 2026
b50718d
docs(studio): update screenshots with When column and category breakdown
christso Apr 9, 2026
32292cc
fix(studio): solid pill style for pass-rate badges
christso Apr 9, 2026
e83ada3
fix(studio): muted pill style for pass-rate badges
christso Apr 9, 2026
8f1f06a
docs(studio): update screenshots with muted pill style
christso Apr 9, 2026
c3ca550
feat(studio): progress-bar pill with fill + text inside
christso Apr 9, 2026
9d298f8
feat(studio): compact inline stats bar matching table width; update s…
christso Apr 9, 2026
ea83509
feat(studio): experiment·target heading with muted metadata subheadin…
christso Apr 9, 2026
2d5eebb
fix(studio): use single blue fill for pass-rate pills
christso Apr 9, 2026
0627be5
feat(studio): gradient blue pills; muted stat colors; emoji tabs
christso Apr 9, 2026
07242cd
feat(studio): show project name in heading; drop Source column from r…
christso Apr 9, 2026
8296384
feat(studio): add Passed/Failed/Total columns to run list
christso Apr 9, 2026
836a4a5
fix(studio): restore semantic colors for stats and status dots; add P…
christso Apr 9, 2026
bd1f3d9
feat(studio): extract PassRatePill; use pill everywhere; Evals column…
christso Apr 9, 2026
3996958
docs(studio): update all three screenshots
christso Apr 9, 2026
cb32942
fix(studio): use unambiguous date format (09 Apr 2026); update screen…
christso Apr 9, 2026
a615522
fix(studio): use en-AU locale for day-first date format (09 Apr 2026)
christso Apr 9, 2026
3b350e3
fix(studio): manually format date as DD MMM YYYY (guaranteed day-first)
christso Apr 9, 2026
ce01810
fix(studio): match Convex date format — relative time for same day, l…
christso Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -497,3 +497,4 @@ bun run promote:latest 2.18.0

## Python Scripts
When running Python scripts, always use: `uv run <script.py>`

48 changes: 48 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import path from 'node:path';
import { pathToFileURL } from 'node:url';

import {
DEFAULT_THRESHOLD,
type EvalTest,
type EvaluationCache,
type EvaluationResult,
Expand All @@ -28,6 +29,7 @@ import {
} from '@agentv/core';

import { enforceRequiredVersion } from '../../version-check.js';
import { maybeAutoExportRunArtifacts } from '../results/remote.js';
import { writeArtifactsFromResults } from './artifact-writer.js';
import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
Expand Down Expand Up @@ -858,6 +860,11 @@ export interface RunEvalResult {
readonly allExecutionErrors?: boolean;
}

interface RemoteEvalSummaryInput {
readonly evalFile: string;
readonly results: EvaluationResult[];
}

export async function runEvalCommand(
input: RunEvalCommandInput,
): Promise<RunEvalResult | undefined> {
Expand Down Expand Up @@ -1077,6 +1084,7 @@ export async function runEvalCommand(
// We defer cache creation until after file metadata is loaded
const evaluationRunner = await resolveEvaluationRunner();
const allResults: EvaluationResult[] = [];
const remoteEvalSummaries: RemoteEvalSummaryInput[] = [];
const seenTestCases = new Set<string>();
const displayIdTracker = createDisplayIdTracker();

Expand Down Expand Up @@ -1352,6 +1360,18 @@ export async function runEvalCommand(
threshold: resolvedThreshold,
providerFactory: transcriptProviderFactory,
});
const evalFile = path.relative(cwd, testFilePath);
const existingSummary = remoteEvalSummaries.find(
(summary) => summary.evalFile === evalFile,
);
if (existingSummary) {
existingSummary.results.push(...result.results);
} else {
remoteEvalSummaries.push({
evalFile,
results: [...result.results],
});
}

return result.results;
} catch (fileError) {
Expand Down Expand Up @@ -1472,6 +1492,34 @@ export async function runEvalCommand(

// Persist last run path for `agentv results` commands
await saveRunCache(cwd, outputPath).catch(() => undefined);

await maybeAutoExportRunArtifacts({
cwd,
run_dir: runDir,
test_files: activeTestFiles,
results: allResults,
eval_summaries: remoteEvalSummaries.map((summary) => ({
eval_file: summary.evalFile,
total: summary.results.length,
passed: summary.results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
avg_score:
summary.results.length > 0
? summary.results.reduce((sum, result) => sum + result.score, 0) /
summary.results.length
: 0,
results: summary.results.map((result) => ({
test_id: result.testId,
score: result.score,
status:
result.executionStatus === 'execution_error' || result.error
? 'ERROR'
: result.score >= DEFAULT_THRESHOLD
? 'PASS'
: 'FAIL',
})),
})),
experiment: normalizeExperimentName(options.experiment),
});
}

// Suggest retry-errors command when execution errors are detected
Expand Down
17 changes: 11 additions & 6 deletions apps/cli/src/commands/inspect/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -569,12 +569,7 @@ function collectRunManifestPaths(
}
}

/**
* Enumerate canonical run manifests in `.agentv/results/runs/`.
*/
export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
const runsDir = path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME);

export function listResultFilesFromRunsDir(runsDir: string, limit?: number): ResultFileMeta[] {
const files: { filePath: string; displayName: string; runId: string }[] = [];

try {
Expand Down Expand Up @@ -626,6 +621,16 @@ export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
return metas;
}

/**
* Enumerate canonical run manifests in `.agentv/results/runs/`.
*/
export function listResultFiles(cwd: string, limit?: number): ResultFileMeta[] {
return listResultFilesFromRunsDir(
path.join(cwd, '.agentv', 'results', RESULT_RUNS_DIRNAME),
limit,
);
}

/**
* Extract ISO timestamp from eval filename like eval_2026-02-20T21-38-05-833Z.jsonl
*/
Expand Down
45 changes: 45 additions & 0 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ import { join } from 'node:path';

import { command, positional, string } from 'cmd-ts';

import { DEFAULT_THRESHOLD, type EvaluationResult } from '@agentv/core';
import { maybeAutoExportRunArtifacts } from '../results/remote.js';

interface EvaluatorScore {
readonly name: string;
readonly type: string;
Expand Down Expand Up @@ -223,6 +226,48 @@ export const evalBenchCommand = command({
);

console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);

const results = indexLines.map((line) => JSON.parse(line)) as Array<{
test_id: string;
score: number;
execution_status?: string;
target?: string;
timestamp?: string;
}>;
await maybeAutoExportRunArtifacts({
cwd: process.cwd(),
run_dir: exportDir,
experiment,
test_files: manifest.eval_file ? [manifest.eval_file] : [],
results: results.map((result) => ({
testId: result.test_id,
score: result.score,
executionStatus: result.execution_status,
target: result.target,
timestamp: result.timestamp,
})) as EvaluationResult[],
eval_summaries: [
{
eval_file: manifest.eval_file ?? 'pipeline',
total: results.length,
passed: results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
avg_score:
results.length > 0
? results.reduce((sum, result) => sum + result.score, 0) / results.length
: 0,
results: results.map((result) => ({
test_id: result.test_id,
score: result.score,
status:
result.execution_status === 'execution_error'
? 'ERROR'
: result.score >= DEFAULT_THRESHOLD
? 'PASS'
: 'FAIL',
})),
},
],
});
},
});

Expand Down
Loading
Loading