Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export const evalBenchCommand = command({
const testIds: string[] = manifest.test_ids;
const targetName: string = manifest.target?.name ?? 'unknown';
const evalSet: string = manifest.eval_set ?? '';
const experiment: string | undefined = manifest.experiment;
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';

// Read LLM scores from file or stdin
Expand Down Expand Up @@ -189,6 +190,7 @@ export const evalBenchCommand = command({
timestamp: manifest.timestamp,
test_id: testId,
eval_set: evalSet || undefined,
experiment: experiment || undefined,
score: Math.round(weightedScore * 1000) / 1000,
target: targetName,
scores,
Expand All @@ -213,6 +215,7 @@ export const evalBenchCommand = command({
metadata: {
eval_file: manifest.eval_file,
timestamp: manifest.timestamp,
experiment: experiment || undefined,
targets: [targetName],
tests_run: testIds,
},
Expand Down
8 changes: 7 additions & 1 deletion apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,13 @@ export const evalInputCommand = command({
description:
'Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)',
}),
experiment: option({
type: optional(string),
long: 'experiment',
description: 'Experiment label (e.g. with_skills, without_skills)',
}),
},
handler: async ({ evalPath, out }) => {
handler: async ({ evalPath, out, experiment }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
Expand Down Expand Up @@ -155,6 +160,7 @@ export const evalInputCommand = command({
await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
eval_set: evalSetName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: {
name: targetName,
Expand Down
8 changes: 7 additions & 1 deletion apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,13 @@ export const evalRunCommand = command({
long: 'workers',
description: 'Parallel workers for target invocation (default: all tests)',
}),
experiment: option({
type: optional(string),
long: 'experiment',
description: 'Experiment label (e.g. with_skills, without_skills)',
}),
},
handler: async ({ evalPath, out, workers }) => {
handler: async ({ evalPath, out, workers, experiment }) => {
const resolvedEvalPath = resolve(evalPath);
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
Expand Down Expand Up @@ -174,6 +179,7 @@ export const evalRunCommand = command({
await writeJson(join(outDir, 'manifest.json'), {
eval_file: resolvedEvalPath,
eval_set: evalSetName || undefined,
experiment: experiment || undefined,
timestamp: new Date().toISOString(),
target: { name: targetName, kind: targetKind },
test_ids: testIds,
Expand Down
3 changes: 3 additions & 0 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export interface ResultManifestRecord {
readonly test_id?: string;
readonly eval_id?: string;
readonly eval_set?: string;
readonly experiment?: string;
readonly target?: string;
readonly score: number;
readonly scores?: readonly Record<string, unknown>[];
Expand Down Expand Up @@ -191,6 +192,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {
export interface LightweightResultRecord {
readonly testId: string;
readonly target?: string;
readonly experiment?: string;
readonly score: number;
readonly scores?: readonly Record<string, unknown>[];
readonly executionStatus?: string;
Expand All @@ -206,6 +208,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
return parseResultManifest(content).map((record) => ({
testId: record.test_id ?? record.eval_id ?? 'unknown',
target: record.target,
experiment: record.experiment,
score: record.score,
scores: record.scores,
executionStatus: record.execution_status,
Expand Down
36 changes: 36 additions & 0 deletions apps/cli/test/commands/eval/pipeline/bench.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,40 @@ describe('pipeline bench', () => {
expect(benchmark.metadata.targets).toContain('test-target');
expect(benchmark.run_summary['test-target']).toBeDefined();
});

it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => {
// Overwrite manifest with experiment field
await writeFile(
join(OUT_DIR, 'manifest.json'),
JSON.stringify({
eval_file: 'test.eval.yaml',
timestamp: new Date().toISOString(),
experiment: 'without_skills',
target: { name: 'test-target', kind: 'cli' },
test_ids: ['test-01'],
}),
);

const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const entry = JSON.parse(indexContent.trim().split('\n')[0]);
expect(entry.experiment).toBe('without_skills');

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.experiment).toBe('without_skills');
});

it('omits experiment from output when manifest has no experiment', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
const entry = JSON.parse(indexContent.trim().split('\n')[0]);
expect(entry.experiment).toBeUndefined();

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.experiment).toBeUndefined();
});
});
25 changes: 25 additions & 0 deletions apps/cli/test/commands/eval/pipeline/input.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,29 @@ describe('pipeline input', () => {
);
expect(invoke.kind).toBeDefined();
});

it('writes experiment to manifest when --experiment is provided', async () => {
const { execa } = await import('execa');
await execa('bun', [
CLI_ENTRY,
'pipeline',
'input',
EVAL_PATH,
'--out',
OUT_DIR,
'--experiment',
'without_skills',
]);

const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
expect(manifest.experiment).toBe('without_skills');
});

it('omits experiment from manifest when --experiment is not provided', async () => {
const { execa } = await import('execa');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);

const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
expect(manifest.experiment).toBeUndefined();
});
});
15 changes: 13 additions & 2 deletions apps/web/src/content/docs/evaluation/running-evals.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ sidebar:
agentv eval evals/my-eval.yaml
```

Results are written to `.agentv/results/eval_<timestamp>.jsonl`. Each line is a JSON object with one result per test case.
Results are written to `.agentv/results/<timestamp>.jsonl`. Each line is a JSON object with one result per test case.

Each `scores[]` entry includes per-grader timing:

Expand Down Expand Up @@ -47,6 +47,17 @@ Run against a different target than specified in the eval file:
agentv eval --target azure-base evals/**/*.yaml
```

### Experiment Label

Tag a pipeline run with an experiment name to track different conditions (e.g. with vs without skills):

```bash
agentv pipeline run evals/my-eval.yaml --experiment with_skills
agentv pipeline run evals/my-eval.yaml --experiment without_skills
```

The experiment label is written to `manifest.json` and propagated to each entry in `index.jsonl` by `pipeline bench`. The eval file stays the same across experiments — what changes is the environment. Dashboards can filter and compare results by experiment.

### Run Specific Test

Run a single test by ID:
Expand Down Expand Up @@ -82,7 +93,7 @@ result-oriented workflows. For full-fidelity span inspection, export OTLP JSON e

```bash
# Summary-level inspection from the run manifest
agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl
agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl

# Full-fidelity OTLP JSON trace (importable by OTel backends like Jaeger, Grafana)
agentv eval evals/my-eval.yaml --otel-file traces/eval.otlp.json
Expand Down
16 changes: 16 additions & 0 deletions apps/web/src/content/docs/guides/skill-improvement-workflow.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,22 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your

**Key takeaway:** You do not need to rewrite your `evals.json`. AgentV reads it directly and adds a richer evaluation engine on top.

## Using Experiments for Baseline vs Candidate

The `--experiment` flag provides a structured way to label baseline and candidate runs without separate eval files:

```bash
# Baseline: run without skills installed
agentv pipeline run evals/my-eval.yaml --experiment without_skills

# Candidate: run with skills installed
agentv pipeline run evals/my-eval.yaml --experiment with_skills
```

Both runs use the same eval file and produce separate run directories. The experiment label is recorded in `manifest.json` and `index.jsonl`, making it easy to filter and compare in dashboards.

This replaces the need for separate `--target baseline` / `--target candidate` configurations when the only difference between runs is the workspace setup (skills, config, etc.) rather than the target harness.

## Baseline Comparison Best Practices

### Discovery-path contamination
Expand Down
67 changes: 67 additions & 0 deletions examples/features/experiments/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Experiments

Demonstrates using the `--experiment` flag to compare evaluation runs under different conditions while keeping test cases identical.

## What This Shows

- Running the same eval file with different experiment labels
- Comparing results across experiments (e.g. with vs without skills)
- One run = one target x one experiment, recorded in `manifest.json`

## Concept

An **experiment** is a run-level label that records the conditions under which an eval was executed. The eval file stays the same — what changes is the environment (skills installed, web search enabled, different system prompt, etc.).

| Experiment | What changes | Eval file |
|---|---|---|
| `with_skills` | Skills installed in workspace | Same `coding-ability.eval.yaml` |
| `without_skills` | No skills in workspace | Same file |
| `web_search` | Web search tool enabled | Same file |

## Running

```bash
# From repository root

# Run with skills (set up workspace with skills first, then run)
agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \
--experiment with_skills

# Run without skills (same eval, clean workspace)
agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \
--experiment without_skills
```

The experiment label is written to `manifest.json` and propagated to `index.jsonl` entries by `pipeline bench`. This enables dashboards to filter and compare results by experiment.

## Output

Each run produces a separate directory. The experiment is metadata, not a path segment:

```
.agentv/results/runs/
├── 2026-03-28T10-00-00-000Z/ # with_skills run
│ ├── manifest.json # { "experiment": "with_skills", ... }
│ └── coding-ability/
│ ├── review-null-check/
│ └── review-clean-function/
└── 2026-03-28T10-05-00-000Z/ # without_skills run
├── manifest.json # { "experiment": "without_skills", ... }
└── coding-ability/
├── review-null-check/
└── review-clean-function/
```

## Comparing experiments

After both runs complete and are graded:

```bash
# Compare the two runs
agentv compare .agentv/results/runs/<with-skills-ts>/index.jsonl \
.agentv/results/runs/<without-skills-ts>/index.jsonl
```

## Key Files

- `evals/coding-ability.eval.yaml` - Shared test cases (same for all experiments)
30 changes: 30 additions & 0 deletions examples/features/experiments/evals/coding-ability.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: coding-ability
tests:
- id: review-null-check
input: |
Review this TypeScript function for bugs:

function getUser(users: Map<string, User>, id: string) {
return users.get(id).name;
}
criteria: Identifies the potential undefined access when the key is missing from the map
assertions:
- name: mentions_undefined
type: contains
value: "undefined"
- name: suggests_fix
type: llm-grader
prompt: Does the review identify that users.get(id) can return undefined and suggest a fix?

- id: review-clean-function
input: |
Review this TypeScript function for bugs:

function add(a: number, b: number): number {
return a + b;
}
criteria: Recognizes the function is correct and does not flag false issues
assertions:
- name: no_false_positives
type: llm-grader
prompt: Does the review correctly identify this function as simple and correct without flagging false issues?
Loading