From 61c2c7afddf417a235e1c15721496fceb2e65211 Mon Sep 17 00:00:00 2001 From: Christopher Date: Sat, 28 Mar 2026 05:31:14 +0000 Subject: [PATCH] feat(pipeline): add --experiment flag, feature example, and docs Add --experiment option to pipeline input and pipeline run commands. The label is written to manifest.json and propagated through pipeline bench into index.jsonl entries and benchmark.json metadata. - pipeline input: accepts --experiment, writes to manifest - pipeline run: accepts --experiment, writes to manifest - pipeline bench: reads manifest.experiment, includes in index entries - New feature example: examples/features/experiments/ - Docs: add experiment section to running-evals.mdx - Docs: add experiments workflow to skill-improvement-workflow.mdx - Tests: 2 new tests for experiment flag presence/absence Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/bench.ts | 3 + apps/cli/src/commands/pipeline/input.ts | 8 ++- apps/cli/src/commands/pipeline/run.ts | 8 ++- apps/cli/src/commands/results/manifest.ts | 3 + .../test/commands/eval/pipeline/bench.test.ts | 36 ++++++++++ .../test/commands/eval/pipeline/input.test.ts | 25 +++++++ .../content/docs/evaluation/running-evals.mdx | 15 ++++- .../guides/skill-improvement-workflow.mdx | 16 +++++ examples/features/experiments/README.md | 67 +++++++++++++++++++ .../evals/coding-ability.eval.yaml | 30 +++++++++ 10 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 examples/features/experiments/README.md create mode 100644 examples/features/experiments/evals/coding-ability.eval.yaml diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 2587913aa..547d926a0 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -46,6 +46,7 @@ export const evalBenchCommand = command({ const testIds: string[] = manifest.test_ids; const targetName: string = manifest.target?.name ?? 'unknown'; const evalSet: string = manifest.eval_set ?? ''; + const experiment: string | undefined = manifest.experiment; const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : ''; // Read LLM scores from file or stdin @@ -189,6 +190,7 @@ export const evalBenchCommand = command({ timestamp: manifest.timestamp, test_id: testId, eval_set: evalSet || undefined, + experiment: experiment || undefined, score: Math.round(weightedScore * 1000) / 1000, target: targetName, scores, @@ -213,6 +215,7 @@ export const evalBenchCommand = command({ metadata: { eval_file: manifest.eval_file, timestamp: manifest.timestamp, + experiment: experiment || undefined, targets: [targetName], tests_run: testIds, }, diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index d33737e6a..745947e76 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -45,8 +45,13 @@ export const evalInputCommand = command({ description: 'Output directory for extracted inputs (default: .agentv/results/runs/)', }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Experiment label (e.g. with_skills, without_skills)', + }), }, - handler: async ({ evalPath, out }) => { + handler: async ({ evalPath, out, experiment }) => { const resolvedEvalPath = resolve(evalPath); const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); @@ -155,6 +160,7 @@ export const evalInputCommand = command({ await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, eval_set: evalSetName || undefined, + experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { name: targetName, diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 482fe0ed8..e5cbabfe2 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -67,8 +67,13 @@ export const evalRunCommand = command({ long: 'workers', description: 'Parallel workers for target invocation (default: all tests)', }), + experiment: option({ + type: optional(string), + long: 'experiment', + description: 'Experiment label (e.g. with_skills, without_skills)', + }), }, - handler: async ({ evalPath, out, workers }) => { + handler: async ({ evalPath, out, workers, experiment }) => { const resolvedEvalPath = resolve(evalPath); const outDir = resolve(out ?? buildDefaultRunDir(process.cwd())); const repoRoot = await findRepoRoot(dirname(resolvedEvalPath)); @@ -174,6 +179,7 @@ export const evalRunCommand = command({ await writeJson(join(outDir, 'manifest.json'), { eval_file: resolvedEvalPath, eval_set: evalSetName || undefined, + experiment: experiment || undefined, timestamp: new Date().toISOString(), target: { name: targetName, kind: targetKind }, test_ids: testIds, diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 7e640e75f..2fcd22a47 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -12,6 +12,7 @@ export interface ResultManifestRecord { readonly test_id?: string; readonly eval_id?: string; readonly eval_set?: string; + readonly experiment?: string; readonly target?: string; readonly score: number; readonly scores?: readonly Record[]; @@ -191,6 +192,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] { export interface LightweightResultRecord { readonly testId: string; readonly target?: string; + readonly experiment?: string; readonly score: number; readonly scores?: readonly Record[]; readonly executionStatus?: string; @@ -206,6 +208,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec return parseResultManifest(content).map((record) => ({ testId: record.test_id ?? record.eval_id ?? 'unknown', target: record.target, + experiment: record.experiment, score: record.score, scores: record.scores, executionStatus: record.execution_status, diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index 3c3005aa2..f6225bbcb 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -113,4 +113,40 @@ describe('pipeline bench', () => { expect(benchmark.metadata.targets).toContain('test-target'); expect(benchmark.run_summary['test-target']).toBeDefined(); }); + + it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => { + // Overwrite manifest with experiment field + await writeFile( + join(OUT_DIR, 'manifest.json'), + JSON.stringify({ + eval_file: 'test.eval.yaml', + timestamp: new Date().toISOString(), + experiment: 'without_skills', + target: { name: 'test-target', kind: 'cli' }, + test_ids: ['test-01'], + }), + ); + + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' }); + + const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); + const entry = JSON.parse(indexContent.trim().split('\n')[0]); + expect(entry.experiment).toBe('without_skills'); + + const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); + expect(benchmark.metadata.experiment).toBe('without_skills'); + }); + + it('omits experiment from output when manifest has no experiment', async () => { + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' }); + + const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); + const entry = JSON.parse(indexContent.trim().split('\n')[0]); + expect(entry.experiment).toBeUndefined(); + + const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); + expect(benchmark.metadata.experiment).toBeUndefined(); + }); }); diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index 47e9a9ff9..12194f4dc 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -77,4 +77,29 @@ describe('pipeline input', () => { ); expect(invoke.kind).toBeDefined(); }); + + it('writes experiment to manifest when --experiment is provided', async () => { + const { execa } = await import('execa'); + await execa('bun', [ + CLI_ENTRY, + 'pipeline', + 'input', + EVAL_PATH, + '--out', + OUT_DIR, + '--experiment', + 'without_skills', + ]); + + const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); + expect(manifest.experiment).toBe('without_skills'); + }); + + it('omits experiment from manifest when --experiment is not provided', async () => { + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); + + const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); + expect(manifest.experiment).toBeUndefined(); + }); }); diff --git a/apps/web/src/content/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/evaluation/running-evals.mdx index 6c34ec549..08f93c66c 100644 --- a/apps/web/src/content/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/evaluation/running-evals.mdx @@ -11,7 +11,7 @@ sidebar: agentv eval evals/my-eval.yaml ``` -Results are written to `.agentv/results/eval_.jsonl`. Each line is a JSON object with one result per test case. +Results are written to `.agentv/results/.jsonl`. Each line is a JSON object with one result per test case. Each `scores[]` entry includes per-grader timing: @@ -47,6 +47,17 @@ Run against a different target than specified in the eval file: agentv eval --target azure-base evals/**/*.yaml ``` +### Experiment Label + +Tag a pipeline run with an experiment name to track different conditions (e.g. with vs without skills): + +```bash +agentv pipeline run evals/my-eval.yaml --experiment with_skills +agentv pipeline run evals/my-eval.yaml --experiment without_skills +``` + +The experiment label is written to `manifest.json` and propagated to each entry in `index.jsonl` by `pipeline bench`. The eval file stays the same across experiments — what changes is the environment. Dashboards can filter and compare results by experiment. + ### Run Specific Test Run a single test by ID: @@ -82,7 +93,7 @@ result-oriented workflows. For full-fidelity span inspection, export OTLP JSON e ```bash # Summary-level inspection from the run manifest -agentv trace stats .agentv/results/runs/eval_/index.jsonl +agentv trace stats .agentv/results/runs//index.jsonl # Full-fidelity OTLP JSON trace (importable by OTel backends like Jaeger, Grafana) agentv eval evals/my-eval.yaml --otel-file traces/eval.otlp.json diff --git a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx index 39b009965..6843e5b81 100644 --- a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx @@ -275,6 +275,22 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your **Key takeaway:** You do not need to rewrite your `evals.json`. AgentV reads it directly and adds a richer evaluation engine on top. +## Using Experiments for Baseline vs Candidate + +The `--experiment` flag provides a structured way to label baseline and candidate runs without separate eval files: + +```bash +# Baseline: run without skills installed +agentv pipeline run evals/my-eval.yaml --experiment without_skills + +# Candidate: run with skills installed +agentv pipeline run evals/my-eval.yaml --experiment with_skills +``` + +Both runs use the same eval file and produce separate run directories. The experiment label is recorded in `manifest.json` and `index.jsonl`, making it easy to filter and compare in dashboards. + +This replaces the need for separate `--target baseline` / `--target candidate` configurations when the only difference between runs is the workspace setup (skills, config, etc.) rather than the target harness. + ## Baseline Comparison Best Practices ### Discovery-path contamination diff --git a/examples/features/experiments/README.md b/examples/features/experiments/README.md new file mode 100644 index 000000000..cda2f0eda --- /dev/null +++ b/examples/features/experiments/README.md @@ -0,0 +1,67 @@ +# Experiments + +Demonstrates using the `--experiment` flag to compare evaluation runs under different conditions while keeping test cases identical. + +## What This Shows + +- Running the same eval file with different experiment labels +- Comparing results across experiments (e.g. with vs without skills) +- One run = one target x one experiment, recorded in `manifest.json` + +## Concept + +An **experiment** is a run-level label that records the conditions under which an eval was executed. The eval file stays the same — what changes is the environment (skills installed, web search enabled, different system prompt, etc.). + +| Experiment | What changes | Eval file | +|---|---|---| +| `with_skills` | Skills installed in workspace | Same `coding-ability.eval.yaml` | +| `without_skills` | No skills in workspace | Same file | +| `web_search` | Web search tool enabled | Same file | + +## Running + +```bash +# From repository root + +# Run with skills (set up workspace with skills first, then run) +agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \ + --experiment with_skills + +# Run without skills (same eval, clean workspace) +agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \ + --experiment without_skills +``` + +The experiment label is written to `manifest.json` and propagated to `index.jsonl` entries by `pipeline bench`. This enables dashboards to filter and compare results by experiment. + +## Output + +Each run produces a separate directory. The experiment is metadata, not a path segment: + +``` +.agentv/results/runs/ +├── 2026-03-28T10-00-00-000Z/ # with_skills run +│ ├── manifest.json # { "experiment": "with_skills", ... } +│ └── coding-ability/ +│ ├── review-null-check/ +│ └── review-clean-function/ +└── 2026-03-28T10-05-00-000Z/ # without_skills run + ├── manifest.json # { "experiment": "without_skills", ... } + └── coding-ability/ + ├── review-null-check/ + └── review-clean-function/ +``` + +## Comparing experiments + +After both runs complete and are graded: + +```bash +# Compare the two runs +agentv compare .agentv/results/runs//index.jsonl \ + .agentv/results/runs//index.jsonl +``` + +## Key Files + +- `evals/coding-ability.eval.yaml` - Shared test cases (same for all experiments) diff --git a/examples/features/experiments/evals/coding-ability.eval.yaml b/examples/features/experiments/evals/coding-ability.eval.yaml new file mode 100644 index 000000000..5441cf147 --- /dev/null +++ b/examples/features/experiments/evals/coding-ability.eval.yaml @@ -0,0 +1,30 @@ +name: coding-ability +tests: + - id: review-null-check + input: | + Review this TypeScript function for bugs: + + function getUser(users: Map, id: string) { + return users.get(id).name; + } + criteria: Identifies the potential undefined access when the key is missing from the map + assertions: + - name: mentions_undefined + type: contains + value: "undefined" + - name: suggests_fix + type: llm-grader + prompt: Does the review identify that users.get(id) can return undefined and suggest a fix? + + - id: review-clean-function + input: | + Review this TypeScript function for bugs: + + function add(a: number, b: number): number { + return a + b; + } + criteria: Recognizes the function is correct and does not flag false issues + assertions: + - name: no_false_positives + type: llm-grader + prompt: Does the review correctly identify this function as simple and correct without flagging false issues?