From 61c2c7afddf417a235e1c15721496fceb2e65211 Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Sat, 28 Mar 2026 05:31:14 +0000
Subject: [PATCH] feat(pipeline): add --experiment flag, feature example, and
 docs

Add --experiment option to pipeline input and pipeline run commands.
The label is written to manifest.json and propagated through
pipeline bench into index.jsonl entries and benchmark.json metadata.

- pipeline input: accepts --experiment, writes to manifest
- pipeline run: accepts --experiment, writes to manifest
- pipeline bench: reads manifest.experiment, includes in index entries
- New feature example: examples/features/experiments/
- Docs: add experiment section to running-evals.mdx
- Docs: add experiments workflow to skill-improvement-workflow.mdx
- Tests: 2 new tests for experiment flag presence/absence

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/pipeline/bench.ts       |  3 +
 apps/cli/src/commands/pipeline/input.ts       |  8 ++-
 apps/cli/src/commands/pipeline/run.ts         |  8 ++-
 apps/cli/src/commands/results/manifest.ts     |  3 +
 .../test/commands/eval/pipeline/bench.test.ts | 36 ++++++++++
 .../test/commands/eval/pipeline/input.test.ts | 25 +++++++
 .../content/docs/evaluation/running-evals.mdx | 15 ++++-
 .../guides/skill-improvement-workflow.mdx     | 16 +++++
 examples/features/experiments/README.md       | 67 +++++++++++++++++++
 .../evals/coding-ability.eval.yaml            | 30 +++++++++
 10 files changed, 207 insertions(+), 4 deletions(-)
 create mode 100644 examples/features/experiments/README.md
 create mode 100644 examples/features/experiments/evals/coding-ability.eval.yaml
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index 2587913aa..547d926a0 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -46,6 +46,7 @@ export const evalBenchCommand = command({
     const testIds: string[] = manifest.test_ids;
     const targetName: string = manifest.target?.name ?? 'unknown';
     const evalSet: string = manifest.eval_set ?? '';
+    const experiment: string | undefined = manifest.experiment;
     const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     // Read LLM scores from file or stdin
@@ -189,6 +190,7 @@ export const evalBenchCommand = command({
           timestamp: manifest.timestamp,
           test_id: testId,
           eval_set: evalSet || undefined,
+          experiment: experiment || undefined,
           score: Math.round(weightedScore * 1000) / 1000,
           target: targetName,
           scores,
@@ -213,6 +215,7 @@ export const evalBenchCommand = command({
       metadata: {
         eval_file: manifest.eval_file,
         timestamp: manifest.timestamp,
+        experiment: experiment || undefined,
         targets: [targetName],
         tests_run: testIds,
       },
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
index d33737e6a..745947e76 100644
--- a/apps/cli/src/commands/pipeline/input.ts
+++ b/apps/cli/src/commands/pipeline/input.ts
@@ -45,8 +45,13 @@ export const evalInputCommand = command({
       description:
         'Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)',
     }),
+    experiment: option({
+      type: optional(string),
+      long: 'experiment',
+      description: 'Experiment label (e.g. with_skills, without_skills)',
+    }),
   },
-  handler: async ({ evalPath, out }) => {
+  handler: async ({ evalPath, out, experiment }) => {
     const resolvedEvalPath = resolve(evalPath);
     const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
@@ -155,6 +160,7 @@ export const evalInputCommand = command({
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
       eval_set: evalSetName || undefined,
+      experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: {
         name: targetName,
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 482fe0ed8..e5cbabfe2 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -67,8 +67,13 @@ export const evalRunCommand = command({
       long: 'workers',
       description: 'Parallel workers for target invocation (default: all tests)',
     }),
+    experiment: option({
+      type: optional(string),
+      long: 'experiment',
+      description: 'Experiment label (e.g. with_skills, without_skills)',
+    }),
   },
-  handler: async ({ evalPath, out, workers }) => {
+  handler: async ({ evalPath, out, workers, experiment }) => {
     const resolvedEvalPath = resolve(evalPath);
     const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
     const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
@@ -174,6 +179,7 @@ export const evalRunCommand = command({
     await writeJson(join(outDir, 'manifest.json'), {
       eval_file: resolvedEvalPath,
       eval_set: evalSetName || undefined,
+      experiment: experiment || undefined,
       timestamp: new Date().toISOString(),
       target: { name: targetName, kind: targetKind },
       test_ids: testIds,
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index 7e640e75f..2fcd22a47 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -12,6 +12,7 @@ export interface ResultManifestRecord {
   readonly test_id?: string;
   readonly eval_id?: string;
   readonly eval_set?: string;
+  readonly experiment?: string;
   readonly target?: string;
   readonly score: number;
   readonly scores?: readonly Record<string, unknown>[];
@@ -191,6 +192,7 @@ export function loadManifestResults(sourceFile: string): EvaluationResult[] {
 export interface LightweightResultRecord {
   readonly testId: string;
   readonly target?: string;
+  readonly experiment?: string;
   readonly score: number;
   readonly scores?: readonly Record<string, unknown>[];
   readonly executionStatus?: string;
@@ -206,6 +208,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
     return parseResultManifest(content).map((record) => ({
       testId: record.test_id ?? record.eval_id ?? 'unknown',
       target: record.target,
+      experiment: record.experiment,
       score: record.score,
       scores: record.scores,
       executionStatus: record.execution_status,
diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts
index 3c3005aa2..f6225bbcb 100644
--- a/apps/cli/test/commands/eval/pipeline/bench.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts
@@ -113,4 +113,40 @@ describe('pipeline bench', () => {
     expect(benchmark.metadata.targets).toContain('test-target');
     expect(benchmark.run_summary['test-target']).toBeDefined();
   });
+
+  it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => {
+    // Overwrite manifest with experiment field
+    await writeFile(
+      join(OUT_DIR, 'manifest.json'),
+      JSON.stringify({
+        eval_file: 'test.eval.yaml',
+        timestamp: new Date().toISOString(),
+        experiment: 'without_skills',
+        target: { name: 'test-target', kind: 'cli' },
+        test_ids: ['test-01'],
+      }),
+    );
+
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
+
+    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
+    const entry = JSON.parse(indexContent.trim().split('\n')[0]);
+    expect(entry.experiment).toBe('without_skills');
+
+    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
+    expect(benchmark.metadata.experiment).toBe('without_skills');
+  });
+
+  it('omits experiment from output when manifest has no experiment', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR], { input: '{}' });
+
+    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
+    const entry = JSON.parse(indexContent.trim().split('\n')[0]);
+    expect(entry.experiment).toBeUndefined();
+
+    const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
+    expect(benchmark.metadata.experiment).toBeUndefined();
+  });
 });
diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts
index 47e9a9ff9..12194f4dc 100644
--- a/apps/cli/test/commands/eval/pipeline/input.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/input.test.ts
@@ -77,4 +77,29 @@ describe('pipeline input', () => {
     );
     expect(invoke.kind).toBeDefined();
   });
+
+  it('writes experiment to manifest when --experiment is provided', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [
+      CLI_ENTRY,
+      'pipeline',
+      'input',
+      EVAL_PATH,
+      '--out',
+      OUT_DIR,
+      '--experiment',
+      'without_skills',
+    ]);
+
+    const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
+    expect(manifest.experiment).toBe('without_skills');
+  });
+
+  it('omits experiment from manifest when --experiment is not provided', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]);
+
+    const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
+    expect(manifest.experiment).toBeUndefined();
+  });
 });
diff --git a/apps/web/src/content/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/evaluation/running-evals.mdx
index 6c34ec549..08f93c66c 100644
--- a/apps/web/src/content/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/evaluation/running-evals.mdx
@@ -11,7 +11,7 @@ sidebar:
 agentv eval evals/my-eval.yaml
 ```
 
-Results are written to `.agentv/results/eval_<timestamp>.jsonl`. Each line is a JSON object with one result per test case.
+Results are written to `.agentv/results/<timestamp>.jsonl`. Each line is a JSON object with one result per test case.
 
 Each `scores[]` entry includes per-grader timing:
 
@@ -47,6 +47,17 @@ Run against a different target than specified in the eval file:
 agentv eval --target azure-base evals/**/*.yaml
 ```
 
+### Experiment Label
+
+Tag a pipeline run with an experiment name to track different conditions (e.g. with vs without skills):
+
+```bash
+agentv pipeline run evals/my-eval.yaml --experiment with_skills
+agentv pipeline run evals/my-eval.yaml --experiment without_skills
+```
+
+The experiment label is written to `manifest.json` and propagated to each entry in `index.jsonl` by `pipeline bench`. The eval file stays the same across experiments — what changes is the environment. Dashboards can filter and compare results by experiment.
+
 ### Run Specific Test
 
 Run a single test by ID:
@@ -82,7 +93,7 @@ result-oriented workflows. For full-fidelity span inspection, export OTLP JSON e
 
 ```bash
 # Summary-level inspection from the run manifest
-agentv trace stats .agentv/results/runs/eval_<timestamp>/index.jsonl
+agentv trace stats .agentv/results/runs/<timestamp>/index.jsonl
 
 # Full-fidelity OTLP JSON trace (importable by OTel backends like Jaeger, Grafana)
 agentv eval evals/my-eval.yaml --otel-file traces/eval.otlp.json
diff --git a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx
index 39b009965..6843e5b81 100644
--- a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx
+++ b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx
@@ -275,6 +275,22 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
 
 **Key takeaway:** You do not need to rewrite your `evals.json`. AgentV reads it directly and adds a richer evaluation engine on top.
 
+## Using Experiments for Baseline vs Candidate
+
+The `--experiment` flag provides a structured way to label baseline and candidate runs without separate eval files:
+
+```bash
+# Baseline: run without skills installed
+agentv pipeline run evals/my-eval.yaml --experiment without_skills
+
+# Candidate: run with skills installed
+agentv pipeline run evals/my-eval.yaml --experiment with_skills
+```
+
+Both runs use the same eval file and produce separate run directories. The experiment label is recorded in `manifest.json` and `index.jsonl`, making it easy to filter and compare in dashboards.
+
+This replaces the need for separate `--target baseline` / `--target candidate` configurations when the only difference between runs is the workspace setup (skills, config, etc.) rather than the target harness.
+
 ## Baseline Comparison Best Practices
 
 ### Discovery-path contamination
diff --git a/examples/features/experiments/README.md b/examples/features/experiments/README.md
new file mode 100644
index 000000000..cda2f0eda
--- /dev/null
+++ b/examples/features/experiments/README.md
@@ -0,0 +1,67 @@
+# Experiments
+
+Demonstrates using the `--experiment` flag to compare evaluation runs under different conditions while keeping test cases identical.
+
+## What This Shows
+
+- Running the same eval file with different experiment labels
+- Comparing results across experiments (e.g. with vs without skills)
+- One run = one target x one experiment, recorded in `manifest.json`
+
+## Concept
+
+An **experiment** is a run-level label that records the conditions under which an eval was executed. The eval file stays the same — what changes is the environment (skills installed, web search enabled, different system prompt, etc.).
+
+| Experiment | What changes | Eval file |
+|---|---|---|
+| `with_skills` | Skills installed in workspace | Same `coding-ability.eval.yaml` |
+| `without_skills` | No skills in workspace | Same file |
+| `web_search` | Web search tool enabled | Same file |
+
+## Running
+
+```bash
+# From repository root
+
+# Run with skills (set up workspace with skills first, then run)
+agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \
+  --experiment with_skills
+
+# Run without skills (same eval, clean workspace)
+agentv pipeline run examples/features/experiments/evals/coding-ability.eval.yaml \
+  --experiment without_skills
+```
+
+The experiment label is written to `manifest.json` and propagated to `index.jsonl` entries by `pipeline bench`. This enables dashboards to filter and compare results by experiment.
+
+## Output
+
+Each run produces a separate directory. The experiment is metadata, not a path segment:
+
+```
+.agentv/results/runs/
+├── 2026-03-28T10-00-00-000Z/       # with_skills run
+│   ├── manifest.json                # { "experiment": "with_skills", ... }
+│   └── coding-ability/
+│       ├── review-null-check/
+│       └── review-clean-function/
+└── 2026-03-28T10-05-00-000Z/       # without_skills run
+    ├── manifest.json                # { "experiment": "without_skills", ... }
+    └── coding-ability/
+        ├── review-null-check/
+        └── review-clean-function/
+```
+
+## Comparing experiments
+
+After both runs complete and are graded:
+
+```bash
+# Compare the two runs
+agentv compare .agentv/results/runs/<with-skills-ts>/index.jsonl \
+               .agentv/results/runs/<without-skills-ts>/index.jsonl
+```
+
+## Key Files
+
+- `evals/coding-ability.eval.yaml` - Shared test cases (same for all experiments)
diff --git a/examples/features/experiments/evals/coding-ability.eval.yaml b/examples/features/experiments/evals/coding-ability.eval.yaml
new file mode 100644
index 000000000..5441cf147
--- /dev/null
+++ b/examples/features/experiments/evals/coding-ability.eval.yaml
@@ -0,0 +1,30 @@
+name: coding-ability
+tests:
+  - id: review-null-check
+    input: |
+      Review this TypeScript function for bugs:
+
+      function getUser(users: Map<string, User>, id: string) {
+        return users.get(id).name;
+      }
+    criteria: Identifies the potential undefined access when the key is missing from the map
+    assertions:
+      - name: mentions_undefined
+        type: contains
+        value: "undefined"
+      - name: suggests_fix
+        type: llm-grader
+        prompt: Does the review identify that users.get(id) can return undefined and suggest a fix?
+
+  - id: review-clean-function
+    input: |
+      Review this TypeScript function for bugs:
+
+      function add(a: number, b: number): number {
+        return a + b;
+      }
+    criteria: Recognizes the function is correct and does not flag false issues
+    assertions:
+      - name: no_false_positives
+        type: llm-grader
+        prompt: Does the review correctly identify this function as simple and correct without flagging false issues?