EntityProcess · christso · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
@@ -134,7 +134,10 @@ export const evalInputCommand = command({
       // No targets file found — subagent-as-target mode
     }
 
-    const suiteName = suite.metadata?.name?.trim() ?? '';
+    // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the
+    // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode
+    // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir).
+    const suiteName = tests[0]?.suite?.trim() ?? '';
     const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -158,7 +158,10 @@ export const evalRunCommand = command({
       // No targets file — subagent-as-target mode
     }
 
-    const suiteName = suite.metadata?.name?.trim() ?? '';
+    // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the
+    // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode
+    // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir).
+    const suiteName = tests[0]?.suite?.trim() ?? '';
     const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
     const testIds: string[] = [];

diff --git a/apps/cli/test/commands/eval/pipeline/fixtures/no-name.eval.yaml b/apps/cli/test/commands/eval/pipeline/fixtures/no-name.eval.yaml
@@ -0,0 +1,8 @@
+tests:
+  - id: test-01
+    input: hello world
+    criteria: Response echoes the input
+    assertions:
+      - name: contains_hello
+        type: contains
+        value: hello
diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts
@@ -128,4 +128,18 @@ describe('pipeline input', () => {
     expect(regexGrader.type).toBe('regex');
     expect(regexGrader.value).toBe('h[aeiou]llo');
   });
+
+  it('falls back to eval file basename for suite directory when name is absent', async () => {
+    const { execa } = await import('execa');
+    const noNameEvalPath = join(FIXTURE_DIR, 'no-name.eval.yaml');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'input', noNameEvalPath, '--out', OUT_DIR]);
+
+    const input = JSON.parse(
+      await readFile(join(OUT_DIR, 'no-name', 'test-01', 'input.json'), 'utf8'),
+    );
+    expect(input.input[0].content).toBe('hello world');
+
+    const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
+    expect(manifest.suite).toBe('no-name');
+  });
 });
diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md
@@ -193,7 +193,7 @@ This evaluates all deterministic assertions against `response.md` files. Two typ
 
 Both types are configured by `pipeline input` into `code_graders/<name>.json` and graded by `pipeline grade`. Results are written to `<test-id>/code_grader_results/<name>.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline.
 
-**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost.
+**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `<test-id>/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1.
 
 **Phase 2: LLM grading** (semantic — do NOT skip this phase)
 

diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/grader.md b/plugins/agentv-dev/skills/agentv-bench/agents/grader.md
@@ -18,7 +18,7 @@ You are the grader for an AgentV evaluation test case. You have two jobs: **grad
 - `eval-path`: Path to the eval YAML file
 - `test-id`: The test case ID
 - `response-file`: Path to the executor's response (e.g., `response.md`)
-- `bench-dir`: Path to the bench run directory (e.g., `.agentv/results/export/<timestamp>/`)
+- `bench-dir`: Path to the test's parent directory — the run directory qualified by evalset name. Example: `.agentv/results/runs/<experiment>/<timestamp>/<evalset-name>/`. The evalset name comes from the eval.yaml `name` field; when absent, it falls back to the eval file's basename (e.g. `my-suite.eval.yaml` → `my-suite`), matching CLI mode. The grader writes results under `{bench-dir}/{test-id}/...`.
 - `timing-file`: Path to `timing.json` (for execution-metrics/latency/cost assertions)
 
 ## Process
@@ -196,10 +196,14 @@ Do **NOT** write directly to `grading.json` — that file is produced by `agentv
 
 ### Field Descriptions
 
+`pipeline bench` consumes only `score` and `assertions[]` from this file when merging into the canonical `grading.json`. The remaining fields are preserved on disk for human review and downstream tooling, but do not flow into the merged output.
+
+**Consumed by `pipeline bench`:**
+- **score**: Weighted overall score for this grader (0.0-1.0)
 - **assertions**: Array of per-assertion results — `text` (assertion description), `passed` (boolean), `evidence` (cited quote or description)
+
+**Kept for traceability (not merged):**
 - **summary**: Aggregate stats — `passed`, `failed`, `total`, `pass_rate` (0.0-1.0)
-- **execution_metrics**: From executor metrics/timing — tool call counts, output size. Omit if not available.
-- **timing**: From `timing-file` — executor and total duration in seconds. Omit if not available.
 - **claims**: Extracted and verified claims — `claim` (statement), `type` (factual/process/quality), `verified` (boolean), `evidence`
 - **user_notes_summary**: Issues from executor notes — `uncertainties[]`, `needs_review[]`, `workarounds[]`. Empty arrays if no notes found.
 - **eval_feedback**: Suggestions for improving the evals — `suggestions[]` (array of `{assertion?, reason}`), `overall` (brief assessment)

diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md
@@ -147,11 +147,11 @@ The path hierarchy mirrors the CLI mode: `<evalset-name>` comes from the `name`
 the eval.yaml. The target is recorded in `manifest.json` — one run = one target.
 
 ```
-.agentv/results/runs/<timestamp>/
+.agentv/results/runs/<experiment>/<timestamp>/
 ├── manifest.json                    ← eval metadata, target, test_ids
 ├── index.jsonl                      ← per-test scores
 ├── benchmark.json                   ← aggregate statistics
-└── <evalset-name>/                  ← from eval.yaml "name" field (omitted if absent)
+└── <evalset-name>/                  ← eval.yaml "name" field, or eval file basename if absent (same as CLI mode)
     └── <test-id>/                   ← test case id
         ├── input.json               ← test input text + messages
         ├── invoke.json              ← target command or agent instructions