diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index 5f24a87f..2bdbb217 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -134,7 +134,10 @@ export const evalInputCommand = command({ // No targets file found — subagent-as-target mode } - const suiteName = suite.metadata?.name?.trim() ?? ''; + // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the + // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode + // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir). + const suiteName = tests[0]?.suite?.trim() ?? ''; const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 5e2758fb..161c69cd 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -158,7 +158,10 @@ export const evalRunCommand = command({ // No targets file — subagent-as-target mode } - const suiteName = suite.metadata?.name?.trim() ?? ''; + // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the + // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode + // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir). + const suiteName = tests[0]?.suite?.trim() ?? ''; const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; const testIds: string[] = []; diff --git a/apps/cli/test/commands/eval/pipeline/fixtures/no-name.eval.yaml b/apps/cli/test/commands/eval/pipeline/fixtures/no-name.eval.yaml new file mode 100644 index 00000000..a793e9c2 --- /dev/null +++ b/apps/cli/test/commands/eval/pipeline/fixtures/no-name.eval.yaml @@ -0,0 +1,8 @@ +tests: + - id: test-01 + input: hello world + criteria: Response echoes the input + assertions: + - name: contains_hello + type: contains + value: hello diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index c7546e45..cbb4e54a 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -128,4 +128,18 @@ describe('pipeline input', () => { expect(regexGrader.type).toBe('regex'); expect(regexGrader.value).toBe('h[aeiou]llo'); }); + + it('falls back to eval file basename for suite directory when name is absent', async () => { + const { execa } = await import('execa'); + const noNameEvalPath = join(FIXTURE_DIR, 'no-name.eval.yaml'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'input', noNameEvalPath, '--out', OUT_DIR]); + + const input = JSON.parse( + await readFile(join(OUT_DIR, 'no-name', 'test-01', 'input.json'), 'utf8'), + ); + expect(input.input[0].content).toBe('hello world'); + + const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); + expect(manifest.suite).toBe('no-name'); + }); }); diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 89d108e7..67df5b62 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -193,7 +193,7 @@ This evaluates all deterministic assertions against `response.md` files. Two typ Both types are configured by `pipeline input` into `code_graders/.json` and graded by `pipeline grade`. Results are written to `/code_grader_results/.json`. Alternatively, pass `--grader-type code` to `pipeline run` to run these inline. -**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. +**Do not dispatch LLM grader subagents for tests that only have `contains`, `regex`, or other built-in assertions** — `pipeline grade` handles them entirely, at zero cost. To detect which tests need Phase 2, check whether `/llm_graders/` contains any `.json` config files — `pipeline input` only writes there for `llm-grader` assertions. Tests with an empty (or missing) `llm_graders/` directory are done after Phase 1. **Phase 2: LLM grading** (semantic — do NOT skip this phase) diff --git a/plugins/agentv-dev/skills/agentv-bench/agents/grader.md b/plugins/agentv-dev/skills/agentv-bench/agents/grader.md index 1f176b57..c34cd879 100644 --- a/plugins/agentv-dev/skills/agentv-bench/agents/grader.md +++ b/plugins/agentv-dev/skills/agentv-bench/agents/grader.md @@ -18,7 +18,7 @@ You are the grader for an AgentV evaluation test case. You have two jobs: **grad - `eval-path`: Path to the eval YAML file - `test-id`: The test case ID - `response-file`: Path to the executor's response (e.g., `response.md`) -- `bench-dir`: Path to the bench run directory (e.g., `.agentv/results/export//`) +- `bench-dir`: Path to the test's parent directory — the run directory qualified by evalset name. Example: `.agentv/results/runs////`. The evalset name comes from the eval.yaml `name` field; when absent, it falls back to the eval file's basename (e.g. `my-suite.eval.yaml` → `my-suite`), matching CLI mode. The grader writes results under `{bench-dir}/{test-id}/...`. - `timing-file`: Path to `timing.json` (for execution-metrics/latency/cost assertions) ## Process @@ -196,10 +196,14 @@ Do **NOT** write directly to `grading.json` — that file is produced by `agentv ### Field Descriptions +`pipeline bench` consumes only `score` and `assertions[]` from this file when merging into the canonical `grading.json`. The remaining fields are preserved on disk for human review and downstream tooling, but do not flow into the merged output. + +**Consumed by `pipeline bench`:** +- **score**: Weighted overall score for this grader (0.0-1.0) - **assertions**: Array of per-assertion results — `text` (assertion description), `passed` (boolean), `evidence` (cited quote or description) + +**Kept for traceability (not merged):** - **summary**: Aggregate stats — `passed`, `failed`, `total`, `pass_rate` (0.0-1.0) -- **execution_metrics**: From executor metrics/timing — tool call counts, output size. Omit if not available. -- **timing**: From `timing-file` — executor and total duration in seconds. Omit if not available. - **claims**: Extracted and verified claims — `claim` (statement), `type` (factual/process/quality), `verified` (boolean), `evidence` - **user_notes_summary**: Issues from executor notes — `uncertainties[]`, `needs_review[]`, `workarounds[]`. Empty arrays if no notes found. - **eval_feedback**: Suggestions for improving the evals — `suggestions[]` (array of `{assertion?, reason}`), `overall` (brief assessment) diff --git a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md index 23994de9..fabff41b 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/subagent-pipeline.md @@ -147,11 +147,11 @@ The path hierarchy mirrors the CLI mode: `` comes from the `name` the eval.yaml. The target is recorded in `manifest.json` — one run = one target. ``` -.agentv/results/runs// +.agentv/results/runs/// ├── manifest.json ← eval metadata, target, test_ids ├── index.jsonl ← per-test scores ├── benchmark.json ← aggregate statistics -└── / ← from eval.yaml "name" field (omitted if absent) +└── / ← eval.yaml "name" field, or eval file basename if absent (same as CLI mode) └── / ← test case id ├── input.json ← test input text + messages ├── invoke.json ← target command or agent instructions