From 6abac5454f9cf89cb900c69e4d84efc06e93edd0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 10:08:26 +0000 Subject: [PATCH 1/2] refactor(artifact-writer): rename evaluators to graders in grading.json Renames the `evaluators` field to `graders` in the GradingArtifact interface and all grading.json output, aligning the artifact schema with the existing `per_grader_summary` in benchmark.json and the llm-grader/code-grader type naming convention throughout the codebase. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/cli/src/commands/eval/artifact-writer.ts | 8 ++++---- apps/cli/src/commands/pipeline/bench.ts | 2 +- apps/cli/src/commands/results/export.ts | 2 +- apps/cli/src/commands/results/manifest.ts | 2 +- apps/cli/test/commands/eval/artifact-writer.test.ts | 10 +++++----- apps/cli/test/commands/eval/pipeline/bench.test.ts | 2 +- .../test/commands/eval/pipeline/pipeline-e2e.test.ts | 2 +- .../test/commands/results/export-e2e-providers.test.ts | 4 ++-- apps/cli/test/commands/results/export.test.ts | 8 ++++---- .../skills/agentv-bench/references/eval-yaml-spec.md | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 7bd1e359c..31a3c794f 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -26,7 +26,7 @@ export interface GradingArtifact { readonly total_tool_calls: number; readonly errors_encountered: number; }; - readonly evaluators?: readonly { + readonly graders?: readonly { readonly name: string; readonly type: string; readonly score: number; @@ -219,12 +219,12 @@ function buildAssertions(result: EvaluationResult): GradingArtifact['assertions' } // --------------------------------------------------------------------------- -// Build evaluators list +// Build graders list // --------------------------------------------------------------------------- function buildEvaluators( scores: readonly EvaluatorResult[] | undefined, -): GradingArtifact['evaluators'] { +): GradingArtifact['graders'] { if (!scores || scores.length === 0) { return undefined; } @@ -267,7 +267,7 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact total_tool_calls: totalToolCalls, errors_encountered: errorsEncountered, }, - evaluators: buildEvaluators(result.scores), + graders: buildEvaluators(result.scores), workspace_changes: parseWorkspaceChanges(result.fileChanges), conversation: result.conversationId ? { diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 6686c4d61..691e506d8 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -133,7 +133,7 @@ export const evalBenchCommand = command({ assertions: allAssertions, summary: { passed, failed, total: allAssertions.length, pass_rate: passRate }, execution_metrics: { tool_calls: {}, total_tool_calls: 0, errors_encountered: 0 }, - evaluators: evaluators.map((e) => ({ + graders: evaluators.map((e) => ({ name: e.name, type: e.type, score: e.score, diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index bfe82a89d..9157dd7ed 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -7,7 +7,7 @@ * benchmark.json — aggregate scores, pass/fail counts, timing * index.jsonl — per-test manifest with artifact pointers * / - * grading.json — per-test grading artifact (assertions, evaluators) + * grading.json — per-test grading artifact (assertions, graders) * timing.json — per-test timing artifact * outputs/ * response.md — human-readable agent response for this test diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 98e8a5527..af47e14cf 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -135,7 +135,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E evidence: assertion.evidence, })), scores: - grading?.evaluators?.map((evaluator) => ({ + grading?.graders?.map((evaluator) => ({ name: evaluator.name, type: evaluator.type, score: evaluator.score, diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 9826d2be1..a601415c3 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -127,10 +127,10 @@ describe('buildGradingArtifact', () => { const grading = buildGradingArtifact(result); - expect(grading.evaluators).toHaveLength(2); - expect(grading.evaluators?.[0].name).toBe('format-check'); - expect(grading.evaluators?.[0].type).toBe('code-grader'); - expect(grading.evaluators?.[1].score).toBe(0.7); + expect(grading.graders).toHaveLength(2); + expect(grading.graders?.[0].name).toBe('format-check'); + expect(grading.graders?.[0].type).toBe('code-grader'); + expect(grading.graders?.[1].score).toBe(0.7); }); it('records error as errors_encountered', () => { @@ -150,7 +150,7 @@ describe('buildGradingArtifact', () => { total: 0, pass_rate: 0, }); - expect(grading.evaluators).toBeUndefined(); + expect(grading.graders).toBeUndefined(); }); it('includes workspace_changes when fileChanges present', () => { diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index 8d6b6be8a..770b04ae0 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -75,7 +75,7 @@ describe('pipeline bench', () => { const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8')); expect(grading.summary.pass_rate).toBeGreaterThan(0); expect(grading.assertions.length).toBeGreaterThan(0); - expect(grading.evaluators).toHaveLength(2); + expect(grading.graders).toHaveLength(2); }); it('writes index.jsonl with one entry per test', async () => { diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts index c0e7422b6..23f94f023 100644 --- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts +++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts @@ -49,7 +49,7 @@ describe('eval pipeline e2e', () => { const grading = JSON.parse( await readFile(join(OUT_DIR, 'input-test', 'test-01', 'grading.json'), 'utf8'), ); - expect(grading.evaluators).toHaveLength(2); + expect(grading.graders).toHaveLength(2); expect(grading.summary.pass_rate).toBeGreaterThan(0); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 320fd524f..fafcec1a7 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -472,8 +472,8 @@ describe('export e2e — multi-provider metrics verification', () => { expect(grading.execution_metrics.tool_calls.Write).toBe(1); // Evaluators - expect(grading.evaluators).toHaveLength(1); - expect(grading.evaluators?.[0].name).toBe('accuracy'); + expect(grading.graders).toHaveLength(1); + expect(grading.graders?.[0].name).toBe('accuracy'); }); it('should produce correct grading for Copilot CLI result with mixed assertions', async () => { diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index e37d5fbd4..2ef02e04c 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -261,10 +261,10 @@ describe('results export', () => { expect(grading.execution_metrics).toBeDefined(); // Has evaluators - expect(grading.evaluators).toBeDefined(); - expect(grading.evaluators).toHaveLength(1); - expect(grading.evaluators?.[0].name).toBe('greeting_quality'); - expect(grading.evaluators?.[0].type).toBe('llm-grader'); + expect(grading.graders).toBeDefined(); + expect(grading.graders).toHaveLength(1); + expect(grading.graders?.[0].name).toBe('greeting_quality'); + expect(grading.graders?.[0].type).toBe('llm-grader'); const perTestTimingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json'); expect(existsSync(perTestTimingPath)).toBe(true); diff --git a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md index e7c908efc..a8a3c632b 100644 --- a/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md +++ b/plugins/agentv-dev/skills/agentv-bench/references/eval-yaml-spec.md @@ -323,7 +323,7 @@ LLM grader results are read from disk at `/llm_grader_results/.js ``` **Output:** -- `/grading.json` — merged grading with `evaluators`, `assertions`, `summary.pass_rate` +- `/grading.json` — merged grading with `graders`, `assertions`, `summary.pass_rate` - `index.jsonl` — one JSON line per test: `{test_id, score, pass, evaluators: [...]}` - `benchmark.json` — aggregate stats: `{metadata: {targets}, run_summary: {: {mean, stddev, n}}}` From 9a9c585e05e9b17604c5fc89616fa0ae917f4525 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 10:16:02 +0000 Subject: [PATCH 2/2] fix(manifest): read legacy evaluators field from old grading.json artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a backwards-compat shim so grading.json files written before the v4.13 rename (evaluators → graders) still hydrate correctly when read by agentv inspect, agentv trend, agentv compare, agentv results, etc. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/cli/src/commands/results/manifest.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index af47e14cf..fbc82b640 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -135,7 +135,13 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E evidence: assertion.evidence, })), scores: - grading?.graders?.map((evaluator) => ({ + // `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts. + // TODO: remove `evaluators` fallback once old run directories are no longer in use. + ( + grading?.graders ?? + (grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined) + ?.evaluators + )?.map((evaluator) => ({ name: evaluator.name, type: evaluator.type, score: evaluator.score,