From 99e804ac4b57a41879d35236710105df9ec7e8b4 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 4 May 2026 05:30:33 +0200 Subject: [PATCH 1/3] fix(core): derive score from assertions when score is absent in code-grader When a code-grader script returns `{ assertions }` without an explicit `score`, the harness now computes score as passing/total instead of defaulting to 0. Also removes redundant manual score computations from six example scripts that already had assertions covering the same logic. Closes #1211 Co-Authored-By: Claude Sonnet 4.6 --- .../scripts/verify-attachments.ts | 8 +------ .../graders/transcript-quality.ts | 5 +---- .../scripts/check-metrics-present.ts | 13 ++--------- .../scripts/check-file-changes.ts | 5 ++--- .../graders/transcript-quality.ts | 6 +---- .../scripts/check-csv-artifact.ts | 7 ++---- .../src/evaluation/graders/code-grader.ts | 19 +++++++++++----- .../graders/code-grader-plain-text.test.ts | 22 +++++++++++++++++++ 8 files changed, 44 insertions(+), 41 deletions(-) diff --git a/examples/features/code-grader-sdk/scripts/verify-attachments.ts b/examples/features/code-grader-sdk/scripts/verify-attachments.ts index 2fec360b1..430b7a66a 100755 --- a/examples/features/code-grader-sdk/scripts/verify-attachments.ts +++ b/examples/features/code-grader-sdk/scripts/verify-attachments.ts @@ -58,11 +58,5 @@ export default defineCodeGrader(({ expectedOutput, output, inputFiles }) => { } } - const passed = assertions.filter((a) => a.passed).length; - const score = assertions.length === 0 ? 0 : passed / assertions.length; - - return { - score, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/copilot-log-eval/graders/transcript-quality.ts b/examples/features/copilot-log-eval/graders/transcript-quality.ts index 87c9f329a..b9ea5d000 100644 --- a/examples/features/copilot-log-eval/graders/transcript-quality.ts +++ b/examples/features/copilot-log-eval/graders/transcript-quality.ts @@ -91,8 +91,5 @@ export default defineCodeGrader(({ output }) => { }); } - const passed = assertions.filter((a) => a.passed).length; - const score = assertions.length > 0 ? passed / assertions.length : 0; - - return { score, assertions }; + return { assertions }; }); diff --git a/examples/features/execution-metrics/scripts/check-metrics-present.ts b/examples/features/execution-metrics/scripts/check-metrics-present.ts index 1aacdea13..c85926439 100644 --- a/examples/features/execution-metrics/scripts/check-metrics-present.ts +++ b/examples/features/execution-metrics/scripts/check-metrics-present.ts @@ -17,10 +17,7 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) => const assertions: Array<{ text: string; passed: boolean }> = []; if (!trace) { - return { - score: 0, - assertions: [{ text: 'No trace provided', passed: false }], - }; + return { assertions: [{ text: 'No trace provided', passed: false }] }; } // Check for tokenUsage @@ -47,11 +44,5 @@ export default defineCodeGrader(({ trace, tokenUsage, costUsd, durationMs }) => assertions.push({ text: 'durationMs not present', passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const score = passed / assertions.length; - - return { - score, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts index b198b7a35..a150c3fdf 100644 --- a/examples/features/file-changes-with-repos/scripts/check-file-changes.ts +++ b/examples/features/file-changes-with-repos/scripts/check-file-changes.ts @@ -22,7 +22,7 @@ if (!fileChanges || fileChanges.trim().length === 0) { passed: false, evidence: 'file_changes is empty — workspace not configured or file tracking failed', }); - console.log(JSON.stringify({ score: 0, assertions })); + console.log(JSON.stringify({ assertions })); process.exit(0); } @@ -56,5 +56,4 @@ assertions.push({ evidence: hasAddFn ? undefined : 'add() function not found in diff', }); -const passed = assertions.filter((a) => a.passed).length; -console.log(JSON.stringify({ score: passed / assertions.length, assertions })); +console.log(JSON.stringify({ assertions })); diff --git a/examples/features/import-claude/graders/transcript-quality.ts b/examples/features/import-claude/graders/transcript-quality.ts index 91e61ad04..9dc7b0ef5 100644 --- a/examples/features/import-claude/graders/transcript-quality.ts +++ b/examples/features/import-claude/graders/transcript-quality.ts @@ -28,9 +28,5 @@ export default defineCodeGrader(({ output }) => { passed: emptyAssistant.length === 0, }); - const passed = assertions.filter((a) => a.passed).length; - return { - score: assertions.length > 0 ? passed / assertions.length : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts index 43809cdea..7836489ee 100644 --- a/examples/features/workspace-artifact/scripts/check-csv-artifact.ts +++ b/examples/features/workspace-artifact/scripts/check-csv-artifact.ts @@ -25,7 +25,7 @@ if (!fileChanges || fileChanges.trim().length === 0) { passed: false, evidence: 'file_changes is empty — workspace snapshot or git baseline may not be configured', }); - console.log(JSON.stringify({ score: 0, assertions })); + console.log(JSON.stringify({ assertions })); process.exit(0); } @@ -65,7 +65,4 @@ assertions.push({ evidence: hasDataRow ? undefined : 'No data rows found after the header', }); -const passed = assertions.filter((a) => a.passed).length; -const score = passed / assertions.length; - -console.log(JSON.stringify({ score, assertions })); +console.log(JSON.stringify({ assertions })); diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts index 62e9a7f37..b672ab32d 100644 --- a/packages/core/src/evaluation/graders/code-grader.ts +++ b/packages/core/src/evaluation/graders/code-grader.ts @@ -262,12 +262,6 @@ export class CodeGrader implements Grader { // Plain-text fallback: exit code is pass/fail, stdout is the assertion text. // For numeric scores or multi-aspect results, use the JSON protocol instead. const passed = exitCode === 0; - const score = - parsed != null - ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0) - : passed - ? 1 - : 0; const assertions: AssertionEntry[] = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions @@ -285,6 +279,19 @@ export class CodeGrader implements Grader { : parsed == null ? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }] : []; + // When the script omits `score` but returns `assertions`, derive score as passing/total. + const score = + parsed != null + ? clampScore( + typeof parsed.score === 'number' + ? parsed.score + : assertions.length > 0 + ? assertions.filter((a) => a.passed).length / assertions.length + : 0, + ) + : passed + ? 1 + : 0; // Capture optional structured details from code judge output const details = parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details) diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index 27d863b4c..3bdaaec33 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -67,6 +67,28 @@ describe('code-grader plain-text fallback', () => { expect(result.assertions[0].text).toBe('ok'); }); + it('assertions without score → derived as passing/total', async () => { + const result = await grader( + `echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":false},{"text":"c","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBeCloseTo(2 / 3); + expect(result.assertions).toHaveLength(3); + }); + + it('assertions all passing without score → score 1', async () => { + const result = await grader( + `echo '{"assertions":[{"text":"a","passed":true},{"text":"b","passed":true}]}'`, + ).evaluate(ctx); + expect(result.score).toBe(1); + }); + + it('assertions all failing without score → score 0', async () => { + const result = await grader( + `echo '{"assertions":[{"text":"a","passed":false}]}'`, + ).evaluate(ctx); + expect(result.score).toBe(0); + }); + it('script with stderr on non-zero exit → surfaces as error assertion', async () => { const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx); expect(result.score).toBe(0); From bc8af5d7dca6c8325c674dd80297db61bd58450a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 4 May 2026 05:34:39 +0200 Subject: [PATCH 2/3] style: fix biome formatting in code-grader-plain-text test Co-Authored-By: Claude Sonnet 4.6 --- .../test/evaluation/graders/code-grader-plain-text.test.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index 3bdaaec33..a793b8667 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -83,9 +83,9 @@ describe('code-grader plain-text fallback', () => { }); it('assertions all failing without score → score 0', async () => { - const result = await grader( - `echo '{"assertions":[{"text":"a","passed":false}]}'`, - ).evaluate(ctx); + const result = await grader(`echo '{"assertions":[{"text":"a","passed":false}]}'`).evaluate( + ctx, + ); expect(result.score).toBe(0); }); From 87c61fa22fb29923e646df67c59281ed3c08dc99 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Mon, 4 May 2026 05:46:00 +0200 Subject: [PATCH 3/3] fix(examples): simplify 4 more example scripts + add empty-assertions test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code review feedback: - Drop redundant passing/total score computation from functional-check.ts, validate-sync.ts, keyword-check.ts, and length-check.ts — same pattern as the 6 scripts updated in the previous commit - Add test for `{"assertions":[]}` without score → score 0 (empty guard) Co-Authored-By: Claude Sonnet 4.6 --- .../.agentv/graders/keyword-check.ts | 7 +------ .../eval-assert-demo/.agentv/graders/length-check.ts | 7 +------ .../functional-grading/scripts/functional-check.ts | 12 +----------- .../cross-repo-sync/scripts/validate-sync.ts | 10 +--------- .../graders/code-grader-plain-text.test.ts | 6 ++++++ 5 files changed, 10 insertions(+), 32 deletions(-) diff --git a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts index 5004381de..58f67833e 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/keyword-check.ts @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => { assertions.push({ text: 'Answer does not mention France', passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - return { - score: total > 0 ? passed / total : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts index da054ff5d..6f939ba44 100644 --- a/examples/features/eval-assert-demo/.agentv/graders/length-check.ts +++ b/examples/features/eval-assert-demo/.agentv/graders/length-check.ts @@ -37,10 +37,5 @@ export default defineCodeGrader(({ output }) => { assertions.push({ text: `Answer has ${wordCount} words (> 50, too verbose)`, passed: false }); } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - return { - score: total > 0 ? passed / total : 0, - assertions, - }; + return { assertions }; }); diff --git a/examples/features/functional-grading/scripts/functional-check.ts b/examples/features/functional-grading/scripts/functional-check.ts index 8952f9e9c..b4c933b1e 100644 --- a/examples/features/functional-grading/scripts/functional-check.ts +++ b/examples/features/functional-grading/scripts/functional-check.ts @@ -19,7 +19,6 @@ const workspacePath: string | null = input.workspace_path; if (!workspacePath) { console.log( JSON.stringify({ - score: 0, assertions: [ { text: 'workspace_path not provided — cannot run functional checks', @@ -63,13 +62,4 @@ if (compiled) { runStage('tests', 'npm', ['test']); } -const passed = assertions.filter((a) => a.passed).length; -const total = assertions.length; -const score = total > 0 ? passed / total : 0; - -console.log( - JSON.stringify({ - score, - assertions, - }), -); +console.log(JSON.stringify({ assertions })); diff --git a/examples/showcase/cross-repo-sync/scripts/validate-sync.ts b/examples/showcase/cross-repo-sync/scripts/validate-sync.ts index 59bb8bcbc..4512bfa19 100644 --- a/examples/showcase/cross-repo-sync/scripts/validate-sync.ts +++ b/examples/showcase/cross-repo-sync/scripts/validate-sync.ts @@ -26,10 +26,7 @@ defineCodeGrader(({ fileChanges, config }) => { if (!fileChanges) { assertions.push({ text: 'No file changes captured', passed: false }); - return { - score: 0, - assertions, - }; + return { assertions }; } // Parse diff blocks @@ -57,12 +54,7 @@ defineCodeGrader(({ fileChanges, config }) => { } } - const passed = assertions.filter((a) => a.passed).length; - const total = assertions.length; - const score = total > 0 ? passed / total : 0; - return { - score, assertions, details: { files_checked: expectedFiles.length, diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts index a793b8667..86f9b8709 100644 --- a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts +++ b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts @@ -89,6 +89,12 @@ describe('code-grader plain-text fallback', () => { expect(result.score).toBe(0); }); + it('empty assertions array without score → score 0', async () => { + const result = await grader(`echo '{"assertions":[]}'`).evaluate(ctx); + expect(result.score).toBe(0); + expect(result.assertions).toHaveLength(0); + }); + it('script with stderr on non-zero exit → surfaces as error assertion', async () => { const result = await grader('echo "bad" >&2; exit 1').evaluate(ctx); expect(result.score).toBe(0);