From 729b85c549163232203b18735d565346aa0d16da Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 13 Apr 2026 20:58:54 +0000 Subject: [PATCH] fix(pipeline): grade built-in deterministic assertions in subagent mode pipeline grade now evaluates contains, regex, equals, starts-with, ends-with, is-json, and other built-in assertion types against response.md. Previously these were silently ignored, producing score: 0 for tests with only deterministic assertions. Closes #1075 Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/pipeline/bench.ts | 2 +- apps/cli/src/commands/pipeline/grade.ts | 187 ++++++++++++++++-- apps/cli/src/commands/pipeline/input.ts | 34 +++- .../pipeline/fixtures/builtin-test.eval.yaml | 14 ++ .../test/commands/eval/pipeline/grade.test.ts | 132 +++++++++++++ .../test/commands/eval/pipeline/input.test.ts | 26 +++ packages/core/src/index.ts | 14 ++ 7 files changed, 387 insertions(+), 22 deletions(-) create mode 100644 apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts index 691e506d8..1a57a2db1 100644 --- a/apps/cli/src/commands/pipeline/bench.ts +++ b/apps/cli/src/commands/pipeline/bench.ts @@ -62,7 +62,7 @@ export const evalBenchCommand = command({ const result = JSON.parse(await readFile(join(codeResultsDir, file), 'utf8')); evaluators.push({ name: result.name, - type: 'code-grader', + type: result.type ?? 'code-grader', score: result.score, weight: result.weight ?? 1.0, assertions: result.assertions ?? [], diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts index b9263c399..24bda1d06 100644 --- a/apps/cli/src/commands/pipeline/grade.ts +++ b/apps/cli/src/commands/pipeline/grade.ts @@ -1,13 +1,16 @@ /** - * `agentv pipeline grade` — Run code-grader assertions against response.md files - * in an export directory produced by `pipeline input`. + * `agentv pipeline grade` — Run code-grader and built-in deterministic assertions + * against response.md files in an export directory produced by `pipeline input`. * - * For each test, reads code_graders/.json configs, executes each grader - * with the response text on stdin (matching CodeEvaluator payload format), - * and writes results to code_grader_results/.json. + * For each test: + * - Reads code_graders/.json configs, executes each grader script, + * and writes results to code_grader_results/.json. + * - Reads builtin_graders/.json configs, evaluates deterministic assertions + * (contains, regex, equals, etc.) in-process, and writes results to + * code_grader_results/.json (same directory, so pipeline bench merges them). * - * Graders run concurrently (default: 4 workers) for performance. - * Progress is printed to stderr so users see real-time feedback. + * Code graders run concurrently (default: 10 workers) for performance. + * Built-in graders are synchronous and evaluate instantly after code graders finish. * * Export directory additions: * ///code_grader_results/.json @@ -15,7 +18,21 @@ import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { executeScript } from '@agentv/core'; +import { + type AssertionResult, + executeScript, + runContainsAllAssertion, + runContainsAnyAssertion, + runContainsAssertion, + runEndsWithAssertion, + runEqualsAssertion, + runIcontainsAllAssertion, + runIcontainsAnyAssertion, + runIcontainsAssertion, + runIsJsonAssertion, + runRegexAssertion, + runStartsWithAssertion, +} from '@agentv/core'; import { command, number, option, optional, positional, string } from 'cmd-ts'; const DEFAULT_CONCURRENCY = 10; @@ -175,9 +192,130 @@ export async function runCodeGraders( return { totalGraders, totalPassed }; } +/** + * Evaluate a single built-in deterministic assertion against the response text. + * + * Dispatches to the appropriate assertion function based on the config type. + * Returns the assertion result with score and descriptive assertions array. + * + * To add a new built-in assertion type: + * 1. Import the runner from @agentv/core + * 2. Add a case to the switch below + * 3. Add the type to BUILTIN_ASSERTION_TYPES in pipeline/input.ts + */ +function evaluateBuiltinAssertion( + config: { type: string; value?: unknown; flags?: string }, + responseText: string, +): AssertionResult { + const value = config.value; + switch (config.type) { + case 'contains': + return runContainsAssertion(responseText, value as string); + case 'contains-any': + return runContainsAnyAssertion(responseText, value as string[]); + case 'contains-all': + return runContainsAllAssertion(responseText, value as string[]); + case 'icontains': + return runIcontainsAssertion(responseText, value as string); + case 'icontains-any': + return runIcontainsAnyAssertion(responseText, value as string[]); + case 'icontains-all': + return runIcontainsAllAssertion(responseText, value as string[]); + case 'starts-with': + return runStartsWithAssertion(responseText, value as string); + case 'ends-with': + return runEndsWithAssertion(responseText, value as string); + case 'regex': + return runRegexAssertion(responseText, value as string, config.flags); + case 'is-json': + return runIsJsonAssertion(responseText); + case 'equals': + return runEqualsAssertion(responseText, value as string); + default: + return { + score: 0, + assertions: [{ text: `Unknown assertion type: ${config.type}`, passed: false }], + }; + } +} + +/** + * Run built-in deterministic assertions for all tests in the export directory. + * Reads configs from builtin_graders/.json, evaluates in-process, + * and writes results to code_grader_results/.json. + */ +async function runBuiltinGraders( + exportDir: string, + testIds: string[], + safeSuiteName: string, +): Promise<{ total: number; passed: number }> { + let total = 0; + let passed = 0; + + for (const testId of testIds) { + const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId]; + const testDir = join(exportDir, ...subpath); + const builtinGradersDir = join(testDir, 'builtin_graders'); + + let graderFiles: string[]; + try { + graderFiles = (await readdir(builtinGradersDir)).filter((f) => f.endsWith('.json')); + } catch { + continue; // No builtin graders for this test + } + + if (graderFiles.length === 0) continue; + + const resultsDir = join(testDir, 'code_grader_results'); + await mkdir(resultsDir, { recursive: true }); + + let responseText: string; + try { + responseText = await readFile(join(testDir, 'response.md'), 'utf8'); + } catch { + continue; // No response yet — skip + } + + for (const file of graderFiles) { + const config = JSON.parse(await readFile(join(builtinGradersDir, file), 'utf8')); + const raw = evaluateBuiltinAssertion(config, responseText); + + // Apply negate if configured + const negate = config.negate === true; + const score = negate ? 1 - raw.score : raw.score; + const assertions = negate + ? raw.assertions.map((a: { text: string; passed: boolean }) => ({ + text: a.text, + passed: !a.passed, + })) + : raw.assertions; + + const result = { + name: config.name, + type: config.type, + score, + weight: config.weight ?? 1.0, + assertions, + details: {}, + }; + + await writeFile( + join(resultsDir, `${config.name}.json`), + `${JSON.stringify(result, null, 2)}\n`, + 'utf8', + ); + + total++; + if (score >= 0.5) passed++; + } + } + + return { total, passed }; +} + export const evalGradeCommand = command({ name: 'grade', - description: 'Run code-grader assertions on responses in an export directory', + description: 'Run code-grader and built-in assertions on responses in an export directory', args: { exportDir: positional({ type: string, @@ -199,7 +337,7 @@ export const evalGradeCommand = command({ const suiteName: string = manifest.suite ?? ''; const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; - // Collect all grader tasks upfront so we know the total count + // Collect all code-grader tasks upfront so we know the total count const tasks: GraderTask[] = []; for (const testId of testIds) { @@ -212,22 +350,31 @@ export const evalGradeCommand = command({ try { graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json')); } catch { - continue; // No code graders for this test + graderFiles = []; } - if (graderFiles.length === 0) continue; - await mkdir(resultsDir, { recursive: true }); - - // Read response and input once per test (shared by all graders for this test) - const responseText = await readFile(join(testDir, 'response.md'), 'utf8'); - const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8')); + if (graderFiles.length > 0) { + await mkdir(resultsDir, { recursive: true }); + const responseText = await readFile(join(testDir, 'response.md'), 'utf8'); + const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8')); - for (const graderFile of graderFiles) { - tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData }); + for (const graderFile of graderFiles) { + tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData }); + } } } const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers); - console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`); + + // Run built-in deterministic assertions (contains, regex, equals, etc.) + const builtin = await runBuiltinGraders(exportDir, testIds, safeSuiteName); + + const totalAll = totalGraders + builtin.total; + const passedAll = totalPassed + builtin.passed; + const parts: string[] = []; + if (totalGraders > 0) parts.push(`${totalGraders} code-grader(s)`); + if (builtin.total > 0) parts.push(`${builtin.total} built-in assertion(s)`); + if (parts.length === 0) parts.push('0 grader(s)'); + console.log(`Graded ${parts.join(' + ')}: ${passedAll}/${totalAll} passed`); }, }); diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index c3a54e20d..486795658 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -16,13 +16,29 @@ * ├── criteria.md * ├── expected_output.json (if present) * ├── llm_graders/.json - * └── code_graders/.json + * ├── code_graders/.json + * └── builtin_graders/.json */ import { readFile } from 'node:fs/promises'; import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join, relative, resolve } from 'node:path'; import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core'; + +/** Assertion types that can be graded deterministically without external scripts or LLMs. */ +const BUILTIN_ASSERTION_TYPES = new Set([ + 'contains', + 'contains-any', + 'contains-all', + 'icontains', + 'icontains-any', + 'icontains-all', + 'starts-with', + 'ends-with', + 'regex', + 'is-json', + 'equals', +]); import { deriveCategory, loadTestSuite } from '@agentv/core'; import { command, option, optional, positional, string } from 'cmd-ts'; @@ -190,9 +206,11 @@ async function writeGraderConfigs( ): Promise { const codeGradersDir = join(testDir, 'code_graders'); const llmGradersDir = join(testDir, 'llm_graders'); + const builtinGradersDir = join(testDir, 'builtin_graders'); let hasCodeGraders = false; let hasLlmGraders = false; + let hasBuiltinGraders = false; for (const assertion of assertions) { if (assertion.type === 'code-grader') { @@ -233,6 +251,20 @@ async function writeGraderConfigs( threshold: 0.5, config: {}, }); + } else if (BUILTIN_ASSERTION_TYPES.has(assertion.type)) { + if (!hasBuiltinGraders) { + await mkdir(builtinGradersDir, { recursive: true }); + hasBuiltinGraders = true; + } + const config = assertion as EvaluatorConfig & { value?: unknown; flags?: string }; + await writeJson(join(builtinGradersDir, `${config.name}.json`), { + name: config.name, + type: config.type, + value: config.value, + flags: (config as { flags?: string }).flags, + weight: config.weight ?? 1.0, + negate: config.negate ?? false, + }); } } } diff --git a/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml b/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml new file mode 100644 index 000000000..7a6984d73 --- /dev/null +++ b/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml @@ -0,0 +1,14 @@ +name: builtin-test +tests: + - id: test-01 + input: hello world + criteria: Response echoes the input + assertions: + - name: has_hello + type: contains + value: hello + - name: matches_pattern + type: regex + value: "h[aeiou]llo" + - name: is_valid_json + type: is-json diff --git a/apps/cli/test/commands/eval/pipeline/grade.test.ts b/apps/cli/test/commands/eval/pipeline/grade.test.ts index 66aa45b12..5a5676c92 100644 --- a/apps/cli/test/commands/eval/pipeline/grade.test.ts +++ b/apps/cli/test/commands/eval/pipeline/grade.test.ts @@ -68,3 +68,135 @@ describe('pipeline grade', () => { expect(result.assertions[0].passed).toBe(true); }); }); + +describe('pipeline grade — builtin assertions', () => { + const BUILTIN_OUT = join(import.meta.dirname, '__tmp_grade_builtin_test__'); + + beforeEach(async () => { + const testDir = join(BUILTIN_OUT, 'test-01'); + const builtinGradersDir = join(testDir, 'builtin_graders'); + await mkdir(builtinGradersDir, { recursive: true }); + + await writeFile(join(testDir, 'response.md'), 'hello world'); + await writeFile( + join(testDir, 'input.json'), + JSON.stringify({ input: [{ role: 'user', content: 'say hello' }] }), + ); + + // contains assertion — should pass + await writeFile( + join(builtinGradersDir, 'has_hello.json'), + JSON.stringify({ + name: 'has_hello', + type: 'contains', + value: 'hello', + weight: 1.0, + negate: false, + }), + ); + + // regex assertion — should pass + await writeFile( + join(builtinGradersDir, 'matches_pattern.json'), + JSON.stringify({ + name: 'matches_pattern', + type: 'regex', + value: 'h[aeiou]llo', + weight: 1.0, + negate: false, + }), + ); + + // contains assertion — should fail + await writeFile( + join(builtinGradersDir, 'has_goodbye.json'), + JSON.stringify({ + name: 'has_goodbye', + type: 'contains', + value: 'goodbye', + weight: 1.0, + negate: false, + }), + ); + + await writeFile( + join(BUILTIN_OUT, 'manifest.json'), + JSON.stringify({ + eval_file: 'test.eval.yaml', + timestamp: new Date().toISOString(), + target: { name: 'test', kind: 'cli' }, + test_ids: ['test-01'], + }), + ); + }); + + afterEach(async () => { + await rm(BUILTIN_OUT, { recursive: true, force: true }); + }); + + it('evaluates contains assertion and writes result', async () => { + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); + + const result = JSON.parse( + await readFile(join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_hello.json'), 'utf8'), + ); + expect(result.score).toBe(1); + expect(result.type).toBe('contains'); + expect(result.assertions[0].passed).toBe(true); + }); + + it('evaluates regex assertion and writes result', async () => { + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); + + const result = JSON.parse( + await readFile( + join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'matches_pattern.json'), + 'utf8', + ), + ); + expect(result.score).toBe(1); + expect(result.type).toBe('regex'); + }); + + it('scores 0 when contains assertion does not match', async () => { + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); + + const result = JSON.parse( + await readFile( + join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'), + 'utf8', + ), + ); + expect(result.score).toBe(0); + expect(result.assertions[0].passed).toBe(false); + }); + + it('applies negate to invert score', async () => { + // Overwrite has_goodbye with negate: true — "not contains goodbye" should pass + await writeFile( + join(BUILTIN_OUT, 'test-01', 'builtin_graders', 'has_goodbye.json'), + JSON.stringify({ + name: 'has_goodbye', + type: 'contains', + value: 'goodbye', + weight: 1.0, + negate: true, + }), + ); + + const { execa } = await import('execa'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); + + const result = JSON.parse( + await readFile( + join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'), + 'utf8', + ), + ); + expect(result.score).toBe(1); + expect(result.assertions[0].passed).toBe(true); + }); +}); diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index 445c05a0d..8a525a11d 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -102,4 +102,30 @@ describe('pipeline input', () => { const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); expect(manifest.experiment).toBeUndefined(); }); + + it('writes builtin_graders/.json for deterministic assertions', async () => { + const { execa } = await import('execa'); + const builtinEvalPath = join(FIXTURE_DIR, 'builtin-test.eval.yaml'); + await execa('bun', [CLI_ENTRY, 'pipeline', 'input', builtinEvalPath, '--out', OUT_DIR]); + + const containsGrader = JSON.parse( + await readFile( + join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'has_hello.json'), + 'utf8', + ), + ); + expect(containsGrader.name).toBe('has_hello'); + expect(containsGrader.type).toBe('contains'); + expect(containsGrader.value).toBe('hello'); + + const regexGrader = JSON.parse( + await readFile( + join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'matches_pattern.json'), + 'utf8', + ), + ); + expect(regexGrader.name).toBe('matches_pattern'); + expect(regexGrader.type).toBe('regex'); + expect(regexGrader.value).toBe('h[aeiou]llo'); + }); }); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index a44e7d6e7..79bf7066e 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -107,6 +107,20 @@ export type { } from './evaluation/registry/evaluator-registry.js'; export { createBuiltinRegistry } from './evaluation/registry/builtin-evaluators.js'; export { discoverAssertions } from './evaluation/registry/assertion-discovery.js'; +export { + runContainsAssertion, + runContainsAnyAssertion, + runContainsAllAssertion, + runIcontainsAssertion, + runIcontainsAnyAssertion, + runIcontainsAllAssertion, + runStartsWithAssertion, + runEndsWithAssertion, + runRegexAssertion, + runIsJsonAssertion, + runEqualsAssertion, + type AssertionResult, +} from './evaluation/evaluators/assertions.js'; export { discoverGraders, discoverGraders as discoverJudges,