From 729b85c549163232203b18735d565346aa0d16da Mon Sep 17 00:00:00 2001
From: Christopher <christso@gmail.com>
Date: Mon, 13 Apr 2026 20:58:54 +0000
Subject: [PATCH] fix(pipeline): grade built-in deterministic assertions in
 subagent mode

pipeline grade now evaluates contains, regex, equals, starts-with,
ends-with, is-json, and other built-in assertion types against response.md.
Previously these were silently ignored, producing score: 0 for tests
with only deterministic assertions.

Closes #1075

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/pipeline/bench.ts       |   2 +-
 apps/cli/src/commands/pipeline/grade.ts       | 187 ++++++++++++++++--
 apps/cli/src/commands/pipeline/input.ts       |  34 +++-
 .../pipeline/fixtures/builtin-test.eval.yaml  |  14 ++
 .../test/commands/eval/pipeline/grade.test.ts | 132 +++++++++++++
 .../test/commands/eval/pipeline/input.test.ts |  26 +++
 packages/core/src/index.ts                    |  14 ++
 7 files changed, 387 insertions(+), 22 deletions(-)
 create mode 100644 apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index 691e506d8..1a57a2db1 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -62,7 +62,7 @@ export const evalBenchCommand = command({
           const result = JSON.parse(await readFile(join(codeResultsDir, file), 'utf8'));
           evaluators.push({
             name: result.name,
-            type: 'code-grader',
+            type: result.type ?? 'code-grader',
             score: result.score,
             weight: result.weight ?? 1.0,
             assertions: result.assertions ?? [],
diff --git a/apps/cli/src/commands/pipeline/grade.ts b/apps/cli/src/commands/pipeline/grade.ts
index b9263c399..24bda1d06 100644
--- a/apps/cli/src/commands/pipeline/grade.ts
+++ b/apps/cli/src/commands/pipeline/grade.ts
@@ -1,13 +1,16 @@
 /**
- * `agentv pipeline grade` — Run code-grader assertions against response.md files
- * in an export directory produced by `pipeline input`.
+ * `agentv pipeline grade` — Run code-grader and built-in deterministic assertions
+ * against response.md files in an export directory produced by `pipeline input`.
  *
- * For each test, reads code_graders/<name>.json configs, executes each grader
- * with the response text on stdin (matching CodeEvaluator payload format),
- * and writes results to code_grader_results/<name>.json.
+ * For each test:
+ * - Reads code_graders/<name>.json configs, executes each grader script,
+ *   and writes results to code_grader_results/<name>.json.
+ * - Reads builtin_graders/<name>.json configs, evaluates deterministic assertions
+ *   (contains, regex, equals, etc.) in-process, and writes results to
+ *   code_grader_results/<name>.json (same directory, so pipeline bench merges them).
  *
- * Graders run concurrently (default: 4 workers) for performance.
- * Progress is printed to stderr so users see real-time feedback.
+ * Code graders run concurrently (default: 10 workers) for performance.
+ * Built-in graders are synchronous and evaluate instantly after code graders finish.
  *
  * Export directory additions:
  *   <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
@@ -15,7 +18,21 @@
 import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
-import { executeScript } from '@agentv/core';
+import {
+  type AssertionResult,
+  executeScript,
+  runContainsAllAssertion,
+  runContainsAnyAssertion,
+  runContainsAssertion,
+  runEndsWithAssertion,
+  runEqualsAssertion,
+  runIcontainsAllAssertion,
+  runIcontainsAnyAssertion,
+  runIcontainsAssertion,
+  runIsJsonAssertion,
+  runRegexAssertion,
+  runStartsWithAssertion,
+} from '@agentv/core';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
 const DEFAULT_CONCURRENCY = 10;
@@ -175,9 +192,130 @@ export async function runCodeGraders(
   return { totalGraders, totalPassed };
 }
 
+/**
+ * Evaluate a single built-in deterministic assertion against the response text.
+ *
+ * Dispatches to the appropriate assertion function based on the config type.
+ * Returns the assertion result with score and descriptive assertions array.
+ *
+ * To add a new built-in assertion type:
+ * 1. Import the runner from @agentv/core
+ * 2. Add a case to the switch below
+ * 3. Add the type to BUILTIN_ASSERTION_TYPES in pipeline/input.ts
+ */
+function evaluateBuiltinAssertion(
+  config: { type: string; value?: unknown; flags?: string },
+  responseText: string,
+): AssertionResult {
+  const value = config.value;
+  switch (config.type) {
+    case 'contains':
+      return runContainsAssertion(responseText, value as string);
+    case 'contains-any':
+      return runContainsAnyAssertion(responseText, value as string[]);
+    case 'contains-all':
+      return runContainsAllAssertion(responseText, value as string[]);
+    case 'icontains':
+      return runIcontainsAssertion(responseText, value as string);
+    case 'icontains-any':
+      return runIcontainsAnyAssertion(responseText, value as string[]);
+    case 'icontains-all':
+      return runIcontainsAllAssertion(responseText, value as string[]);
+    case 'starts-with':
+      return runStartsWithAssertion(responseText, value as string);
+    case 'ends-with':
+      return runEndsWithAssertion(responseText, value as string);
+    case 'regex':
+      return runRegexAssertion(responseText, value as string, config.flags);
+    case 'is-json':
+      return runIsJsonAssertion(responseText);
+    case 'equals':
+      return runEqualsAssertion(responseText, value as string);
+    default:
+      return {
+        score: 0,
+        assertions: [{ text: `Unknown assertion type: ${config.type}`, passed: false }],
+      };
+  }
+}
+
+/**
+ * Run built-in deterministic assertions for all tests in the export directory.
+ * Reads configs from builtin_graders/<name>.json, evaluates in-process,
+ * and writes results to code_grader_results/<name>.json.
+ */
+async function runBuiltinGraders(
+  exportDir: string,
+  testIds: string[],
+  safeSuiteName: string,
+): Promise<{ total: number; passed: number }> {
+  let total = 0;
+  let passed = 0;
+
+  for (const testId of testIds) {
+    const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
+    const testDir = join(exportDir, ...subpath);
+    const builtinGradersDir = join(testDir, 'builtin_graders');
+
+    let graderFiles: string[];
+    try {
+      graderFiles = (await readdir(builtinGradersDir)).filter((f) => f.endsWith('.json'));
+    } catch {
+      continue; // No builtin graders for this test
+    }
+
+    if (graderFiles.length === 0) continue;
+
+    const resultsDir = join(testDir, 'code_grader_results');
+    await mkdir(resultsDir, { recursive: true });
+
+    let responseText: string;
+    try {
+      responseText = await readFile(join(testDir, 'response.md'), 'utf8');
+    } catch {
+      continue; // No response yet — skip
+    }
+
+    for (const file of graderFiles) {
+      const config = JSON.parse(await readFile(join(builtinGradersDir, file), 'utf8'));
+      const raw = evaluateBuiltinAssertion(config, responseText);
+
+      // Apply negate if configured
+      const negate = config.negate === true;
+      const score = negate ? 1 - raw.score : raw.score;
+      const assertions = negate
+        ? raw.assertions.map((a: { text: string; passed: boolean }) => ({
+            text: a.text,
+            passed: !a.passed,
+          }))
+        : raw.assertions;
+
+      const result = {
+        name: config.name,
+        type: config.type,
+        score,
+        weight: config.weight ?? 1.0,
+        assertions,
+        details: {},
+      };
+
+      await writeFile(
+        join(resultsDir, `${config.name}.json`),
+        `${JSON.stringify(result, null, 2)}\n`,
+        'utf8',
+      );
+
+      total++;
+      if (score >= 0.5) passed++;
+    }
+  }
+
+  return { total, passed };
+}
+
 export const evalGradeCommand = command({
   name: 'grade',
-  description: 'Run code-grader assertions on responses in an export directory',
+  description: 'Run code-grader and built-in assertions on responses in an export directory',
   args: {
     exportDir: positional({
       type: string,
@@ -199,7 +337,7 @@ export const evalGradeCommand = command({
     const suiteName: string = manifest.suite ?? '';
     const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
 
-    // Collect all grader tasks upfront so we know the total count
+    // Collect all code-grader tasks upfront so we know the total count
     const tasks: GraderTask[] = [];
 
     for (const testId of testIds) {
@@ -212,22 +350,31 @@ export const evalGradeCommand = command({
       try {
         graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json'));
       } catch {
-        continue; // No code graders for this test
+        graderFiles = [];
       }
 
-      if (graderFiles.length === 0) continue;
-      await mkdir(resultsDir, { recursive: true });
-
-      // Read response and input once per test (shared by all graders for this test)
-      const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
-      const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
+      if (graderFiles.length > 0) {
+        await mkdir(resultsDir, { recursive: true });
+        const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
+        const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
 
-      for (const graderFile of graderFiles) {
-        tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
+        for (const graderFile of graderFiles) {
+          tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
+        }
       }
     }
 
     const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
-    console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
+
+    // Run built-in deterministic assertions (contains, regex, equals, etc.)
+    const builtin = await runBuiltinGraders(exportDir, testIds, safeSuiteName);
+
+    const totalAll = totalGraders + builtin.total;
+    const passedAll = totalPassed + builtin.passed;
+    const parts: string[] = [];
+    if (totalGraders > 0) parts.push(`${totalGraders} code-grader(s)`);
+    if (builtin.total > 0) parts.push(`${builtin.total} built-in assertion(s)`);
+    if (parts.length === 0) parts.push('0 grader(s)');
+    console.log(`Graded ${parts.join(' + ')}: ${passedAll}/${totalAll} passed`);
   },
 });
diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts
index c3a54e20d..486795658 100644
--- a/apps/cli/src/commands/pipeline/input.ts
+++ b/apps/cli/src/commands/pipeline/input.ts
@@ -16,13 +16,29 @@
  *           ├── criteria.md
  *           ├── expected_output.json    (if present)
  *           ├── llm_graders/<name>.json
- *           └── code_graders/<name>.json
+ *           ├── code_graders/<name>.json
+ *           └── builtin_graders/<name>.json
  */
 import { readFile } from 'node:fs/promises';
 import { mkdir, writeFile } from 'node:fs/promises';
 import { dirname, join, relative, resolve } from 'node:path';
 
 import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
+
+/** Assertion types that can be graded deterministically without external scripts or LLMs. */
+const BUILTIN_ASSERTION_TYPES = new Set([
+  'contains',
+  'contains-any',
+  'contains-all',
+  'icontains',
+  'icontains-any',
+  'icontains-all',
+  'starts-with',
+  'ends-with',
+  'regex',
+  'is-json',
+  'equals',
+]);
 import { deriveCategory, loadTestSuite } from '@agentv/core';
 import { command, option, optional, positional, string } from 'cmd-ts';
 
@@ -190,9 +206,11 @@ async function writeGraderConfigs(
 ): Promise<void> {
   const codeGradersDir = join(testDir, 'code_graders');
   const llmGradersDir = join(testDir, 'llm_graders');
+  const builtinGradersDir = join(testDir, 'builtin_graders');
 
   let hasCodeGraders = false;
   let hasLlmGraders = false;
+  let hasBuiltinGraders = false;
 
   for (const assertion of assertions) {
     if (assertion.type === 'code-grader') {
@@ -233,6 +251,20 @@ async function writeGraderConfigs(
         threshold: 0.5,
         config: {},
       });
+    } else if (BUILTIN_ASSERTION_TYPES.has(assertion.type)) {
+      if (!hasBuiltinGraders) {
+        await mkdir(builtinGradersDir, { recursive: true });
+        hasBuiltinGraders = true;
+      }
+      const config = assertion as EvaluatorConfig & { value?: unknown; flags?: string };
+      await writeJson(join(builtinGradersDir, `${config.name}.json`), {
+        name: config.name,
+        type: config.type,
+        value: config.value,
+        flags: (config as { flags?: string }).flags,
+        weight: config.weight ?? 1.0,
+        negate: config.negate ?? false,
+      });
     }
   }
 }
diff --git a/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml b/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml
new file mode 100644
index 000000000..7a6984d73
--- /dev/null
+++ b/apps/cli/test/commands/eval/pipeline/fixtures/builtin-test.eval.yaml
@@ -0,0 +1,14 @@
+name: builtin-test
+tests:
+  - id: test-01
+    input: hello world
+    criteria: Response echoes the input
+    assertions:
+      - name: has_hello
+        type: contains
+        value: hello
+      - name: matches_pattern
+        type: regex
+        value: "h[aeiou]llo"
+      - name: is_valid_json
+        type: is-json
diff --git a/apps/cli/test/commands/eval/pipeline/grade.test.ts b/apps/cli/test/commands/eval/pipeline/grade.test.ts
index 66aa45b12..5a5676c92 100644
--- a/apps/cli/test/commands/eval/pipeline/grade.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/grade.test.ts
@@ -68,3 +68,135 @@ describe('pipeline grade', () => {
     expect(result.assertions[0].passed).toBe(true);
   });
 });
+
+describe('pipeline grade — builtin assertions', () => {
+  const BUILTIN_OUT = join(import.meta.dirname, '__tmp_grade_builtin_test__');
+
+  beforeEach(async () => {
+    const testDir = join(BUILTIN_OUT, 'test-01');
+    const builtinGradersDir = join(testDir, 'builtin_graders');
+    await mkdir(builtinGradersDir, { recursive: true });
+
+    await writeFile(join(testDir, 'response.md'), 'hello world');
+    await writeFile(
+      join(testDir, 'input.json'),
+      JSON.stringify({ input: [{ role: 'user', content: 'say hello' }] }),
+    );
+
+    // contains assertion — should pass
+    await writeFile(
+      join(builtinGradersDir, 'has_hello.json'),
+      JSON.stringify({
+        name: 'has_hello',
+        type: 'contains',
+        value: 'hello',
+        weight: 1.0,
+        negate: false,
+      }),
+    );
+
+    // regex assertion — should pass
+    await writeFile(
+      join(builtinGradersDir, 'matches_pattern.json'),
+      JSON.stringify({
+        name: 'matches_pattern',
+        type: 'regex',
+        value: 'h[aeiou]llo',
+        weight: 1.0,
+        negate: false,
+      }),
+    );
+
+    // contains assertion — should fail
+    await writeFile(
+      join(builtinGradersDir, 'has_goodbye.json'),
+      JSON.stringify({
+        name: 'has_goodbye',
+        type: 'contains',
+        value: 'goodbye',
+        weight: 1.0,
+        negate: false,
+      }),
+    );
+
+    await writeFile(
+      join(BUILTIN_OUT, 'manifest.json'),
+      JSON.stringify({
+        eval_file: 'test.eval.yaml',
+        timestamp: new Date().toISOString(),
+        target: { name: 'test', kind: 'cli' },
+        test_ids: ['test-01'],
+      }),
+    );
+  });
+
+  afterEach(async () => {
+    await rm(BUILTIN_OUT, { recursive: true, force: true });
+  });
+
+  it('evaluates contains assertion and writes result', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);
+
+    const result = JSON.parse(
+      await readFile(join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_hello.json'), 'utf8'),
+    );
+    expect(result.score).toBe(1);
+    expect(result.type).toBe('contains');
+    expect(result.assertions[0].passed).toBe(true);
+  });
+
+  it('evaluates regex assertion and writes result', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);
+
+    const result = JSON.parse(
+      await readFile(
+        join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'matches_pattern.json'),
+        'utf8',
+      ),
+    );
+    expect(result.score).toBe(1);
+    expect(result.type).toBe('regex');
+  });
+
+  it('scores 0 when contains assertion does not match', async () => {
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);
+
+    const result = JSON.parse(
+      await readFile(
+        join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'),
+        'utf8',
+      ),
+    );
+    expect(result.score).toBe(0);
+    expect(result.assertions[0].passed).toBe(false);
+  });
+
+  it('applies negate to invert score', async () => {
+    // Overwrite has_goodbye with negate: true — "not contains goodbye" should pass
+    await writeFile(
+      join(BUILTIN_OUT, 'test-01', 'builtin_graders', 'has_goodbye.json'),
+      JSON.stringify({
+        name: 'has_goodbye',
+        type: 'contains',
+        value: 'goodbye',
+        weight: 1.0,
+        negate: true,
+      }),
+    );
+
+    const { execa } = await import('execa');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]);
+
+    const result = JSON.parse(
+      await readFile(
+        join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'),
+        'utf8',
+      ),
+    );
+    expect(result.score).toBe(1);
+    expect(result.assertions[0].passed).toBe(true);
+  });
+});
diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts
index 445c05a0d..8a525a11d 100644
--- a/apps/cli/test/commands/eval/pipeline/input.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/input.test.ts
@@ -102,4 +102,30 @@ describe('pipeline input', () => {
     const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8'));
     expect(manifest.experiment).toBeUndefined();
   });
+
+  it('writes builtin_graders/<name>.json for deterministic assertions', async () => {
+    const { execa } = await import('execa');
+    const builtinEvalPath = join(FIXTURE_DIR, 'builtin-test.eval.yaml');
+    await execa('bun', [CLI_ENTRY, 'pipeline', 'input', builtinEvalPath, '--out', OUT_DIR]);
+
+    const containsGrader = JSON.parse(
+      await readFile(
+        join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'has_hello.json'),
+        'utf8',
+      ),
+    );
+    expect(containsGrader.name).toBe('has_hello');
+    expect(containsGrader.type).toBe('contains');
+    expect(containsGrader.value).toBe('hello');
+
+    const regexGrader = JSON.parse(
+      await readFile(
+        join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'matches_pattern.json'),
+        'utf8',
+      ),
+    );
+    expect(regexGrader.name).toBe('matches_pattern');
+    expect(regexGrader.type).toBe('regex');
+    expect(regexGrader.value).toBe('h[aeiou]llo');
+  });
 });
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index a44e7d6e7..79bf7066e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -107,6 +107,20 @@ export type {
 } from './evaluation/registry/evaluator-registry.js';
 export { createBuiltinRegistry } from './evaluation/registry/builtin-evaluators.js';
 export { discoverAssertions } from './evaluation/registry/assertion-discovery.js';
+export {
+  runContainsAssertion,
+  runContainsAnyAssertion,
+  runContainsAllAssertion,
+  runIcontainsAssertion,
+  runIcontainsAnyAssertion,
+  runIcontainsAllAssertion,
+  runStartsWithAssertion,
+  runEndsWithAssertion,
+  runRegexAssertion,
+  runIsJsonAssertion,
+  runEqualsAssertion,
+  type AssertionResult,
+} from './evaluation/evaluators/assertions.js';
 export {
   discoverGraders,
   discoverGraders as discoverJudges,