Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export const evalBenchCommand = command({
const result = JSON.parse(await readFile(join(codeResultsDir, file), 'utf8'));
evaluators.push({
name: result.name,
type: 'code-grader',
type: result.type ?? 'code-grader',
score: result.score,
weight: result.weight ?? 1.0,
assertions: result.assertions ?? [],
Expand Down
187 changes: 167 additions & 20 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,38 @@
/**
* `agentv pipeline grade` — Run code-grader assertions against response.md files
* in an export directory produced by `pipeline input`.
* `agentv pipeline grade` — Run code-grader and built-in deterministic assertions
* against response.md files in an export directory produced by `pipeline input`.
*
* For each test, reads code_graders/<name>.json configs, executes each grader
* with the response text on stdin (matching CodeEvaluator payload format),
* and writes results to code_grader_results/<name>.json.
* For each test:
* - Reads code_graders/<name>.json configs, executes each grader script,
* and writes results to code_grader_results/<name>.json.
* - Reads builtin_graders/<name>.json configs, evaluates deterministic assertions
* (contains, regex, equals, etc.) in-process, and writes results to
* code_grader_results/<name>.json (same directory, so pipeline bench merges them).
*
* Graders run concurrently (default: 4 workers) for performance.
* Progress is printed to stderr so users see real-time feedback.
* Code graders run concurrently (default: 10 workers) for performance.
* Built-in graders are synchronous and evaluate instantly after code graders finish.
*
* Export directory additions:
* <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
*/
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';

import { executeScript } from '@agentv/core';
import {
type AssertionResult,
executeScript,
runContainsAllAssertion,
runContainsAnyAssertion,
runContainsAssertion,
runEndsWithAssertion,
runEqualsAssertion,
runIcontainsAllAssertion,
runIcontainsAnyAssertion,
runIcontainsAssertion,
runIsJsonAssertion,
runRegexAssertion,
runStartsWithAssertion,
} from '@agentv/core';
import { command, number, option, optional, positional, string } from 'cmd-ts';

const DEFAULT_CONCURRENCY = 10;
Expand Down Expand Up @@ -175,9 +192,130 @@ export async function runCodeGraders(
return { totalGraders, totalPassed };
}

/**
* Evaluate a single built-in deterministic assertion against the response text.
*
* Dispatches to the appropriate assertion function based on the config type.
* Returns the assertion result with score and descriptive assertions array.
*
* To add a new built-in assertion type:
* 1. Import the runner from @agentv/core
* 2. Add a case to the switch below
* 3. Add the type to BUILTIN_ASSERTION_TYPES in pipeline/input.ts
*/
function evaluateBuiltinAssertion(
config: { type: string; value?: unknown; flags?: string },
responseText: string,
): AssertionResult {
const value = config.value;
switch (config.type) {
case 'contains':
return runContainsAssertion(responseText, value as string);
case 'contains-any':
return runContainsAnyAssertion(responseText, value as string[]);
case 'contains-all':
return runContainsAllAssertion(responseText, value as string[]);
case 'icontains':
return runIcontainsAssertion(responseText, value as string);
case 'icontains-any':
return runIcontainsAnyAssertion(responseText, value as string[]);
case 'icontains-all':
return runIcontainsAllAssertion(responseText, value as string[]);
case 'starts-with':
return runStartsWithAssertion(responseText, value as string);
case 'ends-with':
return runEndsWithAssertion(responseText, value as string);
case 'regex':
return runRegexAssertion(responseText, value as string, config.flags);
case 'is-json':
return runIsJsonAssertion(responseText);
case 'equals':
return runEqualsAssertion(responseText, value as string);
default:
return {
score: 0,
assertions: [{ text: `Unknown assertion type: ${config.type}`, passed: false }],
};
}
}

/**
* Run built-in deterministic assertions for all tests in the export directory.
* Reads configs from builtin_graders/<name>.json, evaluates in-process,
* and writes results to code_grader_results/<name>.json.
*/
async function runBuiltinGraders(
exportDir: string,
testIds: string[],
safeSuiteName: string,
): Promise<{ total: number; passed: number }> {
let total = 0;
let passed = 0;

for (const testId of testIds) {
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(exportDir, ...subpath);
const builtinGradersDir = join(testDir, 'builtin_graders');

let graderFiles: string[];
try {
graderFiles = (await readdir(builtinGradersDir)).filter((f) => f.endsWith('.json'));
} catch {
continue; // No builtin graders for this test
}

if (graderFiles.length === 0) continue;

const resultsDir = join(testDir, 'code_grader_results');
await mkdir(resultsDir, { recursive: true });

let responseText: string;
try {
responseText = await readFile(join(testDir, 'response.md'), 'utf8');
} catch {
continue; // No response yet — skip
}

for (const file of graderFiles) {
const config = JSON.parse(await readFile(join(builtinGradersDir, file), 'utf8'));
const raw = evaluateBuiltinAssertion(config, responseText);

// Apply negate if configured
const negate = config.negate === true;
const score = negate ? 1 - raw.score : raw.score;
const assertions = negate
? raw.assertions.map((a: { text: string; passed: boolean }) => ({
text: a.text,
passed: !a.passed,
}))
: raw.assertions;

const result = {
name: config.name,
type: config.type,
score,
weight: config.weight ?? 1.0,
assertions,
details: {},
};

await writeFile(
join(resultsDir, `${config.name}.json`),
`${JSON.stringify(result, null, 2)}\n`,
'utf8',
);

total++;
if (score >= 0.5) passed++;
}
}

return { total, passed };
}

export const evalGradeCommand = command({
name: 'grade',
description: 'Run code-grader assertions on responses in an export directory',
description: 'Run code-grader and built-in assertions on responses in an export directory',
args: {
exportDir: positional({
type: string,
Expand All @@ -199,7 +337,7 @@ export const evalGradeCommand = command({
const suiteName: string = manifest.suite ?? '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

// Collect all grader tasks upfront so we know the total count
// Collect all code-grader tasks upfront so we know the total count
const tasks: GraderTask[] = [];

for (const testId of testIds) {
Expand All @@ -212,22 +350,31 @@ export const evalGradeCommand = command({
try {
graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json'));
} catch {
continue; // No code graders for this test
graderFiles = [];
}

if (graderFiles.length === 0) continue;
await mkdir(resultsDir, { recursive: true });

// Read response and input once per test (shared by all graders for this test)
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
if (graderFiles.length > 0) {
await mkdir(resultsDir, { recursive: true });
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));

for (const graderFile of graderFiles) {
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
for (const graderFile of graderFiles) {
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
}
}
}

const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);

// Run built-in deterministic assertions (contains, regex, equals, etc.)
const builtin = await runBuiltinGraders(exportDir, testIds, safeSuiteName);

const totalAll = totalGraders + builtin.total;
const passedAll = totalPassed + builtin.passed;
const parts: string[] = [];
if (totalGraders > 0) parts.push(`${totalGraders} code-grader(s)`);
if (builtin.total > 0) parts.push(`${builtin.total} built-in assertion(s)`);
if (parts.length === 0) parts.push('0 grader(s)');
console.log(`Graded ${parts.join(' + ')}: ${passedAll}/${totalAll} passed`);
},
});
34 changes: 33 additions & 1 deletion apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,29 @@
* ├── criteria.md
* ├── expected_output.json (if present)
* ├── llm_graders/<name>.json
* └── code_graders/<name>.json
* ├── code_graders/<name>.json
* └── builtin_graders/<name>.json
*/
import { readFile } from 'node:fs/promises';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join, relative, resolve } from 'node:path';

import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';

/** Assertion types that can be graded deterministically without external scripts or LLMs. */
const BUILTIN_ASSERTION_TYPES = new Set([
'contains',
'contains-any',
'contains-all',
'icontains',
'icontains-any',
'icontains-all',
'starts-with',
'ends-with',
'regex',
'is-json',
'equals',
]);
import { deriveCategory, loadTestSuite } from '@agentv/core';
import { command, option, optional, positional, string } from 'cmd-ts';

Expand Down Expand Up @@ -190,9 +206,11 @@ async function writeGraderConfigs(
): Promise<void> {
const codeGradersDir = join(testDir, 'code_graders');
const llmGradersDir = join(testDir, 'llm_graders');
const builtinGradersDir = join(testDir, 'builtin_graders');

let hasCodeGraders = false;
let hasLlmGraders = false;
let hasBuiltinGraders = false;

for (const assertion of assertions) {
if (assertion.type === 'code-grader') {
Expand Down Expand Up @@ -233,6 +251,20 @@ async function writeGraderConfigs(
threshold: 0.5,
config: {},
});
} else if (BUILTIN_ASSERTION_TYPES.has(assertion.type)) {
if (!hasBuiltinGraders) {
await mkdir(builtinGradersDir, { recursive: true });
hasBuiltinGraders = true;
}
const config = assertion as EvaluatorConfig & { value?: unknown; flags?: string };
await writeJson(join(builtinGradersDir, `${config.name}.json`), {
name: config.name,
type: config.type,
value: config.value,
flags: (config as { flags?: string }).flags,
weight: config.weight ?? 1.0,
negate: config.negate ?? false,
});
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: builtin-test
tests:
- id: test-01
input: hello world
criteria: Response echoes the input
assertions:
- name: has_hello
type: contains
value: hello
- name: matches_pattern
type: regex
value: "h[aeiou]llo"
- name: is_valid_json
type: is-json
Loading
Loading