Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 76 additions & 137 deletions apps/cli/src/commands/pipeline/grade.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
/**
* `agentv pipeline grade` — Run code-grader and built-in deterministic assertions
* against response.md files in an export directory produced by `pipeline input`.
* `agentv pipeline grade` — Run grader assertions against response.md files
* in an export directory produced by `pipeline input`.
*
* For each test:
* - Reads code_graders/<name>.json configs, executes each grader script,
* and writes results to code_grader_results/<name>.json.
* - Reads builtin_graders/<name>.json configs, evaluates deterministic assertions
* (contains, regex, equals, etc.) in-process, and writes results to
* code_grader_results/<name>.json (same directory, so pipeline bench merges them).
* All grader configs live in code_graders/<name>.json. Each config has a `type`
* field that determines how it's evaluated:
* - `code-grader` (or configs with a `command` field): executed as external scripts
* - Built-in types (contains, regex, equals, etc.): evaluated in-process
*
* Code graders run concurrently (default: 10 workers) for performance.
* Built-in graders are synchronous and evaluate instantly after code graders finish.
* Results are written to code_grader_results/<name>.json for pipeline bench.
*
* Export directory additions:
* <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
Expand Down Expand Up @@ -63,7 +60,9 @@ export interface GraderTask {
}

/**
* Run code-grader tasks with concurrency and progress feedback.
* Run grader tasks with concurrency and progress feedback.
* Dispatches each task based on its config: code-graders are executed as
* external scripts, built-in types (contains, regex, etc.) are evaluated in-process.
* Shared by `pipeline grade` and `pipeline run`.
*/
export async function runCodeGraders(
Expand All @@ -84,12 +83,29 @@ export async function runCodeGraders(
writeProgress();

const executeGrader = async (task: GraderTask) => {
const { testId, testDir, resultsDir, graderFile, responseText, inputData } = task;
const { testDir, resultsDir, graderFile, responseText } = task;
const graderConfig = JSON.parse(
await readFile(join(testDir, 'code_graders', graderFile), 'utf8'),
);
const graderName = graderConfig.name;

// Dispatch: configs with a `command` field are external scripts;
// all others are built-in deterministic assertions evaluated in-process.
if (graderConfig.command) {
await executeCodeGrader(graderConfig, task);
} else {
await executeBuiltinGrader(graderConfig, responseText, resultsDir);
}

totalGraders++;
if (graderConfig._lastScore >= 0.5) totalPassed++;
completed++;
writeProgress();
};

/** Run an external code-grader script. */
const executeCodeGrader = async (graderConfig: Record<string, unknown>, task: GraderTask) => {
const { testId, resultsDir, responseText, inputData } = task;
const graderName = graderConfig.name as string;
const inputText = extractInputText(inputData.input);
const payload = JSON.stringify({
output: [{ role: 'assistant', content: responseText }],
Expand All @@ -114,10 +130,10 @@ export async function runCodeGraders(

try {
const stdout = await executeScript(
graderConfig.command,
graderConfig.command as string | string[],
payload,
undefined,
graderConfig.cwd,
graderConfig.cwd as string | undefined,
);
const parsed = JSON.parse(stdout);
const score = typeof parsed.score === 'number' ? parsed.score : 0;
Expand All @@ -131,48 +147,55 @@ export async function runCodeGraders(
...(parsed.misses ?? []).map((m: string) => ({ text: m, passed: false })),
];

const result = {
name: graderName,
type: 'code-grader',
score,
weight: graderConfig.weight ?? 1.0,
assertions,
details: parsed.details ?? {},
};
graderConfig._lastScore = score;

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(result, null, 2)}\n`,
`${JSON.stringify({ name: graderName, type: 'code-grader', score, weight: graderConfig.weight ?? 1.0, assertions, details: parsed.details ?? {} }, null, 2)}\n`,
'utf8',
);

totalGraders++;
if (score >= 0.5) totalPassed++;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
process.stderr.write(`\n ${testId}/${graderName}: ERROR — ${message}\n`);

const errorResult = {
name: graderName,
type: 'code-grader',
score: 0,
weight: graderConfig.weight ?? 1.0,
assertions: [{ text: `Error: ${message}`, passed: false }],
details: { error: message },
};
graderConfig._lastScore = 0;

await writeFile(
join(resultsDir, `${graderName}.json`),
`${JSON.stringify(errorResult, null, 2)}\n`,
`${JSON.stringify({ name: graderName, type: 'code-grader', score: 0, weight: graderConfig.weight ?? 1.0, assertions: [{ text: `Error: ${message}`, passed: false }], details: { error: message } }, null, 2)}\n`,
'utf8',
);
totalGraders++;
} finally {
completed++;
writeProgress();
}
};

/** Evaluate a built-in deterministic assertion in-process. */
const executeBuiltinGrader = async (
graderConfig: Record<string, unknown>,
responseText: string,
resultsDir: string,
) => {
const raw = evaluateBuiltinAssertion(
graderConfig as { type: string; value?: unknown; flags?: string },
responseText,
);

const negate = graderConfig.negate === true;
const score = negate ? 1 - raw.score : raw.score;
const assertions = negate
? raw.assertions.map((a: { text: string; passed: boolean }) => ({
text: a.text,
passed: !a.passed,
}))
: raw.assertions;

graderConfig._lastScore = score;

await writeFile(
join(resultsDir, `${graderConfig.name}.json`),
`${JSON.stringify({ name: graderConfig.name, type: graderConfig.type, score, weight: (graderConfig.weight as number) ?? 1.0, assertions, details: {} }, null, 2)}\n`,
'utf8',
);
};

// Run with concurrency limit
const pending = new Set<Promise<void>>();
for (const task of tasks) {
Expand Down Expand Up @@ -239,83 +262,9 @@ function evaluateBuiltinAssertion(
}
}

/**
* Run built-in deterministic assertions for all tests in the export directory.
* Reads configs from builtin_graders/<name>.json, evaluates in-process,
* and writes results to code_grader_results/<name>.json.
*/
async function runBuiltinGraders(
exportDir: string,
testIds: string[],
safeSuiteName: string,
): Promise<{ total: number; passed: number }> {
let total = 0;
let passed = 0;

for (const testId of testIds) {
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
const testDir = join(exportDir, ...subpath);
const builtinGradersDir = join(testDir, 'builtin_graders');

let graderFiles: string[];
try {
graderFiles = (await readdir(builtinGradersDir)).filter((f) => f.endsWith('.json'));
} catch {
continue; // No builtin graders for this test
}

if (graderFiles.length === 0) continue;

const resultsDir = join(testDir, 'code_grader_results');
await mkdir(resultsDir, { recursive: true });

let responseText: string;
try {
responseText = await readFile(join(testDir, 'response.md'), 'utf8');
} catch {
continue; // No response yet — skip
}

for (const file of graderFiles) {
const config = JSON.parse(await readFile(join(builtinGradersDir, file), 'utf8'));
const raw = evaluateBuiltinAssertion(config, responseText);

// Apply negate if configured
const negate = config.negate === true;
const score = negate ? 1 - raw.score : raw.score;
const assertions = negate
? raw.assertions.map((a: { text: string; passed: boolean }) => ({
text: a.text,
passed: !a.passed,
}))
: raw.assertions;

const result = {
name: config.name,
type: config.type,
score,
weight: config.weight ?? 1.0,
assertions,
details: {},
};

await writeFile(
join(resultsDir, `${config.name}.json`),
`${JSON.stringify(result, null, 2)}\n`,
'utf8',
);

total++;
if (score >= 0.5) passed++;
}
}

return { total, passed };
}

export const evalGradeCommand = command({
name: 'grade',
description: 'Run code-grader and built-in assertions on responses in an export directory',
description: 'Run grader assertions on responses in an export directory',
args: {
exportDir: positional({
type: string,
Expand All @@ -337,7 +286,7 @@ export const evalGradeCommand = command({
const suiteName: string = manifest.suite ?? '';
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';

// Collect all code-grader tasks upfront so we know the total count
// Collect all grader tasks upfront so we know the total count
const tasks: GraderTask[] = [];

for (const testId of testIds) {
Expand All @@ -348,33 +297,23 @@ export const evalGradeCommand = command({

let graderFiles: string[];
try {
graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json'));
graderFiles = (await readdir(codeGradersDir)).filter((f: string) => f.endsWith('.json'));
} catch {
graderFiles = [];
continue; // No graders for this test
}

if (graderFiles.length > 0) {
await mkdir(resultsDir, { recursive: true });
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
if (graderFiles.length === 0) continue;
await mkdir(resultsDir, { recursive: true });

const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));

for (const graderFile of graderFiles) {
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
}
for (const graderFile of graderFiles) {
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
}
}

const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);

// Run built-in deterministic assertions (contains, regex, equals, etc.)
const builtin = await runBuiltinGraders(exportDir, testIds, safeSuiteName);

const totalAll = totalGraders + builtin.total;
const passedAll = totalPassed + builtin.passed;
const parts: string[] = [];
if (totalGraders > 0) parts.push(`${totalGraders} code-grader(s)`);
if (builtin.total > 0) parts.push(`${builtin.total} built-in assertion(s)`);
if (parts.length === 0) parts.push('0 grader(s)');
console.log(`Graded ${parts.join(' + ')}: ${passedAll}/${totalAll} passed`);
console.log(`Graded ${totalGraders} grader(s): ${totalPassed} passed`);
},
});
14 changes: 6 additions & 8 deletions apps/cli/src/commands/pipeline/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
* ├── criteria.md
* ├── expected_output.json (if present)
* ├── llm_graders/<name>.json
* ├── code_graders/<name>.json
* └── builtin_graders/<name>.json
* └── code_graders/<name>.json
*/
import { readFile } from 'node:fs/promises';
import { mkdir, writeFile } from 'node:fs/promises';
Expand Down Expand Up @@ -206,11 +205,9 @@ async function writeGraderConfigs(
): Promise<void> {
const codeGradersDir = join(testDir, 'code_graders');
const llmGradersDir = join(testDir, 'llm_graders');
const builtinGradersDir = join(testDir, 'builtin_graders');

let hasCodeGraders = false;
let hasLlmGraders = false;
let hasBuiltinGraders = false;

for (const assertion of assertions) {
if (assertion.type === 'code-grader') {
Expand All @@ -221,6 +218,7 @@ async function writeGraderConfigs(
const config = assertion as CodeEvaluatorConfig;
await writeJson(join(codeGradersDir, `${config.name}.json`), {
name: config.name,
type: 'code-grader',
command: config.command,
cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
weight: config.weight ?? 1.0,
Expand Down Expand Up @@ -252,12 +250,12 @@ async function writeGraderConfigs(
config: {},
});
} else if (BUILTIN_ASSERTION_TYPES.has(assertion.type)) {
if (!hasBuiltinGraders) {
await mkdir(builtinGradersDir, { recursive: true });
hasBuiltinGraders = true;
if (!hasCodeGraders) {
await mkdir(codeGradersDir, { recursive: true });
hasCodeGraders = true;
}
const config = assertion as EvaluatorConfig & { value?: unknown; flags?: string };
await writeJson(join(builtinGradersDir, `${config.name}.json`), {
await writeJson(join(codeGradersDir, `${config.name}.json`), {
name: config.name,
type: config.type,
value: config.value,
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/commands/eval/pipeline/grade.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ describe('pipeline grade — builtin assertions', () => {

beforeEach(async () => {
const testDir = join(BUILTIN_OUT, 'test-01');
const builtinGradersDir = join(testDir, 'builtin_graders');
const builtinGradersDir = join(testDir, 'code_graders');
await mkdir(builtinGradersDir, { recursive: true });

await writeFile(join(testDir, 'response.md'), 'hello world');
Expand Down Expand Up @@ -177,7 +177,7 @@ describe('pipeline grade — builtin assertions', () => {
it('applies negate to invert score', async () => {
// Overwrite has_goodbye with negate: true — "not contains goodbye" should pass
await writeFile(
join(BUILTIN_OUT, 'test-01', 'builtin_graders', 'has_goodbye.json'),
join(BUILTIN_OUT, 'test-01', 'code_graders', 'has_goodbye.json'),
JSON.stringify({
name: 'has_goodbye',
type: 'contains',
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/pipeline/input.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ describe('pipeline input', () => {
expect(manifest.experiment).toBeUndefined();
});

it('writes builtin_graders/<name>.json for deterministic assertions', async () => {
it('writes code_graders/<name>.json for deterministic assertions', async () => {
const { execa } = await import('execa');
const builtinEvalPath = join(FIXTURE_DIR, 'builtin-test.eval.yaml');
await execa('bun', [CLI_ENTRY, 'pipeline', 'input', builtinEvalPath, '--out', OUT_DIR]);

const containsGrader = JSON.parse(
await readFile(
join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'has_hello.json'),
join(OUT_DIR, 'builtin-test', 'test-01', 'code_graders', 'has_hello.json'),
'utf8',
),
);
Expand All @@ -120,7 +120,7 @@ describe('pipeline input', () => {

const regexGrader = JSON.parse(
await readFile(
join(OUT_DIR, 'builtin-test', 'test-01', 'builtin_graders', 'matches_pattern.json'),
join(OUT_DIR, 'builtin-test', 'test-01', 'code_graders', 'matches_pattern.json'),
'utf8',
),
);
Expand Down
Loading