Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export interface GradingArtifact {
readonly total_tool_calls: number;
readonly errors_encountered: number;
};
readonly evaluators?: readonly {
readonly graders?: readonly {
readonly name: string;
readonly type: string;
readonly score: number;
Expand Down Expand Up @@ -219,12 +219,12 @@ function buildAssertions(result: EvaluationResult): GradingArtifact['assertions'
}

// ---------------------------------------------------------------------------
// Build evaluators list
// Build graders list
// ---------------------------------------------------------------------------

function buildEvaluators(
scores: readonly EvaluatorResult[] | undefined,
): GradingArtifact['evaluators'] {
): GradingArtifact['graders'] {
if (!scores || scores.length === 0) {
return undefined;
}
Expand Down Expand Up @@ -267,7 +267,7 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
total_tool_calls: totalToolCalls,
errors_encountered: errorsEncountered,
},
evaluators: buildEvaluators(result.scores),
graders: buildEvaluators(result.scores),
workspace_changes: parseWorkspaceChanges(result.fileChanges),
conversation: result.conversationId
? {
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ export const evalBenchCommand = command({
assertions: allAssertions,
summary: { passed, failed, total: allAssertions.length, pass_rate: passRate },
execution_metrics: { tool_calls: {}, total_tool_calls: 0, errors_encountered: 0 },
evaluators: evaluators.map((e) => ({
graders: evaluators.map((e) => ({
name: e.name,
type: e.type,
score: e.score,
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/results/export.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* benchmark.json — aggregate scores, pass/fail counts, timing
* index.jsonl — per-test manifest with artifact pointers
* <test-id>/
* grading.json — per-test grading artifact (assertions, evaluators)
* grading.json — per-test grading artifact (assertions, graders)
* timing.json — per-test timing artifact
* outputs/
* response.md — human-readable agent response for this test
Expand Down
8 changes: 7 additions & 1 deletion apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,13 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
evidence: assertion.evidence,
})),
scores:
grading?.evaluators?.map((evaluator) => ({
// `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts.
// TODO: remove `evaluators` fallback once old run directories are no longer in use.
(
grading?.graders ??
(grading as (GradingArtifact & { evaluators?: GradingArtifact['graders'] }) | undefined)
?.evaluators
)?.map((evaluator) => ({
name: evaluator.name,
type: evaluator.type,
score: evaluator.score,
Expand Down
10 changes: 5 additions & 5 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,10 @@ describe('buildGradingArtifact', () => {

const grading = buildGradingArtifact(result);

expect(grading.evaluators).toHaveLength(2);
expect(grading.evaluators?.[0].name).toBe('format-check');
expect(grading.evaluators?.[0].type).toBe('code-grader');
expect(grading.evaluators?.[1].score).toBe(0.7);
expect(grading.graders).toHaveLength(2);
expect(grading.graders?.[0].name).toBe('format-check');
expect(grading.graders?.[0].type).toBe('code-grader');
expect(grading.graders?.[1].score).toBe(0.7);
});

it('records error as errors_encountered', () => {
Expand All @@ -150,7 +150,7 @@ describe('buildGradingArtifact', () => {
total: 0,
pass_rate: 0,
});
expect(grading.evaluators).toBeUndefined();
expect(grading.graders).toBeUndefined();
});

it('includes workspace_changes when fileChanges present', () => {
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/eval/pipeline/bench.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ describe('pipeline bench', () => {
const grading = JSON.parse(await readFile(join(OUT_DIR, 'test-01', 'grading.json'), 'utf8'));
expect(grading.summary.pass_rate).toBeGreaterThan(0);
expect(grading.assertions.length).toBeGreaterThan(0);
expect(grading.evaluators).toHaveLength(2);
expect(grading.graders).toHaveLength(2);
});

it('writes index.jsonl with one entry per test', async () => {
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ describe('eval pipeline e2e', () => {
const grading = JSON.parse(
await readFile(join(OUT_DIR, 'input-test', 'test-01', 'grading.json'), 'utf8'),
);
expect(grading.evaluators).toHaveLength(2);
expect(grading.graders).toHaveLength(2);
expect(grading.summary.pass_rate).toBeGreaterThan(0);

const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/commands/results/export-e2e-providers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,8 @@ describe('export e2e — multi-provider metrics verification', () => {
expect(grading.execution_metrics.tool_calls.Write).toBe(1);

// Evaluators
expect(grading.evaluators).toHaveLength(1);
expect(grading.evaluators?.[0].name).toBe('accuracy');
expect(grading.graders).toHaveLength(1);
expect(grading.graders?.[0].name).toBe('accuracy');
});

it('should produce correct grading for Copilot CLI result with mixed assertions', async () => {
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/test/commands/results/export.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,10 @@ describe('results export', () => {
expect(grading.execution_metrics).toBeDefined();

// Has evaluators
expect(grading.evaluators).toBeDefined();
expect(grading.evaluators).toHaveLength(1);
expect(grading.evaluators?.[0].name).toBe('greeting_quality');
expect(grading.evaluators?.[0].type).toBe('llm-grader');
expect(grading.graders).toBeDefined();
expect(grading.graders).toHaveLength(1);
expect(grading.graders?.[0].name).toBe('greeting_quality');
expect(grading.graders?.[0].type).toBe('llm-grader');

const perTestTimingPath = path.join(artifactDir(outputDir, RESULT_FULL), 'timing.json');
expect(existsSync(perTestTimingPath)).toBe(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ LLM grader results are read from disk at `<test-id>/llm_grader_results/<name>.js
```

**Output:**
- `<test-id>/grading.json` — merged grading with `evaluators`, `assertions`, `summary.pass_rate`
- `<test-id>/grading.json` — merged grading with `graders`, `assertions`, `summary.pass_rate`
- `index.jsonl` — one JSON line per test: `{test_id, score, pass, evaluators: [...]}`
- `benchmark.json` — aggregate stats: `{metadata: {targets}, run_summary: {<target>: {mean, stddev, n}}}`

Expand Down
Loading