Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 8 additions & 26 deletions packages/core/src/evaluation/loaders/evaluator-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ async function parseEvaluatorList(

const customTypeName = isCustomType ? typeValue : undefined;

// Auto-generate name for assertion types if not provided
// Auto-generate name from type if not provided
const name =
rawName ??
(isCustomType ? typeValue : generateAssertionName(typeValue as EvaluatorKind, rawEvaluator));
Expand Down Expand Up @@ -1539,32 +1539,12 @@ export async function parsePreprocessors(
return preprocessors;
}

/** Assertion evaluator types that support auto-generated names. */
const ASSERTION_TYPES = new Set([
'skill-trigger',
'contains',
'contains-any',
'contains-all',
'icontains',
'icontains-any',
'icontains-all',
'starts-with',
'ends-with',
'regex',
'is-json',
'equals',
'rubrics',
]);

/**
* Generate a descriptive name for assertion-type evaluators when no explicit name is given.
* Returns undefined for non-assertion types (those still require an explicit name).
* Generate a descriptive name for evaluators when no explicit name is given.
* Returns the type name as a fallback so evaluators are never skipped just
* because the author omitted `name`.
*/
function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): string | undefined {
if (!ASSERTION_TYPES.has(typeValue)) {
return undefined;
}

function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): string {
const value = asString(rawEvaluator.value);
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : undefined;

Expand Down Expand Up @@ -1598,7 +1578,9 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str
case 'rubrics':
return 'rubrics';
default:
return undefined;
// For all other evaluator types (llm-grader, code-grader, latency, etc.),
// use the type name itself as the auto-derived name.
return typeValue;
}
}

Expand Down
23 changes: 19 additions & 4 deletions packages/core/test/evaluation/loaders/evaluator-parser.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { afterAll, beforeAll, describe, expect, it } from 'bun:test';
import { afterAll, beforeAll, describe, expect, it, spyOn } from 'bun:test';
import { mkdir, rm, writeFile } from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
Expand Down Expand Up @@ -665,7 +665,7 @@ describe('parseEvaluators - kebab-case type normalization', () => {
});

describe('parseEvaluators - score_ranges rubrics', () => {
it('parses valid score_ranges with required_min_score', async () => {
it('parses valid score_ranges with min_score', async () => {
const rawEvalCase = {
evaluators: [
{
Expand All @@ -675,7 +675,7 @@ describe('parseEvaluators - score_ranges rubrics', () => {
{
id: 'accuracy',
weight: 2.0,
required_min_score: 7,
min_score: 0.7,
score_ranges: [
{ score_range: [0, 3], outcome: 'Incorrect' },
{ score_range: [4, 6], outcome: 'Partially correct' },
Expand All @@ -698,6 +698,7 @@ describe('parseEvaluators - score_ranges rubrics', () => {
const rubric = config.rubrics?.[0];
expect(rubric?.id).toBe('accuracy');
expect(rubric?.weight).toBe(2.0);
expect(rubric?.min_score).toBe(0.7);
expect(rubric?.required_min_score).toBe(7);
expect(rubric?.score_ranges).toHaveLength(4);
}
Expand Down Expand Up @@ -752,6 +753,7 @@ describe('parseEvaluators - score_ranges rubrics', () => {
});

it('skips rubric items that use legacy description field without outcome', async () => {
const warnSpy = spyOn(console, 'warn').mockImplementation(() => {});
const rawEvalCase = {
evaluators: [
{
Expand All @@ -777,6 +779,8 @@ describe('parseEvaluators - score_ranges rubrics', () => {
// Rubric should be skipped since it has no 'outcome' field
expect(config.rubrics ?? []).toHaveLength(0);
}
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('missing outcome'));
warnSpy.mockRestore();
});
});

Expand All @@ -791,7 +795,7 @@ describe('parseEvaluators - score_ranges shorthand map', () => {
{
id: 'accuracy',
weight: 2.0,
required_min_score: 7,
min_score: 0.7,
score_ranges: {
0: 'Completely wrong',
3: 'Partially correct',
Expand All @@ -813,6 +817,7 @@ describe('parseEvaluators - score_ranges shorthand map', () => {
expect(config.rubrics).toHaveLength(1);
const rubric = config.rubrics?.[0];
expect(rubric?.id).toBe('accuracy');
expect(rubric?.min_score).toBe(0.7);
expect(rubric?.required_min_score).toBe(7);
expect(rubric?.score_ranges).toHaveLength(4);
expect(rubric?.score_ranges?.[0]).toEqual({
Expand Down Expand Up @@ -1430,6 +1435,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => {
});

it('skips rubrics with empty criteria array', async () => {
const warnSpy = spyOn(console, 'warn').mockImplementation(() => {});
const evaluators = await parseEvaluators(
{
assertions: [
Expand All @@ -1444,9 +1450,14 @@ describe('parseEvaluators - type: rubrics with criteria', () => {
'test-1',
);
expect(evaluators).toBeUndefined();
expect(warnSpy).toHaveBeenCalledWith(
expect.stringContaining('criteria must be a non-empty array'),
);
warnSpy.mockRestore();
});

it('skips rubrics with missing criteria', async () => {
const warnSpy = spyOn(console, 'warn').mockImplementation(() => {});
const evaluators = await parseEvaluators(
{
assertions: [
Expand All @@ -1460,6 +1471,10 @@ describe('parseEvaluators - type: rubrics with criteria', () => {
'test-1',
);
expect(evaluators).toBeUndefined();
expect(warnSpy).toHaveBeenCalledWith(
expect.stringContaining('criteria must be a non-empty array'),
);
warnSpy.mockRestore();
});

it('supports string shorthand in criteria', async () => {
Expand Down
Loading