Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions packages/core/src/evaluation/evaluators/prompt-resolution.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@
* Prompt resolution utilities for LLM judge evaluators.
*
* Extracted from orchestrator.ts to enable reuse by the evaluator registry.
*
* Key behavior: When a user writes `prompt: "some text"` in an assertion,
* `resolveCustomPrompt()` returns that text. The caller must then decide
* whether the text is a **full template** (contains `{{output}}` etc.) or
* **bare criteria** (no template variables). Use `containsTemplateVariables()`
* to distinguish: full templates become `evaluatorTemplateOverride`, while
* bare criteria are injected into the default template's `{{criteria}}` slot.
*/

import path from 'node:path';

import { toSnakeCaseDeep } from '../case-conversion.js';
import { readTextFile } from '../file-utils.js';
import type { Message } from '../providers/types.js';
import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js';
import type { TraceSummary } from '../trace.js';
import type { EvalTest, PromptScriptConfig } from '../types.js';
import { executeScript } from './code-evaluator.js';
Expand Down Expand Up @@ -66,6 +74,23 @@ export async function resolveCustomPrompt(
return undefined;
}

/**
* Checks whether a prompt string contains any known `{{ variable }}` template
* placeholders (e.g. `{{output}}`, `{{input}}`). If it does, the string is a
* full evaluator template and should replace the default template. If not,
* it's bare criteria text and should be injected into the `{{criteria}}` slot
* of the default template.
*/
export function containsTemplateVariables(text: string): boolean {
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
for (const match of text.matchAll(variablePattern)) {
if (VALID_TEMPLATE_VARIABLES.has(match[1])) {
return true;
}
}
return false;
}

async function executePromptTemplate(
script: readonly string[],
context: ResolveCustomPromptContext,
Expand Down
41 changes: 40 additions & 1 deletion packages/core/src/evaluation/formatting/prompt-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,14 @@ export async function buildPromptInputs(
})
: undefined;

// Extract system message from leading system-role messages in the input.
// This is used by the orchestrator to pass the system prompt as a direct field
// on ProviderRequest and by evaluators that need the system context separately.
const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode);

// Both question (flat string) and chatPrompt (structured messages) are returned:
// chatPrompt is used for the API call, question is retained for logging/debugging.
return { question, chatPrompt };
return { question, chatPrompt, systemMessage };
}

/**
Expand Down Expand Up @@ -118,6 +123,40 @@ function needsRoleMarkers(
return messagesWithContent > 1;
}

/**
* Extract the system message text from leading system-role messages in the input.
* Returns undefined if no system messages are present.
*/
function extractSystemMessage(
messages: readonly TestMessage[],
segmentsByMessage: readonly JsonObject[][],
mode: FormattingMode,
): string | undefined {
const systemParts: string[] = [];

for (let i = 0; i < messages.length; i++) {
if (messages[i].role !== 'system') {
break;
}

const segments = segmentsByMessage[i];
const contentParts: string[] = [];

for (const segment of segments) {
const formatted = formatSegment(segment, mode);
if (formatted) {
contentParts.push(formatted);
}
}

if (contentParts.length > 0) {
systemParts.push(contentParts.join('\n'));
}
}

return systemParts.length > 0 ? systemParts.join('\n\n') : undefined;
}

function buildChatPromptFromSegments(options: {
readonly messages: readonly TestMessage[];
readonly segmentsByMessage: readonly JsonObject[][];
Expand Down
8 changes: 2 additions & 6 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1215,11 +1215,9 @@ async function runBatchEvaluation(options: {
const promptInputs = promptInputsList[index];
return {
question: promptInputs.question,
systemPrompt: promptInputs.systemMessage,
inputFiles: evalCase.file_paths,
evalCaseId: evalCase.id,
metadata: {
systemPrompt: promptInputs.systemMessage ?? '',
},
};
});

Expand Down Expand Up @@ -2665,13 +2663,11 @@ async function invokeProvider(

return await provider.invoke({
question: promptInputs.question,
systemPrompt: promptInputs.systemMessage,
chatPrompt: promptInputs.chatPrompt,
inputFiles: evalCase.file_paths,
evalCaseId: evalCase.id,
attempt,
metadata: {
systemPrompt: promptInputs.systemMessage ?? '',
},
signal: controller.signal,
cwd,
workspaceFile,
Expand Down
34 changes: 32 additions & 2 deletions packages/core/src/evaluation/registry/builtin-evaluators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import {
runStartsWithAssertion,
} from '../evaluators.js';
import { InlineAssertEvaluator } from '../evaluators/inline-assert.js';
import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js';
import { containsTemplateVariables, resolveCustomPrompt } from '../evaluators/prompt-resolution.js';
import { isAgentProvider } from '../providers/types.js';
import type { Provider } from '../providers/types.js';
import type { ToolTrajectoryEvaluatorConfig } from '../trace.js';
Expand Down Expand Up @@ -126,9 +126,39 @@ export const llmGraderFactory: EvaluatorFactoryFn = (config, context) => {
},
agentTimeoutMs,
);

// Determine whether the resolved prompt should replace the entire
// evaluator template or be injected as the {{criteria}} in the default
// template.
//
// Script-based prompts (resolvedPromptScript) and file-based prompts
// (resolvedPromptPath/promptPath) are always treated as full template
// overrides — they're expected to produce the complete grader prompt.
//
// Inline `prompt:` strings are checked for template variables like
// {{output}}, {{input}}, etc. If present, the string is a full custom
// template. If absent, it's bare criteria text (e.g. "Check if the
// response shows step-by-step work") and gets injected into the default
// template's {{criteria}} slot so the grader still receives the
// candidate output, input, and reference answer. (#982)
const isFromInlinePrompt =
!c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;

let evaluatorTemplateOverride: string | undefined;
let evalCase = evalContext.evalCase;
if (customPrompt) {
if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
evaluatorTemplateOverride = customPrompt;
} else {
// Bare inline text — use as criteria in the default template
evalCase = { ...evalCase, criteria: customPrompt };
}
}

return evaluator.evaluate({
...evalContext,
evaluatorTemplateOverride: customPrompt,
evalCase,
evaluatorTemplateOverride,
evaluator: c,
});
},
Expand Down
105 changes: 105 additions & 0 deletions packages/core/test/evaluation/evaluators.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,111 @@ describe('LlmGraderEvaluator (llm-grader)', () => {
expect(warnSpy.mock.calls[0][0]).toContain('skipped');
warnSpy.mockRestore();
});

it('treats bare prompt string as criteria, not full template override (#982)', async () => {
// When a user writes `prompt: "Check step-by-step work"` in an assertion,
// the grader should receive the DEFAULT_EVALUATOR_TEMPLATE (which contains
// {{output}}, {{input}}, etc.) with the prompt text injected as {{criteria}},
// NOT use the bare text as the entire template replacement.
const graderProvider = new CapturingProvider({
output: [
{
role: 'assistant',
content: JSON.stringify({
score: 0.9,
assertions: [{ text: 'Shows step-by-step work', passed: true }],
}),
},
],
});

const evaluator = llmGraderFactory(
{
name: 'step-check',
type: 'llm-grader',
prompt: 'Check if the response shows step-by-step work',
},
{
graderProvider,
llmGrader: new LlmGraderEvaluator({
resolveGraderProvider: async () => graderProvider,
}),
registry: {} as never,
},
);

await evaluator.evaluate({
evalCase: {
...baseTestCase,
criteria: 'Original criteria from test case',
},
candidate: 'Step 1: Read the code\nStep 2: Write tests\nStep 3: Refactor',
target: baseTarget,
provider: graderProvider,
attempt: 0,
promptInputs: { question: '' },
now: new Date(),
});

// The user prompt should contain the full default template structure
const userPrompt = graderProvider.lastRequest?.question ?? '';
expect(userPrompt).toContain('[[ ## criteria ## ]]');
expect(userPrompt).toContain('[[ ## answer ## ]]');
expect(userPrompt).toContain('[[ ## question ## ]]');
// The bare prompt text should appear as the criteria
expect(userPrompt).toContain('Check if the response shows step-by-step work');
// The candidate answer should be present in the template
expect(userPrompt).toContain('Step 1: Read the code');
});

it('uses prompt with {{output}} as full template override', async () => {
// When a user provides a template with known variables, it SHOULD replace
// the default template (backward compatible with intentional overrides).
const graderProvider = new CapturingProvider({
output: [
{
role: 'assistant',
content: JSON.stringify({
score: 0.8,
assertions: [{ text: 'Custom template used', passed: true }],
}),
},
],
});

const customTemplate = 'Custom grader: evaluate {{output}} against {{criteria}}';

const evaluator = llmGraderFactory(
{
name: 'custom-template',
type: 'llm-grader',
prompt: customTemplate,
},
{
graderProvider,
llmGrader: new LlmGraderEvaluator({
resolveGraderProvider: async () => graderProvider,
}),
registry: {} as never,
},
);

await evaluator.evaluate({
evalCase: baseTestCase,
candidate: 'Some answer',
target: baseTarget,
provider: graderProvider,
attempt: 0,
promptInputs: { question: '' },
now: new Date(),
});

// The custom template should be used as-is (with substitutions)
const userPrompt = graderProvider.lastRequest?.question ?? '';
expect(userPrompt).toContain('Custom grader: evaluate');
// Should NOT contain the default template's structure
expect(userPrompt).not.toContain('[[ ## answer ## ]]');
});
});

describe('CodeEvaluator', () => {
Expand Down
77 changes: 77 additions & 0 deletions packages/core/test/evaluation/evaluators/prompt-resolution.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import { describe, expect, it } from 'bun:test';

import {
containsTemplateVariables,
resolveCustomPrompt,
} from '../../../src/evaluation/evaluators/prompt-resolution.js';

describe('containsTemplateVariables', () => {
it('returns true for template with {{output}}', () => {
expect(containsTemplateVariables('Grade the {{output}} against {{criteria}}')).toBe(true);
});

it('returns true for template with {{input}}', () => {
expect(containsTemplateVariables('Evaluate {{input}} and {{output}}')).toBe(true);
});

it('returns true for template with {{expected_output}}', () => {
expect(containsTemplateVariables('Compare {{output}} to {{expected_output}}')).toBe(true);
});

it('returns true for template with {{criteria}}', () => {
expect(containsTemplateVariables('Check {{criteria}} for {{output}}')).toBe(true);
});

it('returns true for template with {{file_changes}}', () => {
expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true);
});

it('returns true for deprecated {{output_text}} variable', () => {
expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true);
});

it('returns true for deprecated {{input_text}} variable', () => {
expect(containsTemplateVariables('Evaluate {{input_text}}')).toBe(true);
});

it('returns true with whitespace in braces', () => {
expect(containsTemplateVariables('Grade the {{ output }} carefully')).toBe(true);
});

it('returns false for bare criteria text without variables', () => {
expect(containsTemplateVariables('Check if the response shows step-by-step work')).toBe(false);
});

it('returns false for text with unknown variable names', () => {
expect(containsTemplateVariables('Evaluate {{answer}} against {{rubric}}')).toBe(false);
});

it('returns false for empty string', () => {
expect(containsTemplateVariables('')).toBe(false);
});

it('returns false for text with single braces', () => {
expect(containsTemplateVariables('Check {output} carefully')).toBe(false);
});
});

describe('resolveCustomPrompt', () => {
it('returns inline prompt string as-is', async () => {
const result = await resolveCustomPrompt({
prompt: 'Check if the response is correct',
});
expect(result).toBe('Check if the response is correct');
});

it('returns undefined when no prompt is configured', async () => {
const result = await resolveCustomPrompt({});
expect(result).toBeUndefined();
});

it('returns undefined when prompt is not a string', async () => {
const result = await resolveCustomPrompt({
prompt: { command: ['node', 'script.js'] },
});
expect(result).toBeUndefined();
});
});
Loading