diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts index 5429e62ab..04e9df5dc 100644 --- a/packages/core/src/evaluation/evaluators/prompt-resolution.ts +++ b/packages/core/src/evaluation/evaluators/prompt-resolution.ts @@ -2,6 +2,13 @@ * Prompt resolution utilities for LLM judge evaluators. * * Extracted from orchestrator.ts to enable reuse by the evaluator registry. + * + * Key behavior: When a user writes `prompt: "some text"` in an assertion, + * `resolveCustomPrompt()` returns that text. The caller must then decide + * whether the text is a **full template** (contains `{{output}}` etc.) or + * **bare criteria** (no template variables). Use `containsTemplateVariables()` + * to distinguish: full templates become `evaluatorTemplateOverride`, while + * bare criteria are injected into the default template's `{{criteria}}` slot. */ import path from 'node:path'; @@ -9,6 +16,7 @@ import path from 'node:path'; import { toSnakeCaseDeep } from '../case-conversion.js'; import { readTextFile } from '../file-utils.js'; import type { Message } from '../providers/types.js'; +import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TraceSummary } from '../trace.js'; import type { EvalTest, PromptScriptConfig } from '../types.js'; import { executeScript } from './code-evaluator.js'; @@ -66,6 +74,23 @@ export async function resolveCustomPrompt( return undefined; } +/** + * Checks whether a prompt string contains any known `{{ variable }}` template + * placeholders (e.g. `{{output}}`, `{{input}}`). If it does, the string is a + * full evaluator template and should replace the default template. If not, + * it's bare criteria text and should be injected into the `{{criteria}}` slot + * of the default template. + */ +export function containsTemplateVariables(text: string): boolean { + const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g; + for (const match of text.matchAll(variablePattern)) { + if (VALID_TEMPLATE_VARIABLES.has(match[1])) { + return true; + } + } + return false; +} + async function executePromptTemplate( script: readonly string[], context: ResolveCustomPromptContext, diff --git a/packages/core/src/evaluation/formatting/prompt-builder.ts b/packages/core/src/evaluation/formatting/prompt-builder.ts index 53290599c..f30ee9fca 100644 --- a/packages/core/src/evaluation/formatting/prompt-builder.ts +++ b/packages/core/src/evaluation/formatting/prompt-builder.ts @@ -85,9 +85,14 @@ export async function buildPromptInputs( }) : undefined; + // Extract system message from leading system-role messages in the input. + // This is used by the orchestrator to pass the system prompt as a direct field + // on ProviderRequest and by evaluators that need the system context separately. + const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode); + // Both question (flat string) and chatPrompt (structured messages) are returned: // chatPrompt is used for the API call, question is retained for logging/debugging. - return { question, chatPrompt }; + return { question, chatPrompt, systemMessage }; } /** @@ -118,6 +123,40 @@ function needsRoleMarkers( return messagesWithContent > 1; } +/** + * Extract the system message text from leading system-role messages in the input. + * Returns undefined if no system messages are present. + */ +function extractSystemMessage( + messages: readonly TestMessage[], + segmentsByMessage: readonly JsonObject[][], + mode: FormattingMode, +): string | undefined { + const systemParts: string[] = []; + + for (let i = 0; i < messages.length; i++) { + if (messages[i].role !== 'system') { + break; + } + + const segments = segmentsByMessage[i]; + const contentParts: string[] = []; + + for (const segment of segments) { + const formatted = formatSegment(segment, mode); + if (formatted) { + contentParts.push(formatted); + } + } + + if (contentParts.length > 0) { + systemParts.push(contentParts.join('\n')); + } + } + + return systemParts.length > 0 ? systemParts.join('\n\n') : undefined; +} + function buildChatPromptFromSegments(options: { readonly messages: readonly TestMessage[]; readonly segmentsByMessage: readonly JsonObject[][]; diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 4e6306fc7..54032ed58 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1215,11 +1215,9 @@ async function runBatchEvaluation(options: { const promptInputs = promptInputsList[index]; return { question: promptInputs.question, + systemPrompt: promptInputs.systemMessage, inputFiles: evalCase.file_paths, evalCaseId: evalCase.id, - metadata: { - systemPrompt: promptInputs.systemMessage ?? '', - }, }; }); @@ -2665,13 +2663,11 @@ async function invokeProvider( return await provider.invoke({ question: promptInputs.question, + systemPrompt: promptInputs.systemMessage, chatPrompt: promptInputs.chatPrompt, inputFiles: evalCase.file_paths, evalCaseId: evalCase.id, attempt, - metadata: { - systemPrompt: promptInputs.systemMessage ?? '', - }, signal: controller.signal, cwd, workspaceFile, diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index caec4d609..37b17b77c 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -31,7 +31,7 @@ import { runStartsWithAssertion, } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; -import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; +import { containsTemplateVariables, resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; @@ -126,9 +126,39 @@ export const llmGraderFactory: EvaluatorFactoryFn = (config, context) => { }, agentTimeoutMs, ); + + // Determine whether the resolved prompt should replace the entire + // evaluator template or be injected as the {{criteria}} in the default + // template. + // + // Script-based prompts (resolvedPromptScript) and file-based prompts + // (resolvedPromptPath/promptPath) are always treated as full template + // overrides — they're expected to produce the complete grader prompt. + // + // Inline `prompt:` strings are checked for template variables like + // {{output}}, {{input}}, etc. If present, the string is a full custom + // template. If absent, it's bare criteria text (e.g. "Check if the + // response shows step-by-step work") and gets injected into the default + // template's {{criteria}} slot so the grader still receives the + // candidate output, input, and reference answer. (#982) + const isFromInlinePrompt = + !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath; + + let evaluatorTemplateOverride: string | undefined; + let evalCase = evalContext.evalCase; + if (customPrompt) { + if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) { + evaluatorTemplateOverride = customPrompt; + } else { + // Bare inline text — use as criteria in the default template + evalCase = { ...evalCase, criteria: customPrompt }; + } + } + return evaluator.evaluate({ ...evalContext, - evaluatorTemplateOverride: customPrompt, + evalCase, + evaluatorTemplateOverride, evaluator: c, }); }, diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index 42dabec24..eca0bce73 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -788,6 +788,111 @@ describe('LlmGraderEvaluator (llm-grader)', () => { expect(warnSpy.mock.calls[0][0]).toContain('skipped'); warnSpy.mockRestore(); }); + + it('treats bare prompt string as criteria, not full template override (#982)', async () => { + // When a user writes `prompt: "Check step-by-step work"` in an assertion, + // the grader should receive the DEFAULT_EVALUATOR_TEMPLATE (which contains + // {{output}}, {{input}}, etc.) with the prompt text injected as {{criteria}}, + // NOT use the bare text as the entire template replacement. + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + score: 0.9, + assertions: [{ text: 'Shows step-by-step work', passed: true }], + }), + }, + ], + }); + + const evaluator = llmGraderFactory( + { + name: 'step-check', + type: 'llm-grader', + prompt: 'Check if the response shows step-by-step work', + }, + { + graderProvider, + llmGrader: new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }), + registry: {} as never, + }, + ); + + await evaluator.evaluate({ + evalCase: { + ...baseTestCase, + criteria: 'Original criteria from test case', + }, + candidate: 'Step 1: Read the code\nStep 2: Write tests\nStep 3: Refactor', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + // The user prompt should contain the full default template structure + const userPrompt = graderProvider.lastRequest?.question ?? ''; + expect(userPrompt).toContain('[[ ## criteria ## ]]'); + expect(userPrompt).toContain('[[ ## answer ## ]]'); + expect(userPrompt).toContain('[[ ## question ## ]]'); + // The bare prompt text should appear as the criteria + expect(userPrompt).toContain('Check if the response shows step-by-step work'); + // The candidate answer should be present in the template + expect(userPrompt).toContain('Step 1: Read the code'); + }); + + it('uses prompt with {{output}} as full template override', async () => { + // When a user provides a template with known variables, it SHOULD replace + // the default template (backward compatible with intentional overrides). + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + score: 0.8, + assertions: [{ text: 'Custom template used', passed: true }], + }), + }, + ], + }); + + const customTemplate = 'Custom grader: evaluate {{output}} against {{criteria}}'; + + const evaluator = llmGraderFactory( + { + name: 'custom-template', + type: 'llm-grader', + prompt: customTemplate, + }, + { + graderProvider, + llmGrader: new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }), + registry: {} as never, + }, + ); + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Some answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + // The custom template should be used as-is (with substitutions) + const userPrompt = graderProvider.lastRequest?.question ?? ''; + expect(userPrompt).toContain('Custom grader: evaluate'); + // Should NOT contain the default template's structure + expect(userPrompt).not.toContain('[[ ## answer ## ]]'); + }); }); describe('CodeEvaluator', () => { diff --git a/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts b/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts new file mode 100644 index 000000000..0d94ffe98 --- /dev/null +++ b/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from 'bun:test'; + +import { + containsTemplateVariables, + resolveCustomPrompt, +} from '../../../src/evaluation/evaluators/prompt-resolution.js'; + +describe('containsTemplateVariables', () => { + it('returns true for template with {{output}}', () => { + expect(containsTemplateVariables('Grade the {{output}} against {{criteria}}')).toBe(true); + }); + + it('returns true for template with {{input}}', () => { + expect(containsTemplateVariables('Evaluate {{input}} and {{output}}')).toBe(true); + }); + + it('returns true for template with {{expected_output}}', () => { + expect(containsTemplateVariables('Compare {{output}} to {{expected_output}}')).toBe(true); + }); + + it('returns true for template with {{criteria}}', () => { + expect(containsTemplateVariables('Check {{criteria}} for {{output}}')).toBe(true); + }); + + it('returns true for template with {{file_changes}}', () => { + expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true); + }); + + it('returns true for deprecated {{output_text}} variable', () => { + expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true); + }); + + it('returns true for deprecated {{input_text}} variable', () => { + expect(containsTemplateVariables('Evaluate {{input_text}}')).toBe(true); + }); + + it('returns true with whitespace in braces', () => { + expect(containsTemplateVariables('Grade the {{ output }} carefully')).toBe(true); + }); + + it('returns false for bare criteria text without variables', () => { + expect(containsTemplateVariables('Check if the response shows step-by-step work')).toBe(false); + }); + + it('returns false for text with unknown variable names', () => { + expect(containsTemplateVariables('Evaluate {{answer}} against {{rubric}}')).toBe(false); + }); + + it('returns false for empty string', () => { + expect(containsTemplateVariables('')).toBe(false); + }); + + it('returns false for text with single braces', () => { + expect(containsTemplateVariables('Check {output} carefully')).toBe(false); + }); +}); + +describe('resolveCustomPrompt', () => { + it('returns inline prompt string as-is', async () => { + const result = await resolveCustomPrompt({ + prompt: 'Check if the response is correct', + }); + expect(result).toBe('Check if the response is correct'); + }); + + it('returns undefined when no prompt is configured', async () => { + const result = await resolveCustomPrompt({}); + expect(result).toBeUndefined(); + }); + + it('returns undefined when prompt is not a string', async () => { + const result = await resolveCustomPrompt({ + prompt: { command: ['node', 'script.js'] }, + }); + expect(result).toBeUndefined(); + }); +});