From 4a5394d330dedc6c8b646a7e734e4c607ad45f41 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:21:18 +0000 Subject: [PATCH 1/3] fix(core): extract system messages in prompt builder for LLM grader The buildPromptInputs function now correctly extracts system messages and returns them in the systemMessage field. The orchestrator passes the system prompt directly instead of burying it in metadata. Closes #982 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../evaluation/formatting/prompt-builder.ts | 41 ++++++++++++++++++- packages/core/src/evaluation/orchestrator.ts | 8 +--- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/packages/core/src/evaluation/formatting/prompt-builder.ts b/packages/core/src/evaluation/formatting/prompt-builder.ts index 53290599c..f30ee9fca 100644 --- a/packages/core/src/evaluation/formatting/prompt-builder.ts +++ b/packages/core/src/evaluation/formatting/prompt-builder.ts @@ -85,9 +85,14 @@ export async function buildPromptInputs( }) : undefined; + // Extract system message from leading system-role messages in the input. + // This is used by the orchestrator to pass the system prompt as a direct field + // on ProviderRequest and by evaluators that need the system context separately. + const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode); + // Both question (flat string) and chatPrompt (structured messages) are returned: // chatPrompt is used for the API call, question is retained for logging/debugging. - return { question, chatPrompt }; + return { question, chatPrompt, systemMessage }; } /** @@ -118,6 +123,40 @@ function needsRoleMarkers( return messagesWithContent > 1; } +/** + * Extract the system message text from leading system-role messages in the input. + * Returns undefined if no system messages are present. + */ +function extractSystemMessage( + messages: readonly TestMessage[], + segmentsByMessage: readonly JsonObject[][], + mode: FormattingMode, +): string | undefined { + const systemParts: string[] = []; + + for (let i = 0; i < messages.length; i++) { + if (messages[i].role !== 'system') { + break; + } + + const segments = segmentsByMessage[i]; + const contentParts: string[] = []; + + for (const segment of segments) { + const formatted = formatSegment(segment, mode); + if (formatted) { + contentParts.push(formatted); + } + } + + if (contentParts.length > 0) { + systemParts.push(contentParts.join('\n')); + } + } + + return systemParts.length > 0 ? systemParts.join('\n\n') : undefined; +} + function buildChatPromptFromSegments(options: { readonly messages: readonly TestMessage[]; readonly segmentsByMessage: readonly JsonObject[][]; diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 4e6306fc7..54032ed58 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1215,11 +1215,9 @@ async function runBatchEvaluation(options: { const promptInputs = promptInputsList[index]; return { question: promptInputs.question, + systemPrompt: promptInputs.systemMessage, inputFiles: evalCase.file_paths, evalCaseId: evalCase.id, - metadata: { - systemPrompt: promptInputs.systemMessage ?? '', - }, }; }); @@ -2665,13 +2663,11 @@ async function invokeProvider( return await provider.invoke({ question: promptInputs.question, + systemPrompt: promptInputs.systemMessage, chatPrompt: promptInputs.chatPrompt, inputFiles: evalCase.file_paths, evalCaseId: evalCase.id, attempt, - metadata: { - systemPrompt: promptInputs.systemMessage ?? '', - }, signal: controller.signal, cwd, workspaceFile, From 056f8f4a67b2c5930e1a0b729a970724c1798645 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:49:41 +0000 Subject: [PATCH 2/3] fix(core): treat bare assertion prompt as criteria, not template override (#982) When a user writes `prompt: "Check step-by-step work"` in an llm-grader assertion, the text was being used as the entire evaluator template, replacing the DEFAULT_EVALUATOR_TEMPLATE which contains {{output}}, {{input}}, {{criteria}} variables. This meant the grader never saw the actual candidate response, always scoring 0. Now bare inline prompt strings (without template variables like {{output}}) are injected into the default template's {{criteria}} slot instead. Prompts that contain recognized template variables, and prompts from scripts/files, continue to work as full template overrides. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../evaluators/prompt-resolution.ts | 26 +++++ .../evaluation/registry/builtin-evaluators.ts | 37 +++++- .../core/test/evaluation/evaluators.test.ts | 105 ++++++++++++++++++ .../evaluators/prompt-resolution.test.ts | 77 +++++++++++++ 4 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 packages/core/test/evaluation/evaluators/prompt-resolution.test.ts diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts index 5429e62ab..3d271923f 100644 --- a/packages/core/src/evaluation/evaluators/prompt-resolution.ts +++ b/packages/core/src/evaluation/evaluators/prompt-resolution.ts @@ -2,6 +2,13 @@ * Prompt resolution utilities for LLM judge evaluators. * * Extracted from orchestrator.ts to enable reuse by the evaluator registry. + * + * Key behavior: When a user writes `prompt: "some text"` in an assertion, + * `resolveCustomPrompt()` returns that text. The caller must then decide + * whether the text is a **full template** (contains `{{output}}` etc.) or + * **bare criteria** (no template variables). Use `containsTemplateVariables()` + * to distinguish: full templates become `evaluatorTemplateOverride`, while + * bare criteria are injected into the default template's `{{criteria}}` slot. */ import path from 'node:path'; @@ -9,6 +16,7 @@ import path from 'node:path'; import { toSnakeCaseDeep } from '../case-conversion.js'; import { readTextFile } from '../file-utils.js'; import type { Message } from '../providers/types.js'; +import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TraceSummary } from '../trace.js'; import type { EvalTest, PromptScriptConfig } from '../types.js'; import { executeScript } from './code-evaluator.js'; @@ -66,6 +74,24 @@ export async function resolveCustomPrompt( return undefined; } +/** + * Checks whether a prompt string contains any known `{{ variable }}` template + * placeholders (e.g. `{{output}}`, `{{input}}`). If it does, the string is a + * full evaluator template and should replace the default template. If not, + * it's bare criteria text and should be injected into the `{{criteria}}` slot + * of the default template. + */ +export function containsTemplateVariables(text: string): boolean { + const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g; + let match: RegExpExecArray | null; + while ((match = variablePattern.exec(text)) !== null) { + if (VALID_TEMPLATE_VARIABLES.has(match[1])) { + return true; + } + } + return false; +} + async function executePromptTemplate( script: readonly string[], context: ResolveCustomPromptContext, diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index caec4d609..b115fa00d 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -31,7 +31,10 @@ import { runStartsWithAssertion, } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; -import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; +import { + containsTemplateVariables, + resolveCustomPrompt, +} from '../evaluators/prompt-resolution.js'; import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; @@ -126,9 +129,39 @@ export const llmGraderFactory: EvaluatorFactoryFn = (config, context) => { }, agentTimeoutMs, ); + + // Determine whether the resolved prompt should replace the entire + // evaluator template or be injected as the {{criteria}} in the default + // template. + // + // Script-based prompts (resolvedPromptScript) and file-based prompts + // (resolvedPromptPath/promptPath) are always treated as full template + // overrides — they're expected to produce the complete grader prompt. + // + // Inline `prompt:` strings are checked for template variables like + // {{output}}, {{input}}, etc. If present, the string is a full custom + // template. If absent, it's bare criteria text (e.g. "Check if the + // response shows step-by-step work") and gets injected into the default + // template's {{criteria}} slot so the grader still receives the + // candidate output, input, and reference answer. (#982) + const isFromInlinePrompt = + !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath; + + let evaluatorTemplateOverride: string | undefined; + let evalCase = evalContext.evalCase; + if (customPrompt) { + if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) { + evaluatorTemplateOverride = customPrompt; + } else { + // Bare inline text — use as criteria in the default template + evalCase = { ...evalCase, criteria: customPrompt }; + } + } + return evaluator.evaluate({ ...evalContext, - evaluatorTemplateOverride: customPrompt, + evalCase, + evaluatorTemplateOverride, evaluator: c, }); }, diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index 42dabec24..eca0bce73 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -788,6 +788,111 @@ describe('LlmGraderEvaluator (llm-grader)', () => { expect(warnSpy.mock.calls[0][0]).toContain('skipped'); warnSpy.mockRestore(); }); + + it('treats bare prompt string as criteria, not full template override (#982)', async () => { + // When a user writes `prompt: "Check step-by-step work"` in an assertion, + // the grader should receive the DEFAULT_EVALUATOR_TEMPLATE (which contains + // {{output}}, {{input}}, etc.) with the prompt text injected as {{criteria}}, + // NOT use the bare text as the entire template replacement. + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + score: 0.9, + assertions: [{ text: 'Shows step-by-step work', passed: true }], + }), + }, + ], + }); + + const evaluator = llmGraderFactory( + { + name: 'step-check', + type: 'llm-grader', + prompt: 'Check if the response shows step-by-step work', + }, + { + graderProvider, + llmGrader: new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }), + registry: {} as never, + }, + ); + + await evaluator.evaluate({ + evalCase: { + ...baseTestCase, + criteria: 'Original criteria from test case', + }, + candidate: 'Step 1: Read the code\nStep 2: Write tests\nStep 3: Refactor', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + // The user prompt should contain the full default template structure + const userPrompt = graderProvider.lastRequest?.question ?? ''; + expect(userPrompt).toContain('[[ ## criteria ## ]]'); + expect(userPrompt).toContain('[[ ## answer ## ]]'); + expect(userPrompt).toContain('[[ ## question ## ]]'); + // The bare prompt text should appear as the criteria + expect(userPrompt).toContain('Check if the response shows step-by-step work'); + // The candidate answer should be present in the template + expect(userPrompt).toContain('Step 1: Read the code'); + }); + + it('uses prompt with {{output}} as full template override', async () => { + // When a user provides a template with known variables, it SHOULD replace + // the default template (backward compatible with intentional overrides). + const graderProvider = new CapturingProvider({ + output: [ + { + role: 'assistant', + content: JSON.stringify({ + score: 0.8, + assertions: [{ text: 'Custom template used', passed: true }], + }), + }, + ], + }); + + const customTemplate = 'Custom grader: evaluate {{output}} against {{criteria}}'; + + const evaluator = llmGraderFactory( + { + name: 'custom-template', + type: 'llm-grader', + prompt: customTemplate, + }, + { + graderProvider, + llmGrader: new LlmGraderEvaluator({ + resolveGraderProvider: async () => graderProvider, + }), + registry: {} as never, + }, + ); + + await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Some answer', + target: baseTarget, + provider: graderProvider, + attempt: 0, + promptInputs: { question: '' }, + now: new Date(), + }); + + // The custom template should be used as-is (with substitutions) + const userPrompt = graderProvider.lastRequest?.question ?? ''; + expect(userPrompt).toContain('Custom grader: evaluate'); + // Should NOT contain the default template's structure + expect(userPrompt).not.toContain('[[ ## answer ## ]]'); + }); }); describe('CodeEvaluator', () => { diff --git a/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts b/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts new file mode 100644 index 000000000..0d94ffe98 --- /dev/null +++ b/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from 'bun:test'; + +import { + containsTemplateVariables, + resolveCustomPrompt, +} from '../../../src/evaluation/evaluators/prompt-resolution.js'; + +describe('containsTemplateVariables', () => { + it('returns true for template with {{output}}', () => { + expect(containsTemplateVariables('Grade the {{output}} against {{criteria}}')).toBe(true); + }); + + it('returns true for template with {{input}}', () => { + expect(containsTemplateVariables('Evaluate {{input}} and {{output}}')).toBe(true); + }); + + it('returns true for template with {{expected_output}}', () => { + expect(containsTemplateVariables('Compare {{output}} to {{expected_output}}')).toBe(true); + }); + + it('returns true for template with {{criteria}}', () => { + expect(containsTemplateVariables('Check {{criteria}} for {{output}}')).toBe(true); + }); + + it('returns true for template with {{file_changes}}', () => { + expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true); + }); + + it('returns true for deprecated {{output_text}} variable', () => { + expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true); + }); + + it('returns true for deprecated {{input_text}} variable', () => { + expect(containsTemplateVariables('Evaluate {{input_text}}')).toBe(true); + }); + + it('returns true with whitespace in braces', () => { + expect(containsTemplateVariables('Grade the {{ output }} carefully')).toBe(true); + }); + + it('returns false for bare criteria text without variables', () => { + expect(containsTemplateVariables('Check if the response shows step-by-step work')).toBe(false); + }); + + it('returns false for text with unknown variable names', () => { + expect(containsTemplateVariables('Evaluate {{answer}} against {{rubric}}')).toBe(false); + }); + + it('returns false for empty string', () => { + expect(containsTemplateVariables('')).toBe(false); + }); + + it('returns false for text with single braces', () => { + expect(containsTemplateVariables('Check {output} carefully')).toBe(false); + }); +}); + +describe('resolveCustomPrompt', () => { + it('returns inline prompt string as-is', async () => { + const result = await resolveCustomPrompt({ + prompt: 'Check if the response is correct', + }); + expect(result).toBe('Check if the response is correct'); + }); + + it('returns undefined when no prompt is configured', async () => { + const result = await resolveCustomPrompt({}); + expect(result).toBeUndefined(); + }); + + it('returns undefined when prompt is not a string', async () => { + const result = await resolveCustomPrompt({ + prompt: { command: ['node', 'script.js'] }, + }); + expect(result).toBeUndefined(); + }); +}); From 2f58cfa9d1059aacbdacb9a47bc173af4092775a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:51:21 +0000 Subject: [PATCH 3/3] style: fix lint issues in prompt-resolution and builtin-evaluators Replace while-loop assignment with matchAll iterator (biome noAssignInExpressions) and collapse short import to single line (biome formatter). Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/evaluation/evaluators/prompt-resolution.ts | 3 +-- packages/core/src/evaluation/registry/builtin-evaluators.ts | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts index 3d271923f..04e9df5dc 100644 --- a/packages/core/src/evaluation/evaluators/prompt-resolution.ts +++ b/packages/core/src/evaluation/evaluators/prompt-resolution.ts @@ -83,8 +83,7 @@ export async function resolveCustomPrompt( */ export function containsTemplateVariables(text: string): boolean { const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g; - let match: RegExpExecArray | null; - while ((match = variablePattern.exec(text)) !== null) { + for (const match of text.matchAll(variablePattern)) { if (VALID_TEMPLATE_VARIABLES.has(match[1])) { return true; } diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index b115fa00d..37b17b77c 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -31,10 +31,7 @@ import { runStartsWithAssertion, } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; -import { - containsTemplateVariables, - resolveCustomPrompt, -} from '../evaluators/prompt-resolution.js'; +import { containsTemplateVariables, resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js';