EntityProcess · christso · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/packages/core/src/evaluation/evaluators/prompt-resolution.ts b/packages/core/src/evaluation/evaluators/prompt-resolution.ts
@@ -2,13 +2,21 @@
  * Prompt resolution utilities for LLM judge evaluators.
  *
  * Extracted from orchestrator.ts to enable reuse by the evaluator registry.
+ *
+ * Key behavior: When a user writes `prompt: "some text"` in an assertion,
+ * `resolveCustomPrompt()` returns that text. The caller must then decide
+ * whether the text is a **full template** (contains `{{output}}` etc.) or
+ * **bare criteria** (no template variables). Use `containsTemplateVariables()`
+ * to distinguish: full templates become `evaluatorTemplateOverride`, while
+ * bare criteria are injected into the default template's `{{criteria}}` slot.
  */
 
 import path from 'node:path';
 
 import { toSnakeCaseDeep } from '../case-conversion.js';
 import { readTextFile } from '../file-utils.js';
 import type { Message } from '../providers/types.js';
+import { VALID_TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TraceSummary } from '../trace.js';
 import type { EvalTest, PromptScriptConfig } from '../types.js';
 import { executeScript } from './code-evaluator.js';
@@ -66,6 +74,23 @@ export async function resolveCustomPrompt(
   return undefined;
 }
 
+/**
+ * Checks whether a prompt string contains any known `{{ variable }}` template
+ * placeholders (e.g. `{{output}}`, `{{input}}`). If it does, the string is a
+ * full evaluator template and should replace the default template. If not,
+ * it's bare criteria text and should be injected into the `{{criteria}}` slot
+ * of the default template.
+ */
+export function containsTemplateVariables(text: string): boolean {
+  const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
+  for (const match of text.matchAll(variablePattern)) {
+    if (VALID_TEMPLATE_VARIABLES.has(match[1])) {
+      return true;
+    }
+  }
+  return false;
+}
+
 async function executePromptTemplate(
   script: readonly string[],
   context: ResolveCustomPromptContext,

diff --git a/packages/core/src/evaluation/formatting/prompt-builder.ts b/packages/core/src/evaluation/formatting/prompt-builder.ts
@@ -85,9 +85,14 @@ export async function buildPromptInputs(
       })
     : undefined;
 
+  // Extract system message from leading system-role messages in the input.
+  // This is used by the orchestrator to pass the system prompt as a direct field
+  // on ProviderRequest and by evaluators that need the system context separately.
+  const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode);
+
   // Both question (flat string) and chatPrompt (structured messages) are returned:
   // chatPrompt is used for the API call, question is retained for logging/debugging.
-  return { question, chatPrompt };
+  return { question, chatPrompt, systemMessage };
 }
 
 /**
@@ -118,6 +123,40 @@ function needsRoleMarkers(
   return messagesWithContent > 1;
 }
 
+/**
+ * Extract the system message text from leading system-role messages in the input.
+ * Returns undefined if no system messages are present.
+ */
+function extractSystemMessage(
+  messages: readonly TestMessage[],
+  segmentsByMessage: readonly JsonObject[][],
+  mode: FormattingMode,
+): string | undefined {
+  const systemParts: string[] = [];
+
+  for (let i = 0; i < messages.length; i++) {
+    if (messages[i].role !== 'system') {
+      break;
+    }
+
+    const segments = segmentsByMessage[i];
+    const contentParts: string[] = [];
+
+    for (const segment of segments) {
+      const formatted = formatSegment(segment, mode);
+      if (formatted) {
+        contentParts.push(formatted);
+      }
+    }
+
+    if (contentParts.length > 0) {
+      systemParts.push(contentParts.join('\n'));
+    }
+  }
+
+  return systemParts.length > 0 ? systemParts.join('\n\n') : undefined;
+}
+
 function buildChatPromptFromSegments(options: {
   readonly messages: readonly TestMessage[];
   readonly segmentsByMessage: readonly JsonObject[][];

diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
@@ -1215,11 +1215,9 @@ async function runBatchEvaluation(options: {
     const promptInputs = promptInputsList[index];
     return {
       question: promptInputs.question,
+      systemPrompt: promptInputs.systemMessage,
       inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
-      metadata: {
-        systemPrompt: promptInputs.systemMessage ?? '',
-      },
     };
   });
 
@@ -2665,13 +2663,11 @@ async function invokeProvider(
 
     return await provider.invoke({
       question: promptInputs.question,
+      systemPrompt: promptInputs.systemMessage,
       chatPrompt: promptInputs.chatPrompt,
       inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
       attempt,
-      metadata: {
-        systemPrompt: promptInputs.systemMessage ?? '',
-      },
       signal: controller.signal,
       cwd,
       workspaceFile,

diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts
@@ -31,7 +31,7 @@ import {
   runStartsWithAssertion,
 } from '../evaluators.js';
 import { InlineAssertEvaluator } from '../evaluators/inline-assert.js';
-import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js';
+import { containsTemplateVariables, resolveCustomPrompt } from '../evaluators/prompt-resolution.js';
 import { isAgentProvider } from '../providers/types.js';
 import type { Provider } from '../providers/types.js';
 import type { ToolTrajectoryEvaluatorConfig } from '../trace.js';
@@ -126,9 +126,39 @@ export const llmGraderFactory: EvaluatorFactoryFn = (config, context) => {
         },
         agentTimeoutMs,
       );
+
+      // Determine whether the resolved prompt should replace the entire
+      // evaluator template or be injected as the {{criteria}} in the default
+      // template.
+      //
+      // Script-based prompts (resolvedPromptScript) and file-based prompts
+      // (resolvedPromptPath/promptPath) are always treated as full template
+      // overrides — they're expected to produce the complete grader prompt.
+      //
+      // Inline `prompt:` strings are checked for template variables like
+      // {{output}}, {{input}}, etc.  If present, the string is a full custom
+      // template.  If absent, it's bare criteria text (e.g. "Check if the
+      // response shows step-by-step work") and gets injected into the default
+      // template's {{criteria}} slot so the grader still receives the
+      // candidate output, input, and reference answer.  (#982)
+      const isFromInlinePrompt =
+        !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
+
+      let evaluatorTemplateOverride: string | undefined;
+      let evalCase = evalContext.evalCase;
+      if (customPrompt) {
+        if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
+          evaluatorTemplateOverride = customPrompt;
+        } else {
+          // Bare inline text — use as criteria in the default template
+          evalCase = { ...evalCase, criteria: customPrompt };
+        }
+      }
+
       return evaluator.evaluate({
         ...evalContext,
-        evaluatorTemplateOverride: customPrompt,
+        evalCase,
+        evaluatorTemplateOverride,
         evaluator: c,
       });
     },

diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts
@@ -788,6 +788,111 @@ describe('LlmGraderEvaluator (llm-grader)', () => {
     expect(warnSpy.mock.calls[0][0]).toContain('skipped');
     warnSpy.mockRestore();
   });
+
+  it('treats bare prompt string as criteria, not full template override (#982)', async () => {
+    // When a user writes `prompt: "Check step-by-step work"` in an assertion,
+    // the grader should receive the DEFAULT_EVALUATOR_TEMPLATE (which contains
+    // {{output}}, {{input}}, etc.) with the prompt text injected as {{criteria}},
+    // NOT use the bare text as the entire template replacement.
+    const graderProvider = new CapturingProvider({
+      output: [
+        {
+          role: 'assistant',
+          content: JSON.stringify({
+            score: 0.9,
+            assertions: [{ text: 'Shows step-by-step work', passed: true }],
+          }),
+        },
+      ],
+    });
+
+    const evaluator = llmGraderFactory(
+      {
+        name: 'step-check',
+        type: 'llm-grader',
+        prompt: 'Check if the response shows step-by-step work',
+      },
+      {
+        graderProvider,
+        llmGrader: new LlmGraderEvaluator({
+          resolveGraderProvider: async () => graderProvider,
+        }),
+        registry: {} as never,
+      },
+    );
+
+    await evaluator.evaluate({
+      evalCase: {
+        ...baseTestCase,
+        criteria: 'Original criteria from test case',
+      },
+      candidate: 'Step 1: Read the code\nStep 2: Write tests\nStep 3: Refactor',
+      target: baseTarget,
+      provider: graderProvider,
+      attempt: 0,
+      promptInputs: { question: '' },
+      now: new Date(),
+    });
+
+    // The user prompt should contain the full default template structure
+    const userPrompt = graderProvider.lastRequest?.question ?? '';
+    expect(userPrompt).toContain('[[ ## criteria ## ]]');
+    expect(userPrompt).toContain('[[ ## answer ## ]]');
+    expect(userPrompt).toContain('[[ ## question ## ]]');
+    // The bare prompt text should appear as the criteria
+    expect(userPrompt).toContain('Check if the response shows step-by-step work');
+    // The candidate answer should be present in the template
+    expect(userPrompt).toContain('Step 1: Read the code');
+  });
+
+  it('uses prompt with {{output}} as full template override', async () => {
+    // When a user provides a template with known variables, it SHOULD replace
+    // the default template (backward compatible with intentional overrides).
+    const graderProvider = new CapturingProvider({
+      output: [
+        {
+          role: 'assistant',
+          content: JSON.stringify({
+            score: 0.8,
+            assertions: [{ text: 'Custom template used', passed: true }],
+          }),
+        },
+      ],
+    });
+
+    const customTemplate = 'Custom grader: evaluate {{output}} against {{criteria}}';
+
+    const evaluator = llmGraderFactory(
+      {
+        name: 'custom-template',
+        type: 'llm-grader',
+        prompt: customTemplate,
+      },
+      {
+        graderProvider,
+        llmGrader: new LlmGraderEvaluator({
+          resolveGraderProvider: async () => graderProvider,
+        }),
+        registry: {} as never,
+      },
+    );
+
+    await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Some answer',
+      target: baseTarget,
+      provider: graderProvider,
+      attempt: 0,
+      promptInputs: { question: '' },
+      now: new Date(),
+    });
+
+    // The custom template should be used as-is (with substitutions)
+    const userPrompt = graderProvider.lastRequest?.question ?? '';
+    expect(userPrompt).toContain('Custom grader: evaluate');
+    // Should NOT contain the default template's structure
+    expect(userPrompt).not.toContain('[[ ## answer ## ]]');
+  });
 });
 
 describe('CodeEvaluator', () => {

diff --git a/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts b/packages/core/test/evaluation/evaluators/prompt-resolution.test.ts
@@ -0,0 +1,77 @@
+import { describe, expect, it } from 'bun:test';
+
+import {
+  containsTemplateVariables,
+  resolveCustomPrompt,
+} from '../../../src/evaluation/evaluators/prompt-resolution.js';
+
+describe('containsTemplateVariables', () => {
+  it('returns true for template with {{output}}', () => {
+    expect(containsTemplateVariables('Grade the {{output}} against {{criteria}}')).toBe(true);
+  });
+
+  it('returns true for template with {{input}}', () => {
+    expect(containsTemplateVariables('Evaluate {{input}} and {{output}}')).toBe(true);
+  });
+
+  it('returns true for template with {{expected_output}}', () => {
+    expect(containsTemplateVariables('Compare {{output}} to {{expected_output}}')).toBe(true);
+  });
+
+  it('returns true for template with {{criteria}}', () => {
+    expect(containsTemplateVariables('Check {{criteria}} for {{output}}')).toBe(true);
+  });
+
+  it('returns true for template with {{file_changes}}', () => {
+    expect(containsTemplateVariables('Review {{file_changes}}')).toBe(true);
+  });
+
+  it('returns true for deprecated {{output_text}} variable', () => {
+    expect(containsTemplateVariables('Grade the {{output_text}}')).toBe(true);
+  });
+
+  it('returns true for deprecated {{input_text}} variable', () => {
+    expect(containsTemplateVariables('Evaluate {{input_text}}')).toBe(true);
+  });
+
+  it('returns true with whitespace in braces', () => {
+    expect(containsTemplateVariables('Grade the {{ output }} carefully')).toBe(true);
+  });
+
+  it('returns false for bare criteria text without variables', () => {
+    expect(containsTemplateVariables('Check if the response shows step-by-step work')).toBe(false);
+  });
+
+  it('returns false for text with unknown variable names', () => {
+    expect(containsTemplateVariables('Evaluate {{answer}} against {{rubric}}')).toBe(false);
+  });
+
+  it('returns false for empty string', () => {
+    expect(containsTemplateVariables('')).toBe(false);
+  });
+
+  it('returns false for text with single braces', () => {
+    expect(containsTemplateVariables('Check {output} carefully')).toBe(false);
+  });
+});
+
+describe('resolveCustomPrompt', () => {
+  it('returns inline prompt string as-is', async () => {
+    const result = await resolveCustomPrompt({
+      prompt: 'Check if the response is correct',
+    });
+    expect(result).toBe('Check if the response is correct');
+  });
+
+  it('returns undefined when no prompt is configured', async () => {
+    const result = await resolveCustomPrompt({});
+    expect(result).toBeUndefined();
+  });
+
+  it('returns undefined when prompt is not a string', async () => {
+    const result = await resolveCustomPrompt({
+      prompt: { command: ['node', 'script.js'] },
+    });
+    expect(result).toBeUndefined();
+  });
+});