From c197a6c4fdb3055c4eed3275a7f820e7f63f9c34 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:17:59 +0200
Subject: [PATCH 01/19] docs: add prompt adherence design spec

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../2026-04-09-prompt-adherence-design.md     | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-04-09-prompt-adherence-design.md

diff --git a/docs/superpowers/specs/2026-04-09-prompt-adherence-design.md b/docs/superpowers/specs/2026-04-09-prompt-adherence-design.md
new file mode 100644
index 00000000..ce60e58f
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-09-prompt-adherence-design.md
@@ -0,0 +1,133 @@
+# Prompt Adherence Check for PlanExe
+
+## Problem
+
+PlanExe's pipeline has a "normalization bias." Each of the ~70 nodes nudges the plan toward what a reasonable project *should* look like, and the cumulative drift over the full pipeline is significant. The user's stated reality gets overridden by the LLM's priors about what's plausible.
+
+This manifests as:
+- **Stated facts ignored.** The user says "the East Wing has already been demolished" but the plan includes demolition permitting steps.
+- **Requirements softened.** The user says "100% renewable energy" and the plan targets 60-80%.
+- **Intent diluted.** The user's tone is "this is happening, execute it" but the plan spends 40% on feasibility studies.
+- **Unsolicited caveats.** The plan adds qualifications, risk disclaimers, and scope reductions the user didn't ask for.
+- **Generic PM filler.** The plan relies on boilerplate project management language instead of addressing the specific problem.
+
+Existing pipeline steps (Premise Attack, Premortem, Expert Criticism, Self Audit) assess plan *quality* — whether the plan is internally consistent, well-structured, and risk-aware. None of them check whether the plan actually does what the user asked.
+
+## Goal
+
+A pipeline step that checks the final plan against the original user prompt and produces a scored report showing which user directives were honored, softened, or ignored. The user can scan the report and immediately see the degree of prompt drift.
+
+## Architecture
+
+Two-phase LLM approach: extract directives from the prompt, then score each one against the final plan.
+
+### Phase 1 — Extract Directives
+
+Read `plan.txt` (the original user prompt) and extract a structured list of directives. Each directive is one thing the user stated or implied that the plan must respect.
+
+```python
+class DirectiveType(str, Enum):
+    CONSTRAINT = "constraint"        # "Budget: DKK 500M", "Timeline: 12 months"
+    STATED_FACT = "stated_fact"      # "The East Wing has already been demolished"
+    REQUIREMENT = "requirement"      # "Build a casino", "Reeducate teachers"
+    BANNED = "banned"                # "Banned words: blockchain/NFT"
+    INTENT = "intent"               # "I'm not targeting revenue", tone/posture signals
+```
+
+Each directive has:
+- `directive_id`: "D1", "D2", etc.
+- `directive_type`: one of the types above
+- `text`: the user's words (short quote or paraphrase)
+- `importance_5`: 1 (minor detail) to 5 (core requirement)
+
+The LLM is instructed to extract 5-15 directives, prioritizing things that are easy to dilute: stated facts about the world, hard numbers, explicit scope boundaries, banned words, and the user's posture (execute vs. study).
+
+### Phase 2 — Score Against Final Plan
+
+Read the extracted directives plus the final plan artifacts (executive summary, project plan, consolidated assumptions). For each directive, score adherence.
+
+```python
+class AdherenceCategory(str, Enum):
+    FULLY_HONORED = "fully_honored"
+    PARTIALLY_HONORED = "partially_honored"
+    SOFTENED = "softened"               # requirement weakened
+    IGNORED = "ignored"                 # not addressed at all
+    CONTRADICTED = "contradicted"       # plan says the opposite
+    UNSOLICITED_CAVEAT = "unsolicited_caveat"  # plan adds qualifications user didn't ask for
+```
+
+Each scoring result has:
+- `directive_id`: references a Phase 1 directive
+- `adherence_5`: 1 (ignored/contradicted) to 5 (fully honored)
+- `category`: one of the categories above
+- `evidence`: direct quote from the plan (under 200 chars)
+- `explanation`: how the plan handled this directive and why the score was given
+
+### Output Files
+
+- `prompt_adherence_raw.json` — full structured data (directives + scores + metadata)
+- `prompt_adherence.md` — human-readable report
+
+### Markdown Report Structure
+
+1. **Summary table** — all directives sorted by severity (importance_5 x (6 - adherence_5), worst offenders first):
+
+```
+| ID | Directive | Type | Importance | Adherence | Category |
+|----|-----------|------|------------|-----------|----------|
+| D3 | "East Wing already demolished" | stated_fact | 5/5 | 1/5 | contradicted |
+| D1 | "Budget: DKK 500M" | constraint | 5/5 | 3/5 | softened |
+| D7 | "No feasibility studies" | intent | 4/5 | 2/5 | ignored |
+```
+
+2. **Overall adherence score** — weighted average: `sum(adherence_5 * importance_5) / sum(5 * importance_5)` as a percentage. A plan that fully honors everything scores 100%.
+
+3. **Detail section** — for each directive scoring adherence_5 ≤ 3, the full explanation and evidence quotes from both the prompt and the plan.
+
+### Pipeline Placement
+
+After `self_audit`, before `report`. The task reads:
+- `setup` — plan.txt (the original user prompt)
+- `executive_summary` — the final plan summary
+- `project_plan` — the detailed plan
+- `consolidate_assumptions_markdown` — accumulated assumptions that may have drifted
+
+The report task includes `prompt_adherence.md` in the final HTML output.
+
+### FilenameEnum Entries
+
+```python
+PROMPT_ADHERENCE_RAW = "prompt_adherence_raw.json"
+PROMPT_ADHERENCE_MARKDOWN = "prompt_adherence.md"
+```
+
+### Code Structure
+
+```
+worker_plan/worker_plan_internal/
+    diagnostics/
+        prompt_adherence.py          — Phase 1 + Phase 2 logic, Pydantic models, markdown generation
+    plan/nodes/
+        prompt_adherence.py          — Luigi task (PromptAdherenceTask)
+```
+
+Follows the same pattern as `premortem.py` / `nodes/premortem.py`:
+- Business logic in `diagnostics/prompt_adherence.py`
+- Luigi wiring in `plan/nodes/prompt_adherence.py`
+- Pydantic structured output via `llm.as_structured_llm()`
+- `LLMExecutor` for model fallback and retry
+
+### Scope Boundaries
+
+**In scope:**
+- Extract directives from plan.txt
+- Score each directive against the final plan
+- Produce JSON + markdown report
+- Integrate as a Luigi pipeline step
+- Include in the final HTML report
+
+**Out of scope:**
+- Fixing the drift (this step surfaces it, doesn't correct it)
+- Tracing where in the pipeline drift was introduced (that's RCA's job)
+- Judging plan quality (that's self_audit's job)
+- Comparing multiple plans against each other

From 818686d8473b6745be6cc75a198ae9bf13c7bafd Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:21:00 +0200
Subject: [PATCH 02/19] docs: add prompt adherence implementation plan

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-04-09-prompt-adherence.md      | 683 ++++++++++++++++++
 1 file changed, 683 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-09-prompt-adherence.md

diff --git a/docs/superpowers/plans/2026-04-09-prompt-adherence.md b/docs/superpowers/plans/2026-04-09-prompt-adherence.md
new file mode 100644
index 00000000..7e53db97
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-09-prompt-adherence.md
@@ -0,0 +1,683 @@
+# Prompt Adherence Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Build a pipeline step that checks the final plan against the original user prompt and produces a scored report showing which user directives were honored, softened, or ignored.
+
+**Architecture:** Two-phase LLM approach (extract directives, then score each against the plan). Follows the same pattern as `premortem.py`: Pydantic structured output, `LLMExecutor` for model fallback, dataclass for results with `save_raw`/`save_markdown` methods. Luigi task wired after `self_audit`, before `report`.
+
+**Tech Stack:** Python 3.13, llama-index (structured LLM output), Pydantic v2, Luigi
+
+---
+
+## File Structure
+
+```
+worker_plan/worker_plan_internal/
+    diagnostics/
+        prompt_adherence.py              — Phase 1 + Phase 2 logic, Pydantic models, markdown generation
+        tests/
+            test_prompt_adherence.py      — Unit tests for Pydantic models and markdown generation
+    plan/nodes/
+        prompt_adherence.py              — Luigi task (PromptAdherenceTask)
+worker_plan/worker_plan_api/
+    filenames.py                         — Add PROMPT_ADHERENCE_RAW, PROMPT_ADHERENCE_MARKDOWN
+```
+
+---
+
+### Task 1: FilenameEnum entries
+
+**Files:**
+- Modify: `worker_plan/worker_plan_api/filenames.py`
+
+- [ ] **Step 1: Add filename entries**
+
+Add after the `SELF_AUDIT_MARKDOWN` line:
+
+```python
+    PROMPT_ADHERENCE_RAW = "prompt_adherence_raw.json"
+    PROMPT_ADHERENCE_MARKDOWN = "prompt_adherence.md"
+```
+
+- [ ] **Step 2: Verify import works**
+
+Run: `cd worker_plan && .venv/bin/python -c "from worker_plan_api.filenames import FilenameEnum; print(FilenameEnum.PROMPT_ADHERENCE_RAW.value)"`
+Expected: `prompt_adherence_raw.json`
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add worker_plan/worker_plan_api/filenames.py
+git commit -m "feat: add FilenameEnum entries for prompt adherence"
+```
+
+---
+
+### Task 2: Pydantic models and prompt logic
+
+**Files:**
+- Create: `worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py`
+- Create: `worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+```python
+# worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+import unittest
+from worker_plan_internal.diagnostics.prompt_adherence import (
+    DirectiveType,
+    Directive,
+    DirectiveExtractionResult,
+    AdherenceCategory,
+    AdherenceResult,
+    AdherenceScoreResult,
+    PromptAdherence,
+)
+
+
+class TestDirectiveModel(unittest.TestCase):
+    def test_directive_valid(self):
+        d = Directive(
+            directive_id="D1",
+            directive_type=DirectiveType.CONSTRAINT,
+            text="Budget: DKK 500M",
+            importance_5=5,
+        )
+        self.assertEqual(d.directive_id, "D1")
+        self.assertEqual(d.directive_type, DirectiveType.CONSTRAINT)
+        self.assertEqual(d.importance_5, 5)
+
+    def test_directive_extraction_result(self):
+        result = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+            ]
+        )
+        self.assertEqual(len(result.directives), 2)
+
+
+class TestAdherenceResultModel(unittest.TestCase):
+    def test_adherence_result_valid(self):
+        r = AdherenceResult(
+            directive_id="D1",
+            adherence_5=3,
+            category=AdherenceCategory.SOFTENED,
+            evidence="Budget adjusted to DKK 800M",
+            explanation="The plan increased the budget beyond the stated constraint.",
+        )
+        self.assertEqual(r.adherence_5, 3)
+        self.assertEqual(r.category, AdherenceCategory.SOFTENED)
+
+    def test_adherence_score_result(self):
+        result = AdherenceScoreResult(
+            results=[
+                AdherenceResult(
+                    directive_id="D1", adherence_5=5,
+                    category=AdherenceCategory.FULLY_HONORED,
+                    evidence="Budget: DKK 500M", explanation="Honored exactly.",
+                ),
+                AdherenceResult(
+                    directive_id="D2", adherence_5=1,
+                    category=AdherenceCategory.CONTRADICTED,
+                    evidence="Demolition permit required", explanation="Plan ignores stated fact.",
+                ),
+            ]
+        )
+        self.assertEqual(len(result.results), 2)
+
+
+class TestPromptAdherenceMarkdown(unittest.TestCase):
+    def test_convert_to_markdown_produces_report(self):
+        directives = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+            ]
+        )
+        scores = AdherenceScoreResult(
+            results=[
+                AdherenceResult(
+                    directive_id="D1", adherence_5=5,
+                    category=AdherenceCategory.FULLY_HONORED,
+                    evidence="Budget: DKK 500M", explanation="Honored.",
+                ),
+                AdherenceResult(
+                    directive_id="D2", adherence_5=1,
+                    category=AdherenceCategory.CONTRADICTED,
+                    evidence="Demolition permit required",
+                    explanation="Plan contradicts stated fact.",
+                ),
+            ]
+        )
+        markdown = PromptAdherence.convert_to_markdown(directives, scores)
+        self.assertIn("# Prompt Adherence Report", markdown)
+        self.assertIn("Budget: DKK 500M", markdown)
+        self.assertIn("contradicted", markdown)
+        self.assertIn("Overall Adherence", markdown)
+
+    def test_overall_score_calculation(self):
+        # D1: importance=5, adherence=5 -> weighted=25
+        # D2: importance=5, adherence=1 -> weighted=5
+        # total weighted = 30, max = 50, score = 60%
+        directives = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="A", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="B", importance_5=5),
+            ]
+        )
+        scores = AdherenceScoreResult(
+            results=[
+                AdherenceResult(directive_id="D1", adherence_5=5, category=AdherenceCategory.FULLY_HONORED, evidence="", explanation=""),
+                AdherenceResult(directive_id="D2", adherence_5=1, category=AdherenceCategory.CONTRADICTED, evidence="", explanation=""),
+            ]
+        )
+        score = PromptAdherence.calculate_overall_score(directives, scores)
+        self.assertEqual(score, 60)
+
+    def test_overall_score_empty(self):
+        directives = DirectiveExtractionResult(directives=[])
+        scores = AdherenceScoreResult(results=[])
+        score = PromptAdherence.calculate_overall_score(directives, scores)
+        self.assertEqual(score, 100)
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cd worker_plan && .venv/bin/python -m pytest worker_plan_internal/diagnostics/tests/test_prompt_adherence.py -v`
+Expected: FAIL with `ModuleNotFoundError`
+
+- [ ] **Step 3: Implement prompt_adherence.py**
+
+```python
+# worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+"""
+Prompt Adherence: check how faithfully the final plan follows the original user prompt.
+
+Phase 1: Extract directives (constraints, stated facts, requirements, banned words, intent) from plan.txt.
+Phase 2: Score each directive against the final plan artifacts.
+
+PROMPT> python -m worker_plan_internal.diagnostics.prompt_adherence
+"""
+import json
+import logging
+from enum import Enum
+from dataclasses import dataclass
+from typing import List
+from pydantic import BaseModel, Field
+from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.core.llms.llm import LLM
+from worker_plan_internal.llm_util.llm_executor import LLMExecutor, PipelineStopRequested
+from worker_plan_internal.llm_util.llm_errors import LLMChatError
+
+logger = logging.getLogger(__name__)
+
+
+# -- Pydantic models for Phase 1: Directive Extraction -------------------------
+
+class DirectiveType(str, Enum):
+    CONSTRAINT = "constraint"
+    STATED_FACT = "stated_fact"
+    REQUIREMENT = "requirement"
+    BANNED = "banned"
+    INTENT = "intent"
+
+
+class Directive(BaseModel):
+    directive_id: str = Field(description="Enumerate as 'D1', 'D2', 'D3', etc.")
+    directive_type: DirectiveType = Field(description=(
+        "constraint: explicit numeric or scope limits (budget, timeline, capacity). "
+        "stated_fact: things the user says are already true about the world. "
+        "requirement: what must be built or done. "
+        "banned: words, approaches, or technologies the user explicitly prohibits. "
+        "intent: the user's posture, tone, or implied expectations about execution vs. study."
+    ))
+    text: str = Field(description="The user's words — short quote or close paraphrase (under 100 chars).")
+    importance_5: int = Field(description="1 (minor detail) to 5 (core requirement). Rate how central this is to the user's request.")
+
+
+class DirectiveExtractionResult(BaseModel):
+    directives: List[Directive] = Field(description="5-15 directives extracted from the user's prompt.")
+
+
+# -- Pydantic models for Phase 2: Adherence Scoring ---------------------------
+
+class AdherenceCategory(str, Enum):
+    FULLY_HONORED = "fully_honored"
+    PARTIALLY_HONORED = "partially_honored"
+    SOFTENED = "softened"
+    IGNORED = "ignored"
+    CONTRADICTED = "contradicted"
+    UNSOLICITED_CAVEAT = "unsolicited_caveat"
+
+
+class AdherenceResult(BaseModel):
+    directive_id: str = Field(description="References a directive from Phase 1.")
+    adherence_5: int = Field(description="1 (ignored/contradicted) to 5 (fully honored).")
+    category: AdherenceCategory = Field(description=(
+        "fully_honored: plan respects this exactly. "
+        "partially_honored: plan addresses it but incompletely. "
+        "softened: plan weakens the requirement. "
+        "ignored: plan doesn't address it at all. "
+        "contradicted: plan says the opposite. "
+        "unsolicited_caveat: plan adds qualifications the user didn't ask for."
+    ))
+    evidence: str = Field(description="Direct quote from the plan (under 200 chars).")
+    explanation: str = Field(description="How the plan handled this directive and why this score was given.")
+
+
+class AdherenceScoreResult(BaseModel):
+    results: List[AdherenceResult] = Field(description="One scoring result per directive from Phase 1.")
+
+
+# -- System prompts ------------------------------------------------------------
+
+EXTRACT_DIRECTIVES_SYSTEM_PROMPT = """\
+You are analyzing the original user prompt for a project planning pipeline.
+
+Your job is to extract the user's directives — the things the plan MUST respect. \
+These are the user's stated constraints, facts about the world, requirements, \
+banned items, and implied intent.
+
+Focus on things that are easy for a planning pipeline to dilute:
+- Stated facts about the current state of the world (e.g., "the building is already demolished")
+- Hard numeric constraints (budget, timeline, capacity)
+- Explicit scope boundaries (what to build, what NOT to build)
+- Banned words or approaches
+- The user's posture: are they saying "execute this" or "study whether to do this"?
+
+Extract 5-15 directives. Prioritize specificity over quantity. \
+Rate importance from 1 (minor detail) to 5 (core requirement).
+
+Do NOT extract generic project management advice. \
+Only extract what the USER specifically stated or clearly implied.
+"""
+
+SCORE_ADHERENCE_SYSTEM_PROMPT = """\
+You are checking whether a project plan faithfully follows the user's original directives.
+
+You will receive:
+1. The user's original prompt
+2. A list of extracted directives (what the user asked for)
+3. The final plan artifacts
+
+For each directive, score how well the plan honored it:
+- adherence_5: 1 (ignored or contradicted) to 5 (fully honored)
+- category: what happened to this directive in the plan
+- evidence: quote from the plan (under 200 chars) showing how it was handled
+- explanation: why you gave this score
+
+Be strict. The user wrote their prompt for a reason. If the plan softens \
+"100% renewable" to "aim for 60-80%", that is SOFTENED, not PARTIALLY_HONORED. \
+If the user says "the East Wing is already demolished" and the plan includes \
+demolition permitting, that is CONTRADICTED.
+
+Plans that add feasibility studies, risk disclaimers, or scope reductions that \
+the user didn't ask for should be flagged as UNSOLICITED_CAVEAT.
+
+Plans that use generic project management boilerplate instead of addressing \
+the specific problem should score low on adherence.
+"""
+
+
+# -- Business logic ------------------------------------------------------------
+
+@dataclass
+class PromptAdherence:
+    system_prompt_phase1: str
+    system_prompt_phase2: str
+    user_prompt: str
+    directives: dict
+    scores: dict
+    metadata: dict
+    markdown: str
+
+    @classmethod
+    def execute(cls, llm_executor: LLMExecutor, plan_prompt: str, plan_context: str) -> 'PromptAdherence':
+        if not isinstance(llm_executor, LLMExecutor):
+            raise ValueError("Invalid LLMExecutor instance.")
+        if not isinstance(plan_prompt, str):
+            raise ValueError("Invalid plan_prompt.")
+        if not isinstance(plan_context, str):
+            raise ValueError("Invalid plan_context.")
+
+        system_prompt_phase1 = EXTRACT_DIRECTIVES_SYSTEM_PROMPT.strip()
+        system_prompt_phase2 = SCORE_ADHERENCE_SYSTEM_PROMPT.strip()
+
+        # Phase 1: Extract directives from the original prompt
+        logger.info("Prompt Adherence Phase 1: Extracting directives from plan prompt...")
+        phase1_messages = [
+            ChatMessage(role=MessageRole.SYSTEM, content=system_prompt_phase1),
+            ChatMessage(role=MessageRole.USER, content=f"User's original prompt:\n{plan_prompt}"),
+        ]
+
+        def execute_phase1(llm: LLM) -> dict:
+            sllm = llm.as_structured_llm(DirectiveExtractionResult)
+            chat_response = sllm.chat(phase1_messages)
+            metadata = dict(llm.metadata)
+            metadata["llm_classname"] = llm.class_name()
+            return {"pydantic_response": chat_response.raw, "metadata": metadata}
+
+        try:
+            phase1_result = llm_executor.run(execute_phase1)
+        except PipelineStopRequested:
+            raise
+        except Exception as e:
+            llm_error = LLMChatError(cause=e)
+            logger.error(f"Phase 1 failed [{llm_error.error_id}]", exc_info=True)
+            raise llm_error from e
+
+        extraction: DirectiveExtractionResult = phase1_result["pydantic_response"]
+        logger.info(f"Phase 1 complete: extracted {len(extraction.directives)} directives.")
+
+        # Phase 2: Score each directive against the plan
+        logger.info("Prompt Adherence Phase 2: Scoring directives against final plan...")
+        directives_json = json.dumps(extraction.model_dump(), indent=2)
+        phase2_messages = [
+            ChatMessage(role=MessageRole.SYSTEM, content=system_prompt_phase2),
+            ChatMessage(role=MessageRole.USER, content=(
+                f"User's original prompt:\n{plan_prompt}\n\n"
+                f"Extracted directives:\n{directives_json}\n\n"
+                f"Final plan artifacts:\n{plan_context}"
+            )),
+        ]
+
+        def execute_phase2(llm: LLM) -> dict:
+            sllm = llm.as_structured_llm(AdherenceScoreResult)
+            chat_response = sllm.chat(phase2_messages)
+            metadata = dict(llm.metadata)
+            metadata["llm_classname"] = llm.class_name()
+            return {"pydantic_response": chat_response.raw, "metadata": metadata}
+
+        try:
+            phase2_result = llm_executor.run(execute_phase2)
+        except PipelineStopRequested:
+            raise
+        except Exception as e:
+            llm_error = LLMChatError(cause=e)
+            logger.error(f"Phase 2 failed [{llm_error.error_id}]", exc_info=True)
+            raise llm_error from e
+
+        scoring: AdherenceScoreResult = phase2_result["pydantic_response"]
+        logger.info(f"Phase 2 complete: scored {len(scoring.results)} directives.")
+
+        metadata = {
+            "phase1": phase1_result["metadata"],
+            "phase2": phase2_result["metadata"],
+        }
+        markdown = cls.convert_to_markdown(extraction, scoring)
+
+        return PromptAdherence(
+            system_prompt_phase1=system_prompt_phase1,
+            system_prompt_phase2=system_prompt_phase2,
+            user_prompt=plan_prompt,
+            directives=extraction.model_dump(),
+            scores=scoring.model_dump(),
+            metadata=metadata,
+            markdown=markdown,
+        )
+
+    def to_dict(self, include_metadata=True, include_system_prompt=True, include_user_prompt=True, include_markdown=True) -> dict:
+        d = {
+            "directives": self.directives,
+            "scores": self.scores,
+        }
+        if include_metadata:
+            d["metadata"] = self.metadata
+        if include_system_prompt:
+            d["system_prompt_phase1"] = self.system_prompt_phase1
+            d["system_prompt_phase2"] = self.system_prompt_phase2
+        if include_user_prompt:
+            d["user_prompt"] = self.user_prompt
+        if include_markdown:
+            d["markdown"] = self.markdown
+        return d
+
+    def save_raw(self, file_path: str) -> None:
+        with open(file_path, 'w') as f:
+            f.write(json.dumps(self.to_dict(), indent=2))
+
+    def save_markdown(self, output_file_path: str) -> None:
+        with open(output_file_path, 'w', encoding='utf-8') as f:
+            f.write(self.markdown)
+
+    @staticmethod
+    def calculate_overall_score(directives: DirectiveExtractionResult, scores: AdherenceScoreResult) -> int:
+        """Weighted average: sum(adherence_5 * importance_5) / sum(5 * importance_5) as integer percentage."""
+        if not directives.directives:
+            return 100
+        importance_map = {d.directive_id: d.importance_5 for d in directives.directives}
+        weighted_sum = 0
+        max_sum = 0
+        for r in scores.results:
+            importance = importance_map.get(r.directive_id, 3)
+            weighted_sum += r.adherence_5 * importance
+            max_sum += 5 * importance
+        if max_sum == 0:
+            return 100
+        return round(weighted_sum * 100 / max_sum)
+
+    @staticmethod
+    def convert_to_markdown(directives: DirectiveExtractionResult, scores: AdherenceScoreResult) -> str:
+        lines: list[str] = []
+        lines.append("# Prompt Adherence Report")
+        lines.append("")
+
+        # Build lookup
+        importance_map = {d.directive_id: d for d in directives.directives}
+
+        # Calculate overall score
+        overall = PromptAdherence.calculate_overall_score(directives, scores)
+        lines.append(f"**Overall Adherence: {overall}%**")
+        lines.append("")
+
+        # Sort by severity: importance * (6 - adherence), worst first
+        scored_items = []
+        for r in scores.results:
+            d = importance_map.get(r.directive_id)
+            importance = d.importance_5 if d else 3
+            severity = importance * (6 - r.adherence_5)
+            scored_items.append((severity, d, r))
+        scored_items.sort(key=lambda x: x[0], reverse=True)
+
+        # Summary table
+        lines.append("## Summary")
+        lines.append("")
+        lines.append("| ID | Directive | Type | Importance | Adherence | Category |")
+        lines.append("|----|-----------|------|------------|-----------|----------|")
+        for _, d, r in scored_items:
+            directive_text = d.text if d else "Unknown"
+            directive_type = d.directive_type.value if d else "unknown"
+            lines.append(
+                f"| {r.directive_id} | {_escape_table_cell(directive_text)} "
+                f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
+                f"| {r.adherence_5}/5 | {r.category.value} |"
+            )
+        lines.append("")
+
+        # Detail section for poorly-scored directives
+        poor_items = [(sev, d, r) for sev, d, r in scored_items if r.adherence_5 <= 3]
+        if poor_items:
+            lines.append("## Issues")
+            lines.append("")
+            for _, d, r in poor_items:
+                directive_text = d.text if d else "Unknown"
+                lines.append(f"### {r.directive_id}: {directive_text}")
+                lines.append("")
+                lines.append(f"- **Category:** {r.category.value}")
+                lines.append(f"- **Adherence:** {r.adherence_5}/5")
+                lines.append(f"- **Importance:** {d.importance_5 if d else '?'}/5")
+                lines.append(f"- **Evidence:** {r.evidence}")
+                lines.append(f"- **Explanation:** {r.explanation}")
+                lines.append("")
+
+        return "\n".join(lines)
+
+
+def _escape_table_cell(text: str) -> str:
+    return text.replace("|", "\\|").replace("\n", " ")
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `cd worker_plan && .venv/bin/python -m pytest worker_plan_internal/diagnostics/tests/test_prompt_adherence.py -v`
+Expected: All tests PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+git commit -m "feat: add prompt adherence Pydantic models, prompts, and markdown generation"
+```
+
+---
+
+### Task 3: Luigi task
+
+**Files:**
+- Create: `worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py`
+
+- [ ] **Step 1: Implement the Luigi task**
+
+```python
+# worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
+"""PromptAdherenceTask - Check how faithfully the plan follows the original prompt."""
+from worker_plan_internal.plan.run_plan_pipeline import PlanTask
+from worker_plan_internal.diagnostics.prompt_adherence import PromptAdherence
+from worker_plan_internal.llm_util.llm_executor import LLMExecutor
+from worker_plan_api.filenames import FilenameEnum
+from worker_plan_internal.plan.nodes.setup import SetupTask
+from worker_plan_internal.plan.nodes.project_plan import ProjectPlanTask
+from worker_plan_internal.plan.nodes.executive_summary import ExecutiveSummaryTask
+from worker_plan_internal.plan.nodes.consolidate_assumptions_markdown import ConsolidateAssumptionsMarkdownTask
+
+
+class PromptAdherenceTask(PlanTask):
+    """Score how faithfully the final plan follows the user's original prompt."""
+
+    def output(self):
+        return {
+            'raw': self.local_target(FilenameEnum.PROMPT_ADHERENCE_RAW),
+            'markdown': self.local_target(FilenameEnum.PROMPT_ADHERENCE_MARKDOWN),
+        }
+
+    def requires(self):
+        return {
+            'setup': self.clone(SetupTask),
+            'project_plan': self.clone(ProjectPlanTask),
+            'executive_summary': self.clone(ExecutiveSummaryTask),
+            'consolidate_assumptions_markdown': self.clone(ConsolidateAssumptionsMarkdownTask),
+        }
+
+    def run_inner(self):
+        llm_executor: LLMExecutor = self.create_llm_executor()
+
+        with self.input()['setup'].open("r") as f:
+            plan_prompt = f.read()
+        with self.input()['project_plan']['markdown'].open("r") as f:
+            project_plan_markdown = f.read()
+        with self.input()['executive_summary']['markdown'].open("r") as f:
+            executive_summary_markdown = f.read()
+        with self.input()['consolidate_assumptions_markdown']['full'].open("r") as f:
+            assumptions_markdown = f.read()
+
+        plan_context = (
+            f"File 'executive_summary.md':\n{executive_summary_markdown}\n\n"
+            f"File 'project_plan.md':\n{project_plan_markdown}\n\n"
+            f"File 'consolidate_assumptions_full.md':\n{assumptions_markdown}"
+        )
+
+        result = PromptAdherence.execute(
+            llm_executor=llm_executor,
+            plan_prompt=plan_prompt,
+            plan_context=plan_context,
+        )
+
+        result.save_raw(self.output()['raw'].path)
+        result.save_markdown(self.output()['markdown'].path)
+```
+
+- [ ] **Step 2: Verify import works**
+
+Run: `cd worker_plan && .venv/bin/python -c "from worker_plan_internal.plan.nodes.prompt_adherence import PromptAdherenceTask; print('OK')"`
+Expected: `OK`
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
+git commit -m "feat: add PromptAdherenceTask Luigi node"
+```
+
+---
+
+### Task 4: Wire into pipeline and report
+
+**Files:**
+- Modify: `worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py`
+- Modify: `worker_plan/worker_plan_internal/plan/nodes/report.py`
+
+- [ ] **Step 1: Add to full_plan_pipeline.py**
+
+Add the import at the top with the other node imports:
+
+```python
+from worker_plan_internal.plan.nodes.prompt_adherence import PromptAdherenceTask
+```
+
+Add to the `requires()` dict, after `'self_audit'` and before `'report'`:
+
+```python
+            'prompt_adherence': self.clone(PromptAdherenceTask),
+```
+
+- [ ] **Step 2: Add to report.py**
+
+Add the import at the top:
+
+```python
+from worker_plan_internal.plan.nodes.prompt_adherence import PromptAdherenceTask
+```
+
+Add to `requires()` dict:
+
+```python
+            'prompt_adherence': self.clone(PromptAdherenceTask),
+```
+
+In `run_inner()`, find where `self_audit` is appended and add after it:
+
+```python
+        rg.append_markdown_with_tables('Prompt Adherence', self.input()['prompt_adherence']['markdown'].path)
+```
+
+- [ ] **Step 3: Run full test suite**
+
+Run: `cd worker_plan && .venv/bin/python -m pytest -q`
+Expected: All tests pass
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py worker_plan/worker_plan_internal/plan/nodes/report.py
+git commit -m "feat: wire PromptAdherenceTask into pipeline and report"
+```
+
+---
+
+### Task 5: Integration verification
+
+- [ ] **Step 1: Verify extract_dag picks up the new node**
+
+Run: `cd worker_plan && .venv/bin/python -c "from worker_plan_internal.extract_dag import extract_dag; dag = extract_dag(); nodes = {n['id'] for n in dag['nodes']}; assert 'prompt_adherence' in nodes; print(f'OK: {len(nodes)} nodes')"`
+Expected: `OK: <N> nodes` (one more than before)
+
+- [ ] **Step 2: Run full test suite**
+
+Run: `cd worker_plan && .venv/bin/python -m pytest -q`
+Expected: All tests pass, no regressions
+
+- [ ] **Step 3: Commit any fixes**
+
+Only if step 2 revealed issues. Otherwise skip.

From 4d8bea076c4fefce966bdfdd41d5cc85af5f0a4f Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:23:12 +0200
Subject: [PATCH 03/19] feat: add FilenameEnum entries for prompt adherence

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 worker_plan/worker_plan_api/filenames.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/worker_plan/worker_plan_api/filenames.py b/worker_plan/worker_plan_api/filenames.py
index fc7ba624..92e1c6bc 100644
--- a/worker_plan/worker_plan_api/filenames.py
+++ b/worker_plan/worker_plan_api/filenames.py
@@ -128,6 +128,8 @@ class FilenameEnum(str, Enum):
     PREMORTEM_MARKDOWN = "premortem.md"
     SELF_AUDIT_RAW = "self_audit_raw.json"
     SELF_AUDIT_MARKDOWN = "self_audit.md"
+    PROMPT_ADHERENCE_RAW = "prompt_adherence_raw.json"
+    PROMPT_ADHERENCE_MARKDOWN = "prompt_adherence.md"
     REPORT = "report.html"
     PIPELINE_COMPLETE = "pipeline_complete.txt"
 

From 06bfad3ede63ecb6ef5bf794350f58a63d243cdf Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:26:19 +0200
Subject: [PATCH 04/19] feat: add prompt adherence Pydantic models, prompts,
 and markdown generation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py           | 326 ++++++++++++++++++
 .../tests/test_prompt_adherence.py            | 115 ++++++
 2 files changed, 441 insertions(+)
 create mode 100644 worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
 create mode 100644 worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
new file mode 100644
index 00000000..a1763699
--- /dev/null
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -0,0 +1,326 @@
+# worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+"""
+Prompt Adherence: check how faithfully the final plan follows the original user prompt.
+
+Phase 1: Extract directives (constraints, stated facts, requirements, banned words, intent) from plan.txt.
+Phase 2: Score each directive against the final plan artifacts.
+
+PROMPT> python -m worker_plan_internal.diagnostics.prompt_adherence
+"""
+import json
+import logging
+from enum import Enum
+from dataclasses import dataclass
+from typing import List
+from pydantic import BaseModel, Field
+from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.core.llms.llm import LLM
+from worker_plan_internal.llm_util.llm_executor import LLMExecutor, PipelineStopRequested
+from worker_plan_internal.llm_util.llm_errors import LLMChatError
+
+logger = logging.getLogger(__name__)
+
+
+# -- Pydantic models for Phase 1: Directive Extraction -------------------------
+
+class DirectiveType(str, Enum):
+    CONSTRAINT = "constraint"
+    STATED_FACT = "stated_fact"
+    REQUIREMENT = "requirement"
+    BANNED = "banned"
+    INTENT = "intent"
+
+
+class Directive(BaseModel):
+    directive_id: str = Field(description="Enumerate as 'D1', 'D2', 'D3', etc.")
+    directive_type: DirectiveType = Field(description=(
+        "constraint: explicit numeric or scope limits (budget, timeline, capacity). "
+        "stated_fact: things the user says are already true about the world. "
+        "requirement: what must be built or done. "
+        "banned: words, approaches, or technologies the user explicitly prohibits. "
+        "intent: the user's posture, tone, or implied expectations about execution vs. study."
+    ))
+    text: str = Field(description="The user's words — short quote or close paraphrase (under 100 chars).")
+    importance_5: int = Field(description="1 (minor detail) to 5 (core requirement). Rate how central this is to the user's request.")
+
+
+class DirectiveExtractionResult(BaseModel):
+    directives: List[Directive] = Field(description="5-15 directives extracted from the user's prompt.")
+
+
+# -- Pydantic models for Phase 2: Adherence Scoring ---------------------------
+
+class AdherenceCategory(str, Enum):
+    FULLY_HONORED = "fully_honored"
+    PARTIALLY_HONORED = "partially_honored"
+    SOFTENED = "softened"
+    IGNORED = "ignored"
+    CONTRADICTED = "contradicted"
+    UNSOLICITED_CAVEAT = "unsolicited_caveat"
+
+
+class AdherenceResult(BaseModel):
+    directive_id: str = Field(description="References a directive from Phase 1.")
+    adherence_5: int = Field(description="1 (ignored/contradicted) to 5 (fully honored).")
+    category: AdherenceCategory = Field(description=(
+        "fully_honored: plan respects this exactly. "
+        "partially_honored: plan addresses it but incompletely. "
+        "softened: plan weakens the requirement. "
+        "ignored: plan doesn't address it at all. "
+        "contradicted: plan says the opposite. "
+        "unsolicited_caveat: plan adds qualifications the user didn't ask for."
+    ))
+    evidence: str = Field(description="Direct quote from the plan (under 200 chars).")
+    explanation: str = Field(description="How the plan handled this directive and why this score was given.")
+
+
+class AdherenceScoreResult(BaseModel):
+    results: List[AdherenceResult] = Field(description="One scoring result per directive from Phase 1.")
+
+
+# -- System prompts ------------------------------------------------------------
+
+EXTRACT_DIRECTIVES_SYSTEM_PROMPT = """\
+You are analyzing the original user prompt for a project planning pipeline.
+
+Your job is to extract the user's directives — the things the plan MUST respect. \
+These are the user's stated constraints, facts about the world, requirements, \
+banned items, and implied intent.
+
+Focus on things that are easy for a planning pipeline to dilute:
+- Stated facts about the current state of the world (e.g., "the building is already demolished")
+- Hard numeric constraints (budget, timeline, capacity)
+- Explicit scope boundaries (what to build, what NOT to build)
+- Banned words or approaches
+- The user's posture: are they saying "execute this" or "study whether to do this"?
+
+Extract 5-15 directives. Prioritize specificity over quantity. \
+Rate importance from 1 (minor detail) to 5 (core requirement).
+
+Do NOT extract generic project management advice. \
+Only extract what the USER specifically stated or clearly implied.
+"""
+
+SCORE_ADHERENCE_SYSTEM_PROMPT = """\
+You are checking whether a project plan faithfully follows the user's original directives.
+
+You will receive:
+1. The user's original prompt
+2. A list of extracted directives (what the user asked for)
+3. The final plan artifacts
+
+For each directive, score how well the plan honored it:
+- adherence_5: 1 (ignored or contradicted) to 5 (fully honored)
+- category: what happened to this directive in the plan
+- evidence: quote from the plan (under 200 chars) showing how it was handled
+- explanation: why you gave this score
+
+Be strict. The user wrote their prompt for a reason. If the plan softens \
+"100% renewable" to "aim for 60-80%", that is SOFTENED, not PARTIALLY_HONORED. \
+If the user says "the East Wing is already demolished" and the plan includes \
+demolition permitting, that is CONTRADICTED.
+
+Plans that add feasibility studies, risk disclaimers, or scope reductions that \
+the user didn't ask for should be flagged as UNSOLICITED_CAVEAT.
+
+Plans that use generic project management boilerplate instead of addressing \
+the specific problem should score low on adherence.
+"""
+
+
+# -- Business logic ------------------------------------------------------------
+
+@dataclass
+class PromptAdherence:
+    system_prompt_phase1: str
+    system_prompt_phase2: str
+    user_prompt: str
+    directives: dict
+    scores: dict
+    metadata: dict
+    markdown: str
+
+    @classmethod
+    def execute(cls, llm_executor: LLMExecutor, plan_prompt: str, plan_context: str) -> 'PromptAdherence':
+        if not isinstance(llm_executor, LLMExecutor):
+            raise ValueError("Invalid LLMExecutor instance.")
+        if not isinstance(plan_prompt, str):
+            raise ValueError("Invalid plan_prompt.")
+        if not isinstance(plan_context, str):
+            raise ValueError("Invalid plan_context.")
+
+        system_prompt_phase1 = EXTRACT_DIRECTIVES_SYSTEM_PROMPT.strip()
+        system_prompt_phase2 = SCORE_ADHERENCE_SYSTEM_PROMPT.strip()
+
+        # Phase 1: Extract directives from the original prompt
+        logger.info("Prompt Adherence Phase 1: Extracting directives from plan prompt...")
+        phase1_messages = [
+            ChatMessage(role=MessageRole.SYSTEM, content=system_prompt_phase1),
+            ChatMessage(role=MessageRole.USER, content=f"User's original prompt:\n{plan_prompt}"),
+        ]
+
+        def execute_phase1(llm: LLM) -> dict:
+            sllm = llm.as_structured_llm(DirectiveExtractionResult)
+            chat_response = sllm.chat(phase1_messages)
+            metadata = dict(llm.metadata)
+            metadata["llm_classname"] = llm.class_name()
+            return {"pydantic_response": chat_response.raw, "metadata": metadata}
+
+        try:
+            phase1_result = llm_executor.run(execute_phase1)
+        except PipelineStopRequested:
+            raise
+        except Exception as e:
+            llm_error = LLMChatError(cause=e)
+            logger.error(f"Phase 1 failed [{llm_error.error_id}]", exc_info=True)
+            raise llm_error from e
+
+        extraction: DirectiveExtractionResult = phase1_result["pydantic_response"]
+        logger.info(f"Phase 1 complete: extracted {len(extraction.directives)} directives.")
+
+        # Phase 2: Score each directive against the plan
+        logger.info("Prompt Adherence Phase 2: Scoring directives against final plan...")
+        directives_json = json.dumps(extraction.model_dump(), indent=2)
+        phase2_messages = [
+            ChatMessage(role=MessageRole.SYSTEM, content=system_prompt_phase2),
+            ChatMessage(role=MessageRole.USER, content=(
+                f"User's original prompt:\n{plan_prompt}\n\n"
+                f"Extracted directives:\n{directives_json}\n\n"
+                f"Final plan artifacts:\n{plan_context}"
+            )),
+        ]
+
+        def execute_phase2(llm: LLM) -> dict:
+            sllm = llm.as_structured_llm(AdherenceScoreResult)
+            chat_response = sllm.chat(phase2_messages)
+            metadata = dict(llm.metadata)
+            metadata["llm_classname"] = llm.class_name()
+            return {"pydantic_response": chat_response.raw, "metadata": metadata}
+
+        try:
+            phase2_result = llm_executor.run(execute_phase2)
+        except PipelineStopRequested:
+            raise
+        except Exception as e:
+            llm_error = LLMChatError(cause=e)
+            logger.error(f"Phase 2 failed [{llm_error.error_id}]", exc_info=True)
+            raise llm_error from e
+
+        scoring: AdherenceScoreResult = phase2_result["pydantic_response"]
+        logger.info(f"Phase 2 complete: scored {len(scoring.results)} directives.")
+
+        metadata = {
+            "phase1": phase1_result["metadata"],
+            "phase2": phase2_result["metadata"],
+        }
+        markdown = cls.convert_to_markdown(extraction, scoring)
+
+        return PromptAdherence(
+            system_prompt_phase1=system_prompt_phase1,
+            system_prompt_phase2=system_prompt_phase2,
+            user_prompt=plan_prompt,
+            directives=extraction.model_dump(),
+            scores=scoring.model_dump(),
+            metadata=metadata,
+            markdown=markdown,
+        )
+
+    def to_dict(self, include_metadata=True, include_system_prompt=True, include_user_prompt=True, include_markdown=True) -> dict:
+        d = {
+            "directives": self.directives,
+            "scores": self.scores,
+        }
+        if include_metadata:
+            d["metadata"] = self.metadata
+        if include_system_prompt:
+            d["system_prompt_phase1"] = self.system_prompt_phase1
+            d["system_prompt_phase2"] = self.system_prompt_phase2
+        if include_user_prompt:
+            d["user_prompt"] = self.user_prompt
+        if include_markdown:
+            d["markdown"] = self.markdown
+        return d
+
+    def save_raw(self, file_path: str) -> None:
+        with open(file_path, 'w') as f:
+            f.write(json.dumps(self.to_dict(), indent=2))
+
+    def save_markdown(self, output_file_path: str) -> None:
+        with open(output_file_path, 'w', encoding='utf-8') as f:
+            f.write(self.markdown)
+
+    @staticmethod
+    def calculate_overall_score(directives: DirectiveExtractionResult, scores: AdherenceScoreResult) -> int:
+        """Weighted average: sum(adherence_5 * importance_5) / sum(5 * importance_5) as integer percentage."""
+        if not directives.directives:
+            return 100
+        importance_map = {d.directive_id: d.importance_5 for d in directives.directives}
+        weighted_sum = 0
+        max_sum = 0
+        for r in scores.results:
+            importance = importance_map.get(r.directive_id, 3)
+            weighted_sum += r.adherence_5 * importance
+            max_sum += 5 * importance
+        if max_sum == 0:
+            return 100
+        return round(weighted_sum * 100 / max_sum)
+
+    @staticmethod
+    def convert_to_markdown(directives: DirectiveExtractionResult, scores: AdherenceScoreResult) -> str:
+        lines: list[str] = []
+        lines.append("# Prompt Adherence Report")
+        lines.append("")
+
+        # Build lookup
+        importance_map = {d.directive_id: d for d in directives.directives}
+
+        # Calculate overall score
+        overall = PromptAdherence.calculate_overall_score(directives, scores)
+        lines.append(f"**Overall Adherence: {overall}%**")
+        lines.append("")
+
+        # Sort by severity: importance * (6 - adherence), worst first
+        scored_items = []
+        for r in scores.results:
+            d = importance_map.get(r.directive_id)
+            importance = d.importance_5 if d else 3
+            severity = importance * (6 - r.adherence_5)
+            scored_items.append((severity, d, r))
+        scored_items.sort(key=lambda x: x[0], reverse=True)
+
+        # Summary table
+        lines.append("## Summary")
+        lines.append("")
+        lines.append("| ID | Directive | Type | Importance | Adherence | Category |")
+        lines.append("|----|-----------|------|------------|-----------|----------|")
+        for _, d, r in scored_items:
+            directive_text = d.text if d else "Unknown"
+            directive_type = d.directive_type.value if d else "unknown"
+            lines.append(
+                f"| {r.directive_id} | {_escape_table_cell(directive_text)} "
+                f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
+                f"| {r.adherence_5}/5 | {r.category.value} |"
+            )
+        lines.append("")
+
+        # Detail section for poorly-scored directives
+        poor_items = [(sev, d, r) for sev, d, r in scored_items if r.adherence_5 <= 3]
+        if poor_items:
+            lines.append("## Issues")
+            lines.append("")
+            for _, d, r in poor_items:
+                directive_text = d.text if d else "Unknown"
+                lines.append(f"### {r.directive_id}: {directive_text}")
+                lines.append("")
+                lines.append(f"- **Category:** {r.category.value}")
+                lines.append(f"- **Adherence:** {r.adherence_5}/5")
+                lines.append(f"- **Importance:** {d.importance_5 if d else '?'}/5")
+                lines.append(f"- **Evidence:** {r.evidence}")
+                lines.append(f"- **Explanation:** {r.explanation}")
+                lines.append("")
+
+        return "\n".join(lines)
+
+
+def _escape_table_cell(text: str) -> str:
+    return text.replace("|", "\\|").replace("\n", " ")
diff --git a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
new file mode 100644
index 00000000..dc4e6e41
--- /dev/null
+++ b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
@@ -0,0 +1,115 @@
+# worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+import unittest
+from worker_plan_internal.diagnostics.prompt_adherence import (
+    DirectiveType,
+    Directive,
+    DirectiveExtractionResult,
+    AdherenceCategory,
+    AdherenceResult,
+    AdherenceScoreResult,
+    PromptAdherence,
+)
+
+
+class TestDirectiveModel(unittest.TestCase):
+    def test_directive_valid(self):
+        d = Directive(
+            directive_id="D1",
+            directive_type=DirectiveType.CONSTRAINT,
+            text="Budget: DKK 500M",
+            importance_5=5,
+        )
+        self.assertEqual(d.directive_id, "D1")
+        self.assertEqual(d.directive_type, DirectiveType.CONSTRAINT)
+        self.assertEqual(d.importance_5, 5)
+
+    def test_directive_extraction_result(self):
+        result = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+            ]
+        )
+        self.assertEqual(len(result.directives), 2)
+
+
+class TestAdherenceResultModel(unittest.TestCase):
+    def test_adherence_result_valid(self):
+        r = AdherenceResult(
+            directive_id="D1",
+            adherence_5=3,
+            category=AdherenceCategory.SOFTENED,
+            evidence="Budget adjusted to DKK 800M",
+            explanation="The plan increased the budget beyond the stated constraint.",
+        )
+        self.assertEqual(r.adherence_5, 3)
+        self.assertEqual(r.category, AdherenceCategory.SOFTENED)
+
+    def test_adherence_score_result(self):
+        result = AdherenceScoreResult(
+            results=[
+                AdherenceResult(
+                    directive_id="D1", adherence_5=5,
+                    category=AdherenceCategory.FULLY_HONORED,
+                    evidence="Budget: DKK 500M", explanation="Honored exactly.",
+                ),
+                AdherenceResult(
+                    directive_id="D2", adherence_5=1,
+                    category=AdherenceCategory.CONTRADICTED,
+                    evidence="Demolition permit required", explanation="Plan ignores stated fact.",
+                ),
+            ]
+        )
+        self.assertEqual(len(result.results), 2)
+
+
+class TestPromptAdherenceMarkdown(unittest.TestCase):
+    def test_convert_to_markdown_produces_report(self):
+        directives = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+            ]
+        )
+        scores = AdherenceScoreResult(
+            results=[
+                AdherenceResult(
+                    directive_id="D1", adherence_5=5,
+                    category=AdherenceCategory.FULLY_HONORED,
+                    evidence="Budget: DKK 500M", explanation="Honored.",
+                ),
+                AdherenceResult(
+                    directive_id="D2", adherence_5=1,
+                    category=AdherenceCategory.CONTRADICTED,
+                    evidence="Demolition permit required",
+                    explanation="Plan contradicts stated fact.",
+                ),
+            ]
+        )
+        markdown = PromptAdherence.convert_to_markdown(directives, scores)
+        self.assertIn("# Prompt Adherence Report", markdown)
+        self.assertIn("Budget: DKK 500M", markdown)
+        self.assertIn("contradicted", markdown)
+        self.assertIn("Overall Adherence", markdown)
+
+    def test_overall_score_calculation(self):
+        directives = DirectiveExtractionResult(
+            directives=[
+                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="A", importance_5=5),
+                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="B", importance_5=5),
+            ]
+        )
+        scores = AdherenceScoreResult(
+            results=[
+                AdherenceResult(directive_id="D1", adherence_5=5, category=AdherenceCategory.FULLY_HONORED, evidence="", explanation=""),
+                AdherenceResult(directive_id="D2", adherence_5=1, category=AdherenceCategory.CONTRADICTED, evidence="", explanation=""),
+            ]
+        )
+        score = PromptAdherence.calculate_overall_score(directives, scores)
+        self.assertEqual(score, 60)
+
+    def test_overall_score_empty(self):
+        directives = DirectiveExtractionResult(directives=[])
+        scores = AdherenceScoreResult(results=[])
+        score = PromptAdherence.calculate_overall_score(directives, scores)
+        self.assertEqual(score, 100)

From 2ad532fe7c814e0166aac73c7324a5ae40cafb12 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:27:18 +0200
Subject: [PATCH 05/19] feat: add PromptAdherenceTask Luigi node

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../plan/nodes/prompt_adherence.py            | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py

diff --git a/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py b/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
new file mode 100644
index 00000000..25287c17
--- /dev/null
+++ b/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
@@ -0,0 +1,54 @@
+"""PromptAdherenceTask - Check how faithfully the plan follows the original prompt."""
+from worker_plan_internal.plan.run_plan_pipeline import PlanTask
+from worker_plan_internal.diagnostics.prompt_adherence import PromptAdherence
+from worker_plan_internal.llm_util.llm_executor import LLMExecutor
+from worker_plan_api.filenames import FilenameEnum
+from worker_plan_internal.plan.nodes.setup import SetupTask
+from worker_plan_internal.plan.nodes.project_plan import ProjectPlanTask
+from worker_plan_internal.plan.nodes.executive_summary import ExecutiveSummaryTask
+from worker_plan_internal.plan.nodes.consolidate_assumptions_markdown import ConsolidateAssumptionsMarkdownTask
+
+
+class PromptAdherenceTask(PlanTask):
+    """Score how faithfully the final plan follows the user's original prompt."""
+
+    def output(self):
+        return {
+            'raw': self.local_target(FilenameEnum.PROMPT_ADHERENCE_RAW),
+            'markdown': self.local_target(FilenameEnum.PROMPT_ADHERENCE_MARKDOWN),
+        }
+
+    def requires(self):
+        return {
+            'setup': self.clone(SetupTask),
+            'project_plan': self.clone(ProjectPlanTask),
+            'executive_summary': self.clone(ExecutiveSummaryTask),
+            'consolidate_assumptions_markdown': self.clone(ConsolidateAssumptionsMarkdownTask),
+        }
+
+    def run_inner(self):
+        llm_executor: LLMExecutor = self.create_llm_executor()
+
+        with self.input()['setup'].open("r") as f:
+            plan_prompt = f.read()
+        with self.input()['project_plan']['markdown'].open("r") as f:
+            project_plan_markdown = f.read()
+        with self.input()['executive_summary']['markdown'].open("r") as f:
+            executive_summary_markdown = f.read()
+        with self.input()['consolidate_assumptions_markdown']['full'].open("r") as f:
+            assumptions_markdown = f.read()
+
+        plan_context = (
+            f"File 'executive_summary.md':\n{executive_summary_markdown}\n\n"
+            f"File 'project_plan.md':\n{project_plan_markdown}\n\n"
+            f"File 'consolidate_assumptions_full.md':\n{assumptions_markdown}"
+        )
+
+        result = PromptAdherence.execute(
+            llm_executor=llm_executor,
+            plan_prompt=plan_prompt,
+            plan_context=plan_context,
+        )
+
+        result.save_raw(self.output()['raw'].path)
+        result.save_markdown(self.output()['markdown'].path)

From 59ad512768ffb7a8ce0e407ad18b9938584070ad Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 17:29:29 +0200
Subject: [PATCH 06/19] feat: wire PromptAdherenceTask into pipeline and report

Add PromptAdherenceTask to full_plan_pipeline.py requires() dict and
report.py requires() dict and run_inner(). Also fix bare Enum types
in prompt_adherence.py Pydantic models to use Literal[...] as required
by the codebase lint rules.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py                  | 12 ++++++------
 .../plan/nodes/full_plan_pipeline.py                 |  2 ++
 .../worker_plan_internal/plan/nodes/report.py        |  5 ++++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index a1763699..ba8e0f72 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -11,7 +11,7 @@
 import logging
 from enum import Enum
 from dataclasses import dataclass
-from typing import List
+from typing import List, Literal
 from pydantic import BaseModel, Field
 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core.llms.llm import LLM
@@ -33,7 +33,7 @@ class DirectiveType(str, Enum):
 
 class Directive(BaseModel):
     directive_id: str = Field(description="Enumerate as 'D1', 'D2', 'D3', etc.")
-    directive_type: DirectiveType = Field(description=(
+    directive_type: Literal["constraint", "stated_fact", "requirement", "banned", "intent"] = Field(description=(
         "constraint: explicit numeric or scope limits (budget, timeline, capacity). "
         "stated_fact: things the user says are already true about the world. "
         "requirement: what must be built or done. "
@@ -62,7 +62,7 @@ class AdherenceCategory(str, Enum):
 class AdherenceResult(BaseModel):
     directive_id: str = Field(description="References a directive from Phase 1.")
     adherence_5: int = Field(description="1 (ignored/contradicted) to 5 (fully honored).")
-    category: AdherenceCategory = Field(description=(
+    category: Literal["fully_honored", "partially_honored", "softened", "ignored", "contradicted", "unsolicited_caveat"] = Field(description=(
         "fully_honored: plan respects this exactly. "
         "partially_honored: plan addresses it but incompletely. "
         "softened: plan weakens the requirement. "
@@ -295,11 +295,11 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         lines.append("|----|-----------|------|------------|-----------|----------|")
         for _, d, r in scored_items:
             directive_text = d.text if d else "Unknown"
-            directive_type = d.directive_type.value if d else "unknown"
+            directive_type = d.directive_type if d else "unknown"
             lines.append(
                 f"| {r.directive_id} | {_escape_table_cell(directive_text)} "
                 f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
-                f"| {r.adherence_5}/5 | {r.category.value} |"
+                f"| {r.adherence_5}/5 | {r.category} |"
             )
         lines.append("")
 
@@ -312,7 +312,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
                 directive_text = d.text if d else "Unknown"
                 lines.append(f"### {r.directive_id}: {directive_text}")
                 lines.append("")
-                lines.append(f"- **Category:** {r.category.value}")
+                lines.append(f"- **Category:** {r.category}")
                 lines.append(f"- **Adherence:** {r.adherence_5}/5")
                 lines.append(f"- **Importance:** {d.importance_5 if d else '?'}/5")
                 lines.append(f"- **Evidence:** {r.evidence}")
diff --git a/worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py b/worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py
index ed51e3bd..970df417 100644
--- a/worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py
+++ b/worker_plan/worker_plan_internal/plan/nodes/full_plan_pipeline.py
@@ -88,6 +88,7 @@
 from worker_plan_internal.plan.nodes.questions_and_answers import QuestionsAndAnswersTask
 from worker_plan_internal.plan.nodes.premortem import PremortemTask
 from worker_plan_internal.plan.nodes.self_audit import SelfAuditTask
+from worker_plan_internal.plan.nodes.prompt_adherence import PromptAdherenceTask
 from worker_plan_internal.plan.nodes.report import ReportTask
 
 
@@ -163,6 +164,7 @@ def requires(self):
             'questions_and_answers': self.clone(QuestionsAndAnswersTask),
             'premortem': self.clone(PremortemTask),
             'self_audit': self.clone(SelfAuditTask),
+            'prompt_adherence': self.clone(PromptAdherenceTask),
             'report': self.clone(ReportTask),
         }
 
diff --git a/worker_plan/worker_plan_internal/plan/nodes/report.py b/worker_plan/worker_plan_internal/plan/nodes/report.py
index cc123fee..c1ac4530 100644
--- a/worker_plan/worker_plan_internal/plan/nodes/report.py
+++ b/worker_plan/worker_plan_internal/plan/nodes/report.py
@@ -25,6 +25,7 @@
 from worker_plan_internal.plan.nodes.questions_and_answers import QuestionsAndAnswersTask
 from worker_plan_internal.plan.nodes.premortem import PremortemTask
 from worker_plan_internal.plan.nodes.self_audit import SelfAuditTask
+from worker_plan_internal.plan.nodes.prompt_adherence import PromptAdherenceTask
 from worker_plan_internal.plan.nodes.screen_planning_prompt import ScreenPlanningPromptTask
 
 
@@ -58,7 +59,8 @@ def requires(self):
             'create_schedule': self.clone(CreateScheduleTask),
             'questions_and_answers': self.clone(QuestionsAndAnswersTask),
             'premortem': self.clone(PremortemTask),
-            'self_audit': self.clone(SelfAuditTask)
+            'self_audit': self.clone(SelfAuditTask),
+            'prompt_adherence': self.clone(PromptAdherenceTask),
         }
 
     def run_inner(self):
@@ -86,6 +88,7 @@ def run_inner(self):
         rg.append_html('Questions & Answers', self.input()['questions_and_answers']['html'].path)
         rg.append_markdown_with_tables('Premortem', self.input()['premortem']['markdown'].path)
         rg.append_markdown_with_tables('Self Audit', self.input()['self_audit']['markdown'].path)
+        rg.append_markdown_with_tables('Prompt Adherence', self.input()['prompt_adherence']['markdown'].path)
         rg.append_initial_prompt_vetted(
             document_title='Initial Prompt Vetted',
             initial_prompt_file_path=self.input()['setup'].path,

From de9f0b24ed23fadd540b3beb8b50eb60a81e6cdb Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 19:39:30 +0200
Subject: [PATCH 07/19] refactor: use directive_index (int) instead of
 directive_id (str)

Incrementing integer prevents random ordering from the LLM.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py           | 16 +++++-----
 .../tests/test_prompt_adherence.py            | 30 +++++++++----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index ba8e0f72..45984ba3 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -32,7 +32,7 @@ class DirectiveType(str, Enum):
 
 
 class Directive(BaseModel):
-    directive_id: str = Field(description="Enumerate as 'D1', 'D2', 'D3', etc.")
+    directive_index: int = Field(description="Index of this directive, starting from 1.")
     directive_type: Literal["constraint", "stated_fact", "requirement", "banned", "intent"] = Field(description=(
         "constraint: explicit numeric or scope limits (budget, timeline, capacity). "
         "stated_fact: things the user says are already true about the world. "
@@ -60,7 +60,7 @@ class AdherenceCategory(str, Enum):
 
 
 class AdherenceResult(BaseModel):
-    directive_id: str = Field(description="References a directive from Phase 1.")
+    directive_index: int = Field(description="References a directive_index from Phase 1.")
     adherence_5: int = Field(description="1 (ignored/contradicted) to 5 (fully honored).")
     category: Literal["fully_honored", "partially_honored", "softened", "ignored", "contradicted", "unsolicited_caveat"] = Field(description=(
         "fully_honored: plan respects this exactly. "
@@ -254,11 +254,11 @@ def calculate_overall_score(directives: DirectiveExtractionResult, scores: Adher
         """Weighted average: sum(adherence_5 * importance_5) / sum(5 * importance_5) as integer percentage."""
         if not directives.directives:
             return 100
-        importance_map = {d.directive_id: d.importance_5 for d in directives.directives}
+        importance_map = {d.directive_index: d.importance_5 for d in directives.directives}
         weighted_sum = 0
         max_sum = 0
         for r in scores.results:
-            importance = importance_map.get(r.directive_id, 3)
+            importance = importance_map.get(r.directive_index, 3)
             weighted_sum += r.adherence_5 * importance
             max_sum += 5 * importance
         if max_sum == 0:
@@ -272,7 +272,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         lines.append("")
 
         # Build lookup
-        importance_map = {d.directive_id: d for d in directives.directives}
+        importance_map = {d.directive_index: d for d in directives.directives}
 
         # Calculate overall score
         overall = PromptAdherence.calculate_overall_score(directives, scores)
@@ -282,7 +282,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         # Sort by severity: importance * (6 - adherence), worst first
         scored_items = []
         for r in scores.results:
-            d = importance_map.get(r.directive_id)
+            d = importance_map.get(r.directive_index)
             importance = d.importance_5 if d else 3
             severity = importance * (6 - r.adherence_5)
             scored_items.append((severity, d, r))
@@ -297,7 +297,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
             directive_text = d.text if d else "Unknown"
             directive_type = d.directive_type if d else "unknown"
             lines.append(
-                f"| {r.directive_id} | {_escape_table_cell(directive_text)} "
+                f"| {r.directive_index} | {_escape_table_cell(directive_text)} "
                 f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
                 f"| {r.adherence_5}/5 | {r.category} |"
             )
@@ -310,7 +310,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
             lines.append("")
             for _, d, r in poor_items:
                 directive_text = d.text if d else "Unknown"
-                lines.append(f"### {r.directive_id}: {directive_text}")
+                lines.append(f"### {r.directive_index}: {directive_text}")
                 lines.append("")
                 lines.append(f"- **Category:** {r.category}")
                 lines.append(f"- **Adherence:** {r.adherence_5}/5")
diff --git a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
index dc4e6e41..53a02a2d 100644
--- a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
@@ -14,20 +14,20 @@
 class TestDirectiveModel(unittest.TestCase):
     def test_directive_valid(self):
         d = Directive(
-            directive_id="D1",
+            directive_index=1,
             directive_type=DirectiveType.CONSTRAINT,
             text="Budget: DKK 500M",
             importance_5=5,
         )
-        self.assertEqual(d.directive_id, "D1")
+        self.assertEqual(d.directive_index, 1)
         self.assertEqual(d.directive_type, DirectiveType.CONSTRAINT)
         self.assertEqual(d.importance_5, 5)
 
     def test_directive_extraction_result(self):
         result = DirectiveExtractionResult(
             directives=[
-                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
-                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+                Directive(directive_index=1, directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_index=2, directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
             ]
         )
         self.assertEqual(len(result.directives), 2)
@@ -36,7 +36,7 @@ def test_directive_extraction_result(self):
 class TestAdherenceResultModel(unittest.TestCase):
     def test_adherence_result_valid(self):
         r = AdherenceResult(
-            directive_id="D1",
+            directive_index=1,
             adherence_5=3,
             category=AdherenceCategory.SOFTENED,
             evidence="Budget adjusted to DKK 800M",
@@ -49,12 +49,12 @@ def test_adherence_score_result(self):
         result = AdherenceScoreResult(
             results=[
                 AdherenceResult(
-                    directive_id="D1", adherence_5=5,
+                    directive_index=1, adherence_5=5,
                     category=AdherenceCategory.FULLY_HONORED,
                     evidence="Budget: DKK 500M", explanation="Honored exactly.",
                 ),
                 AdherenceResult(
-                    directive_id="D2", adherence_5=1,
+                    directive_index=2, adherence_5=1,
                     category=AdherenceCategory.CONTRADICTED,
                     evidence="Demolition permit required", explanation="Plan ignores stated fact.",
                 ),
@@ -67,19 +67,19 @@ class TestPromptAdherenceMarkdown(unittest.TestCase):
     def test_convert_to_markdown_produces_report(self):
         directives = DirectiveExtractionResult(
             directives=[
-                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
-                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
+                Directive(directive_index=1, directive_type=DirectiveType.CONSTRAINT, text="Budget: DKK 500M", importance_5=5),
+                Directive(directive_index=2, directive_type=DirectiveType.STATED_FACT, text="East Wing demolished", importance_5=5),
             ]
         )
         scores = AdherenceScoreResult(
             results=[
                 AdherenceResult(
-                    directive_id="D1", adherence_5=5,
+                    directive_index=1, adherence_5=5,
                     category=AdherenceCategory.FULLY_HONORED,
                     evidence="Budget: DKK 500M", explanation="Honored.",
                 ),
                 AdherenceResult(
-                    directive_id="D2", adherence_5=1,
+                    directive_index=2, adherence_5=1,
                     category=AdherenceCategory.CONTRADICTED,
                     evidence="Demolition permit required",
                     explanation="Plan contradicts stated fact.",
@@ -95,14 +95,14 @@ def test_convert_to_markdown_produces_report(self):
     def test_overall_score_calculation(self):
         directives = DirectiveExtractionResult(
             directives=[
-                Directive(directive_id="D1", directive_type=DirectiveType.CONSTRAINT, text="A", importance_5=5),
-                Directive(directive_id="D2", directive_type=DirectiveType.STATED_FACT, text="B", importance_5=5),
+                Directive(directive_index=1, directive_type=DirectiveType.CONSTRAINT, text="A", importance_5=5),
+                Directive(directive_index=2, directive_type=DirectiveType.STATED_FACT, text="B", importance_5=5),
             ]
         )
         scores = AdherenceScoreResult(
             results=[
-                AdherenceResult(directive_id="D1", adherence_5=5, category=AdherenceCategory.FULLY_HONORED, evidence="", explanation=""),
-                AdherenceResult(directive_id="D2", adherence_5=1, category=AdherenceCategory.CONTRADICTED, evidence="", explanation=""),
+                AdherenceResult(directive_index=1, adherence_5=5, category=AdherenceCategory.FULLY_HONORED, evidence="", explanation=""),
+                AdherenceResult(directive_index=2, adherence_5=1, category=AdherenceCategory.CONTRADICTED, evidence="", explanation=""),
             ]
         )
         score = PromptAdherence.calculate_overall_score(directives, scores)

From ee655a9f5ef9010e1747b5ad6512a4f6afded74c Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 19:42:21 +0200
Subject: [PATCH 08/19] fix: use human-readable category labels in markdown
 output

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py            | 18 ++++++++++++++++--
 .../diagnostics/tests/test_prompt_adherence.py |  2 +-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index 45984ba3..5b470d96 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -299,7 +299,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
             lines.append(
                 f"| {r.directive_index} | {_escape_table_cell(directive_text)} "
                 f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
-                f"| {r.adherence_5}/5 | {r.category} |"
+                f"| {r.adherence_5}/5 | {_format_category(r.category)} |"
             )
         lines.append("")
 
@@ -312,7 +312,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
                 directive_text = d.text if d else "Unknown"
                 lines.append(f"### {r.directive_index}: {directive_text}")
                 lines.append("")
-                lines.append(f"- **Category:** {r.category}")
+                lines.append(f"- **Category:** {_format_category(r.category)}")
                 lines.append(f"- **Adherence:** {r.adherence_5}/5")
                 lines.append(f"- **Importance:** {d.importance_5 if d else '?'}/5")
                 lines.append(f"- **Evidence:** {r.evidence}")
@@ -322,5 +322,19 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         return "\n".join(lines)
 
 
+_CATEGORY_LABELS = {
+    "fully_honored": "Fully honored",
+    "partially_honored": "Partially honored",
+    "softened": "Softened",
+    "ignored": "Ignored",
+    "contradicted": "Contradicted",
+    "unsolicited_caveat": "Unsolicited caveat",
+}
+
+
+def _format_category(category: str) -> str:
+    return _CATEGORY_LABELS.get(category, category)
+
+
 def _escape_table_cell(text: str) -> str:
     return text.replace("|", "\\|").replace("\n", " ")
diff --git a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
index 53a02a2d..15c00136 100644
--- a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
@@ -89,7 +89,7 @@ def test_convert_to_markdown_produces_report(self):
         markdown = PromptAdherence.convert_to_markdown(directives, scores)
         self.assertIn("# Prompt Adherence Report", markdown)
         self.assertIn("Budget: DKK 500M", markdown)
-        self.assertIn("contradicted", markdown)
+        self.assertIn("Contradicted", markdown)
         self.assertIn("Overall Adherence", markdown)
 
     def test_overall_score_calculation(self):

From 4ceff7791ea0cfac32469ff3764268ce1b08a6aa Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 19:43:41 +0200
Subject: [PATCH 09/19] fix: use human-readable directive type labels in
 markdown output

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py                   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index 5b470d96..2a34da0c 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -295,7 +295,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         lines.append("|----|-----------|------|------------|-----------|----------|")
         for _, d, r in scored_items:
             directive_text = d.text if d else "Unknown"
-            directive_type = d.directive_type if d else "unknown"
+            directive_type = _DIRECTIVE_TYPE_LABELS.get(d.directive_type, d.directive_type) if d else "Unknown"
             lines.append(
                 f"| {r.directive_index} | {_escape_table_cell(directive_text)} "
                 f"| {directive_type} | {d.importance_5 if d else '?'}/5 "
@@ -322,6 +322,15 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         return "\n".join(lines)
 
 
+_DIRECTIVE_TYPE_LABELS = {
+    "constraint": "Constraint",
+    "stated_fact": "Stated fact",
+    "requirement": "Requirement",
+    "banned": "Banned",
+    "intent": "Intent",
+}
+
+
 _CATEGORY_LABELS = {
     "fully_honored": "Fully honored",
     "partially_honored": "Partially honored",

From 3876d99004deeff1808b64d50ff5dd18bfa34988 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 19:51:52 +0200
Subject: [PATCH 10/19] fix: move Prompt Adherence to last section in report

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 worker_plan/worker_plan_internal/plan/nodes/report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker_plan/worker_plan_internal/plan/nodes/report.py b/worker_plan/worker_plan_internal/plan/nodes/report.py
index c1ac4530..339c1ae2 100644
--- a/worker_plan/worker_plan_internal/plan/nodes/report.py
+++ b/worker_plan/worker_plan_internal/plan/nodes/report.py
@@ -88,7 +88,6 @@ def run_inner(self):
         rg.append_html('Questions & Answers', self.input()['questions_and_answers']['html'].path)
         rg.append_markdown_with_tables('Premortem', self.input()['premortem']['markdown'].path)
         rg.append_markdown_with_tables('Self Audit', self.input()['self_audit']['markdown'].path)
-        rg.append_markdown_with_tables('Prompt Adherence', self.input()['prompt_adherence']['markdown'].path)
         rg.append_initial_prompt_vetted(
             document_title='Initial Prompt Vetted',
             initial_prompt_file_path=self.input()['setup'].path,
@@ -97,4 +96,5 @@ def run_inner(self):
             redline_gate_markdown_file_path=self.input()['redline_gate']['markdown'].path,
             premise_attack_markdown_file_path=self.input()['premise_attack']['markdown'].path
         )
+        rg.append_markdown_with_tables('Prompt Adherence', self.input()['prompt_adherence']['markdown'].path)
         rg.save_report(self.output().path, title=title, execute_plan_section_hidden=REPORT_EXECUTE_PLAN_SECTION_HIDDEN)

From ca653ebe79fdee83a439f0eaf29b9188bdec0832 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 20:35:48 +0200
Subject: [PATCH 11/19] fix: sort summary table by ID, use "Issue N - title"
 format in issues

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../worker_plan_internal/diagnostics/prompt_adherence.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index 2a34da0c..7e5a0789 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -286,7 +286,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
             importance = d.importance_5 if d else 3
             severity = importance * (6 - r.adherence_5)
             scored_items.append((severity, d, r))
-        scored_items.sort(key=lambda x: x[0], reverse=True)
+        scored_items.sort(key=lambda x: x[2].directive_index)
 
         # Summary table
         lines.append("## Summary")
@@ -305,12 +305,13 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
 
         # Detail section for poorly-scored directives
         poor_items = [(sev, d, r) for sev, d, r in scored_items if r.adherence_5 <= 3]
+        poor_items.sort(key=lambda x: x[0], reverse=True)
         if poor_items:
             lines.append("## Issues")
             lines.append("")
             for _, d, r in poor_items:
                 directive_text = d.text if d else "Unknown"
-                lines.append(f"### {r.directive_index}: {directive_text}")
+                lines.append(f"### Issue {r.directive_index} - {directive_text}")
                 lines.append("")
                 lines.append(f"- **Category:** {_format_category(r.category)}")
                 lines.append(f"- **Adherence:** {r.adherence_5}/5")

From 57af3c675a976c9bfbe70269ea5715d281807671 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 20:49:31 +0200
Subject: [PATCH 12/19] fix: remove h1 header from prompt adherence markdown

The report generator adds its own section header.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../worker_plan_internal/diagnostics/prompt_adherence.py       | 3 ---
 .../diagnostics/tests/test_prompt_adherence.py                 | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index 7e5a0789..32cc8708 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -268,9 +268,6 @@ def calculate_overall_score(directives: DirectiveExtractionResult, scores: Adher
     @staticmethod
     def convert_to_markdown(directives: DirectiveExtractionResult, scores: AdherenceScoreResult) -> str:
         lines: list[str] = []
-        lines.append("# Prompt Adherence Report")
-        lines.append("")
-
         # Build lookup
         importance_map = {d.directive_index: d for d in directives.directives}
 
diff --git a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
index 15c00136..a82c3001 100644
--- a/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/tests/test_prompt_adherence.py
@@ -87,7 +87,7 @@ def test_convert_to_markdown_produces_report(self):
             ]
         )
         markdown = PromptAdherence.convert_to_markdown(directives, scores)
-        self.assertIn("# Prompt Adherence Report", markdown)
+        self.assertNotIn("# Prompt Adherence Report", markdown)
         self.assertIn("Budget: DKK 500M", markdown)
         self.assertIn("Contradicted", markdown)
         self.assertIn("Overall Adherence", markdown)

From 95c30c32c45c0b531af4e280f91f15341e2202d7 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 23:50:15 +0200
Subject: [PATCH 13/19] refactor: split plan.txt into plan_raw.json + SetupTask
 template

The app now saves plan_raw.json (user prompt + date as JSON).
SetupTask reads plan_raw.json and produces plan.txt from a template.
This separates the raw user input from the formatted pipeline input.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 worker_plan/app.py                            |  2 +-
 worker_plan/worker_plan_api/filenames.py      |  1 +
 worker_plan/worker_plan_api/plan_file.py      | 40 ++++++++++++++-----
 .../worker_plan_internal/plan/nodes/setup.py  | 18 ++++++---
 .../worker_plan_internal/plan/ping_llm.py     |  4 +-
 .../plan/run_plan_pipeline.py                 |  4 +-
 .../plan/tests/test_ping_llm.py               |  2 +-
 7 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/worker_plan/app.py b/worker_plan/app.py
index 35051f97..88ffb872 100644
--- a/worker_plan/app.py
+++ b/worker_plan/app.py
@@ -223,7 +223,7 @@ def create_run_directory(request: StartRunRequest) -> tuple[str, Path]:
     start_time_file.save(run_dir / FilenameEnum.START_TIME.value)
 
     plan_file = PlanFile.create(vague_plan_description=request.plan_prompt, start_time=start_time)
-    plan_file.save(run_dir / FilenameEnum.INITIAL_PLAN.value)
+    plan_file.save(run_dir / FilenameEnum.INITIAL_PLAN_RAW.value)
 
     return run_id, run_dir.resolve()
 
diff --git a/worker_plan/worker_plan_api/filenames.py b/worker_plan/worker_plan_api/filenames.py
index 92e1c6bc..783ad375 100644
--- a/worker_plan/worker_plan_api/filenames.py
+++ b/worker_plan/worker_plan_api/filenames.py
@@ -2,6 +2,7 @@
 
 class FilenameEnum(str, Enum):
     START_TIME = "start_time.json"
+    INITIAL_PLAN_RAW = "plan_raw.json"
     INITIAL_PLAN = "plan.txt"
     PLANEXE_METADATA = "planexe_metadata.json"
     SCREEN_PLANNING_PROMPT_RAW = "screen_planning_prompt.json"
diff --git a/worker_plan/worker_plan_api/plan_file.py b/worker_plan/worker_plan_api/plan_file.py
index 83dc1e47..3720d1e0 100644
--- a/worker_plan/worker_plan_api/plan_file.py
+++ b/worker_plan/worker_plan_api/plan_file.py
@@ -1,28 +1,50 @@
 """
 PROMPT> python -m worker_plan_api.plan_file
 """
+import json
 from datetime import datetime
 from dataclasses import dataclass
 
+
+PLAN_TEMPLATE = "Plan:\n{plan_prompt}\n\nToday's date:\n{pretty_date}\n\nProject start ASAP"
+
+
 @dataclass
 class PlanFile:
-    content: str
+    plan_prompt: str
+    pretty_date: str
 
     @classmethod
     def create(cls, vague_plan_description: str, start_time: datetime) -> "PlanFile":
         pretty_date = start_time.strftime("%Y-%b-%d")
-        plan_prompt = (
-            f"Plan:\n{vague_plan_description}\n\n"
-            f"Today's date:\n{pretty_date}\n\n"
-            "Project start ASAP"
-        )
-        return cls(plan_prompt)
+        return cls(plan_prompt=vague_plan_description, pretty_date=pretty_date)
+
+    def to_dict(self) -> dict:
+        return {
+            "plan_prompt": self.plan_prompt,
+            "pretty_date": self.pretty_date,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "PlanFile":
+        return cls(plan_prompt=data["plan_prompt"], pretty_date=data["pretty_date"])
+
+    @classmethod
+    def load(cls, file_path: str) -> "PlanFile":
+        with open(file_path, "r", encoding="utf-8") as f:
+            return cls.from_dict(json.load(f))
 
     def save(self, file_path: str) -> None:
         with open(file_path, "w", encoding="utf-8") as f:
-            f.write(self.content)
+            json.dump(self.to_dict(), f, indent=2)
+
+    def to_plan_text(self) -> str:
+        return PLAN_TEMPLATE.format(plan_prompt=self.plan_prompt, pretty_date=self.pretty_date)
+
 
 if __name__ == "__main__":
     start_time: datetime = datetime.now().astimezone()
     plan = PlanFile.create(vague_plan_description="My plan is here!", start_time=start_time)
-    print(plan.content)
+    print(json.dumps(plan.to_dict(), indent=2))
+    print("---")
+    print(plan.to_plan_text())
diff --git a/worker_plan/worker_plan_internal/plan/nodes/setup.py b/worker_plan/worker_plan_internal/plan/nodes/setup.py
index af60ff95..56ce171e 100644
--- a/worker_plan/worker_plan_internal/plan/nodes/setup.py
+++ b/worker_plan/worker_plan_internal/plan/nodes/setup.py
@@ -1,14 +1,22 @@
-"""SetupTask - The plan prompt text provided by the user."""
+"""SetupTask - Convert plan_raw.json into the plan.txt used by the pipeline."""
 from worker_plan_internal.plan.run_plan_pipeline import PlanTask
 from worker_plan_api.filenames import FilenameEnum
+from worker_plan_api.plan_file import PlanFile
 
 
 class SetupTask(PlanTask):
-    """Load the user's plan prompt as the pipeline input."""
+    """Read plan_raw.json and produce plan.txt from the template."""
     def output(self):
         return self.local_target(FilenameEnum.INITIAL_PLAN)
 
     def run(self):
-        # The Gradio/Flask app that starts the luigi pipeline, must first create the `INITIAL_PLAN` file inside the `run_id_dir`.
-        # This code will ONLY run if the Gradio/Flask app *failed* to create the file.
-        raise AssertionError(f"This code is not supposed to be run. Before starting the pipeline the '{FilenameEnum.INITIAL_PLAN.value}' file must be present in the `run_id_dir`: {self.run_id_dir!r}")
+        raw_path = self.run_id_dir / FilenameEnum.INITIAL_PLAN_RAW.value
+        if not raw_path.exists():
+            raise FileNotFoundError(
+                f"Before starting the pipeline the '{FilenameEnum.INITIAL_PLAN_RAW.value}' file "
+                f"must be present in the run_id_dir: {self.run_id_dir!r}"
+            )
+        plan_file = PlanFile.load(str(raw_path))
+        plan_text = plan_file.to_plan_text()
+        with open(self.output().path, "w", encoding="utf-8") as f:
+            f.write(plan_text)
diff --git a/worker_plan/worker_plan_internal/plan/ping_llm.py b/worker_plan/worker_plan_internal/plan/ping_llm.py
index a8dd2303..b8717669 100644
--- a/worker_plan/worker_plan_internal/plan/ping_llm.py
+++ b/worker_plan/worker_plan_internal/plan/ping_llm.py
@@ -37,9 +37,9 @@ def _validate_run_dir(run_id_dir: Path) -> None:
         raise FileNotFoundError(
             f"The '{FilenameEnum.START_TIME.value}' file does not exist in the run_id_dir: {run_id_dir!r}"
         )
-    if not (run_id_dir / FilenameEnum.INITIAL_PLAN.value).exists():
+    if not (run_id_dir / FilenameEnum.INITIAL_PLAN_RAW.value).exists():
         raise FileNotFoundError(
-            f"The '{FilenameEnum.INITIAL_PLAN.value}' file does not exist in the run_id_dir: {run_id_dir!r}"
+            f"The '{FilenameEnum.INITIAL_PLAN_RAW.value}' file does not exist in the run_id_dir: {run_id_dir!r}"
         )
 
 
diff --git a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
index 00999753..30ca4b0d 100644
--- a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
+++ b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
@@ -213,8 +213,8 @@ def setup(self) -> None:
             raise NotADirectoryError(f"The run_id_dir is not a directory: {self.run_id_dir!r}")
         if not (self.run_id_dir / FilenameEnum.START_TIME.value).exists():
             raise FileNotFoundError(f"The '{FilenameEnum.START_TIME.value}' file does not exist in the run_id_dir: {self.run_id_dir!r}")
-        if not (self.run_id_dir / FilenameEnum.INITIAL_PLAN.value).exists():
-            raise FileNotFoundError(f"The '{FilenameEnum.INITIAL_PLAN.value}' file does not exist in the run_id_dir: {self.run_id_dir!r}")
+        if not (self.run_id_dir / FilenameEnum.INITIAL_PLAN_RAW.value).exists():
+            raise FileNotFoundError(f"The '{FilenameEnum.INITIAL_PLAN_RAW.value}' file does not exist in the run_id_dir: {self.run_id_dir!r}")
 
         from worker_plan_internal.plan.nodes.full_plan_pipeline import FullPlanPipeline
         full_plan_pipeline_task = FullPlanPipeline(
diff --git a/worker_plan/worker_plan_internal/plan/tests/test_ping_llm.py b/worker_plan/worker_plan_internal/plan/tests/test_ping_llm.py
index a9cffe84..a09604ab 100644
--- a/worker_plan/worker_plan_internal/plan/tests/test_ping_llm.py
+++ b/worker_plan/worker_plan_internal/plan/tests/test_ping_llm.py
@@ -13,7 +13,7 @@ def test_ping_llm_report_fallback(self):
         with TemporaryDirectory() as temp_dir:
             run_id_dir = Path(temp_dir)
             (run_id_dir / FilenameEnum.START_TIME.value).write_text("{}", encoding="utf-8")
-            (run_id_dir / FilenameEnum.INITIAL_PLAN.value).write_text("Ping test", encoding="utf-8")
+            (run_id_dir / FilenameEnum.INITIAL_PLAN_RAW.value).write_text('{"plan_prompt": "Ping test", "pretty_date": "1984-Apr-09"}', encoding="utf-8")
 
             bad_llm = ResponseMockLLM(responses=["raise:BAD"])
             good_llm = ResponseMockLLM(responses=["PONG ok"])

From b691e1a87f93e7492c137c804df0067ad0780cd8 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Thu, 9 Apr 2026 23:54:16 +0200
Subject: [PATCH 14/19] fix: read plan_prompt from plan_raw.json in
 PromptAdherenceTask

Uses the raw user prompt directly, not the templated plan.txt.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../worker_plan_internal/plan/nodes/prompt_adherence.py     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py b/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
index 25287c17..95c83b0c 100644
--- a/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/plan/nodes/prompt_adherence.py
@@ -3,6 +3,7 @@
 from worker_plan_internal.diagnostics.prompt_adherence import PromptAdherence
 from worker_plan_internal.llm_util.llm_executor import LLMExecutor
 from worker_plan_api.filenames import FilenameEnum
+from worker_plan_api.plan_file import PlanFile
 from worker_plan_internal.plan.nodes.setup import SetupTask
 from worker_plan_internal.plan.nodes.project_plan import ProjectPlanTask
 from worker_plan_internal.plan.nodes.executive_summary import ExecutiveSummaryTask
@@ -29,8 +30,9 @@ def requires(self):
     def run_inner(self):
         llm_executor: LLMExecutor = self.create_llm_executor()
 
-        with self.input()['setup'].open("r") as f:
-            plan_prompt = f.read()
+        plan_raw_path = self.run_id_dir / FilenameEnum.INITIAL_PLAN_RAW.value
+        plan_file = PlanFile.load(str(plan_raw_path))
+        plan_prompt = plan_file.plan_prompt
         with self.input()['project_plan']['markdown'].open("r") as f:
             project_plan_markdown = f.read()
         with self.input()['executive_summary']['markdown'].open("r") as f:

From ee32bd369bd1519e70383514fe8a14858202b8ba Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Fri, 10 Apr 2026 00:10:17 +0200
Subject: [PATCH 15/19] fix: show all non-perfect directives in Issues section

Changed threshold from adherence <= 3 to adherence < 5 so
partially_honored items appear in the Issues list.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../worker_plan_internal/diagnostics/prompt_adherence.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index 32cc8708..c5260f99 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -301,7 +301,7 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         lines.append("")
 
         # Detail section for poorly-scored directives
-        poor_items = [(sev, d, r) for sev, d, r in scored_items if r.adherence_5 <= 3]
+        poor_items = [(sev, d, r) for sev, d, r in scored_items if r.adherence_5 < 5]
         poor_items.sort(key=lambda x: x[0], reverse=True)
         if poor_items:
             lines.append("## Issues")

From 12cca36c6c3d6231ab53f1e70877aaecb2ccbe62 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Fri, 10 Apr 2026 00:23:16 +0200
Subject: [PATCH 16/19] feat: show adherence score math in markdown report
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Displays the formula below the overall score, e.g.:
(5×5 + 4×4 + 5×3 + ...) / 250 = 94%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py           | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index c5260f99..de5b1092 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -271,12 +271,25 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         # Build lookup
         importance_map = {d.directive_index: d for d in directives.directives}
 
-        # Calculate overall score
-        overall = PromptAdherence.calculate_overall_score(directives, scores)
+        # Calculate overall score with math breakdown
+        weighted_sum = 0
+        max_sum = 0
+        math_parts = []
+        for r in scores.results:
+            d = importance_map.get(r.directive_index)
+            importance = d.importance_5 if d else 3
+            weighted_sum += r.adherence_5 * importance
+            max_sum += 5 * importance
+            math_parts.append(f"{r.adherence_5}×{importance}")
+        overall = round(weighted_sum * 100 / max_sum) if max_sum > 0 else 100
         lines.append(f"**Overall Adherence: {overall}%**")
         lines.append("")
+        if math_parts:
+            math_str = " + ".join(math_parts)
+            lines.append(f"({math_str}) / {max_sum} = {overall}%")
+            lines.append("")
 
-        # Sort by severity: importance * (6 - adherence), worst first
+        # Sort by directive index
         scored_items = []
         for r in scores.results:
             d = importance_map.get(r.directive_index)

From 3c8fc52f3a8adf03b5fb2772a14097fda0c3bc6a Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Fri, 10 Apr 2026 00:49:17 +0200
Subject: [PATCH 17/19] fix: spell out adherence score formula in markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shows three lines:
IMPORTANCE_ADHERENCE_SUM = (5×5 + 3×4 + ...) = 205
IMPORTANCE_SUM = 5 + 3 + ... = 41
OVERALL_ADHERENCE = IMPORTANCE_ADHERENCE_SUM / (IMPORTANCE_SUM × 5) = 205 / 205 = 94%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../diagnostics/prompt_adherence.py           | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index de5b1092..b13475cc 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -272,21 +272,27 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         importance_map = {d.directive_index: d for d in directives.directives}
 
         # Calculate overall score with math breakdown
-        weighted_sum = 0
-        max_sum = 0
-        math_parts = []
+        weighted_parts = []
+        importance_parts = []
+        importances = []
         for r in scores.results:
             d = importance_map.get(r.directive_index)
             importance = d.importance_5 if d else 3
-            weighted_sum += r.adherence_5 * importance
-            max_sum += 5 * importance
-            math_parts.append(f"{r.adherence_5}×{importance}")
-        overall = round(weighted_sum * 100 / max_sum) if max_sum > 0 else 100
+            importances.append(importance)
+            weighted_parts.append(f"{importance}×{r.adherence_5}")
+            importance_parts.append(str(importance))
+        weighted_sum = sum(
+            r.adherence_5 * (importance_map.get(r.directive_index).importance_5 if importance_map.get(r.directive_index) else 3)
+            for r in scores.results
+        )
+        importance_sum = sum(importances)
+        overall = round(weighted_sum * 100 / (importance_sum * 5)) if importance_sum > 0 else 100
         lines.append(f"**Overall Adherence: {overall}%**")
         lines.append("")
-        if math_parts:
-            math_str = " + ".join(math_parts)
-            lines.append(f"({math_str}) / {max_sum} = {overall}%")
+        if weighted_parts:
+            lines.append(f"IMPORTANCE_ADHERENCE_SUM = ({' + '.join(weighted_parts)}) = {weighted_sum}")
+            lines.append(f"IMPORTANCE_SUM = {' + '.join(importance_parts)} = {importance_sum}")
+            lines.append(f"OVERALL_ADHERENCE = IMPORTANCE_ADHERENCE_SUM / (IMPORTANCE_SUM × 5) = {weighted_sum} / {importance_sum * 5} = {overall}%")
             lines.append("")
 
         # Sort by directive index

From b28445cde3cc67a152902ea0d05e24ebf3bbfed5 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Fri, 10 Apr 2026 00:50:32 +0200
Subject: [PATCH 18/19] fix: wrap adherence formula in code block

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../worker_plan_internal/diagnostics/prompt_adherence.py        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
index b13475cc..dced2e93 100644
--- a/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
+++ b/worker_plan/worker_plan_internal/diagnostics/prompt_adherence.py
@@ -290,9 +290,11 @@ def convert_to_markdown(directives: DirectiveExtractionResult, scores: Adherence
         lines.append(f"**Overall Adherence: {overall}%**")
         lines.append("")
         if weighted_parts:
+            lines.append("```")
             lines.append(f"IMPORTANCE_ADHERENCE_SUM = ({' + '.join(weighted_parts)}) = {weighted_sum}")
             lines.append(f"IMPORTANCE_SUM = {' + '.join(importance_parts)} = {importance_sum}")
             lines.append(f"OVERALL_ADHERENCE = IMPORTANCE_ADHERENCE_SUM / (IMPORTANCE_SUM × 5) = {weighted_sum} / {importance_sum * 5} = {overall}%")
+            lines.append("```")
             lines.append("")
 
         # Sort by directive index

From 0cbb55d15c155d5cbe9a521170bc2ff2e15e3068 Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Fri, 10 Apr 2026 00:59:26 +0200
Subject: [PATCH 19/19] fix: enable fenced_code in markdown_with_tables
 rendering

Code blocks in sections like Prompt Adherence now render as
<pre><code> instead of inline <code>, preserving line breaks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 worker_plan/worker_plan_internal/report/report_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker_plan/worker_plan_internal/report/report_generator.py b/worker_plan/worker_plan_internal/report/report_generator.py
index cb8a8fe5..0b24caef 100644
--- a/worker_plan/worker_plan_internal/report/report_generator.py
+++ b/worker_plan/worker_plan_internal/report/report_generator.py
@@ -125,7 +125,7 @@ def append_markdown_with_tables(self, document_title: str, file_path: Path, css_
         if md_data is None:
             logging.warning(f"Document: '{document_title}'. Could not read markdown file: {file_path}")
             return
-        html = markdown.markdown(md_data, extensions=['tables'])
+        html = markdown.markdown(md_data, extensions=['tables', 'fenced_code'])
         self.report_item_list.append(ReportDocumentItem(document_title, html, css_classes=css_classes))
     
     def append_csv(self, document_title: str, file_path: Path, css_classes: list[str] = []):