fix: per-archetype PremortemTask decomposition for small-model compatibility

EgonBot · EgonBot · commit 56838e748a11 · 2026-03-07T21:08:23.000Z
PremortemAnalysis required the LLM to emit a deeply nested schema:
3 AssumptionItem + 3 FailureModeItem, with 11+ required fields per item
including linked cross-reference IDs. Qwen 3.5-35B and other small local
models consistently echoed the schema structure back instead of producing
values, causing validation errors after exhausting all retries.

Fix (applies the 'LLMs handle narrative; code handles structure' principle):
- New ArchetypeNarrative schema: 5 plain text fields only (no IDs, no
  cross-references, no counts). The LLM writes narrative content only.
- Per-archetype decomposition: one independent LLM call per archetype
  with up to 5 retries. Failed archetypes are skipped gracefully.
- IDs, indices, and cross-references (assumption_id, failure_mode_index,
  root_cause_assumption_id) are assigned by code, not the LLM.
- falsifier derived from test_now field to avoid hardcoded tautologies.
- _calculate_risk_level_verbose returns 'Not Scored' when likelihood or
  impact is None (was rendering 'Likelihood None/5, Impact None/5').
- Dead code removed: ArchetypeAnalysis class, PREMORTEM_SYSTEM_PROMPT_TEMPLATE.

Validated: PremortemTask PASSED on GLM 4.7 Flash (HVT_minimal run).
diff --git a/worker_plan/worker_plan_internal/diagnostics/premortem.py b/worker_plan/worker_plan_internal/diagnostics/premortem.py
@@ -71,6 +71,14 @@ class PremortemAnalysis(BaseModel):
     assumptions_to_kill: List[AssumptionItem] = Field(description="A list of 3 new, critical, underlying assumptions to test immediately.")
     failure_modes: List[FailureModeItem] = Field(description="A list containing exactly 3 distinct failure failure_modes, one for each archetype.")
 
+class ArchetypeNarrative(BaseModel):
+    """Minimal per-archetype schema. IDs and cross-references are assigned by the program, not the LLM."""
+    assumption: str = Field(description="One critical assumption the project is making that, if false, would cause this failure.")
+    test_now: str = Field(description="One concrete action to immediately test if this assumption holds.")
+    failure_title: str = Field(description="A short, compelling title for this failure scenario (e.g. 'The Gridlock Gamble').")
+    failure_story: str = Field(description="A detailed narrative of how this failure unfolds. Explain causes, chain of events, and impact.")
+    warning_signs: List[str] = Field(description="2-4 observable signals that this failure is beginning to occur.")
+
 PREMORTEM_SYSTEM_PROMPT = """
 Persona: You are a senior project analyst. Your primary goal is to write compelling, detailed, and distinct failure stories that are also operationally actionable.
 
@@ -119,92 +127,75 @@ def execute(cls, llm_executor: LLMExecutor, speed_vs_detail: SpeedVsDetailEnum,
         logger.debug(f"User Prompt:\n{user_prompt}")
         system_prompt = PREMORTEM_SYSTEM_PROMPT.strip()
 
-        accumulated_chat_message_list = [
-            ChatMessage(
-                role=MessageRole.SYSTEM,
-                content=system_prompt,
-            )
-        ]
-
-        user_prompt_list = [
-            user_prompt,
-            "Generate 3 new assumptions that are thematically different from the previous ones. Start assumption_id at A4.",
-            "Generate 3 new assumptions that are thematically different from the previous ones and covers different archetypes. Start assumption_id at A7.",
-        ]
+        archetypes = ["Process/Financial", "Technical/Logistical", "Market/Human"]
         if speed_vs_detail == SpeedVsDetailEnum.FAST_BUT_SKIP_DETAILS:
-            user_prompt_list = user_prompt_list[:1]
-            logger.info("Running in FAST_BUT_SKIP_DETAILS mode. Omitting some assumptions.")
+            archetypes = archetypes[:1]
+            logger.info("Running in FAST_BUT_SKIP_DETAILS mode. Processing 1 archetype only.")
         else:
-            logger.info("Running in ALL_DETAILS_BUT_SLOW mode. Processing all assumptions.")
+            logger.info("Running in ALL_DETAILS_BUT_SLOW mode. Processing all 3 archetypes.")
 
-        responses: list[PremortemAnalysis] = []
+        assumptions_to_kill: list[AssumptionItem] = []
+        failure_modes: list[FailureModeItem] = []
         metadata_list: list[dict] = []
-        for user_prompt_index, user_prompt_item in enumerate(user_prompt_list):
-            logger.info(f"Processing user_prompt_index: {user_prompt_index+1} of {len(user_prompt_list)}")
-            chat_message_list = accumulated_chat_message_list.copy()
-            chat_message_list.append(
-                ChatMessage(
-                    role=MessageRole.USER,
-                    content=user_prompt_item,
-                )
+
+        for archetype_index, archetype in enumerate(archetypes):
+            assumption_id = f"A{archetype_index + 1}"
+            failure_mode_index = archetype_index + 1
+            logger.info(f"Processing archetype {archetype_index+1} of {len(archetypes)}: {archetype!r}")
+
+            archetype_user_prompt = (
+                f"{user_prompt}\n\n"
+                f"Archetype: {archetype}\n"
+                f"Write one assumption and one failure scenario for this archetype only."
             )
+            chat_message_list = [
+                ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
+                ChatMessage(role=MessageRole.USER, content=archetype_user_prompt),
+            ]
 
             def execute_function(llm: LLM) -> dict:
-                sllm = llm.as_structured_llm(PremortemAnalysis)
+                sllm = llm.as_structured_llm(ArchetypeNarrative)
                 start_time = time.perf_counter()
-                
                 chat_response = sllm.chat(chat_message_list)
-                pydantic_response = chat_response.raw
-                
+                narrative = require_raw(chat_response, ArchetypeNarrative)
                 end_time = time.perf_counter()
                 duration = int(ceil(end_time - start_time))
-                
                 metadata = dict(llm.metadata)
                 metadata["llm_classname"] = llm.class_name()
                 metadata["duration"] = duration
-                
-                return {
-                    "pydantic_response": pydantic_response,
-                    "metadata": metadata,
-                    "duration": duration
-                }
+                return {"narrative": narrative, "metadata": metadata}
 
             try:
                 result = llm_executor.run(execute_function)
             except PipelineStopRequested:
-                # Re-raise PipelineStopRequested without wrapping it
                 raise
             except Exception as e:
-                logger.debug(f"LLM chat interaction failed: {e}")
-                logger.error("LLM chat interaction failed.", exc_info=True)
-                if user_prompt_index == 0:
-                    logger.error("The first user prompt failed. This is a critical error. Please check the system prompt and user prompt.")
-                    raise ValueError("LLM chat interaction failed.") from e
-                else:
-                    logger.error(f"User prompt {user_prompt_index+1} failed. Continuing with next user prompt.")
-                    continue
-            
-            assistant_content_raw: dict = result["pydantic_response"].model_dump()
-            # Compact JSON without newlines and spaces, since it's going to be parsed by the LLM. Pretty printing wastes input tokens for the LLM.
-            assistant_content: str = json.dumps(assistant_content_raw, separators=(',', ':'))
-
-            chat_message_list.append(
-                ChatMessage(
-                    role=MessageRole.ASSISTANT,
-                    content=assistant_content,
-                )
-            )
+                logger.error(f"Archetype {archetype!r} failed: {e}", exc_info=True)
+                if archetype_index == 0:
+                    raise ValueError(f"First archetype failed: {e}") from e
+                logger.warning(f"Skipping archetype {archetype!r} due to failure.")
+                continue
 
-            responses.append(result["pydantic_response"])
+            narrative: ArchetypeNarrative = result["narrative"]
             metadata_list.append(result["metadata"])
-            accumulated_chat_message_list = chat_message_list.copy()
 
-        # Use the last response as the primary result
-        assumptions_to_kill: list[AssumptionItem] = []
-        failure_modes: list[FailureModeItem] = []
-        for response in responses:
-            assumptions_to_kill.extend(response.assumptions_to_kill)
-            failure_modes.extend(response.failure_modes)
+            # Code assigns IDs and cross-references — the LLM only provides narrative text.
+            assumption = AssumptionItem(
+                assumption_id=assumption_id,
+                statement=narrative.assumption,
+                test_now=narrative.test_now,
+                falsifier=f"Result of: {narrative.test_now} — reveals the assumption does not hold.",
+            )
+            failure_mode = FailureModeItem(
+                failure_mode_index=failure_mode_index,
+                root_cause_assumption_id=assumption_id,
+                failure_mode_archetype=archetype,
+                failure_mode_title=narrative.failure_title,
+                risk_analysis=narrative.failure_story,
+                early_warning_signs=narrative.warning_signs,
+            )
+            assumptions_to_kill.append(assumption)
+            failure_modes.append(failure_mode)
 
         final_response = PremortemAnalysis(
             assumptions_to_kill=assumptions_to_kill,
@@ -286,7 +277,7 @@ def _calculate_risk_level_brief(likelihood: Optional[int], impact: Optional[int]
     def _calculate_risk_level_verbose(likelihood: Optional[int], impact: Optional[int]) -> str:
         """Calculates a qualitative risk level from likelihood and impact scores."""
         if likelihood is None or impact is None:
-            return f"Likelihood {likelihood}/5, Impact {impact}/5"
+            return "Not Scored"
         
         score = likelihood * impact
         if score >= 15: