From 47cabe8b6d8cdf24688979038e7ee39d71943e11 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 21:56:09 -0600
Subject: [PATCH 1/5] refactor: extract build_run_inputs to
 evolution.core.run_inputs

Five sites built the same `run_inputs` dict by hand (3 skill, 2 tool).
Two latent inconsistencies fall out of normalizing through one helper:

- evolve_tool's deploy-path literal was missing `eval_source`, which
  the skill side has always included. The helper restores it.
- evolve_tool's cost-ceiling fallback was missing
  `enable_confusable_bucket`, which the deploy-path schema test
  asserts is present. Whenever the cost ceiling tripped on the
  deploy path, the fallback fired and silently dropped the field;
  routing through the helper closes that gap.

The helper's `quality_gate_preset`/`fitness_profile`/`enable_confusable_bucket`
kwargs make both the shape contract and the optional-on-skill / required-on-tool
asymmetry explicit at every call site.
---
 evolution/core/run_inputs.py     |  49 +++++++++++++
 evolution/skills/evolve_skill.py |  70 ++++++------------
 evolution/tools/evolve_tool.py   |  53 +++++---------
 tests/core/test_run_inputs.py    | 117 +++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+), 82 deletions(-)
 create mode 100644 evolution/core/run_inputs.py
 create mode 100644 tests/core/test_run_inputs.py

diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py
new file mode 100644
index 0000000..c47bb0d
--- /dev/null
+++ b/evolution/core/run_inputs.py
@@ -0,0 +1,49 @@
+"""Build the `run_inputs` block written into every gate_decision.json.
+
+The block records every input that produced a given run so a third party
+holding only the gate_decision.json artifact can reproduce the result.
+Centralizing construction here keeps the skill and tool sides from
+drifting — the cost-ceiling fallback in `evolve_tool` historically built
+the block by hand and dropped `enable_confusable_bucket`, breaking the
+deploy-gate schema contract.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from evolution.core.config import EvolutionConfig
+from evolution.core.hermes_provider import resolved_lms_dump
+
+
+def build_run_inputs(
+    *,
+    config: EvolutionConfig,
+    iterations: int,
+    optimizer_model: str,
+    quality_gate_preset: str,
+    eval_source: str,
+    fitness_profile: Optional[str] = None,
+    enable_confusable_bucket: Optional[bool] = None,
+) -> dict[str, Any]:
+    run_inputs: dict[str, Any] = {
+        "seed": config.seed,
+        "iterations": iterations,
+        "optimizer_model": optimizer_model,
+        "reflection_model": config.reflection_model,
+        "eval_model": config.eval_model,
+        "resolved_lms": resolved_lms_dump(
+            optimizer=optimizer_model,
+            reflection=config.reflection_model,
+            eval_=config.eval_model,
+        ),
+        "eval_dataset_size": config.eval_dataset_size,
+        "holdout_ratio": config.holdout_ratio,
+        "quality_gate_preset": quality_gate_preset,
+        "eval_source": eval_source,
+    }
+    if fitness_profile is not None:
+        run_inputs["fitness_profile"] = fitness_profile
+    if enable_confusable_bucket is not None:
+        run_inputs["enable_confusable_bucket"] = enable_confusable_bucket
+    return run_inputs
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index ee4fb0c..b0161aa 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -52,6 +52,7 @@
     write_cost_ceiling_abort,
     write_gate_decision,
 )
+from evolution.core.run_inputs import build_run_inputs
 from evolution.core.skill_sources import discover_skill_sources
 
 # Without this, the BudgetAwareProposer + LMTimingCallback logs stay
@@ -1053,22 +1054,13 @@ def evolve(
                     "messages": [c.message for c in static_constraints if not c.passed],
                     "knee_point": _knee_point_payload(knee_pick),
                     "dataset": _dataset_payload(dataset),
-                    "run_inputs": {
-                        "seed": config.seed,
-                        "iterations": iterations,
-                        "optimizer_model": optimizer_model,
-                        "reflection_model": config.reflection_model,
-                        "eval_model": config.eval_model,
-                        "resolved_lms": resolved_lms_dump(
-                            optimizer=optimizer_model,
-                            reflection=config.reflection_model,
-                            eval_=config.eval_model,
-                        ),
-                        "eval_dataset_size": config.eval_dataset_size,
-                        "holdout_ratio": config.holdout_ratio,
-                        "quality_gate_preset": quality_gate,
-                        "eval_source": eval_source,
-                    },
+                    "run_inputs": build_run_inputs(
+                        config=config,
+                        iterations=iterations,
+                        optimizer_model=optimizer_model,
+                        quality_gate_preset=quality_gate,
+                        eval_source=eval_source,
+                    ),
                 })
                 console.print(f"  Saved failed variant to {failed_path}")
                 return
@@ -1104,22 +1096,13 @@ def evolve(
 
             # Hoist run_inputs to a local — referenced from 3 sites (the
             # two CL-primary abort paths + the main decision_payload).
-            run_inputs = {
-                "seed": config.seed,
-                "iterations": iterations,
-                "optimizer_model": optimizer_model,
-                "reflection_model": config.reflection_model,
-                "eval_model": config.eval_model,
-                "resolved_lms": resolved_lms_dump(
-                    optimizer=optimizer_model,
-                    reflection=config.reflection_model,
-                    eval_=config.eval_model,
-                ),
-                "eval_dataset_size": config.eval_dataset_size,
-                "holdout_ratio": config.holdout_ratio,
-                "quality_gate_preset": quality_gate,
-                "eval_source": eval_source,
-            }
+            run_inputs = build_run_inputs(
+                config=config,
+                iterations=iterations,
+                optimizer_model=optimizer_model,
+                quality_gate_preset=quality_gate,
+                eval_source=eval_source,
+            )
 
             use_cl_primary = (
                 preflight_band == "weak_signal"
@@ -1488,22 +1471,13 @@ def evolve(
             write_cost_ceiling_abort(
                 exc,
                 output_dir=output_dir,
-                run_inputs={
-                    "seed": config.seed,
-                    "iterations": iterations,
-                    "optimizer_model": optimizer_model,
-                    "reflection_model": config.reflection_model,
-                    "eval_model": config.eval_model,
-                    "resolved_lms": resolved_lms_dump(
-                        optimizer=optimizer_model,
-                        reflection=config.reflection_model,
-                        eval_=config.eval_model,
-                    ),
-                    "eval_dataset_size": config.eval_dataset_size,
-                    "holdout_ratio": config.holdout_ratio,
-                    "quality_gate_preset": quality_gate,
-                    "eval_source": eval_source,
-                },
+                run_inputs=build_run_inputs(
+                    config=config,
+                    iterations=iterations,
+                    optimizer_model=optimizer_model,
+                    quality_gate_preset=quality_gate,
+                    eval_source=eval_source,
+                ),
                 schema_version="5",
             )
             return
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 7721af8..402ee80 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -70,6 +70,7 @@
     write_cost_ceiling_abort,
     write_gate_decision,
 )
+from evolution.core.run_inputs import build_run_inputs
 from evolution.core.stats import paired_bootstrap
 from evolution.skills.knee_point import CandidatePick, select_knee_point
 from evolution.tools.session_mining import (
@@ -791,23 +792,15 @@ def evolve(
                 if not c.passed:
                     static_pass = False
 
-            run_inputs = {
-                "seed": config.seed,
-                "iterations": iterations,
-                "optimizer_model": optimizer_model,
-                "reflection_model": config.reflection_model,
-                "eval_model": config.eval_model,
-                "resolved_lms": resolved_lms_dump(
-                    optimizer=optimizer_model,
-                    reflection=config.reflection_model,
-                    eval_=config.eval_model,
-                ),
-                "eval_dataset_size": config.eval_dataset_size,
-                "holdout_ratio": config.holdout_ratio,
-                "quality_gate_preset": quality_gate,
-                "fitness_profile": fitness_profile,
-                "enable_confusable_bucket": config.enable_confusable_bucket,
-            }
+            run_inputs = build_run_inputs(
+                config=config,
+                iterations=iterations,
+                optimizer_model=optimizer_model,
+                quality_gate_preset=quality_gate,
+                eval_source=eval_source,
+                fitness_profile=fitness_profile,
+                enable_confusable_bucket=config.enable_confusable_bucket,
+            )
             tool_payload_fields = {
                 "artifact_type": "tool_description",
                 "target_tool": tool_name,
@@ -1226,23 +1219,15 @@ def evolve(
         except CostCeilingExceeded as exc:
             # Abort may fire before `run_inputs` is built in the deploy path;
             # fall back to a minimal equivalent so gate_decision.json is still useful.
-            run_inputs_for_abort = locals().get("run_inputs") or {
-                "seed": config.seed,
-                "iterations": iterations,
-                "optimizer_model": optimizer_model,
-                "reflection_model": config.reflection_model,
-                "eval_model": config.eval_model,
-                "resolved_lms": resolved_lms_dump(
-                    optimizer=optimizer_model,
-                    reflection=config.reflection_model,
-                    eval_=config.eval_model,
-                ),
-                "eval_dataset_size": config.eval_dataset_size,
-                "holdout_ratio": config.holdout_ratio,
-                "quality_gate_preset": quality_gate,
-                "fitness_profile": fitness_profile,
-                "eval_source": eval_source,
-            }
+            run_inputs_for_abort = locals().get("run_inputs") or build_run_inputs(
+                config=config,
+                iterations=iterations,
+                optimizer_model=optimizer_model,
+                quality_gate_preset=quality_gate,
+                eval_source=eval_source,
+                fitness_profile=fitness_profile,
+                enable_confusable_bucket=config.enable_confusable_bucket,
+            )
             write_cost_ceiling_abort(
                 exc,
                 output_dir=output_dir,
diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py
new file mode 100644
index 0000000..e839a39
--- /dev/null
+++ b/tests/core/test_run_inputs.py
@@ -0,0 +1,117 @@
+"""Unit tests for `build_run_inputs`.
+
+The helper centralizes the construction of the `run_inputs` block written
+into every gate_decision.json. Both `evolve_skill` and `evolve_tool` build
+the same nine-key core; the tool side adds two extra keys
+(`fitness_profile`, `enable_confusable_bucket`). Lock the shape so future
+refactors can't silently drop a key — the cost-ceiling fallback in
+`evolve_tool` historically dropped `enable_confusable_bucket`, breaking
+the deploy-gate contract asserted by `TestGateDecisionSchemaOnDeploy`.
+"""
+
+from __future__ import annotations
+
+from evolution.core.config import EvolutionConfig
+from evolution.core.hermes_provider import resolved_lms_dump
+from evolution.core.run_inputs import build_run_inputs
+
+
+def _fake_config() -> EvolutionConfig:
+    return EvolutionConfig(
+        seed=42,
+        reflection_model="openai/gpt-4.1",
+        eval_model="openai/gpt-4.1-mini",
+        eval_dataset_size=150,
+        holdout_ratio=0.5,
+        enable_confusable_bucket=False,
+    )
+
+
+class TestBuildRunInputs:
+    def test_skill_side_has_nine_keys(self):
+        config = _fake_config()
+        result = build_run_inputs(
+            config=config,
+            iterations=10,
+            optimizer_model="openai/gpt-4.1",
+            quality_gate_preset="default",
+            eval_source="synthetic",
+        )
+        assert set(result.keys()) == {
+            "seed",
+            "iterations",
+            "optimizer_model",
+            "reflection_model",
+            "eval_model",
+            "resolved_lms",
+            "eval_dataset_size",
+            "holdout_ratio",
+            "quality_gate_preset",
+            "eval_source",
+        }
+        assert len(result) == 10
+
+    def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
+        config = _fake_config()
+        config.enable_confusable_bucket = True
+        result = build_run_inputs(
+            config=config,
+            iterations=10,
+            optimizer_model="openai/gpt-4.1",
+            quality_gate_preset="default",
+            eval_source="synthetic",
+            fitness_profile="balanced",
+            enable_confusable_bucket=True,
+        )
+        assert set(result.keys()) == {
+            "seed",
+            "iterations",
+            "optimizer_model",
+            "reflection_model",
+            "eval_model",
+            "resolved_lms",
+            "eval_dataset_size",
+            "holdout_ratio",
+            "quality_gate_preset",
+            "eval_source",
+            "fitness_profile",
+            "enable_confusable_bucket",
+        }
+        assert len(result) == 12
+        assert result["fitness_profile"] == "balanced"
+        assert result["enable_confusable_bucket"] is True
+
+    def test_resolved_lms_matches_helper_output(self):
+        config = _fake_config()
+        result = build_run_inputs(
+            config=config,
+            iterations=10,
+            optimizer_model="openai/gpt-4.1",
+            quality_gate_preset="default",
+            eval_source="synthetic",
+        )
+        expected = resolved_lms_dump(
+            optimizer="openai/gpt-4.1",
+            reflection=config.reflection_model,
+            eval_=config.eval_model,
+        )
+        assert result["resolved_lms"] == expected
+
+    def test_enable_confusable_bucket_round_trips_when_passed(self):
+        # Regression: the cost-ceiling fallback in evolve_tool historically
+        # built run_inputs without `enable_confusable_bucket`, which broke
+        # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy`
+        # whenever the cost ceiling tripped on a deploy path.
+        config = _fake_config()
+        config.enable_confusable_bucket = True
+        result = build_run_inputs(
+            config=config,
+            iterations=10,
+            optimizer_model="openai/gpt-4.1",
+            quality_gate_preset="default",
+            eval_source="synthetic",
+            fitness_profile="balanced",
+            enable_confusable_bucket=config.enable_confusable_bucket,
+        )
+        assert "enable_confusable_bucket" in result
+        assert result["enable_confusable_bucket"] is True

From c4610c1458a3bd4438ddf001f04f2b24c7abc4f1 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 22:04:45 -0600
Subject: [PATCH 2/5] polish: trim build_run_inputs docstrings and tighten
 tests

Address review feedback on 47cabe8b:
- Drop historical narrative from helper and test module docstrings;
  keep the load-bearing "what does this produce" sentence.
- Downgrade test_enable_confusable_bucket_round_trips docstring to
  honestly describe what the test covers (helper round-trip only,
  not the cost-ceiling call site).
- Remove redundant len() assertions in shape tests (the set equality
  already pins the count) and rename test_skill_side_has_nine_keys.
- Remove stale "hoist run_inputs to a local" comment; the helper
  call makes the intent obvious.
---
 evolution/core/run_inputs.py     |  4 ----
 evolution/skills/evolve_skill.py |  2 --
 tests/core/test_run_inputs.py    | 22 +++++-----------------
 3 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py
index c47bb0d..7862b1c 100644
--- a/evolution/core/run_inputs.py
+++ b/evolution/core/run_inputs.py
@@ -2,10 +2,6 @@
 
 The block records every input that produced a given run so a third party
 holding only the gate_decision.json artifact can reproduce the result.
-Centralizing construction here keeps the skill and tool sides from
-drifting — the cost-ceiling fallback in `evolve_tool` historically built
-the block by hand and dropped `enable_confusable_bucket`, breaking the
-deploy-gate schema contract.
 """
 
 from __future__ import annotations
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index b0161aa..fc7aca3 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -1094,8 +1094,6 @@ def evolve(
             evolved_chars = len(evolved_full)
             growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars)
 
-            # Hoist run_inputs to a local — referenced from 3 sites (the
-            # two CL-primary abort paths + the main decision_payload).
             run_inputs = build_run_inputs(
                 config=config,
                 iterations=iterations,
diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py
index e839a39..e525d55 100644
--- a/tests/core/test_run_inputs.py
+++ b/tests/core/test_run_inputs.py
@@ -1,13 +1,4 @@
-"""Unit tests for `build_run_inputs`.
-
-The helper centralizes the construction of the `run_inputs` block written
-into every gate_decision.json. Both `evolve_skill` and `evolve_tool` build
-the same nine-key core; the tool side adds two extra keys
-(`fitness_profile`, `enable_confusable_bucket`). Lock the shape so future
-refactors can't silently drop a key — the cost-ceiling fallback in
-`evolve_tool` historically dropped `enable_confusable_bucket`, breaking
-the deploy-gate contract asserted by `TestGateDecisionSchemaOnDeploy`.
-"""
+"""Unit tests for `build_run_inputs`."""
 
 from __future__ import annotations
 
@@ -28,7 +19,7 @@ def _fake_config() -> EvolutionConfig:
 
 
 class TestBuildRunInputs:
-    def test_skill_side_has_nine_keys(self):
+    def test_skill_side_shape(self):
         config = _fake_config()
         result = build_run_inputs(
             config=config,
@@ -49,7 +40,6 @@ def test_skill_side_has_nine_keys(self):
             "quality_gate_preset",
             "eval_source",
         }
-        assert len(result) == 10
 
     def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
         config = _fake_config()
@@ -77,7 +67,6 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
             "fitness_profile",
             "enable_confusable_bucket",
         }
-        assert len(result) == 12
         assert result["fitness_profile"] == "balanced"
         assert result["enable_confusable_bucket"] is True
 
@@ -98,10 +87,9 @@ def test_resolved_lms_matches_helper_output(self):
         assert result["resolved_lms"] == expected
 
     def test_enable_confusable_bucket_round_trips_when_passed(self):
-        # Regression: the cost-ceiling fallback in evolve_tool historically
-        # built run_inputs without `enable_confusable_bucket`, which broke
-        # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy`
-        # whenever the cost ceiling tripped on a deploy path.
+        # Helper round-trip only. Call-site coverage that the deploy-gate
+        # paths actually pass this kwarg lives in
+        # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy`.
         config = _fake_config()
         config.enable_confusable_bucket = True
         result = build_run_inputs(

From 5edd73dbc7d1876941771a061448164934622796 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 22:11:58 -0600
Subject: [PATCH 3/5] refactor: extract append_cl_decision_fields to
 evolution.core.quality_gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The same 9-field CL-decision mutation block was duplicated byte-for-byte in
evolve_tool.py and evolve_skill.py. Move it to a shared helper so the
deploy-path callsites collapse to one call each and the CL_PRIMARY_* /
math.ceil details live in one place.

Scope is the success-path block only. The cl_eval_failed / cl_eval_incomplete
abort callsites also write CL-related fields, but with a different shape
(flat dict literal, no synthetic_sanity_check, no cl_tasks_gained /
cl_required_gain) — forcing them through this helper would either inflate
the abort payload or require ad-hoc kwargs that obscure the contract. Left
as a follow-up.

evolved_cl_errored_task_ids defaults to () so the deploy-path caller stays
empty by construction; abort-path adoption can pass the populated list
without diverging the signature.
---
 evolution/core/quality_gate.py               | 48 ++++++++++-
 evolution/skills/evolve_skill.py             | 39 +++------
 evolution/tools/evolve_tool.py               | 41 +++-------
 tests/core/test_append_cl_decision_fields.py | 85 ++++++++++++++++++++
 4 files changed, 156 insertions(+), 57 deletions(-)
 create mode 100644 tests/core/test_append_cl_decision_fields.py

diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py
index 426af86..a87e992 100644
--- a/evolution/core/quality_gate.py
+++ b/evolution/core/quality_gate.py
@@ -12,7 +12,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional, Sequence
 
 from rich.console import Console
 
@@ -97,6 +97,52 @@ def _check_cl_primary_gate(
     )
 
 
+def append_cl_decision_fields(
+    decision_payload: dict,
+    *,
+    cached_baseline_cl_per_example: list[float],
+    evolved_cl_per_example: list[float],
+    avg_baseline: float,
+    avg_evolved: float,
+    growth_pct: float,
+    cl_eval_cost_usd: float,
+    preflight_holdout_score: Optional[float],
+    preflight_cl_score: Optional[float],
+    closed_loop_agent_model: str,
+    evolved_cl_errored_task_ids: Sequence = (),
+) -> None:
+    """In-place mutation: adds the 9 closed-loop fields to ``decision_payload``.
+
+    ``evolved_cl_errored_task_ids`` defaults to ``()`` so the deploy-path
+    caller (no errors by construction) can omit the kwarg; future abort-path
+    callers can pass the populated list without a separate code path.
+    """
+    decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
+    decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
+    decision_payload["evolved_closed_loop_errored_tasks"] = list(evolved_cl_errored_task_ids)
+    decision_payload["cl_tasks_gained"] = (
+        int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
+    )
+    decision_payload["cl_required_gain"] = max(
+        1,
+        math.ceil(
+            max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
+        ),
+    )
+    decision_payload["synthetic_sanity_check"] = {
+        "tolerance": CL_PRIMARY_SYNTH_TOLERANCE,
+        "baseline_mean": avg_baseline,
+        "evolved_mean": avg_evolved,
+        "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE,
+    }
+    decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd
+    decision_payload["band_trigger_score"] = {
+        "holdout": preflight_holdout_score,
+        "closed_loop": preflight_cl_score,
+    }
+    decision_payload["validator_agent_model"] = closed_loop_agent_model
+
+
 # `default` is calibrated against the obsidian deploy (+24.2% growth,
 # ~+0.07 expected improvement). `off` disables the slope/ceiling checks
 # but still enforces bootstrap.mean ≥ 0 — see deprecation warning when
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index fc7aca3..b81c3b6 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -8,7 +8,6 @@
 import difflib
 import json
 import logging
-import math
 import random
 import sys
 import time
@@ -42,11 +41,9 @@
     resolved_lms_dump,
 )
 from evolution.core.quality_gate import (
-    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
-    CL_PRIMARY_GROWTH_SLOPE,
-    CL_PRIMARY_SYNTH_TOLERANCE,
     QUALITY_GATE_PRESETS,
     _check_cl_primary_gate,
+    append_cl_decision_fields,
     resolve_proposer_mode,
     run_benchmark_hook,
     write_cost_ceiling_abort,
@@ -1344,30 +1341,18 @@ def evolve(
                 decision_payload["benchmark"] = benchmark_block
 
             if use_cl_primary:
-                decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
-                decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
-                decision_payload["evolved_closed_loop_errored_tasks"] = []  # populated only on abort path
-                decision_payload["cl_tasks_gained"] = (
-                    int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
-                )
-                decision_payload["cl_required_gain"] = max(
-                    1,
-                    math.ceil(
-                        max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
-                    ),
+                append_cl_decision_fields(
+                    decision_payload,
+                    cached_baseline_cl_per_example=cached_baseline_cl_per_example,
+                    evolved_cl_per_example=evolved_cl_per_example,
+                    avg_baseline=avg_baseline,
+                    avg_evolved=avg_evolved,
+                    growth_pct=growth_pct,
+                    cl_eval_cost_usd=cl_eval_cost_usd,
+                    preflight_holdout_score=preflight_holdout_score,
+                    preflight_cl_score=preflight_cl_score,
+                    closed_loop_agent_model=closed_loop_agent_model,
                 )
-                decision_payload["synthetic_sanity_check"] = {
-                    "tolerance": CL_PRIMARY_SYNTH_TOLERANCE,
-                    "baseline_mean": avg_baseline,
-                    "evolved_mean": avg_evolved,
-                    "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE,
-                }
-                decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd
-                decision_payload["band_trigger_score"] = {
-                    "holdout": preflight_holdout_score,
-                    "closed_loop": preflight_cl_score,
-                }
-                decision_payload["validator_agent_model"] = closed_loop_agent_model
 
             if not use_cl_primary and preflight_band is None:
                 # User passed --no-saturation-check; record why CL-primary
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 402ee80..af7cff1 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -10,7 +10,6 @@
 import difflib
 import json
 import logging
-import math
 import sys
 import time
 from datetime import datetime
@@ -60,11 +59,9 @@
     register_litellm_failure_callback,
 )
 from evolution.core.quality_gate import (
-    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
-    CL_PRIMARY_GROWTH_SLOPE,
-    CL_PRIMARY_SYNTH_TOLERANCE,
     QUALITY_GATE_PRESETS,
     _check_cl_primary_gate,
+    append_cl_decision_fields,
     resolve_proposer_mode,
     run_benchmark_hook,
     write_cost_ceiling_abort,
@@ -1078,32 +1075,18 @@ def evolve(
             if benchmark_block is not None:
                 decision_payload["benchmark"] = benchmark_block
             if use_cl_primary:
-                decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
-                decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
-                # Populated only on the abort path (cl_eval_incomplete); empty
-                # here because we reach this block only when no task errored.
-                decision_payload["evolved_closed_loop_errored_tasks"] = []
-                decision_payload["cl_tasks_gained"] = (
-                    int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
-                )
-                decision_payload["cl_required_gain"] = max(
-                    1,
-                    math.ceil(
-                        max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
-                    ),
+                append_cl_decision_fields(
+                    decision_payload,
+                    cached_baseline_cl_per_example=cached_baseline_cl_per_example,
+                    evolved_cl_per_example=evolved_cl_per_example,
+                    avg_baseline=avg_baseline,
+                    avg_evolved=avg_evolved,
+                    growth_pct=growth_pct,
+                    cl_eval_cost_usd=cl_eval_cost_usd,
+                    preflight_holdout_score=preflight_holdout_score,
+                    preflight_cl_score=preflight_cl_score,
+                    closed_loop_agent_model=closed_loop_agent_model,
                 )
-                decision_payload["synthetic_sanity_check"] = {
-                    "tolerance": CL_PRIMARY_SYNTH_TOLERANCE,
-                    "baseline_mean": avg_baseline,
-                    "evolved_mean": avg_evolved,
-                    "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE,
-                }
-                decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd
-                decision_payload["band_trigger_score"] = {
-                    "holdout": preflight_holdout_score,
-                    "closed_loop": preflight_cl_score,
-                }
-                decision_payload["validator_agent_model"] = closed_loop_agent_model
 
             if not use_cl_primary and preflight_band is None:
                 # User passed --no-saturation-check; record why CL-primary
diff --git a/tests/core/test_append_cl_decision_fields.py b/tests/core/test_append_cl_decision_fields.py
new file mode 100644
index 0000000..597ba28
--- /dev/null
+++ b/tests/core/test_append_cl_decision_fields.py
@@ -0,0 +1,85 @@
+"""Unit tests for `append_cl_decision_fields`."""
+
+from __future__ import annotations
+
+import math
+
+from evolution.core.quality_gate import (
+    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
+    CL_PRIMARY_GROWTH_SLOPE,
+    CL_PRIMARY_SYNTH_TOLERANCE,
+    append_cl_decision_fields,
+)
+
+
+def _call(payload: dict, **overrides) -> None:
+    kwargs = dict(
+        cached_baseline_cl_per_example=[1.0, 0.0, 1.0],
+        evolved_cl_per_example=[1.0, 1.0, 1.0],
+        avg_baseline=0.60,
+        avg_evolved=0.65,
+        growth_pct=0.25,
+        cl_eval_cost_usd=0.0123,
+        preflight_holdout_score=0.7,
+        preflight_cl_score=0.4,
+        closed_loop_agent_model="openai/gpt-4.1-mini",
+    )
+    kwargs.update(overrides)
+    append_cl_decision_fields(payload, **kwargs)
+
+
+class TestAppendClDecisionFields:
+    def test_all_nine_fields_added(self):
+        payload: dict = {}
+        _call(payload)
+        assert set(payload.keys()) == {
+            "baseline_closed_loop_per_example",
+            "evolved_closed_loop_per_example",
+            "evolved_closed_loop_errored_tasks",
+            "cl_tasks_gained",
+            "cl_required_gain",
+            "synthetic_sanity_check",
+            "evolved_cl_eval_cost_usd",
+            "band_trigger_score",
+            "validator_agent_model",
+        }
+        assert payload["cl_tasks_gained"] == 3 - 2
+        assert payload["evolved_cl_eval_cost_usd"] == 0.0123
+        assert payload["band_trigger_score"] == {"holdout": 0.7, "closed_loop": 0.4}
+        assert payload["validator_agent_model"] == "openai/gpt-4.1-mini"
+        sanity = payload["synthetic_sanity_check"]
+        assert sanity["tolerance"] == CL_PRIMARY_SYNTH_TOLERANCE
+        assert sanity["baseline_mean"] == 0.60
+        assert sanity["evolved_mean"] == 0.65
+        assert sanity["passed"] is True
+
+    def test_errored_task_ids_defaults_to_empty_list(self):
+        payload: dict = {}
+        _call(payload)
+        assert payload["evolved_closed_loop_errored_tasks"] == []
+
+    def test_cl_required_gain_uses_constants_not_magic_numbers(self):
+        # Pin the formula so silent constant→literal substitutions break this test.
+        payload: dict = {}
+        _call(payload, growth_pct=CL_PRIMARY_GROWTH_FREE_THRESHOLD + 0.5)
+        expected = max(1, math.ceil(CL_PRIMARY_GROWTH_SLOPE * 0.5))
+        assert payload["cl_required_gain"] == expected
+
+    def test_synthetic_sanity_check_passed_reflects_tolerance(self):
+        # Exactly at the boundary: passes.
+        boundary_payload: dict = {}
+        _call(
+            boundary_payload,
+            avg_baseline=0.50,
+            avg_evolved=0.50 - CL_PRIMARY_SYNTH_TOLERANCE,
+        )
+        assert boundary_payload["synthetic_sanity_check"]["passed"] is True
+
+        # Just past the boundary: fails.
+        over_payload: dict = {}
+        _call(
+            over_payload,
+            avg_baseline=0.50,
+            avg_evolved=0.50 - CL_PRIMARY_SYNTH_TOLERANCE - 0.001,
+        )
+        assert over_payload["synthetic_sanity_check"]["passed"] is False

From 63889479e1b39193735f2c4e82d2dc3afb315a95 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 22:19:50 -0600
Subject: [PATCH 4/5] polish: drop speculative evolved_cl_errored_task_ids
 kwarg

Address review feedback on 5edd73db:
- Drop evolved_cl_errored_task_ids kwarg and its Sequence import.
  The deferred abort paths have a different scaffold and won't adopt
  this helper; the kwarg + its forward-looking docstring were YAGNI.
- Hard-code evolved_closed_loop_errored_tasks = [] in the helper body
  (matches deploy-path semantics: success path has no errors by
  construction).
- Trim docstring to one line.
- Rename test_errored_task_ids_defaults_to_empty_list to
  test_errored_tasks_is_empty_list (no longer about a default).
---
 evolution/core/quality_gate.py               | 12 +++---------
 tests/core/test_append_cl_decision_fields.py |  2 +-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py
index a87e992..6800050 100644
--- a/evolution/core/quality_gate.py
+++ b/evolution/core/quality_gate.py
@@ -12,7 +12,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import Any, Optional, Sequence
+from typing import Any, Optional
 
 from rich.console import Console
 
@@ -109,17 +109,11 @@ def append_cl_decision_fields(
     preflight_holdout_score: Optional[float],
     preflight_cl_score: Optional[float],
     closed_loop_agent_model: str,
-    evolved_cl_errored_task_ids: Sequence = (),
 ) -> None:
-    """In-place mutation: adds the 9 closed-loop fields to ``decision_payload``.
-
-    ``evolved_cl_errored_task_ids`` defaults to ``()`` so the deploy-path
-    caller (no errors by construction) can omit the kwarg; future abort-path
-    callers can pass the populated list without a separate code path.
-    """
+    """Append the closed-loop deploy-gate decision fields to ``decision_payload``."""
     decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
     decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
-    decision_payload["evolved_closed_loop_errored_tasks"] = list(evolved_cl_errored_task_ids)
+    decision_payload["evolved_closed_loop_errored_tasks"] = []
     decision_payload["cl_tasks_gained"] = (
         int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
     )
diff --git a/tests/core/test_append_cl_decision_fields.py b/tests/core/test_append_cl_decision_fields.py
index 597ba28..6219c0e 100644
--- a/tests/core/test_append_cl_decision_fields.py
+++ b/tests/core/test_append_cl_decision_fields.py
@@ -53,7 +53,7 @@ def test_all_nine_fields_added(self):
         assert sanity["evolved_mean"] == 0.65
         assert sanity["passed"] is True
 
-    def test_errored_task_ids_defaults_to_empty_list(self):
+    def test_errored_tasks_is_empty_list(self):
         payload: dict = {}
         _call(payload)
         assert payload["evolved_closed_loop_errored_tasks"] == []

From e2d0ab9e62e1b5260bbf26b1977087e30cb886c9 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 22:30:13 -0600
Subject: [PATCH 5/5] fix: surface CL gains in summary panel when CL-primary
 deployed

The panel rendered "did not improve" on a CL-primary deploy because the
synthetic delta is often near zero (or negative) when the closed-loop
gate is the one driving the decision. Now under use_cl_primary the
deploy line announces the CL task gain and the reject line names the
shortfall vs the required gain; the Evolution Results table picks up a
"Closed-loop (behavioral)" row and colors all rows by the gate verdict
rather than the irrelevant synthetic delta.
---
 evolution/skills/evolve_skill.py              | 33 ++++++-
 evolution/tools/evolve_tool.py                | 32 ++++++-
 .../skills/test_evolve_skill_cl_aware_gate.py | 96 +++++++++++++++++++
 tests/tools/test_evolve_tool_cl_aware_gate.py | 87 +++++++++++++++++
 4 files changed, 242 insertions(+), 6 deletions(-)

diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index b81c3b6..4a2dff8 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -1364,6 +1364,12 @@ def evolve(
 
             if not growth_pass:
                 console.print("[red]✗ Evolved skill REJECTED by quality gate — not deploying[/red]")
+                if use_cl_primary:
+                    console.print(
+                        f"[yellow]⚠ Evolution rejected: "
+                        f"CL gain {decision_payload['cl_tasks_gained']} < "
+                        f"required {decision_payload['cl_required_gain']}[/yellow]"
+                    )
                 failed_path = output_dir / "evolved_FAILED.md"
                 failed_path.write_text(evolved_full)
                 console.print(f"  Saved failed variant to {failed_path}")
@@ -1386,13 +1392,28 @@ def evolve(
             table.add_column("Evolved", justify="right")
             table.add_column("Change", justify="right")
 
-            change_color = "green" if improvement > 0 else "red"
+            # Under CL-primary, the gate verdict — not the synthetic delta —
+            # decides the row color; the synthetic delta is informational.
+            row_color = (
+                ("green" if growth_pass else "yellow")
+                if use_cl_primary
+                else ("green" if improvement > 0 else "red")
+            )
             table.add_row(
                 "Holdout Score",
                 f"{avg_baseline:.3f}",
                 f"{avg_evolved:.3f}",
-                f"[{change_color}]{improvement:+.3f}[/{change_color}]",
+                f"[{row_color}]{improvement:+.3f}[/{row_color}]",
             )
+            if use_cl_primary:
+                baseline_cl = int(sum(cached_baseline_cl_per_example))
+                evolved_cl = int(sum(evolved_cl_per_example))
+                table.add_row(
+                    "Closed-loop (behavioral)",
+                    f"{baseline_cl} tasks",
+                    f"{evolved_cl} tasks",
+                    f"[{row_color}]{evolved_cl - baseline_cl:+d} tasks[/{row_color}]",
+                )
             table.add_row(
                 "Skill Size",
                 f"{len(skill['body']):,} chars",
@@ -1444,7 +1465,13 @@ def evolve(
                     if applied:
                         console.print(f"  --apply: wrote evolved skill to {skill_path}")
 
-            if improvement > 0:
+            if use_cl_primary:
+                console.print(
+                    f"\n[bold green]✓ Evolution improved skill "
+                    f"(CL gained +{decision_payload['cl_tasks_gained']} tasks)[/bold green]"
+                )
+                console.print(f"  Review the diff: diff {output_dir}/baseline_skill.md {output_dir}/evolved_skill.md")
+            elif improvement > 0:
                 console.print(f"\n[bold green]✓ Evolution improved skill by {improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]")
                 console.print(f"  Review the diff: diff {output_dir}/baseline_skill.md {output_dir}/evolved_skill.md")
             else:
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index af7cff1..40759bd 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -1099,6 +1099,12 @@ def evolve(
 
             if not growth_pass:
                 console.print("[red]✗ Evolved description REJECTED by quality gate — not deploying[/red]")
+                if use_cl_primary:
+                    console.print(
+                        f"[yellow]⚠ Evolution rejected: "
+                        f"CL gain {decision_payload['cl_tasks_gained']} < "
+                        f"required {decision_payload['cl_required_gain']}[/yellow]"
+                    )
                 evolved_manifest = manifest.replace_description(tool_name, evolved_description)
                 failed_path = output_dir / "evolved_FAILED.json"
                 failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
@@ -1121,13 +1127,28 @@ def evolve(
             table.add_column("Baseline", justify="right")
             table.add_column("Evolved", justify="right")
             table.add_column("Change", justify="right")
-            change_color = "green" if improvement > 0 else "red"
+            # Under CL-primary, the gate verdict — not the synthetic delta —
+            # decides the row color; the synthetic delta is informational.
+            row_color = (
+                ("green" if growth_pass else "yellow")
+                if use_cl_primary
+                else ("green" if improvement > 0 else "red")
+            )
             table.add_row(
                 "Holdout Score",
                 f"{avg_baseline:.3f}",
                 f"{avg_evolved:.3f}",
-                f"[{change_color}]{improvement:+.3f}[/{change_color}]",
+                f"[{row_color}]{improvement:+.3f}[/{row_color}]",
             )
+            if use_cl_primary:
+                baseline_cl = int(sum(cached_baseline_cl_per_example))
+                evolved_cl = int(sum(evolved_cl_per_example))
+                table.add_row(
+                    "Closed-loop (behavioral)",
+                    f"{baseline_cl} tasks",
+                    f"{evolved_cl} tasks",
+                    f"[{row_color}]{evolved_cl - baseline_cl:+d} tasks[/{row_color}]",
+                )
             table.add_row(
                 "Description Size",
                 f"{baseline_chars:,} chars",
@@ -1188,7 +1209,12 @@ def evolve(
                 )
                 console.print(f"  --apply: wrote evolved description to {manifest_path}")
 
-            if improvement > 0:
+            if use_cl_primary:
+                console.print(
+                    f"\n[bold green]✓ Evolution improved tool description "
+                    f"(CL gained +{decision_payload['cl_tasks_gained']} tasks)[/bold green]"
+                )
+            elif improvement > 0:
                 console.print(
                     f"\n[bold green]✓ Evolution improved tool description by "
                     f"{improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]"
diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py
index 421352f..b285c9d 100644
--- a/tests/skills/test_evolve_skill_cl_aware_gate.py
+++ b/tests/skills/test_evolve_skill_cl_aware_gate.py
@@ -905,3 +905,99 @@ def test_skill_v4_payload_fields_preserved_in_v5_cl_primary(
         f"knee_point keys: {list(knee.keys())}"
     )
     assert isinstance(knee["band_roster"], list)
+
+
+def test_summary_panel_reflects_cl_decision_when_cl_primary_deploys(
+    skill_dir: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture,
+):
+    """CL-primary deploy → summary line announces the CL gain instead of
+    the synthetic delta. Without the CL-aware branch the panel says
+    'did not improve' even though gate_decision.json deployed the artifact,
+    so the operator gets a contradictory signal."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+
+    # _weak_signal_report pins baseline holdout to 0.95; evolved=0.90
+    # forces a negative synthetic improvement so the pre-change panel
+    # would render 'did not improve' even though CL-primary just deployed.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+        holdout_evolved_mean=0.90,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    out = capsys.readouterr().out
+    assert "CL gained +2" in out, f"missing CL-gain line in summary: {out!r}"
+    assert "did not improve" not in out, (
+        f"synthetic 'did not improve' line leaked through CL-primary deploy: {out!r}"
+    )
+
+
+def test_summary_panel_reflects_cl_decision_when_cl_primary_rejects(
+    skill_dir: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture,
+):
+    """CL-primary reject → summary line explains the CL shortfall instead
+    of falling back to the generic synthetic-rejected line."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    # 5/7 → 5/7: zero CL gain, required_gain stays at 1 → reject.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, False, False],
+    )
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    out = capsys.readouterr().out
+    assert "CL gain 0 < required 1" in out, (
+        f"missing CL-reject line in summary: {out!r}"
+    )
+
+
+def test_summary_panel_uses_synthetic_delta_when_not_cl_primary(
+    skill_dir: Path,
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture,
+):
+    """healthy band → synthetic gate → existing 'improved/did not improve'
+    wording is unchanged. Regression guard for the synthetic path.
+
+    _healthy_report() pins baseline holdout to 0.5 (cached per-example);
+    evolved=0.5 produces a zero synthetic delta that still clears the
+    non-inferiority gate (within tolerance) so the deploy-path 'did not
+    improve' line fires — that's the legacy branch we must preserve.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    with _patch_stack(
+        sat_report=_healthy_report(),
+        fake_cache=fake_cache,
+        holdout_evolved_mean=0.5,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    out = capsys.readouterr().out
+    assert "did not improve" in out, (
+        f"synthetic path must keep 'did not improve' on a zero delta: {out!r}"
+    )
+    assert "CL gained" not in out
+    assert "CL gain" not in out
diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
index c5dd86f..20db7b3 100644
--- a/tests/tools/test_evolve_tool_cl_aware_gate.py
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -873,3 +873,90 @@ def test_static_constraint_failure_payload_has_schema_v5_and_decision_signal(
         assert payload["decision_signal"] == "synthetic"
         assert payload["decision"] == "reject"
         assert payload["reason"] == "static_constraint_failure"
+
+
+def test_summary_panel_reflects_cl_decision_when_cl_primary_deploys(
+    temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture,
+):
+    """CL-primary deploy → summary line announces the CL gain instead of
+    the synthetic delta. Without the CL-aware branch the panel says
+    'did not improve' even though gate_decision.json deployed the artifact,
+    so the operator gets a contradictory signal."""
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+    run_dir = tmp_path / "run"
+
+    # _weak_signal_report pins baseline holdout to 0.95; evolved=0.90
+    # forces a negative synthetic improvement so the pre-change panel
+    # would render 'did not improve' even though CL-primary just deployed.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_description=_LOW_GROWTH_EVOLVED,
+        holdout_evolved_mean=0.90,
+    ):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    out = capsys.readouterr().out
+    assert "CL gained +2" in out, f"missing CL-gain line in summary: {out!r}"
+    assert "did not improve" not in out, (
+        f"synthetic 'did not improve' line leaked through CL-primary deploy: {out!r}"
+    )
+
+
+def test_summary_panel_reflects_cl_decision_when_cl_primary_rejects(
+    temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture,
+):
+    """CL-primary reject → summary line explains the CL shortfall instead
+    of falling back to the generic synthetic-rejected line."""
+    fake_cache = MagicMock()
+    # 5/7 → 5/7: zero CL gain, required_gain stays at 1 → reject.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, False, False],
+    )
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_description=_LOW_GROWTH_EVOLVED,
+    ):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    out = capsys.readouterr().out
+    assert "CL gain 0 < required 1" in out, (
+        f"missing CL-reject line in summary: {out!r}"
+    )
+
+
+def test_summary_panel_uses_synthetic_delta_when_not_cl_primary(
+    temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture,
+):
+    """healthy band → synthetic gate → existing 'improved/did not improve'
+    wording is unchanged. Regression guard for the synthetic path.
+
+    _healthy_report() pins baseline holdout to 0.5 (cached per-example);
+    evolved=0.5 produces a zero synthetic delta that still clears the
+    non-inferiority gate (within tolerance) so the deploy-path 'did not
+    improve' line fires — that's the legacy branch we must preserve.
+    """
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(
+        sat_report=_healthy_report(),
+        fake_cache=fake_cache,
+        holdout_evolved_mean=0.5,
+    ):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    out = capsys.readouterr().out
+    assert "did not improve" in out, (
+        f"synthetic path must keep 'did not improve' on a zero delta: {out!r}"
+    )
+    assert "CL gained" not in out
+    assert "CL gain" not in out