From deb952b5784f9e7b1c0464443c99b366e8ef1d4d Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:06:19 -0600
Subject: [PATCH 1/9] feat(quality_gate): add _check_cl_primary_gate helper

Pure function returning a ConstraintResult for the closed-loop-primary
deploy decision. Used when saturation pre-flight reports weak_signal
band. Required gain scales with description growth, mirroring the
synthetic gate's free_threshold + slope shape; synthetic regression
tolerance of 0.05 protects against catastrophic judge collapse.

11 unit tests cover the decision-rule math including the PR #68
calibration point (+2 gain on +121% growth -> required 2, just passes)
and wallpaper protection (+1 gain on +400% growth -> required 4, fails).
---
 evolution/core/quality_gate.py           |  74 ++++++++++++
 tests/core/test_check_cl_primary_gate.py | 142 +++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 tests/core/test_check_cl_primary_gate.py

diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py
index 8ddd1403..bf137cb3 100644
--- a/evolution/core/quality_gate.py
+++ b/evolution/core/quality_gate.py
@@ -7,6 +7,7 @@
 """
 
 import json
+import math
 import os
 import subprocess
 import time
@@ -15,6 +16,7 @@
 
 from rich.console import Console
 
+from evolution.core.constraints import ConstraintResult
 from evolution.core.lm_timing_callback import COST_LEDGER, CostCeilingExceeded
 from evolution.skills.budget_aware_proposer import ProposerMode
 
@@ -23,6 +25,78 @@
 _BENCHMARK_OUTPUT_TAIL_BYTES = 4096
 
 
+# CL-primary deploy-gate formula constants. Mirrors the synthetic
+# growth_quality_gate's free-threshold-then-slope shape (constraints.py
+# _check_growth_with_quality_gate) but adapted to integer CL task gains.
+#
+# free_threshold matches EvolutionConfig.growth_free_threshold so both
+# gates agree on the "free growth" boundary. slope=1.0 means "one extra
+# task required per +100% growth above the free threshold."
+CL_PRIMARY_GROWTH_FREE_THRESHOLD = 0.20
+CL_PRIMARY_GROWTH_SLOPE = 1.0
+CL_PRIMARY_SYNTH_TOLERANCE = 0.05
+
+
+def _check_cl_primary_gate(
+    *,
+    baseline_cl_passes: int,
+    evolved_cl_passes: int,
+    baseline_synth_mean: float,
+    evolved_synth_mean: float,
+    growth_pct: float,
+    synth_tolerance: float = CL_PRIMARY_SYNTH_TOLERANCE,
+) -> ConstraintResult:
+    """Deploy-gate decision rule used when the saturation pre-flight
+    classifies the run as ``weak_signal`` (synthetic judge saturated,
+    closed-loop signal has a gradient).
+
+    ACCEPT iff (gain >= required_gain) AND (synthetic not catastrophically
+    collapsed). ``required_gain`` scales with description growth so a
+    +1 task win can't deploy +400% wallpaper.
+
+    Parameters are scalars (not SaturationReport) so this helper is
+    independent of the preflight subsystem and trivially unit-testable.
+    Returns the standard ``ConstraintResult`` so the deploy gate's
+    existing aggregation code works without changes.
+    """
+    cl_gain = evolved_cl_passes - baseline_cl_passes
+    required_gain = max(
+        1,
+        math.ceil(
+            max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
+        ),
+    )
+    synth_delta = evolved_synth_mean - baseline_synth_mean
+    synth_passed = synth_delta >= -synth_tolerance
+
+    if cl_gain < required_gain:
+        return ConstraintResult(
+            passed=False,
+            constraint_name="cl_primary_gate",
+            message=(
+                f"CL gained {cl_gain:+d} tasks but required {required_gain} "
+                f"for {growth_pct:+.2%} growth"
+            ),
+        )
+    if not synth_passed:
+        return ConstraintResult(
+            passed=False,
+            constraint_name="cl_primary_gate",
+            message=(
+                f"CL gained {cl_gain:+d} tasks but synthetic regressed "
+                f"{synth_delta:+.3f} > tolerance {synth_tolerance:.3f}"
+            ),
+        )
+    return ConstraintResult(
+        passed=True,
+        constraint_name="cl_primary_gate",
+        message=(
+            f"CL gained +{cl_gain} tasks (required {required_gain}); "
+            f"synth Δ {synth_delta:+.3f} within ±{synth_tolerance:.3f}"
+        ),
+    )
+
+
 # `default` is calibrated against the obsidian deploy (+24.2% growth,
 # ~+0.07 expected improvement). `off` disables the slope/ceiling checks
 # but still enforces bootstrap.mean ≥ 0 — see deprecation warning when
diff --git a/tests/core/test_check_cl_primary_gate.py b/tests/core/test_check_cl_primary_gate.py
new file mode 100644
index 00000000..26c66a9d
--- /dev/null
+++ b/tests/core/test_check_cl_primary_gate.py
@@ -0,0 +1,142 @@
+"""Unit tests for the CL-primary gate helper.
+
+The helper combines two signals (CL pass counts, synthetic mean) and a
+growth signal into a single accept/reject ConstraintResult. Tests pin
+the decision-rule math; integration with evolve_tool lives in
+tests/tools/test_evolve_tool_cl_aware_gate.py.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from evolution.core.constraints import ConstraintResult
+from evolution.core.quality_gate import (
+    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
+    CL_PRIMARY_GROWTH_SLOPE,
+    _check_cl_primary_gate,
+)
+
+
+class TestCheckClPrimaryGate:
+    def test_accepts_when_required_gain_met_at_free_threshold(self):
+        # +1 gain, +20% growth (exactly at free threshold) → required=1
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.20,
+        )
+        assert result.passed is True
+        assert result.constraint_name == "cl_primary_gate"
+
+    def test_accepts_at_pr_68_calibration_point(self):
+        # PR #68: +2 gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2.
+        # This is the exact case that motivated this work.
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=7,
+            baseline_synth_mean=1.000,
+            evolved_synth_mean=1.000,
+            growth_pct=1.21,
+        )
+        assert result.passed is True
+
+    def test_rejects_when_growth_aware_threshold_unsatisfied(self):
+        # +1 gain on +400% growth → required=4, fail.
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=4.00,
+        )
+        assert result.passed is False
+        assert "required" in result.message.lower()
+
+    def test_rejects_when_no_task_gained(self):
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=5,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.20,
+        )
+        assert result.passed is False
+
+    def test_rejects_when_synthetic_regressed_beyond_tolerance(self):
+        # +1 task gained, but synthetic dropped 0.06 (> 0.05 tolerance)
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=1.000,
+            evolved_synth_mean=0.939,
+            growth_pct=0.20,
+        )
+        assert result.passed is False
+        assert "synthetic" in result.message.lower()
+
+    def test_accepts_when_synthetic_regressed_within_tolerance(self):
+        # +1 task gained, synthetic dropped 0.04 (< 0.05 tolerance)
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=1.000,
+            evolved_synth_mean=0.961,
+            growth_pct=0.20,
+        )
+        assert result.passed is True
+
+    def test_rejects_when_evolved_cl_regressed(self):
+        # Negative gain → reject even with no growth
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=4,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.0,
+        )
+        assert result.passed is False
+
+    def test_required_gain_floor_is_one_even_at_zero_growth(self):
+        # Even with 0 growth, must gain ≥1 task — no free deploys for null changes
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=5,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.0,
+        )
+        assert result.passed is False
+
+    def test_growth_within_free_threshold_requires_only_one_task(self):
+        # +1 gain, +15% growth (below 20% free threshold)
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.15,
+        )
+        assert result.passed is True
+
+    def test_message_records_required_and_actual_gain(self):
+        # Message must surface the numbers for gate_decision.json + console
+        result = _check_cl_primary_gate(
+            baseline_cl_passes=5,
+            evolved_cl_passes=6,
+            baseline_synth_mean=0.97,
+            evolved_synth_mean=0.97,
+            growth_pct=0.20,
+        )
+        assert "1" in result.message  # required_gain == 1
+        assert "+1" in result.message or "gained 1" in result.message.lower()
+
+    def test_constants_match_evolution_config_defaults(self):
+        # The CL gate's free-threshold default must match EvolutionConfig's
+        # synthetic-gate default so they agree on what "free growth" means.
+        from evolution.core.config import EvolutionConfig
+        cfg = EvolutionConfig()
+        assert CL_PRIMARY_GROWTH_FREE_THRESHOLD == cfg.growth_free_threshold
+        assert CL_PRIMARY_GROWTH_SLOPE == 1.0

From 365bd364dbc5ea77228b637f45b323904106bbb4 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:12:51 -0600
Subject: [PATCH 2/9] refactor(evolve_tool): preserve SaturationReport fields
 for deploy gate

Today only sat_report.holdout_per_example survives past the preflight
call site; subsequent CL-aware gate work needs the band classification
and baseline CL per-task scores too. Bind four new locals next to the
existing cache: band, cl_per_example, holdout_score, cl_score. All
default to None on the --no-saturation-check path so the deploy gate
can branch safely.

No behavior change; existing tests pass unchanged.
---
 evolution/tools/evolve_tool.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 9dfec416..d226e109 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -669,7 +669,11 @@ def evolve(
                 if closed_loop_in_valset:
                     valset = valset + behavioral_examples
 
-            cached_baseline_holdout_per_example = None
+            cached_baseline_holdout_per_example: Optional[list[float]] = None
+            preflight_band: Optional[str] = None
+            cached_baseline_cl_per_example: Optional[list[float]] = None
+            preflight_holdout_score: Optional[float] = None
+            preflight_cl_score: Optional[float] = None
             if not skip_saturation_check:
                 holdout_examples_for_preflight = _build_examples(
                     dataset.holdout, for_module=True
@@ -703,6 +707,14 @@ def evolve(
                 else:
                     render_saturation_panel(sat_report, console=console)
                 cached_baseline_holdout_per_example = sat_report.holdout_per_example
+                # Preserve preflight outputs for the deploy gate's CL-primary
+                # path. None when --no-saturation-check was passed (sat_report
+                # itself doesn't exist in that case; handled by initialization
+                # to None above the preflight call).
+                preflight_band: Optional[str] = sat_report.band
+                cached_baseline_cl_per_example: Optional[list[float]] = sat_report.closed_loop_per_example
+                preflight_holdout_score: Optional[float] = sat_report.holdout_score
+                preflight_cl_score: Optional[float] = sat_report.closed_loop_score
 
             console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n")
             start_time = time.time()

From ae1fea688b6b83095cd5166189edadc03460091f Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:22:28 -0600
Subject: [PATCH 3/9] feat(evolve_tool): branch deploy gate on saturation band
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When preflight reports weak_signal AND closed-loop is configured, run
a one-shot force_run on the evolved description and gate the deploy
decision on closed-loop signal via _check_cl_primary_gate.

Three abort paths are written to gate_decision.json with diagnostic
payloads (schema v5):
  - cl_eval_failed: force_run raised an exception
  - cl_eval_incomplete: one or more evolved CL tasks abstained
    (runner errored — distinguished from genuine task failure via
    the existing TaskResult.abstained field)
  - cl_primary_gate reject: returned by the gate helper itself

_check_absolute_char_ceiling is preserved in the CL-primary path —
wallpaper protection is orthogonal to which signal we gate on. All
other bands (healthy / no_headroom / uniform_failure / no preflight)
fall through to the existing synthetic path unchanged.
---
 evolution/tools/evolve_tool.py | 156 ++++++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 12 deletions(-)

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index d226e109..e4eb87a0 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -41,6 +41,7 @@
     resolved_lms_dump,
 )
 from evolution.core.constraints import (
+    ConstraintResult,
     ConstraintValidator,
     effective_absolute_char_ceiling,
     resolve_decision_rule,
@@ -59,6 +60,7 @@
 )
 from evolution.core.quality_gate import (
     QUALITY_GATE_PRESETS,
+    _check_cl_primary_gate,
     resolve_proposer_mode,
     run_benchmark_hook,
     write_cost_ceiling_abort,
@@ -711,10 +713,10 @@ def evolve(
                 # path. None when --no-saturation-check was passed (sat_report
                 # itself doesn't exist in that case; handled by initialization
                 # to None above the preflight call).
-                preflight_band: Optional[str] = sat_report.band
-                cached_baseline_cl_per_example: Optional[list[float]] = sat_report.closed_loop_per_example
-                preflight_holdout_score: Optional[float] = sat_report.holdout_score
-                preflight_cl_score: Optional[float] = sat_report.closed_loop_score
+                preflight_band = sat_report.band
+                cached_baseline_cl_per_example = sat_report.closed_loop_per_example
+                preflight_holdout_score = sat_report.holdout_score
+                preflight_cl_score = sat_report.closed_loop_score
 
             console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n")
             start_time = time.time()
@@ -844,6 +846,125 @@ def evolve(
             )
             improvement = avg_evolved - avg_baseline
 
+            # Decide which deploy-gate path applies. CL-primary fires when
+            # the preflight saw weak_signal AND CL data is present. All
+            # other cases (no preflight, healthy/no_headroom/uniform_failure
+            # bands, missing CL data) use the synthetic-only path.
+            baseline_chars = len(baseline_description)
+            evolved_chars = len(evolved_description)
+            growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars)
+
+            use_cl_primary = (
+                preflight_band == "weak_signal"
+                and cached_baseline_cl_per_example is not None
+                and len(cached_baseline_cl_per_example) > 0
+                and closed_loop_cache is not None
+            )
+
+            evolved_cl_report = None
+            evolved_cl_per_example: Optional[list[float]] = None
+            evolved_cl_errored_task_ids: list[str] = []
+            cl_eval_cost_before: float = 0.0
+            cl_eval_cost_usd: Optional[float] = None
+            cl_constraint: Optional[ConstraintResult] = None
+
+            if use_cl_primary:
+                console.print(
+                    f"\n[bold]Evaluating evolved description on closed-loop suite[/bold] "
+                    "(weak_signal band → CL-primary gate)"
+                )
+                cl_eval_cost_before = COST_LEDGER.summary().get("total_usd", 0.0)
+                try:
+                    evolved_cl_report = closed_loop_cache.force_run(evolved_description)
+                except Exception as exc:  # ValidatorError or downstream
+                    cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before
+                    console.print(
+                        f"[red]✗ Evolved closed-loop eval failed: {exc}[/red] — writing aborted decision"
+                    )
+                    failed_path = output_dir / "evolved_FAILED.json"
+                    evolved_manifest = manifest.replace_description(tool_name, evolved_description)
+                    failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
+                    write_gate_decision(output_dir, {
+                        "schema_version": "5",
+                        "decision": "aborted",
+                        "reason": "cl_eval_failed",
+                        "decision_signal": "closed_loop",
+                        "cl_eval_exception": str(exc),
+                        "evolved_cl_eval_cost_usd": cl_eval_cost_usd,
+                        "band_trigger_score": {
+                            "holdout": preflight_holdout_score,
+                            "closed_loop": preflight_cl_score,
+                        },
+                        "validator_agent_model": closed_loop_agent_model,
+                        "baseline_chars": baseline_chars,
+                        "evolved_chars": evolved_chars,
+                        "growth_pct": growth_pct,
+                        "knee_point": _knee_point_payload(knee_pick),
+                        "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
+                        "run_inputs": run_inputs,
+                        **tool_payload_fields,
+                    })
+                    return {"decision": "aborted", "reason": "cl_eval_failed"}
+                cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before
+
+                # Detect abstained tasks (TaskResult.abstained == True means
+                # the runner errored — see validation/report.py:score_task).
+                # An infrastructure flake on an evolved task is NOT a quality
+                # regression; conflating them would falsely reject good
+                # candidates. Hard-fail with a written diagnostic instead.
+                evolved_cl_errored_task_ids = [
+                    t.task_id for t in evolved_cl_report.evolved.tasks if t.abstained
+                ]
+                evolved_cl_per_example = [
+                    1.0 if t.passed else 0.0 for t in evolved_cl_report.evolved.tasks
+                ]
+                if evolved_cl_errored_task_ids:
+                    console.print(
+                        f"[red]✗ {len(evolved_cl_errored_task_ids)} evolved CL task(s) errored "
+                        f"({', '.join(evolved_cl_errored_task_ids)}) — writing aborted decision[/red]"
+                    )
+                    failed_path = output_dir / "evolved_FAILED.json"
+                    evolved_manifest = manifest.replace_description(tool_name, evolved_description)
+                    failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
+                    write_gate_decision(output_dir, {
+                        "schema_version": "5",
+                        "decision": "aborted",
+                        "reason": "cl_eval_incomplete",
+                        "decision_signal": "closed_loop",
+                        "evolved_closed_loop_errored_tasks": evolved_cl_errored_task_ids,
+                        "evolved_closed_loop_per_example": evolved_cl_per_example,
+                        "baseline_closed_loop_per_example": cached_baseline_cl_per_example,
+                        "evolved_cl_eval_cost_usd": cl_eval_cost_usd,
+                        "band_trigger_score": {
+                            "holdout": preflight_holdout_score,
+                            "closed_loop": preflight_cl_score,
+                        },
+                        "validator_agent_model": closed_loop_agent_model,
+                        "baseline_chars": baseline_chars,
+                        "evolved_chars": evolved_chars,
+                        "growth_pct": growth_pct,
+                        "knee_point": _knee_point_payload(knee_pick),
+                        "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops),
+                        "run_inputs": run_inputs,
+                        **tool_payload_fields,
+                    })
+                    return {"decision": "aborted", "reason": "cl_eval_incomplete"}
+
+                baseline_cl_passes = int(sum(cached_baseline_cl_per_example))
+                evolved_cl_passes = int(sum(evolved_cl_per_example))
+                cl_constraint = _check_cl_primary_gate(
+                    baseline_cl_passes=baseline_cl_passes,
+                    evolved_cl_passes=evolved_cl_passes,
+                    baseline_synth_mean=avg_baseline,
+                    evolved_synth_mean=avg_evolved,
+                    growth_pct=growth_pct,
+                )
+                icon = "✓" if cl_constraint.passed else "✗"
+                color = "green" if cl_constraint.passed else "red"
+                console.print(
+                    f"  [{color}]{icon} cl_primary_gate[/{color}]: {cl_constraint.message}"
+                )
+
             console.print(f"\n[bold]Validating growth against holdout improvement[/bold]")
             bootstrap = paired_bootstrap(
                 baseline_per_example,
@@ -852,11 +973,22 @@ def evolve(
                 n_resamples=config.bootstrap_n_resamples,
                 seed=config.seed,
             )
-            # Growth + ceiling check on the description, not the rendered manifest —
-            # the gate's curve has to apply to the artifact the user actually evolves.
-            growth_constraints = validator.validate_growth_with_quality(
-                evolved_description, baseline_description, bootstrap,
-            )
+            if use_cl_primary:
+                # CL-primary path: skip the synthetic growth_quality_gate
+                # (it would always reject when synth is saturated and growth > 0).
+                # But still enforce the absolute_char_ceiling — that's an
+                # orthogonal wallpaper-protection backstop that must hold
+                # regardless of which signal we're gating on.
+                ceiling_constraint = validator._check_absolute_chars(
+                    evolved_description, baseline_chars,
+                )
+                growth_constraints = [cl_constraint, ceiling_constraint]
+            else:
+                # Synthetic-only path (unchanged): growth_quality_gate runs both
+                # the growth curve and the absolute-char ceiling internally.
+                growth_constraints = validator.validate_growth_with_quality(
+                    evolved_description, baseline_description, bootstrap,
+                )
             growth_pass = True
             for c in growth_constraints:
                 icon = "✓" if c.passed else "✗"
@@ -892,9 +1024,9 @@ def evolve(
                     evolved_manifest_path.unlink(missing_ok=True)
                     baseline_manifest_path.unlink(missing_ok=True)
 
-            baseline_chars = len(baseline_description)
-            evolved_chars = len(evolved_description)
-            growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars)
+            # baseline_chars / evolved_chars / growth_pct are bound earlier
+            # (before the use_cl_primary branch) so the CL-primary path can
+            # use them in its abort payloads. Don't recompute here.
             required_improvement = max(
                 0.0,
                 config.growth_quality_slope * (growth_pct - config.growth_free_threshold),

From 779002a7f7ab87396b9d35a2a131cfd480b1dc40 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:30:16 -0600
Subject: [PATCH 4/9] fix(evolve_tool): narrow cl_constraint type, surface
 saved-variant path

Code review found two minor issues in the CL-primary branch added by
ae1fea68:

1. cl_constraint: Optional[ConstraintResult] flows into a
   list[ConstraintResult] without type narrowing at the post-branch
   growth_constraints assignment. Added an assert so the type checker
   sees the correlation between the two 'if use_cl_primary:' blocks.

2. Both new abort paths wrote evolved_FAILED.json but skipped the
   'Saved failed variant to {path}' console line that existing abort
   paths print. Operators triaging a flake need to know the file was
   saved and where; added the print to both new paths.

No behavior change for any test.
---
 evolution/tools/evolve_tool.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index e4eb87a0..513ea4dd 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -884,6 +884,7 @@ def evolve(
                     failed_path = output_dir / "evolved_FAILED.json"
                     evolved_manifest = manifest.replace_description(tool_name, evolved_description)
                     failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
+                    console.print(f"  Saved failed variant to {failed_path}")
                     write_gate_decision(output_dir, {
                         "schema_version": "5",
                         "decision": "aborted",
@@ -926,6 +927,7 @@ def evolve(
                     failed_path = output_dir / "evolved_FAILED.json"
                     evolved_manifest = manifest.replace_description(tool_name, evolved_description)
                     failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
+                    console.print(f"  Saved failed variant to {failed_path}")
                     write_gate_decision(output_dir, {
                         "schema_version": "5",
                         "decision": "aborted",
@@ -979,6 +981,10 @@ def evolve(
                 # But still enforce the absolute_char_ceiling — that's an
                 # orthogonal wallpaper-protection backstop that must hold
                 # regardless of which signal we're gating on.
+                # cl_constraint was bound in the earlier `if use_cl_primary:` block;
+                # the assert narrows Optional[ConstraintResult] so growth_constraints
+                # types as list[ConstraintResult], not list[Optional[ConstraintResult]].
+                assert cl_constraint is not None
                 ceiling_constraint = validator._check_absolute_chars(
                     evolved_description, baseline_chars,
                 )

From 90014c5aeb79ab33a8a15065a1d8208b2faace22 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:34:07 -0600
Subject: [PATCH 5/9] feat(evolve_tool): gate_decision.json schema v5 with
 CL-primary fields

Schema bumps from v4 to v5 across all four gate_decision write sites
(static-fail, cl_eval_failed, cl_eval_incomplete, success/reject). The
bump is additive. New fields are present only when use_cl_primary == True:
  decision_signal, baseline_closed_loop_per_example,
  evolved_closed_loop_per_example, evolved_closed_loop_errored_tasks,
  cl_tasks_gained, cl_required_gain, synthetic_sanity_check,
  evolved_cl_eval_cost_usd, band_trigger_score, validator_agent_model.

When preflight was skipped (--no-saturation-check), records
reason_synthetic: 'preflight_skipped' so downstream consumers can
distinguish 'preflight saw no weak_signal' from 'preflight didn't run.'

cl_required_gain and synthetic_sanity_check reuse the
CL_PRIMARY_GROWTH_SLOPE / CL_PRIMARY_GROWTH_FREE_THRESHOLD /
CL_PRIMARY_SYNTH_TOLERANCE constants from quality_gate.py so the
gate-decision payload can't drift from the actual gate logic.

Existing v4 consumers see byte-identical output for synthetic-mode
runs except the new decision_signal: 'synthetic' string.
---
 evolution/tools/evolve_tool.py | 43 ++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 513ea4dd..97522d3c 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -10,6 +10,7 @@
 import difflib
 import json
 import logging
+import math
 import sys
 import time
 from datetime import datetime
@@ -59,6 +60,9 @@
     register_litellm_failure_callback,
 )
 from evolution.core.quality_gate import (
+    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
+    CL_PRIMARY_GROWTH_SLOPE,
+    CL_PRIMARY_SYNTH_TOLERANCE,
     QUALITY_GATE_PRESETS,
     _check_cl_primary_gate,
     resolve_proposer_mode,
@@ -817,7 +821,7 @@ def evolve(
                 evolved_manifest = manifest.replace_description(tool_name, evolved_description)
                 failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n")
                 write_gate_decision(output_dir, {
-                    "schema_version": "4",
+                    "schema_version": "5",
                     "decision": "reject",
                     "reason": "static_constraint_failure",
                     "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed],
@@ -1045,9 +1049,10 @@ def evolve(
             else:
                 decision_reason = "growth_quality_gate"
             decision_payload = {
-                "schema_version": "4",
+                "schema_version": "5",
                 "decision": "deploy" if growth_pass else "reject",
                 "reason": decision_reason,
+                "decision_signal": "closed_loop" if use_cl_primary else "synthetic",
                 "decision_rule_used": decision_rule_used,
                 "gate_mode": config.gate_mode,
                 "inferiority_tolerance": config.inferiority_tolerance,
@@ -1078,6 +1083,40 @@ def evolve(
             }
             if benchmark_block is not None:
                 decision_payload["benchmark"] = benchmark_block
+            if use_cl_primary:
+                decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
+                decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
+                # Populated only on the abort path (cl_eval_incomplete); empty
+                # here because we reach this block only when no task errored.
+                decision_payload["evolved_closed_loop_errored_tasks"] = []
+                decision_payload["cl_tasks_gained"] = (
+                    int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
+                )
+                decision_payload["cl_required_gain"] = max(
+                    1,
+                    math.ceil(
+                        max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
+                    ),
+                )
+                decision_payload["synthetic_sanity_check"] = {
+                    "tolerance": CL_PRIMARY_SYNTH_TOLERANCE,
+                    "baseline_mean": avg_baseline,
+                    "evolved_mean": avg_evolved,
+                    "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE,
+                }
+                decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd
+                decision_payload["band_trigger_score"] = {
+                    "holdout": preflight_holdout_score,
+                    "closed_loop": preflight_cl_score,
+                }
+                decision_payload["validator_agent_model"] = closed_loop_agent_model
+
+            if not use_cl_primary and preflight_band is None:
+                # User passed --no-saturation-check; record why CL-primary
+                # didn't fire even though CL may be configured. Lets downstream
+                # consumers distinguish 'preflight saw no weak_signal' from
+                # 'preflight didn't run.'
+                decision_payload["reason_synthetic"] = "preflight_skipped"
             gate_path = write_gate_decision(output_dir, decision_payload)
             console.print(f"  [dim]Gate decision logged to {gate_path}[/dim]")
 

From 0b1106c5c4ffa7c4986a86487832ed7af37e77f0 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 08:52:57 -0600
Subject: [PATCH 6/9] test(evolve_tool): integration tests for CL-aware deploy
 gate

10 tests covering the deploy-gate branch on saturation band:
  - weak_signal triggers evolved CL eval (force_run called post-GEPA)
  - healthy/no_headroom fall through to synthetic
  - --no-saturation-check records reason_synthetic in JSON
  - all v5 fields present + correct types
  - force_run failure writes aborted decision with diagnostics
  - evolved task abstention writes cl_eval_incomplete (not regression)
  - absolute_char_ceiling still enforced in CL-primary path

Mocks the synthetic dataset builder + closed-loop cache at the same
seams as test_evolve_tool_saturation_preflight.py; calls evolve()
directly (rather than via CliRunner) so each test can inspect
gate_decision.json at a pinned output_dir.
---
 tests/tools/test_evolve_tool_cl_aware_gate.py | 695 ++++++++++++++++++
 1 file changed, 695 insertions(+)
 create mode 100644 tests/tools/test_evolve_tool_cl_aware_gate.py

diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
new file mode 100644
index 00000000..83b1b078
--- /dev/null
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -0,0 +1,695 @@
+"""Integration tests for the deploy-gate CL-aware branch.
+
+Mocks the synthetic dataset builder + closed-loop cache so each test
+can pin a saturation band and verify the deploy gate's branch behavior
+plus ``gate_decision.json`` shape. No real LM calls.
+
+Pairs with unit tests at ``tests/core/test_check_cl_primary_gate.py``
+which cover the decision-rule math in isolation. These tests run the
+full ``evolve()`` orchestrator end-to-end with seams stubbed at the
+saturation pre-flight, closed-loop cache, GEPA, knee-point, and holdout
+evaluator, so they exercise the branch logic added in this PR rather
+than the helper math.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Optional
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from evolution.core.dataset_builder import EvalExample
+from evolution.core.saturation_check import SaturationReport
+from evolution.skills.knee_point import CandidatePick
+from evolution.tools.evolve_tool import evolve
+from evolution.validation.report import (
+    PhaseResult,
+    TaskResult,
+    ValidationReport,
+    WinLoss,
+)
+
+
+FIXTURES = Path(__file__).parent.parent / "fixtures" / "tool_manifests"
+
+
+@pytest.fixture
+def temp_manifest(tmp_path: Path) -> Path:
+    """Copy multiple_tools.json to a tmp location."""
+    src = FIXTURES / "multiple_tools.json"
+    dst = tmp_path / "manifest.json"
+    dst.write_text(src.read_text())
+    return dst
+
+
+def _fake_tool_examples(n: int = 30) -> list[EvalExample]:
+    """Build n fake EvalExamples without calling an LM."""
+    return [
+        EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}")
+        for i in range(n)
+    ]
+
+
+def _fake_validation_report(
+    *,
+    baseline_pass: list[bool],
+    evolved_pass: list[bool],
+    evolved_abstain: Optional[list[bool]] = None,
+) -> ValidationReport:
+    """Build a ValidationReport with the given per-task verdicts.
+
+    Mirrors what ClosedLoopFeedbackCache.force_run returns; ``evolved``
+    is the only phase the deploy-gate branch actually reads (it pulls
+    baseline pass-counts from the cached preflight data).
+    """
+    n = len(baseline_pass)
+    evolved_abstain = evolved_abstain or [False] * n
+    assert len(evolved_pass) == n
+    assert len(evolved_abstain) == n
+
+    baseline_tasks = [
+        TaskResult(
+            task_id=f"task_{i}",
+            passed=p,
+            abstained=False,
+            tool_calls_seq=[],
+            duration_seconds=0.1,
+        )
+        for i, p in enumerate(baseline_pass)
+    ]
+    evolved_tasks = [
+        TaskResult(
+            task_id=f"task_{i}",
+            passed=p,
+            abstained=a,
+            tool_calls_seq=[],
+            duration_seconds=0.1,
+            error="runner timeout" if a else None,
+        )
+        for i, (p, a) in enumerate(zip(evolved_pass, evolved_abstain))
+    ]
+
+    def _phase(tasks: list[TaskResult]) -> PhaseResult:
+        n_p = sum(1 for t in tasks if t.passed and not t.abstained)
+        n_f = sum(1 for t in tasks if not t.passed and not t.abstained)
+        n_a = sum(1 for t in tasks if t.abstained)
+        scored = n_p + n_f
+        return PhaseResult(
+            pass_rate=(n_p / scored) if scored else 0.0,
+            n_passed=n_p,
+            n_failed=n_f,
+            n_abstained=n_a,
+            tasks=tasks,
+        )
+
+    return ValidationReport(
+        schema_version="1",
+        tool="search_files",
+        task_suite_path="fake_suite.jsonl",
+        task_suite_sha256="0" * 64,
+        baseline=_phase(baseline_tasks),
+        evolved=_phase(evolved_tasks),
+        delta=WinLoss(
+            n_wins=0, n_losses=0, n_ties=n, pass_rate_change=0.0,
+        ),
+        decision="pass",
+        decision_reasons=[],
+    )
+
+
+def _make_knee_pick(evolved_description: str) -> CandidatePick:
+    """Build a CandidatePick that select_knee_point would return."""
+    fake_module = MagicMock()
+    return CandidatePick(
+        module=fake_module,
+        skill_text=evolved_description,
+        body_chars=len(evolved_description),
+        val_score=0.8,
+        val_rank_in_band=1,
+        band_size=1,
+        epsilon=0.1,
+        fallback="knee",
+        picked_idx=0,
+        gepa_default_idx=0,
+        gepa_default_body_chars=len(evolved_description),
+        band_roster=[],
+    )
+
+
+def _make_fake_gepa(evolved_description: str):
+    """Build a fake dspy.GEPA whose ``compile()`` returns a module with
+    the detailed_results shape the knee-point path expects."""
+
+    class _FakeGEPA:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+        def compile(self, baseline_module, *, trainset, valset):
+            fake_module = MagicMock()
+            fake_module.detailed_results = SimpleNamespace(
+                candidates=[fake_module],
+                val_aggregate_scores=[1.0],
+                best_idx=0,
+            )
+            fake_module.description_text = evolved_description
+            return fake_module
+
+    return _FakeGEPA
+
+
+# Baseline description for search_files in multiple_tools.json is
+# "Find things." (12 chars). With static_ceiling=5000 (default preset),
+# effective_absolute_char_ceiling = max(5000, 1.5*12) = 5000 — so a
+# plausible-length evolved description passes by default.
+_EVOLVED_DESCRIPTION = (
+    "Find files in the repository by name or glob pattern. "
+    "Returns matching file paths."
+)
+
+# CL-primary path tests that want the gate to ACCEPT need growth_pct
+# below CL_PRIMARY_GROWTH_FREE_THRESHOLD (0.20) so required_gain stays
+# at 1 task and a +2 win clears it. 12-char baseline × 1.20 = 14.4, so
+# evolved must be ≤ 14 chars. "Locate files." is 13 chars (8.3% growth)
+# which lands required_gain=1.
+_LOW_GROWTH_EVOLVED = "Locate files."
+
+
+@contextlib.contextmanager
+def _patch_stack(
+    *,
+    sat_report: SaturationReport,
+    fake_cache: Optional[MagicMock],
+    holdout_baseline_mean: float = 0.95,
+    holdout_evolved_mean: float = 0.96,
+    holdout_n: int = 10,
+    evolved_description: str = _EVOLVED_DESCRIPTION,
+):
+    """Single context manager wrapping every seam patch each test needs.
+
+    Tests stay focused on the band/cache/assertion they're verifying.
+    """
+    fake_builder = MagicMock()
+    fake_builder.generate_tool_selection.return_value = _fake_tool_examples()
+    knee_pick = _make_knee_pick(evolved_description)
+    evolved_per = [holdout_evolved_mean] * holdout_n
+
+    def _maybe_build(**kwargs):
+        # Honour the real "no suite path → no cache" contract; if a test
+        # forgets to pass a suite path the use_cl_primary branch can't fire
+        # (None cache) instead of getting a confusingly-active mock.
+        if kwargs.get("suite_path") is None:
+            return None
+        return fake_cache
+
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.SyntheticDatasetBuilder",
+            return_value=fake_builder,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.saturation_preflight",
+            return_value=sat_report,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool._preflight_lm_credentials",
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool._maybe_build_closed_loop_cache",
+            side_effect=_maybe_build,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.dspy.GEPA",
+            new=_make_fake_gepa(evolved_description),
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.select_knee_point",
+            return_value=knee_pick,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool._candidate_description",
+            return_value=evolved_description,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool._holdout_evaluate_with_metric",
+            return_value=(holdout_evolved_mean, evolved_per),
+        ))
+        # In headless test envs stdin is non-TTY. For non-healthy bands
+        # the orchestrator otherwise sys.exit(3)s before the deploy gate.
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.is_non_interactive",
+            return_value=False,
+        ))
+        stack.enter_context(patch(
+            "evolution.tools.evolve_tool.interactive_confirm",
+            return_value=True,
+        ))
+        yield
+
+
+def _run_evolve(
+    *,
+    manifest_path: Path,
+    output_dir: Path,
+    extra_kwargs: Optional[dict] = None,
+):
+    """Invoke evolve() with the minimum kwargs every test in this module
+    shares. Wraps the long, repetitive call so each test stays focused
+    on the band/cache/assertion that's actually being exercised."""
+    kwargs = dict(
+        tool_name="search_files",
+        manifest_path=manifest_path,
+        iterations=1,
+        eval_dataset_size=30,
+        holdout_ratio=0.5,
+        quality_gate="non-inferiority",
+        closed_loop_suite_path=Path("/fake/suite.jsonl"),
+        closed_loop_hermes_repo=Path("/fake/hermes"),
+        # mode="feedback" avoids _load_behavioral_examples_from_suite,
+        # which would read the suite file on disk. The deploy-gate
+        # CL-primary branch is mode-agnostic; it pulls verdicts via
+        # closed_loop_cache.force_run regardless.
+        closed_loop_mode="feedback",
+        closed_loop_in_valset=False,
+        closed_loop_agent_model="openai/gpt-5-mini",
+        max_total_cost_usd=5.0,
+        skip_preflight=True,
+        output_dir=output_dir,
+    )
+    if extra_kwargs:
+        kwargs.update(extra_kwargs)
+    return evolve(**kwargs)
+
+
+def _weak_signal_report() -> SaturationReport:
+    """The one band that triggers the CL-aware deploy gate."""
+    return SaturationReport(
+        band="weak_signal",
+        holdout_score=0.95,
+        holdout_n=10,
+        holdout_per_example=[0.95] * 10,
+        closed_loop_score=5 / 7,
+        closed_loop_n=7,
+        # 5/7 baseline pass-rate — the deploy gate reads this list
+        # verbatim to compute baseline_cl_passes.
+        closed_loop_per_example=[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
+        suggestions=[],
+        thresholds={},
+    )
+
+
+def _healthy_report() -> SaturationReport:
+    """No CL data needed; the band routes through the synthetic gate."""
+    return SaturationReport(
+        band="healthy",
+        holdout_score=0.5,
+        holdout_n=10,
+        holdout_per_example=[0.5] * 10,
+        closed_loop_score=None,
+        closed_loop_n=None,
+        closed_loop_per_example=None,
+        suggestions=[],
+        thresholds={},
+    )
+
+
+def _no_headroom_report(*, with_cl_data: bool) -> SaturationReport:
+    """no_headroom band with optional CL data. CL-primary must NOT fire
+    on no_headroom regardless of data presence."""
+    cl_per = [1.0] * 7 if with_cl_data else None
+    return SaturationReport(
+        band="no_headroom",
+        holdout_score=0.99,
+        # holdout_n must match the _patch_stack holdout_n (10) so the
+        # cached baseline list and the post-GEPA evolved list line up
+        # for paired_bootstrap.
+        holdout_n=10,
+        holdout_per_example=[1.0] * 10,
+        closed_loop_score=1.0 if with_cl_data else None,
+        closed_loop_n=7 if with_cl_data else None,
+        closed_loop_per_example=cl_per,
+        suggestions=["Try a harder suite"],
+        thresholds={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# The 10 tests
+# ---------------------------------------------------------------------------
+
+
+def test_weak_signal_band_triggers_evolved_cl_eval(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """weak_signal + +2 task win → force_run is called post-GEPA,
+    decision == deploy, decision_signal == closed_loop, cl_tasks_gained == 2."""
+    fake_cache = MagicMock()
+    # Baseline preflight per-example is [1]*5 + [0]*2 = 5/7.
+    # Evolved 7/7 — a +2 task gain that beats required_gain at small
+    # growth_pct.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+    run_dir = tmp_path / "run"
+
+    # _LOW_GROWTH_EVOLVED keeps required_gain at 1 task so the +2 CL win
+    # clears the cl_primary_gate.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_description=_LOW_GROWTH_EVOLVED,
+    ):
+        result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    fake_cache.force_run.assert_called()
+    call_args = fake_cache.force_run.call_args_list
+    # The CL-primary post-GEPA call passes the evolved description text.
+    assert any(_LOW_GROWTH_EVOLVED in str(call) for call in call_args), (
+        f"Expected force_run to be called with evolved description, got: {call_args}"
+    )
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision"] == "deploy", (
+        f"weak_signal + 5→7 should deploy, got {payload['decision']} "
+        f"(reason: {payload.get('reason')})"
+    )
+    assert payload["decision_signal"] == "closed_loop"
+    assert payload["cl_tasks_gained"] == 2
+    # The deploy result echoes the metrics dict, not the gate decision.
+    assert isinstance(result, dict)
+
+
+def test_healthy_band_does_not_trigger_cl_aware_gate(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """healthy band → CL-primary never fires; gate falls through to
+    synthetic, force_run is NOT called post-GEPA, no CL fields written."""
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    fake_cache.force_run.assert_not_called()
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision_signal"] == "synthetic"
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+        "baseline_closed_loop_per_example",
+        "evolved_closed_loop_per_example",
+    ):
+        assert cl_field not in payload, (
+            f"CL field {cl_field!r} should not be in synthetic-gate payload"
+        )
+
+
+def test_no_headroom_without_cl_data_falls_through_to_synthetic_gate(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """no_headroom + no CL data + --force-saturation-check → synthetic gate
+    runs without KeyError. CL was never measured, so no CL fields."""
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(
+        sat_report=_no_headroom_report(with_cl_data=False),
+        fake_cache=fake_cache,
+    ):
+        _run_evolve(
+            manifest_path=temp_manifest,
+            output_dir=run_dir,
+            extra_kwargs={"force_saturation_check": True},
+        )
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision_signal"] == "synthetic"
+
+
+def test_no_headroom_with_cl_data_falls_through_to_synthetic_gate(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """no_headroom + non-empty CL data → CL-primary STILL must NOT fire.
+    The spec triggers CL-primary only on weak_signal."""
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(
+        sat_report=_no_headroom_report(with_cl_data=True),
+        fake_cache=fake_cache,
+    ):
+        _run_evolve(
+            manifest_path=temp_manifest,
+            output_dir=run_dir,
+            extra_kwargs={"force_saturation_check": True},
+        )
+
+    fake_cache.force_run.assert_not_called()
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision_signal"] == "synthetic"
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+    ):
+        assert cl_field not in payload
+
+
+def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """--no-saturation-check → no preflight, falls through to synthetic.
+    decision_signal == synthetic AND reason_synthetic == preflight_skipped
+    so downstream consumers can distinguish 'preflight saw nothing weak'
+    from 'preflight didn't run'."""
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    # sat_report is unused (skip_saturation_check=True bypasses preflight)
+    # but _patch_stack requires one.
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(
+            manifest_path=temp_manifest,
+            output_dir=run_dir,
+            extra_kwargs={"skip_saturation_check": True},
+        )
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision_signal"] == "synthetic"
+    assert payload["reason_synthetic"] == "preflight_skipped"
+
+
+def test_cl_primary_decision_persists_to_gate_decision_json(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """weak_signal → all v5 CL fields present in gate_decision.json with
+    correct types. Pins the JSON contract downstream consumers depend on."""
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+    run_dir = tmp_path / "run"
+
+    # _LOW_GROWTH_EVOLVED → required_gain=1 → +2 win clears the gate so
+    # the deploy path populates every v5 CL field we're pinning here.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_description=_LOW_GROWTH_EVOLVED,
+    ):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+
+    assert payload["schema_version"] == "5"
+    assert payload["decision_signal"] == "closed_loop"
+
+    assert isinstance(payload["baseline_closed_loop_per_example"], list)
+    assert all(
+        isinstance(x, (int, float))
+        for x in payload["baseline_closed_loop_per_example"]
+    )
+    assert isinstance(payload["evolved_closed_loop_per_example"], list)
+    assert all(
+        isinstance(x, (int, float))
+        for x in payload["evolved_closed_loop_per_example"]
+    )
+
+    assert isinstance(payload["cl_tasks_gained"], int)
+    assert isinstance(payload["cl_required_gain"], int)
+
+    sanity = payload["synthetic_sanity_check"]
+    assert isinstance(sanity, dict)
+    for key in ("tolerance", "baseline_mean", "evolved_mean", "passed"):
+        assert key in sanity, f"synthetic_sanity_check missing {key!r}"
+    assert isinstance(sanity["tolerance"], (int, float))
+    assert isinstance(sanity["baseline_mean"], (int, float))
+    assert isinstance(sanity["evolved_mean"], (int, float))
+    assert isinstance(sanity["passed"], bool)
+
+    # cost_usd may be None (tests don't exercise the cost ledger), float,
+    # or int — accept any; we only pin field presence here.
+    assert "evolved_cl_eval_cost_usd" in payload
+    cost = payload["evolved_cl_eval_cost_usd"]
+    assert cost is None or isinstance(cost, (int, float))
+
+    band_score = payload["band_trigger_score"]
+    assert isinstance(band_score, dict)
+    assert "holdout" in band_score
+    assert "closed_loop" in band_score
+
+    assert isinstance(payload["validator_agent_model"], str)
+
+
+def test_synthetic_only_decision_unchanged_in_gate_decision_json(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """healthy → synthetic path. All v4 fields present, schema_version=5,
+    decision_signal=synthetic, no CL fields."""
+    fake_cache = MagicMock()
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+
+    assert payload["schema_version"] == "5"
+    assert payload["decision_signal"] == "synthetic"
+
+    # v4-and-earlier fields the synthetic path has always written.
+    for required in (
+        "baseline_per_example",
+        "evolved_per_example",
+        "bootstrap",
+        "growth_pct",
+        "required_improvement",
+        "baseline_chars",
+        "evolved_chars",
+        "absolute_char_ceiling",
+        "knee_point",
+        "dataset",
+        "run_inputs",
+    ):
+        assert required in payload, f"missing v4 field {required!r}"
+
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+        "baseline_closed_loop_per_example",
+        "evolved_closed_loop_per_example",
+        "band_trigger_score",
+        "validator_agent_model",
+    ):
+        assert cl_field not in payload, (
+            f"CL-only field {cl_field!r} leaked into synthetic-gate payload"
+        )
+
+
+def test_force_run_failure_writes_aborted_decision_with_diagnostic_payload(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """weak_signal + force_run raises → aborted decision,
+    reason=cl_eval_failed, exception text recorded, evolved_FAILED.json
+    written for forensic inspection of the rejected candidate."""
+    fake_cache = MagicMock()
+    fake_cache.force_run.side_effect = RuntimeError("validator crashed")
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+        result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    assert result == {"decision": "aborted", "reason": "cl_eval_failed"}
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision"] == "aborted"
+    assert payload["reason"] == "cl_eval_failed"
+    assert "validator crashed" in payload["cl_eval_exception"]
+
+    assert (run_dir / "evolved_FAILED.json").exists(), (
+        "evolved_FAILED.json must be written so the rejected variant "
+        "is inspectable"
+    )
+
+
+def test_evolved_task_error_writes_cl_eval_incomplete_decision(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """weak_signal + one evolved task abstained → cl_eval_incomplete
+    (NOT a regression). An infrastructure flake on the evolved phase
+    isn't evidence of quality loss; conflating them would silently
+    reject good candidates."""
+    fake_cache = MagicMock()
+    # task_2 abstains; others pass. Without the incomplete-detection
+    # branch this would score as 6/7 (+1 vs 5/7 baseline) and deploy.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, False, True, True, True, True],
+        evolved_abstain=[False, False, True, False, False, False, False],
+    )
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+        result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+    assert result == {"decision": "aborted", "reason": "cl_eval_incomplete"}
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision"] == "aborted"
+    assert payload["reason"] == "cl_eval_incomplete"
+    assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"]
+
+
+def test_absolute_char_ceiling_still_enforced_in_cl_primary_path(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """weak_signal + +2 CL win + evolved description exceeding the
+    absolute char ceiling → reject. CL-primary mustn't bypass the
+    wallpaper-protection backstop."""
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+    # Baseline = 12 chars. ceiling = max(50, 1.5*12) = 50.
+    # Evolved ~480 chars; trips the absolute_char_ceiling backstop.
+    # Stays under max_tool_desc_size=500 so static checks still pass.
+    long_evolved = (
+        "Find files in the repository by name pattern or glob; "
+        "returns matching file paths from anywhere under the project root. "
+    ) * 4
+    assert 50 < len(long_evolved) <= 500, (
+        f"Test pre-condition: expected 50 < len(long_evolved)={len(long_evolved)} <= 500"
+    )
+
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_description=long_evolved,
+    ):
+        result = _run_evolve(
+            manifest_path=temp_manifest,
+            output_dir=run_dir,
+            extra_kwargs={"max_absolute_chars": 50},
+        )
+
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision"] == "reject", (
+        f"absolute_char_ceiling must reject even on a winning CL gate; "
+        f"got decision={payload['decision']} (reason={payload.get('reason')})"
+    )
+    assert "absolute_char_ceiling" in payload.get("failed_constraints", []), (
+        f"failed_constraints={payload.get('failed_constraints')}"
+    )
+    # The deploy-gate reject path returns the reject reason from the dict.
+    assert result["decision"] == "reject"

From 70c0e370af77d34c7517f0abd561718d18fe3db1 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 09:05:05 -0600
Subject: [PATCH 7/9] test(evolve_tool): tighten Test 1 assertion, add
 uniform_failure test, pin evolved_FAILED.json

Code-review feedback on the CL-aware gate test suite:

1. Test 1's force_run assertion was substring-based on str(call_args),
   which silently misses regressions where force_run is called twice
   or with extra kwargs. Tightened to assert_called_once_with.

2. Added test_uniform_failure_band_falls_through_to_synthetic_gate
   pinning the spec edge-case (uniform_failure -> synthetic path).
   Without it, expanding use_cl_primary to include uniform_failure
   would silently change behavior without a test failing.

3. Test 9 (cl_eval_incomplete) now asserts evolved_FAILED.json is
   written, mirroring Test 8's assertion on the cl_eval_failed
   abort path. Production writes the file on both abort paths.
---
 tests/tools/test_evolve_tool_cl_aware_gate.py | 45 ++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
index 83b1b078..1ab6a3e3 100644
--- a/tests/tools/test_evolve_tool_cl_aware_gate.py
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -366,12 +366,7 @@ def test_weak_signal_band_triggers_evolved_cl_eval(
     ):
         result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
 
-    fake_cache.force_run.assert_called()
-    call_args = fake_cache.force_run.call_args_list
-    # The CL-primary post-GEPA call passes the evolved description text.
-    assert any(_LOW_GROWTH_EVOLVED in str(call) for call in call_args), (
-        f"Expected force_run to be called with evolved description, got: {call_args}"
-    )
+    fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_EVOLVED)
 
     payload = json.loads((run_dir / "gate_decision.json").read_text())
     assert payload["decision"] == "deploy", (
@@ -461,6 +456,43 @@ def test_no_headroom_with_cl_data_falls_through_to_synthetic_gate(
         assert cl_field not in payload
 
 
+def test_uniform_failure_band_falls_through_to_synthetic_gate(
+    temp_manifest: Path, tmp_path: Path,
+):
+    """uniform_failure band (CL all-zero, e.g. validator broken) is NOT
+    covered by use_cl_primary — only weak_signal triggers CL-primary.
+    Verifies the gate falls through to the synthetic path with no
+    KeyError and no CL eval. If someone later expands use_cl_primary
+    to include uniform_failure, this test catches the change so it
+    must be accompanied by a deliberate spec update."""
+    fake_cache = MagicMock()
+    sat_report = SaturationReport(
+        band="uniform_failure",
+        holdout_score=0.99,
+        holdout_n=10,
+        holdout_per_example=[1.0] * 10,
+        closed_loop_score=0.0,
+        closed_loop_n=7,
+        closed_loop_per_example=[0.0] * 7,
+        suggestions=[],
+        thresholds={},
+    )
+    run_dir = tmp_path / "run"
+
+    with _patch_stack(sat_report=sat_report, fake_cache=fake_cache):
+        _run_evolve(
+            manifest_path=temp_manifest,
+            output_dir=run_dir,
+            extra_kwargs={"force_saturation_check": True},
+        )
+
+    fake_cache.force_run.assert_not_called()
+    payload = json.loads((run_dir / "gate_decision.json").read_text())
+    assert payload["decision_signal"] == "synthetic"
+    assert "baseline_closed_loop_per_example" not in payload
+    assert "cl_tasks_gained" not in payload
+
+
 def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded(
     temp_manifest: Path, tmp_path: Path,
 ):
@@ -646,6 +678,7 @@ def test_evolved_task_error_writes_cl_eval_incomplete_decision(
     assert payload["decision"] == "aborted"
     assert payload["reason"] == "cl_eval_incomplete"
     assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"]
+    assert (run_dir / "evolved_FAILED.json").exists()
 
 
 def test_absolute_char_ceiling_still_enforced_in_cl_primary_path(

From c7e8714f2b07bdc9c308f9adb93a06b97a905982 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 09:09:29 -0600
Subject: [PATCH 8/9] test(evolve_tool): schema v5 regression tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pin the v4 → v5 additivity contract: every v4 field must still exist
in v5 output, plus decision_signal (always) and the CL-specific fields
(when use_cl_primary fired). Future schema bumps should add a
TestSchemaV{N}Regression class following this pattern.
---
 tests/tools/test_evolve_tool_cl_aware_gate.py | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
index 1ab6a3e3..830c8f21 100644
--- a/tests/tools/test_evolve_tool_cl_aware_gate.py
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -726,3 +726,79 @@ def test_absolute_char_ceiling_still_enforced_in_cl_primary_path(
     )
     # The deploy-gate reject path returns the reject reason from the dict.
     assert result["decision"] == "reject"
+
+
+class TestSchemaV5Regression:
+    """V5 must be additive over v4. Old consumers should see all v4 fields
+    plus the new decision_signal field (and the CL-specific fields when
+    use_cl_primary fired). Future schema bumps should add a parallel
+    TestSchemaV{N}Regression class following the same pattern."""
+
+    # V4 fields that MUST persist in v5 output regardless of code path.
+    # Verified against the decision_payload literal in
+    # evolution/tools/evolve_tool.py.
+    V4_REQUIRED_FIELDS = frozenset({
+        "schema_version", "decision", "reason", "decision_rule_used",
+        "gate_mode", "inferiority_tolerance", "growth_pct",
+        "required_improvement", "baseline_chars", "evolved_chars",
+        "absolute_char_ceiling", "effective_absolute_char_ceiling",
+        "growth_free_threshold", "fitness_profile", "proposer_mode",
+        "growth_quality_slope", "baseline_per_example",
+        "evolved_per_example",
+    })
+
+    def test_synthetic_path_writes_all_v4_fields(
+        self, temp_manifest: Path, tmp_path: Path,
+    ):
+        """healthy band → synthetic gate. Every v4 field must still be
+        present alongside the new decision_signal marker."""
+        fake_cache = MagicMock()
+        run_dir = tmp_path / "run"
+
+        with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+            _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+        payload = json.loads((run_dir / "gate_decision.json").read_text())
+
+        missing = self.V4_REQUIRED_FIELDS - payload.keys()
+        assert not missing, f"v4 fields missing in v5 synthetic payload: {sorted(missing)}"
+        assert payload["schema_version"] == "5"
+        assert payload["decision_signal"] == "synthetic"
+
+    def test_cl_primary_path_writes_all_v4_fields_plus_cl_fields(
+        self, temp_manifest: Path, tmp_path: Path,
+    ):
+        """weak_signal + +2 CL win → CL-primary gate. Every v4 field must
+        still be present AND every new v5 CL-specific field must be
+        populated."""
+        cl_fields = frozenset({
+            "decision_signal", "baseline_closed_loop_per_example",
+            "evolved_closed_loop_per_example",
+            "evolved_closed_loop_errored_tasks", "cl_tasks_gained",
+            "cl_required_gain", "synthetic_sanity_check",
+            "evolved_cl_eval_cost_usd", "band_trigger_score",
+            "validator_agent_model",
+        })
+        fake_cache = MagicMock()
+        # 5/7 baseline → 7/7 evolved with _LOW_GROWTH_EVOLVED keeps
+        # required_gain=1 so the +2 win clears the gate and the deploy
+        # branch writes every CL-specific field.
+        fake_cache.force_run.return_value = _fake_validation_report(
+            baseline_pass=[True, True, True, True, True, False, False],
+            evolved_pass=[True, True, True, True, True, True, True],
+        )
+        run_dir = tmp_path / "run"
+
+        with _patch_stack(
+            sat_report=_weak_signal_report(),
+            fake_cache=fake_cache,
+            evolved_description=_LOW_GROWTH_EVOLVED,
+        ):
+            _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+        payload = json.loads((run_dir / "gate_decision.json").read_text())
+
+        missing = (self.V4_REQUIRED_FIELDS | cl_fields) - payload.keys()
+        assert not missing, f"v5 fields missing in CL-primary payload: {sorted(missing)}"
+        assert payload["schema_version"] == "5"
+        assert payload["decision_signal"] == "closed_loop"

From b265b3dfe0fd284fbf064bef33c0ce90862b45ba Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sat, 23 May 2026 09:33:14 -0600
Subject: [PATCH 9/9] fix(evolve_tool): schema v5 consistency across all
 gate_decision write sites

Final-review feedback caught two seam leaks:

1. write_cost_ceiling_abort was hard-coding schema_version=4. If the
   cost ceiling trips during the CL-primary force_run call, the
   resulting gate_decision.json had v4 in a v5 directory. Made the
   schema_version a keyword arg (default 4 for skill-side callers
   that haven't bumped yet); tool-side passes 5.

2. The static_constraint_failure payload was bumped to v5 in Task 4
   but never had decision_signal added. Every other v5 path has it.
   Set to 'synthetic' since static-fail fires before any CL eval.

3. Extended TestSchemaV5Regression with abort-path coverage so the
   above issues couldn't have slipped through. Three new tests pin
   schema_version and decision_signal on cl_eval_failed,
   cl_eval_incomplete, and static_constraint_failure payloads.

4. Renamed test_accepts_at_pr_68_calibration_point to
   test_accepts_at_24char_baseline_calibration_point per the project
   convention against exposing internal PR numbers in code.
---
 evolution/core/quality_gate.py                |  9 ++-
 evolution/tools/evolve_tool.py                |  2 +
 tests/core/test_check_cl_primary_gate.py      |  6 +-
 tests/tools/test_evolve_tool_cl_aware_gate.py | 71 +++++++++++++++++++
 4 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py
index bf137cb3..426af86f 100644
--- a/evolution/core/quality_gate.py
+++ b/evolution/core/quality_gate.py
@@ -157,10 +157,15 @@ def write_cost_ceiling_abort(
     output_dir: Path,
     run_inputs: dict[str, Any],
     extra_fields: dict[str, Any] | None = None,
+    schema_version: str = "4",
 ) -> Path:
     """Write a ``decision="aborted"`` gate_decision for a cost-ceiling trip.
+
     ``extra_fields`` lets callers add path-specific keys (e.g.,
-    ``artifact_type``, ``target_tool``).
+    ``artifact_type``, ``target_tool``). ``schema_version`` defaults to
+    ``"4"`` so skill-side callers (which haven't bumped past v4 yet) keep
+    working unchanged; tool-side callers pass ``"5"`` to stay consistent
+    with the rest of the gate_decision write sites in that ``output_dir``.
     """
     cost_summary = COST_LEDGER.summary()
     _console.print(
@@ -168,7 +173,7 @@ def write_cost_ceiling_abort(
         f"ceiling ${exc.ceiling_usd:.4f}[/bold red]"
     )
     payload: dict[str, Any] = {
-        "schema_version": "4",
+        "schema_version": schema_version,
         "decision": "aborted",
         "reason": "cost_ceiling_exceeded",
         "cost_ceiling_usd": exc.ceiling_usd,
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 97522d3c..7721af8c 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -824,6 +824,7 @@ def evolve(
                     "schema_version": "5",
                     "decision": "reject",
                     "reason": "static_constraint_failure",
+                    "decision_signal": "synthetic",
                     "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed],
                     "messages": [c.message for c in static_constraints if not c.passed],
                     "knee_point": _knee_point_payload(knee_pick),
@@ -1250,6 +1251,7 @@ def evolve(
                     "artifact_type": "tool_description",
                     "target_tool": tool_name,
                 },
+                schema_version="5",
             )
             return {"decision": "aborted", "reason": "cost_ceiling_exceeded"}
     finally:
diff --git a/tests/core/test_check_cl_primary_gate.py b/tests/core/test_check_cl_primary_gate.py
index 26c66a9d..3c43e296 100644
--- a/tests/core/test_check_cl_primary_gate.py
+++ b/tests/core/test_check_cl_primary_gate.py
@@ -31,9 +31,9 @@ def test_accepts_when_required_gain_met_at_free_threshold(self):
         assert result.passed is True
         assert result.constraint_name == "cl_primary_gate"
 
-    def test_accepts_at_pr_68_calibration_point(self):
-        # PR #68: +2 gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2.
-        # This is the exact case that motivated this work.
+    def test_accepts_at_24char_baseline_calibration_point(self):
+        # +2 task gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2 → just barely passes.
+        # 24-char baseline calibration point from the prior retro-validation.
         result = _check_cl_primary_gate(
             baseline_cl_passes=5,
             evolved_cl_passes=7,
diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py
index 830c8f21..c5dd86f5 100644
--- a/tests/tools/test_evolve_tool_cl_aware_gate.py
+++ b/tests/tools/test_evolve_tool_cl_aware_gate.py
@@ -802,3 +802,74 @@ def test_cl_primary_path_writes_all_v4_fields_plus_cl_fields(
         assert not missing, f"v5 fields missing in CL-primary payload: {sorted(missing)}"
         assert payload["schema_version"] == "5"
         assert payload["decision_signal"] == "closed_loop"
+
+    def test_cl_eval_failed_payload_has_schema_v5_and_decision_signal(
+        self, temp_manifest: Path, tmp_path: Path,
+    ):
+        """Abort payloads are diagnostic-only (no full v4 field set), but
+        must still pin schema_version="5" and a decision_signal so abort
+        rows route the same way as deploy/reject rows in downstream jq."""
+        fake_cache = MagicMock()
+        fake_cache.force_run.side_effect = RuntimeError("validator crashed")
+        run_dir = tmp_path / "run"
+
+        with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+            _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+        payload = json.loads((run_dir / "gate_decision.json").read_text())
+        assert payload["schema_version"] == "5"
+        assert payload["decision_signal"] == "closed_loop"
+        assert payload["decision"] == "aborted"
+        assert payload["reason"] == "cl_eval_failed"
+
+    def test_cl_eval_incomplete_payload_has_schema_v5_and_decision_signal(
+        self, temp_manifest: Path, tmp_path: Path,
+    ):
+        """Abort payloads from the incomplete-eval branch must also pin
+        schema_version="5" and decision_signal so abort rows participate
+        in v5 cohort queries alongside deploy/reject rows."""
+        fake_cache = MagicMock()
+        # task_2 abstains; mirrors the incomplete-detection scenario.
+        fake_cache.force_run.return_value = _fake_validation_report(
+            baseline_pass=[True, True, True, True, True, False, False],
+            evolved_pass=[True, True, False, True, True, True, True],
+            evolved_abstain=[False, False, True, False, False, False, False],
+        )
+        run_dir = tmp_path / "run"
+
+        with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+            _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+        payload = json.loads((run_dir / "gate_decision.json").read_text())
+        assert payload["schema_version"] == "5"
+        assert payload["decision_signal"] == "closed_loop"
+        assert payload["decision"] == "aborted"
+        assert payload["reason"] == "cl_eval_incomplete"
+
+    def test_static_constraint_failure_payload_has_schema_v5_and_decision_signal(
+        self, temp_manifest: Path, tmp_path: Path,
+    ):
+        """Static-fail fires before any CL evaluation could run, so the
+        user never got into the CL-primary path → decision_signal must be
+        "synthetic". Triggered by patching _candidate_description to
+        return an empty string, which fails the non_empty constraint."""
+        fake_cache = MagicMock()
+        run_dir = tmp_path / "run"
+
+        # Use the healthy band so we route through the synthetic-only
+        # path conceptually, then make _candidate_description return ""
+        # to trip the non_empty static constraint. The _patch_stack
+        # context manager already patches _candidate_description; we
+        # override it here with an empty string.
+        with _patch_stack(
+            sat_report=_healthy_report(),
+            fake_cache=fake_cache,
+            evolved_description="",
+        ):
+            _run_evolve(manifest_path=temp_manifest, output_dir=run_dir)
+
+        payload = json.loads((run_dir / "gate_decision.json").read_text())
+        assert payload["schema_version"] == "5"
+        assert payload["decision_signal"] == "synthetic"
+        assert payload["decision"] == "reject"
+        assert payload["reason"] == "static_constraint_failure"