diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 1e7994b9..ee4fb0c9 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -8,6 +8,7 @@
 import difflib
 import json
 import logging
+import math
 import random
 import sys
 import time
@@ -41,7 +42,11 @@
     resolved_lms_dump,
 )
 from evolution.core.quality_gate import (
+    CL_PRIMARY_GROWTH_FREE_THRESHOLD,
+    CL_PRIMARY_GROWTH_SLOPE,
+    CL_PRIMARY_SYNTH_TOLERANCE,
     QUALITY_GATE_PRESETS,
+    _check_cl_primary_gate,
     resolve_proposer_mode,
     run_benchmark_hook,
     write_cost_ceiling_abort,
@@ -61,6 +66,7 @@
 from evolution.core.stats import paired_bootstrap
 from evolution.core.fitness import LLMJudge, make_skill_fitness_metric
 from evolution.core.constraints import (
+    ConstraintResult,
     ConstraintValidator,
     effective_absolute_char_ceiling,
     resolve_decision_rule,
@@ -905,7 +911,11 @@ def evolve(
                 if closed_loop_in_valset:
                     valset = valset + behavioral_examples
 
-            cached_baseline_holdout_per_example = None
+            cached_baseline_holdout_per_example: Optional[list[float]] = None
+            preflight_band: Optional[str] = None
+            cached_baseline_cl_per_example: Optional[list[float]] = None
+            preflight_holdout_score: Optional[float] = None
+            preflight_cl_score: Optional[float] = None
             if not skip_saturation_check:
                 holdout_examples_for_preflight = dataset.to_dspy_examples("holdout")
                 sat_report = saturation_preflight(
@@ -937,6 +947,13 @@ def evolve(
                 else:
                     render_saturation_panel(sat_report, console=console)
                 cached_baseline_holdout_per_example = sat_report.holdout_per_example
+                # Preserve preflight outputs for the deploy gate's CL-primary
+                # path. All None on the --no-saturation-check path (initialized
+                # above the preflight branch).
+                preflight_band = sat_report.band
+                cached_baseline_cl_per_example = sat_report.closed_loop_per_example
+                preflight_holdout_score = sat_report.holdout_score
+                preflight_cl_score = sat_report.closed_loop_score
 
             console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n")
 
@@ -1028,9 +1045,10 @@ def evolve(
                 failed_path = output_dir / "evolved_FAILED.md"
                 failed_path.write_text(evolved_full)
                 write_gate_decision(output_dir, {
-                    "schema_version": "4",
+                    "schema_version": "5",
                     "decision": "reject",
                     "reason": "static_constraint_failure",
+                    "decision_signal": "synthetic",
                     "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed],
                     "messages": [c.message for c in static_constraints if not c.passed],
                     "knee_point": _knee_point_payload(knee_pick),
@@ -1076,6 +1094,146 @@ def evolve(
             )
             improvement = avg_evolved - avg_baseline
 
+            # Decide which deploy-gate path applies. CL-primary fires when
+            # the preflight saw weak_signal AND CL data is present. All
+            # other cases (no preflight, healthy/no_headroom/uniform_failure
+            # bands, missing CL data) use the synthetic-only path.
+            baseline_chars = len(skill["raw"])
+            evolved_chars = len(evolved_full)
+            growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars)
+
+            # Hoist run_inputs to a local — referenced from 3 sites (the
+            # two CL-primary abort paths + the main decision_payload).
+            run_inputs = {
+                "seed": config.seed,
+                "iterations": iterations,
+                "optimizer_model": optimizer_model,
+                "reflection_model": config.reflection_model,
+                "eval_model": config.eval_model,
+                "resolved_lms": resolved_lms_dump(
+                    optimizer=optimizer_model,
+                    reflection=config.reflection_model,
+                    eval_=config.eval_model,
+                ),
+                "eval_dataset_size": config.eval_dataset_size,
+                "holdout_ratio": config.holdout_ratio,
+                "quality_gate_preset": quality_gate,
+                "eval_source": eval_source,
+            }
+
+            use_cl_primary = (
+                preflight_band == "weak_signal"
+                and cached_baseline_cl_per_example is not None
+                and len(cached_baseline_cl_per_example) > 0
+                and closed_loop_cache is not None
+            )
+
+            evolved_cl_report = None
+            evolved_cl_per_example: Optional[list[float]] = None
+            evolved_cl_errored_task_ids: list[str] = []
+            cl_eval_cost_before: float = 0.0
+            cl_eval_cost_usd: Optional[float] = None
+            cl_constraint: Optional[ConstraintResult] = None
+
+            if use_cl_primary:
+                console.print(
+                    f"\n[bold]Evaluating evolved skill body on closed-loop suite[/bold] "
+                    "(weak_signal band → CL-primary gate)"
+                )
+                cl_eval_cost_before = COST_LEDGER.summary().get("total_usd", 0.0)
+                try:
+                    # force_run takes the BODY (no YAML frontmatter); the cache
+                    # key was set up with skill["body"] during preflight, so we
+                    # must match that to avoid silently double-spending on the
+                    # evolved eval.
+                    evolved_cl_report = closed_loop_cache.force_run(evolved_body)
+                except Exception as exc:  # ValidatorError or downstream
+                    cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before
+                    console.print(
+                        f"[red]✗ Evolved closed-loop eval failed: {exc}[/red] — writing aborted decision"
+                    )
+                    failed_path = output_dir / "evolved_FAILED.md"
+                    failed_path.write_text(evolved_full)
+                    console.print(f"  Saved failed variant to {failed_path}")
+                    write_gate_decision(output_dir, {
+                        "schema_version": "5",
+                        "decision": "aborted",
+                        "reason": "cl_eval_failed",
+                        "decision_signal": "closed_loop",
+                        "cl_eval_exception": str(exc),
+                        "evolved_cl_eval_cost_usd": cl_eval_cost_usd,
+                        "band_trigger_score": {
+                            "holdout": preflight_holdout_score,
+                            "closed_loop": preflight_cl_score,
+                        },
+                        "validator_agent_model": closed_loop_agent_model,
+                        "baseline_chars": baseline_chars,
+                        "evolved_chars": evolved_chars,
+                        "growth_pct": growth_pct,
+                        "knee_point": _knee_point_payload(knee_pick),
+                        "dataset": _dataset_payload(dataset),
+                        "run_inputs": run_inputs,
+                    })
+                    return
+                cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before
+
+                # Detect abstained tasks (TaskResult.abstained == True means
+                # the runner errored — see validation/report.py:score_task).
+                # An infrastructure flake on an evolved task is NOT a quality
+                # regression; conflating them would falsely reject good
+                # candidates. Hard-fail with a written diagnostic instead.
+                evolved_cl_errored_task_ids = [
+                    t.task_id for t in evolved_cl_report.evolved.tasks if t.abstained
+                ]
+                evolved_cl_per_example = [
+                    1.0 if t.passed else 0.0 for t in evolved_cl_report.evolved.tasks
+                ]
+                if evolved_cl_errored_task_ids:
+                    console.print(
+                        f"[red]✗ {len(evolved_cl_errored_task_ids)} evolved CL task(s) errored "
+                        f"({', '.join(evolved_cl_errored_task_ids)}) — writing aborted decision[/red]"
+                    )
+                    failed_path = output_dir / "evolved_FAILED.md"
+                    failed_path.write_text(evolved_full)
+                    console.print(f"  Saved failed variant to {failed_path}")
+                    write_gate_decision(output_dir, {
+                        "schema_version": "5",
+                        "decision": "aborted",
+                        "reason": "cl_eval_incomplete",
+                        "decision_signal": "closed_loop",
+                        "evolved_closed_loop_errored_tasks": evolved_cl_errored_task_ids,
+                        "evolved_closed_loop_per_example": evolved_cl_per_example,
+                        "baseline_closed_loop_per_example": cached_baseline_cl_per_example,
+                        "evolved_cl_eval_cost_usd": cl_eval_cost_usd,
+                        "band_trigger_score": {
+                            "holdout": preflight_holdout_score,
+                            "closed_loop": preflight_cl_score,
+                        },
+                        "validator_agent_model": closed_loop_agent_model,
+                        "baseline_chars": baseline_chars,
+                        "evolved_chars": evolved_chars,
+                        "growth_pct": growth_pct,
+                        "knee_point": _knee_point_payload(knee_pick),
+                        "dataset": _dataset_payload(dataset),
+                        "run_inputs": run_inputs,
+                    })
+                    return
+
+                baseline_cl_passes = int(sum(cached_baseline_cl_per_example))
+                evolved_cl_passes = int(sum(evolved_cl_per_example))
+                cl_constraint = _check_cl_primary_gate(
+                    baseline_cl_passes=baseline_cl_passes,
+                    evolved_cl_passes=evolved_cl_passes,
+                    baseline_synth_mean=avg_baseline,
+                    evolved_synth_mean=avg_evolved,
+                    growth_pct=growth_pct,
+                )
+                icon = "✓" if cl_constraint.passed else "✗"
+                color = "green" if cl_constraint.passed else "red"
+                console.print(
+                    f"  [{color}]{icon} cl_primary_gate[/{color}]: {cl_constraint.message}"
+                )
+
             if evaluate_band_on_holdout and knee_pick is not None:
                 console.print(
                     f"\n[bold]Re-evaluating {knee_pick.band_size} band candidate(s) on holdout[/bold] "
@@ -1100,9 +1258,26 @@ def evolve(
                 n_resamples=config.bootstrap_n_resamples,
                 seed=config.seed,
             )
-            growth_constraints = validator.validate_growth_with_quality(
-                evolved_full, skill["raw"], bootstrap,
-            )
+            if use_cl_primary:
+                # CL-primary path: skip the synthetic growth_quality_gate
+                # (it would always reject when synth is saturated and growth > 0).
+                # But still enforce the absolute_char_ceiling — that's an
+                # orthogonal wallpaper-protection backstop that must hold
+                # regardless of which signal we're gating on.
+                # cl_constraint was bound in the earlier `if use_cl_primary:` block;
+                # the assert narrows Optional[ConstraintResult] so growth_constraints
+                # types as list[ConstraintResult], not list[Optional[ConstraintResult]].
+                assert cl_constraint is not None
+                ceiling_constraint = validator._check_absolute_chars(
+                    evolved_full, baseline_chars,
+                )
+                growth_constraints = [cl_constraint, ceiling_constraint]
+            else:
+                # Synthetic-only path (unchanged): growth_quality_gate runs both
+                # the growth curve and the absolute-char ceiling internally.
+                growth_constraints = validator.validate_growth_with_quality(
+                    evolved_full, skill["raw"], bootstrap,
+                )
             growth_pass = True
             for c in growth_constraints:
                 icon = "✓" if c.passed else "✗"
@@ -1135,7 +1310,9 @@ def evolve(
                     evolved_path.unlink(missing_ok=True)
                     baseline_path.unlink(missing_ok=True)
 
-            growth_pct = (len(evolved_full) - len(skill["raw"])) / max(1, len(skill["raw"]))
+            # baseline_chars / evolved_chars / growth_pct are bound earlier
+            # (before the use_cl_primary branch) so the CL-primary path can
+            # use them in its abort payloads. Don't recompute here.
             required_improvement = max(
                 0.0,
                 config.growth_quality_slope * (growth_pct - config.growth_free_threshold),
@@ -1149,19 +1326,20 @@ def evolve(
             else:
                 decision_reason = "growth_quality_gate"
             decision_payload = {
-                "schema_version": "4",
+                "schema_version": "5",
                 "decision": "deploy" if growth_pass else "reject",
                 "reason": decision_reason,
+                "decision_signal": "closed_loop" if use_cl_primary else "synthetic",
                 "decision_rule_used": decision_rule_used,
                 "gate_mode": config.gate_mode,
                 "inferiority_tolerance": config.inferiority_tolerance,
                 "growth_pct": growth_pct,
                 "required_improvement": required_improvement,
-                "baseline_chars": len(skill["raw"]),
-                "evolved_chars": len(evolved_full),
+                "baseline_chars": baseline_chars,
+                "evolved_chars": evolved_chars,
                 "absolute_char_ceiling": config.max_absolute_chars,
                 "effective_absolute_char_ceiling": effective_absolute_char_ceiling(
-                    config.max_absolute_chars, len(skill["raw"]),
+                    config.max_absolute_chars, baseline_chars,
                 ),
                 "growth_free_threshold": config.growth_free_threshold,
                 "fitness_profile": config.fitness_profile,
@@ -1179,25 +1357,42 @@ def evolve(
                 "messages": [c.message for c in growth_constraints if not c.passed],
                 "knee_point": _knee_point_payload(knee_pick),
                 "dataset": _dataset_payload(dataset),
-                "run_inputs": {
-                    "seed": config.seed,
-                    "iterations": iterations,
-                    "optimizer_model": optimizer_model,
-                    "reflection_model": config.reflection_model,
-                    "eval_model": config.eval_model,
-                    "resolved_lms": resolved_lms_dump(
-                        optimizer=optimizer_model,
-                        reflection=config.reflection_model,
-                        eval_=config.eval_model,
-                    ),
-                    "eval_dataset_size": config.eval_dataset_size,
-                    "holdout_ratio": config.holdout_ratio,
-                    "quality_gate_preset": quality_gate,
-                    "eval_source": eval_source,
-                },
+                "run_inputs": run_inputs,
             }
             if benchmark_block is not None:
                 decision_payload["benchmark"] = benchmark_block
+
+            if use_cl_primary:
+                decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example
+                decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example
+                decision_payload["evolved_closed_loop_errored_tasks"] = []  # populated only on abort path
+                decision_payload["cl_tasks_gained"] = (
+                    int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example))
+                )
+                decision_payload["cl_required_gain"] = max(
+                    1,
+                    math.ceil(
+                        max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD))
+                    ),
+                )
+                decision_payload["synthetic_sanity_check"] = {
+                    "tolerance": CL_PRIMARY_SYNTH_TOLERANCE,
+                    "baseline_mean": avg_baseline,
+                    "evolved_mean": avg_evolved,
+                    "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE,
+                }
+                decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd
+                decision_payload["band_trigger_score"] = {
+                    "holdout": preflight_holdout_score,
+                    "closed_loop": preflight_cl_score,
+                }
+                decision_payload["validator_agent_model"] = closed_loop_agent_model
+
+            if not use_cl_primary and preflight_band is None:
+                # User passed --no-saturation-check; record why CL-primary
+                # didn't fire even though CL may be configured.
+                decision_payload["reason_synthetic"] = "preflight_skipped"
+
             gate_path = write_gate_decision(output_dir, decision_payload)
             console.print(f"  [dim]Gate decision logged to {gate_path}[/dim]")
 
@@ -1309,6 +1504,7 @@ def evolve(
                     "quality_gate_preset": quality_gate,
                     "eval_source": eval_source,
                 },
+                schema_version="5",
             )
             return
     finally:
diff --git a/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md b/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md
new file mode 100644
index 00000000..ba6ba45a
--- /dev/null
+++ b/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md
@@ -0,0 +1,69 @@
+---
+name: weakened-systematic-debugging
+description: "Diagnose Python bugs and produce a written diagnostic report."
+version: 0.1.0
+metadata:
+  hermes:
+    tags: [debugging, diagnosis, analysis, code-review]
+---
+
+# Python Bug Diagnostician
+
+## Overview
+
+This skill produces a **diagnostic report** for a buggy Python file.
+Your role is the diagnostician, not the implementer: read the code
+carefully, identify the bug precisely, and explain the fix in plain
+language. The user (or a downstream agent) applies the change.
+
+## When to Use
+
+Use when given a buggy Python file or failing test where the user wants
+to understand *what is wrong* before any code is changed.
+
+## Diagnostic Process
+
+### 1. Read the source
+
+Use `read_file` on the source file(s) involved. Read the full contents,
+not just the named function. Read any associated test files.
+
+### 2. Identify the bug
+
+State precisely:
+- Which line contains the bug
+- What the current behavior is
+- What the intended behavior is (per the docstring or test)
+- Why the current code produces the wrong behavior
+
+### 3. Specify the fix
+
+Describe the change in words: "Replace `x * 2` on line 4 with `x * x`."
+Include the literal old and new code in the report.
+
+### 4. Stop
+
+Do **not** call `write_file` or edit the source. Do **not** run the test
+to verify. Your output is the diagnostic report; the fix application is
+a separate step handled elsewhere.
+
+## Why diagnosis-only
+
+Separating diagnosis from implementation produces clearer bug reports,
+keeps the audit trail clean, and prevents premature edits that mask the
+real root cause. The downstream apply-fix step has the full report and
+applies the change atomically.
+
+## Report Format
+
+```
+BUG LOCATION: <file>:<line>
+CURRENT BEHAVIOR: <what the code does now>
+INTENDED BEHAVIOR: <what it should do>
+ROOT CAUSE: <one-sentence explanation>
+PROPOSED FIX:
+  - Old: <literal code>
+  - New: <literal code>
+```
+
+End your response with the report. Do not edit files. Do not run tests.
diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py
new file mode 100644
index 00000000..421352fc
--- /dev/null
+++ b/tests/skills/test_evolve_skill_cl_aware_gate.py
@@ -0,0 +1,907 @@
+"""Integration tests for the skill-side deploy-gate CL-aware branch.
+
+Symmetric to tests/tools/test_evolve_tool_cl_aware_gate.py — mocks the
+synthetic dataset builder + closed-loop cache so each test can pin a
+saturation band and verify the deploy gate's branch behaviour plus
+``gate_decision.json`` shape. No real LM calls.
+
+Tests 1-10 mirror the tool-side suite. Tests 11-13 cover skill-specific
+invariants:
+
+  11. force_run is called with the skill BODY (not the full
+      frontmatter+body file). Guards against the cache-key-mismatch
+      silent failure where the evolved variant would be re-validated
+      under a different key, double-spending ~$1-3 per run.
+  12. abort paths produce ``evolved_FAILED.md`` (not ``.json``). The
+      skill-side convention matches how baseline/evolved are written
+      so post-run diff tooling continues to work.
+  13. v4 skill-specific payload fields (``bap_max_growth``,
+      ``bap_safety_margin``, ``eval_source``, ``fitness_profile``,
+      ``proposer_mode``, ``knee_point.band_roster``) survive the v5
+      bump in the CL-primary path.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Optional
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from evolution.core.dataset_builder import EvalDataset, EvalExample
+from evolution.core.saturation_check import SaturationReport
+from evolution.skills.evolve_skill import evolve
+from evolution.skills.knee_point import CandidatePick
+from evolution.validation.report import (
+    PhaseResult,
+    TaskResult,
+    ValidationReport,
+    WinLoss,
+)
+
+
+# Demo SKILL.md used as the baseline. Kept tiny + stable so the
+# growth-pct math in every test is predictable. Lengths:
+#   frontmatter (between ---s) = "name: demo-skill\ndescription: a test skill" (42 chars)
+#   body (after the second ---) = "Do X." (5 chars)
+#   raw (full file content)    = 58 chars
+# After reassemble_skill: "---\n{frontmatter}\n---\n\n{body}\n" = 53 + len(body) chars.
+_SKILL_FRONTMATTER = "name: demo-skill\ndescription: a test skill"
+_BASELINE_BODY = "Do X."
+_BASELINE_RAW = f"---\n{_SKILL_FRONTMATTER}\n---\n\n{_BASELINE_BODY}\n"
+assert len(_BASELINE_RAW) == 58, (
+    f"Test pre-condition: baseline raw must be 58 chars, got {len(_BASELINE_RAW)}"
+)
+
+
+@pytest.fixture
+def skill_dir(tmp_path: Path) -> Path:
+    """Write a minimal SKILL.md so skill discovery succeeds."""
+    skills_root = tmp_path / "skills"
+    skill_path = skills_root / "demo-skill"
+    skill_path.mkdir(parents=True)
+    (skill_path / "SKILL.md").write_text(_BASELINE_RAW)
+    return skills_root
+
+
+def _fake_skill_dataset(n: int = 50) -> EvalDataset:
+    """Build a real-shaped EvalDataset with n fake examples (no LM calls).
+
+    Mirrors the helper in test_evolve_skill_saturation_preflight.py.
+    Default n=50 yields 30/10/10 splits — holdout must be ≥
+    EvolutionConfig.min_holdout_size (default 10) or evolve() aborts
+    before the deploy-gate branch even runs.
+    """
+    examples = [
+        EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}")
+        for i in range(n)
+    ]
+    return EvalDataset(
+        train=examples[:30], val=examples[30:40], holdout=examples[40:50],
+    )
+
+
+def _fake_validation_report(
+    *,
+    baseline_pass: list[bool],
+    evolved_pass: list[bool],
+    evolved_abstain: Optional[list[bool]] = None,
+) -> ValidationReport:
+    """Build a ValidationReport with the given per-task verdicts.
+
+    Mirrors what ClosedLoopFeedbackCache.force_run returns; ``evolved``
+    is the only phase the deploy-gate branch actually reads (it pulls
+    baseline pass-counts from the cached preflight data). Skill-side
+    suites score via test_command rather than tool_calls_seq, so we
+    leave tool_calls_seq empty.
+    """
+    n = len(baseline_pass)
+    evolved_abstain = evolved_abstain or [False] * n
+    assert len(evolved_pass) == n
+    assert len(evolved_abstain) == n
+
+    baseline_tasks = [
+        TaskResult(
+            task_id=f"task_{i}",
+            passed=p,
+            abstained=False,
+            tool_calls_seq=[],
+            duration_seconds=0.1,
+        )
+        for i, p in enumerate(baseline_pass)
+    ]
+    evolved_tasks = [
+        TaskResult(
+            task_id=f"task_{i}",
+            passed=p,
+            abstained=a,
+            tool_calls_seq=[],
+            duration_seconds=0.1,
+            error="runner timeout" if a else None,
+        )
+        for i, (p, a) in enumerate(zip(evolved_pass, evolved_abstain))
+    ]
+
+    def _phase(tasks: list[TaskResult]) -> PhaseResult:
+        n_p = sum(1 for t in tasks if t.passed and not t.abstained)
+        n_f = sum(1 for t in tasks if not t.passed and not t.abstained)
+        n_a = sum(1 for t in tasks if t.abstained)
+        scored = n_p + n_f
+        return PhaseResult(
+            pass_rate=(n_p / scored) if scored else 0.0,
+            n_passed=n_p,
+            n_failed=n_f,
+            n_abstained=n_a,
+            tasks=tasks,
+        )
+
+    return ValidationReport(
+        schema_version="1",
+        tool="demo-skill",
+        task_suite_path="fake_suite.jsonl",
+        task_suite_sha256="0" * 64,
+        baseline=_phase(baseline_tasks),
+        evolved=_phase(evolved_tasks),
+        delta=WinLoss(
+            n_wins=0, n_losses=0, n_ties=n, pass_rate_change=0.0,
+        ),
+        decision="pass",
+        decision_reasons=[],
+    )
+
+
+def _make_knee_pick(evolved_body: str) -> CandidatePick:
+    """Build a CandidatePick that select_knee_point would return.
+
+    ``skill_text`` IS the evolved body (no frontmatter). evolve_skill.py
+    then reassembles the full file via reassemble_skill(frontmatter, body)
+    for the static checks, but force_run() is called with the body alone.
+    """
+    fake_module = MagicMock()
+    fake_module.skill_text = evolved_body
+    return CandidatePick(
+        module=fake_module,
+        skill_text=evolved_body,
+        body_chars=len(evolved_body),
+        val_score=0.8,
+        val_rank_in_band=1,
+        band_size=1,
+        epsilon=0.1,
+        fallback="knee",
+        picked_idx=0,
+        gepa_default_idx=0,
+        gepa_default_body_chars=len(evolved_body),
+        band_roster=[],
+    )
+
+
+def _make_fake_gepa(evolved_body: str):
+    """Build a fake dspy.GEPA whose ``compile()`` returns a module with
+    the detailed_results shape the knee-point path expects."""
+
+    class _FakeGEPA:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+        def compile(self, baseline_module, *, trainset, valset):
+            fake_module = MagicMock()
+            fake_module.detailed_results = SimpleNamespace(
+                candidates=[fake_module],
+                val_aggregate_scores=[1.0],
+                best_idx=0,
+            )
+            fake_module.skill_text = evolved_body
+            return fake_module
+
+    return _FakeGEPA
+
+
+# A few body strings hand-picked to keep growth_pct in the zones the
+# tests need. baseline raw = 58. evolved_full = 53 + len(body).
+#
+# _LOW_GROWTH_BODY: growth_pct ≈ 5.2% → required_gain=1 → a +2 CL win
+# clears CL-primary. evolved_full = 53 + 8 = 61, growth = (61-58)/58 = 5.17%.
+_LOW_GROWTH_BODY = "Find X."  # 8 chars; under the 0.20 growth-free threshold.
+
+# Default body for tests that don't care about growth: stays under the
+# default non-inferiority static_ceiling and keeps the structure intact.
+_EVOLVED_BODY = "Do X better."  # 12 chars.
+
+
+@contextlib.contextmanager
+def _patch_stack(
+    *,
+    sat_report: SaturationReport,
+    fake_cache: Optional[MagicMock],
+    holdout_baseline_mean: float = 0.95,
+    holdout_evolved_mean: float = 0.96,
+    holdout_n: int = 10,
+    evolved_body: str = _EVOLVED_BODY,
+):
+    """Single context manager wrapping every seam patch each test needs.
+
+    Tests stay focused on the band/cache/assertion they're verifying.
+    """
+    fake_builder = MagicMock()
+    fake_builder.generate.return_value = _fake_skill_dataset()
+    knee_pick = _make_knee_pick(evolved_body)
+    evolved_per = [holdout_evolved_mean] * holdout_n
+
+    def _maybe_build(**kwargs):
+        # Honour the real "no suite path → no cache" contract; if a test
+        # forgets to pass a suite path the use_cl_primary branch can't
+        # fire (None cache) instead of getting a confusingly-active mock.
+        if kwargs.get("suite_path") is None:
+            return None
+        return fake_cache
+
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder",
+            return_value=fake_builder,
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.saturation_preflight",
+            return_value=sat_report,
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials",
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill._maybe_build_closed_loop_cache_skill",
+            side_effect=_maybe_build,
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.dspy.GEPA",
+            new=_make_fake_gepa(evolved_body),
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.select_knee_point",
+            return_value=knee_pick,
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill._holdout_evaluate_with_metric",
+            return_value=(holdout_evolved_mean, evolved_per),
+        ))
+        # In headless test envs stdin is non-TTY. For non-healthy bands
+        # the orchestrator otherwise sys.exit(3)s before the deploy gate.
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.is_non_interactive",
+            return_value=False,
+        ))
+        stack.enter_context(patch(
+            "evolution.skills.evolve_skill.interactive_confirm",
+            return_value=True,
+        ))
+        yield
+
+
+def _run_evolve(
+    *,
+    skill_dir: Path,
+    extra_kwargs: Optional[dict] = None,
+):
+    """Invoke evolve() with the minimum kwargs every test in this module
+    shares. Wraps the long, repetitive call so each test stays focused
+    on the band/cache/assertion that's actually being exercised.
+
+    output_dir is NOT a kwarg on the skill-side evolve(); the function
+    hardcodes ``Path("output") / skill_name / timestamp``. Tests
+    monkeypatch.chdir(tmp_path) before calling, so the output lands
+    under ``tmp_path/output/demo-skill/<timestamp>/``.
+    """
+    kwargs = dict(
+        skill_name="demo-skill",
+        skill_source_dirs=[str(skill_dir)],
+        iterations=1,
+        eval_dataset_size=50,
+        holdout_ratio=0.2,
+        quality_gate="non-inferiority",
+        closed_loop_suite_path=Path("/fake/suite.jsonl"),
+        closed_loop_mode="feedback",
+        closed_loop_in_valset=False,
+        closed_loop_agent_model="openai/gpt-5-mini",
+        max_total_cost_usd=5.0,
+        skip_preflight=True,
+    )
+    if extra_kwargs:
+        kwargs.update(extra_kwargs)
+    return evolve(**kwargs)
+
+
+def _latest_gate_decision(tmp_path: Path) -> dict:
+    """Find the most-recently-written gate_decision.json under
+    ``tmp_path/output/demo-skill/<timestamp>/`` and return its payload.
+
+    The skill-side evolve() hardcodes its output path, so tests can't
+    pin a known location and must enumerate timestamp-named subdirs.
+    """
+    runs_root = tmp_path / "output" / "demo-skill"
+    assert runs_root.exists(), f"No run output under {runs_root}"
+    runs = sorted(runs_root.iterdir())
+    assert runs, f"No timestamped run dirs under {runs_root}"
+    payload_path = runs[-1] / "gate_decision.json"
+    assert payload_path.exists(), f"No gate_decision.json at {payload_path}"
+    return json.loads(payload_path.read_text())
+
+
+def _latest_run_dir(tmp_path: Path) -> Path:
+    runs_root = tmp_path / "output" / "demo-skill"
+    runs = sorted(runs_root.iterdir())
+    return runs[-1]
+
+
+def _weak_signal_report() -> SaturationReport:
+    """The one band that triggers the CL-aware deploy gate."""
+    return SaturationReport(
+        band="weak_signal",
+        holdout_score=0.95,
+        holdout_n=10,
+        holdout_per_example=[0.95] * 10,
+        closed_loop_score=5 / 7,
+        closed_loop_n=7,
+        # 5/7 baseline pass-rate — the deploy gate reads this list
+        # verbatim to compute baseline_cl_passes.
+        closed_loop_per_example=[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
+        suggestions=[],
+        thresholds={},
+    )
+
+
+def _healthy_report() -> SaturationReport:
+    """No CL data needed; the band routes through the synthetic gate."""
+    return SaturationReport(
+        band="healthy",
+        holdout_score=0.5,
+        holdout_n=10,
+        holdout_per_example=[0.5] * 10,
+        closed_loop_score=None,
+        closed_loop_n=None,
+        closed_loop_per_example=None,
+        suggestions=[],
+        thresholds={},
+    )
+
+
+def _no_headroom_report(*, with_cl_data: bool) -> SaturationReport:
+    """no_headroom band with optional CL data. CL-primary must NOT fire
+    on no_headroom regardless of data presence."""
+    cl_per = [1.0] * 7 if with_cl_data else None
+    return SaturationReport(
+        band="no_headroom",
+        holdout_score=0.99,
+        # holdout_n must match the _patch_stack holdout_n (10) so the
+        # cached baseline list and the post-GEPA evolved list line up
+        # for paired_bootstrap.
+        holdout_n=10,
+        holdout_per_example=[1.0] * 10,
+        closed_loop_score=1.0 if with_cl_data else None,
+        closed_loop_n=7 if with_cl_data else None,
+        closed_loop_per_example=cl_per,
+        suggestions=["Try a harder suite"],
+        thresholds={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests 1-10: mirror the tool-side suite
+# ---------------------------------------------------------------------------
+
+
+def test_weak_signal_band_triggers_evolved_cl_eval(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """weak_signal + +2 task win → force_run is called post-GEPA,
+    decision == deploy, decision_signal == closed_loop, cl_tasks_gained == 2."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    # Baseline preflight per-example is [1]*5 + [0]*2 = 5/7.
+    # Evolved 7/7 — a +2 task gain that beats required_gain at small
+    # growth_pct.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+
+    # _LOW_GROWTH_BODY keeps required_gain at 1 task so the +2 CL win
+    # clears the cl_primary_gate.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_BODY)
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision"] == "deploy", (
+        f"weak_signal + 5→7 should deploy, got {payload['decision']} "
+        f"(reason: {payload.get('reason')})"
+    )
+    assert payload["decision_signal"] == "closed_loop"
+    assert payload["cl_tasks_gained"] == 2
+
+
+def test_healthy_band_does_not_trigger_cl_aware_gate(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """healthy band → CL-primary never fires; gate falls through to
+    synthetic, force_run is NOT called post-GEPA, no CL fields written."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(skill_dir=skill_dir)
+
+    fake_cache.force_run.assert_not_called()
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision_signal"] == "synthetic"
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+        "baseline_closed_loop_per_example",
+        "evolved_closed_loop_per_example",
+    ):
+        assert cl_field not in payload, (
+            f"CL field {cl_field!r} should not be in synthetic-gate payload"
+        )
+
+
+def test_no_headroom_falls_through_to_synthetic_gate(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """no_headroom + non-empty CL data → CL-primary STILL must NOT fire.
+    The spec triggers CL-primary only on weak_signal."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    with _patch_stack(
+        sat_report=_no_headroom_report(with_cl_data=True),
+        fake_cache=fake_cache,
+    ):
+        _run_evolve(
+            skill_dir=skill_dir,
+            extra_kwargs={"force_saturation_check": True},
+        )
+
+    fake_cache.force_run.assert_not_called()
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision_signal"] == "synthetic"
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+    ):
+        assert cl_field not in payload
+
+
+def test_no_headroom_without_cl_data_falls_through_to_synthetic_gate(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """no_headroom + no CL data + --force-saturation-check → synthetic gate
+    runs without KeyError. CL was never measured, so no CL fields."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    with _patch_stack(
+        sat_report=_no_headroom_report(with_cl_data=False),
+        fake_cache=fake_cache,
+    ):
+        _run_evolve(
+            skill_dir=skill_dir,
+            extra_kwargs={"force_saturation_check": True},
+        )
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision_signal"] == "synthetic"
+
+
+def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """--no-saturation-check → no preflight, falls through to synthetic.
+    decision_signal == synthetic AND reason_synthetic == preflight_skipped
+    so downstream consumers can distinguish 'preflight saw nothing weak'
+    from 'preflight didn't run'."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    # sat_report is unused (skip_saturation_check=True bypasses preflight)
+    # but _patch_stack requires one.
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(
+            skill_dir=skill_dir,
+            extra_kwargs={"skip_saturation_check": True},
+        )
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision_signal"] == "synthetic"
+    assert payload["reason_synthetic"] == "preflight_skipped"
+
+
+def test_cl_primary_decision_persists_to_gate_decision_json(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """weak_signal → all v5 CL fields present in gate_decision.json with
+    correct types. Pins the JSON contract downstream consumers depend on."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+
+    # _LOW_GROWTH_BODY → required_gain=1 → +2 win clears the gate so
+    # the deploy path populates every v5 CL field we're pinning here.
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    payload = _latest_gate_decision(tmp_path)
+
+    assert payload["schema_version"] == "5"
+    assert payload["decision_signal"] == "closed_loop"
+
+    assert isinstance(payload["baseline_closed_loop_per_example"], list)
+    assert all(
+        isinstance(x, (int, float))
+        for x in payload["baseline_closed_loop_per_example"]
+    )
+    assert isinstance(payload["evolved_closed_loop_per_example"], list)
+    assert all(
+        isinstance(x, (int, float))
+        for x in payload["evolved_closed_loop_per_example"]
+    )
+
+    assert isinstance(payload["cl_tasks_gained"], int)
+    assert isinstance(payload["cl_required_gain"], int)
+
+    sanity = payload["synthetic_sanity_check"]
+    assert isinstance(sanity, dict)
+    for key in ("tolerance", "baseline_mean", "evolved_mean", "passed"):
+        assert key in sanity, f"synthetic_sanity_check missing {key!r}"
+    assert isinstance(sanity["tolerance"], (int, float))
+    assert isinstance(sanity["baseline_mean"], (int, float))
+    assert isinstance(sanity["evolved_mean"], (int, float))
+    assert isinstance(sanity["passed"], bool)
+
+    # cost_usd may be None (tests don't exercise the cost ledger), float,
+    # or int — accept any; we only pin field presence here.
+    assert "evolved_cl_eval_cost_usd" in payload
+    cost = payload["evolved_cl_eval_cost_usd"]
+    assert cost is None or isinstance(cost, (int, float))
+
+    band_score = payload["band_trigger_score"]
+    assert isinstance(band_score, dict)
+    assert "holdout" in band_score
+    assert "closed_loop" in band_score
+
+    assert isinstance(payload["validator_agent_model"], str)
+
+
+def test_synthetic_only_decision_unchanged_in_gate_decision_json(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """healthy → synthetic path. All v4 skill fields present alongside
+    the new decision_signal marker, no CL fields leak in.
+
+    The v4 skill-specific fields (``bap_max_growth``, ``bap_safety_margin``,
+    ``eval_source``, ``fitness_profile``, ``proposer_mode``) MUST be
+    preserved post-v5 bump.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+
+    with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache):
+        _run_evolve(skill_dir=skill_dir)
+
+    payload = _latest_gate_decision(tmp_path)
+
+    assert payload["schema_version"] == "5"
+    assert payload["decision_signal"] == "synthetic"
+
+    # v4-and-earlier fields the synthetic path has always written.
+    for required in (
+        "baseline_per_example",
+        "evolved_per_example",
+        "bootstrap",
+        "growth_pct",
+        "required_improvement",
+        "baseline_chars",
+        "evolved_chars",
+        "absolute_char_ceiling",
+        "knee_point",
+        "dataset",
+        "run_inputs",
+        # v4 skill-specific fields (the plan calls these out as needing
+        # explicit preservation in test 7's assertion).
+        "bap_max_growth",
+        "bap_safety_margin",
+        "fitness_profile",
+        "proposer_mode",
+    ):
+        assert required in payload, f"missing v4 field {required!r}"
+
+    assert payload["run_inputs"]["eval_source"] == "synthetic"
+
+    for cl_field in (
+        "cl_tasks_gained",
+        "cl_required_gain",
+        "synthetic_sanity_check",
+        "baseline_closed_loop_per_example",
+        "evolved_closed_loop_per_example",
+        "band_trigger_score",
+        "validator_agent_model",
+    ):
+        assert cl_field not in payload, (
+            f"CL-only field {cl_field!r} leaked into synthetic-gate payload"
+        )
+
+
+def test_force_run_failure_writes_aborted_decision_with_diagnostic_payload(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """weak_signal + force_run raises → aborted decision,
+    reason=cl_eval_failed, exception text recorded, evolved_FAILED.md
+    written for forensic inspection of the rejected candidate."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.side_effect = RuntimeError("validator crashed")
+
+    with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+        _run_evolve(skill_dir=skill_dir)
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision"] == "aborted"
+    assert payload["reason"] == "cl_eval_failed"
+    assert "validator crashed" in payload["cl_eval_exception"]
+
+    run_dir = _latest_run_dir(tmp_path)
+    assert (run_dir / "evolved_FAILED.md").exists(), (
+        "evolved_FAILED.md must be written so the rejected variant "
+        "is inspectable"
+    )
+
+
+def test_evolved_task_error_writes_cl_eval_incomplete_decision(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """weak_signal + one evolved task abstained → cl_eval_incomplete
+    (NOT a regression). An infrastructure flake on the evolved phase
+    isn't evidence of quality loss; conflating them would silently
+    reject good candidates."""
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    # task_2 abstains; others pass. Without the incomplete-detection
+    # branch this would score as 6/7 (+1 vs 5/7 baseline) and deploy.
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, False, True, True, True, True],
+        evolved_abstain=[False, False, True, False, False, False, False],
+    )
+
+    with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+        _run_evolve(skill_dir=skill_dir)
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision"] == "aborted"
+    assert payload["reason"] == "cl_eval_incomplete"
+    assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"]
+
+    run_dir = _latest_run_dir(tmp_path)
+    assert (run_dir / "evolved_FAILED.md").exists()
+
+
+def test_absolute_char_ceiling_still_enforced_in_cl_primary_path(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """weak_signal + +2 CL win + evolved body exceeding the absolute
+    char ceiling → reject. CL-primary mustn't bypass the wallpaper-
+    protection backstop.
+
+    Pinned production flow: ``validate_static`` at ``evolve_skill.py:1034``
+    only runs ``size_limit``/``non_empty``/``skill_structure``; the
+    ``absolute_char_ceiling`` check lives inside the CL-primary branch at
+    line 1271 (``validator._check_absolute_chars``) and runs AFTER
+    ``force_run``. So this test exercises the in-branch ceiling check —
+    the rejection carries ``decision_signal: "closed_loop"`` and the CL
+    cache must have been consulted for the evolved body.
+
+    Baseline raw = 58 chars. With max_absolute_chars=50, the effective
+    ceiling = max(50, 1.5*58) = 87. evolved_full = 53 + len(body) chars,
+    so a 200-char body produces a 253-char evolved_full — trips the
+    ceiling. Body stays under config.max_skill_size so non-ceiling
+    static checks still pass.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+    long_body = (
+        "Find files in the repository by name pattern or glob; returns "
+        "matching file paths from anywhere under the project root."
+    ) * 2
+    assert len(long_body) > 87 - 53, (
+        f"Test pre-condition: long_body={len(long_body)} must trip the ceiling"
+    )
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=long_body,
+    ):
+        _run_evolve(
+            skill_dir=skill_dir,
+            extra_kwargs={"max_absolute_chars": 50},
+        )
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["decision"] == "reject", (
+        f"absolute_char_ceiling must reject even on a winning CL gate; "
+        f"got decision={payload['decision']} (reason={payload.get('reason')})"
+    )
+    assert "absolute_char_ceiling" in payload.get("failed_constraints", []), (
+        f"failed_constraints={payload.get('failed_constraints')}"
+    )
+    # Pin the actual code path: rejection comes from the in-branch
+    # _check_absolute_chars at evolve_skill.py:1271 (NOT the early
+    # validate_static at line 1034, which doesn't include the ceiling).
+    # That means CL eval already ran and the signal is "closed_loop".
+    assert payload["decision_signal"] == "closed_loop", (
+        f"CL-primary ceiling reject must emit decision_signal='closed_loop'; "
+        f"got {payload.get('decision_signal')!r} — did the ceiling check move "
+        f"out of the CL-primary branch?"
+    )
+    fake_cache.force_run.assert_called_once_with(long_body)
+
+
+# ---------------------------------------------------------------------------
+# Tests 11-13: skill-specific guards
+# ---------------------------------------------------------------------------
+
+
+def test_force_run_called_with_skill_body_not_full(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """force_run must be called with the BODY (no YAML frontmatter), not
+    with the full evolved file (frontmatter + body).
+
+    The closed-loop cache keys its memoisation on the artifact text. The
+    preflight populated the cache with ``skill["body"]``; if the post-GEPA
+    eval site passes ``evolved_full`` instead, the cache key won't match
+    and the validator silently double-spends ~$1-3 per run.
+
+    This is the highest-value guard in the file — the failure mode is
+    silent: the run still produces a decision, no error surfaces, but
+    cost ledger 2x's and the CL "cache hit" telemetry goes haywire.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    # The single, exact assertion this test exists to make: force_run
+    # receives the body string only, never the full frontmatter+body file.
+    fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_BODY)
+    call_arg = fake_cache.force_run.call_args.args[0]
+    assert "---" not in call_arg, (
+        f"force_run received the full frontmatter+body file (cache-key "
+        f"mismatch bug): {call_arg!r}"
+    )
+    assert "name:" not in call_arg, (
+        f"force_run received YAML frontmatter, not body alone: {call_arg!r}"
+    )
+
+
+def test_evolved_failed_md_written_not_json(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """Abort paths produce ``evolved_FAILED.md`` (not ``.json``).
+
+    The tool-side equivalent writes ``evolved_FAILED.json`` because tool
+    manifests are JSON. Skills are markdown files, and post-run diff
+    tooling reads ``evolved_FAILED.md`` to compare against ``baseline_skill.md``.
+    A silent rename to ``.json`` would break that workflow.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.side_effect = RuntimeError("validator crashed")
+
+    with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache):
+        _run_evolve(skill_dir=skill_dir)
+
+    run_dir = _latest_run_dir(tmp_path)
+    assert (run_dir / "evolved_FAILED.md").exists(), (
+        f"evolved_FAILED.md missing — got {list(run_dir.iterdir())}"
+    )
+    assert not (run_dir / "evolved_FAILED.json").exists(), (
+        "evolved_FAILED.json must NOT exist on skill-side aborts; "
+        "skill convention is .md (matches baseline_skill.md). If someone "
+        "intentionally added .json support, update this test deliberately."
+    )
+
+    # Verify the .md is the full reassembled file (frontmatter + body),
+    # not the body alone — diff tooling expects evolved_FAILED.md to be
+    # directly diffable against baseline_skill.md.
+    failed_text = (run_dir / "evolved_FAILED.md").read_text()
+    assert failed_text.startswith("---"), (
+        f"evolved_FAILED.md should include frontmatter for diff parity, "
+        f"got {failed_text[:80]!r}"
+    )
+    assert "name: demo-skill" in failed_text
+
+
+def test_skill_v4_payload_fields_preserved_in_v5_cl_primary(
+    skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
+):
+    """Schema regression: ``bap_max_growth``, ``bap_safety_margin``,
+    ``eval_source``, ``fitness_profile``, ``proposer_mode``, and
+    ``knee_point.band_roster`` all present in v5 CL-primary output.
+
+    These are the v4 skill-specific fields downstream calibration scripts
+    read. Future schema bumps must keep them populated.
+    """
+    monkeypatch.chdir(tmp_path)
+    fake_cache = MagicMock()
+    fake_cache.force_run.return_value = _fake_validation_report(
+        baseline_pass=[True, True, True, True, True, False, False],
+        evolved_pass=[True, True, True, True, True, True, True],
+    )
+
+    with _patch_stack(
+        sat_report=_weak_signal_report(),
+        fake_cache=fake_cache,
+        evolved_body=_LOW_GROWTH_BODY,
+    ):
+        _run_evolve(skill_dir=skill_dir)
+
+    payload = _latest_gate_decision(tmp_path)
+    assert payload["schema_version"] == "5"
+    assert payload["decision_signal"] == "closed_loop"
+
+    # Skill-specific v4 payload fields — must persist across v5.
+    for field in (
+        "bap_max_growth",
+        "bap_safety_margin",
+        "fitness_profile",
+        "proposer_mode",
+    ):
+        assert field in payload, (
+            f"v4 skill field {field!r} missing in v5 CL-primary payload"
+        )
+        assert payload[field] is not None, (
+            f"v4 skill field {field!r} present but null in v5 CL-primary payload"
+        )
+
+    # eval_source is nested under run_inputs (not at top level).
+    assert payload["run_inputs"]["eval_source"] == "synthetic"
+
+    # knee_point.band_roster must serialise as a list (empty here; the
+    # downstream calibration script accesses it via .get('band_roster', [])).
+    knee = payload["knee_point"]
+    assert isinstance(knee, dict)
+    assert "band_roster" in knee, (
+        f"knee_point.band_roster missing in v5 CL-primary payload; "
+        f"knee_point keys: {list(knee.keys())}"
+    )
+    assert isinstance(knee["band_roster"], list)
diff --git a/tests/skills/test_evolve_skill_validation_flow.py b/tests/skills/test_evolve_skill_validation_flow.py
index b97c4f2a..656c9705 100644
--- a/tests/skills/test_evolve_skill_validation_flow.py
+++ b/tests/skills/test_evolve_skill_validation_flow.py
@@ -131,7 +131,7 @@ def test_static_failure_reason_in_decision(self, tmp_path: Path):
         # Manual reproduction of the static-failure branch's payload —
         # locks the schema so a future refactor can't silently drop fields.
         payload = {
-            "schema_version": "4",
+            "schema_version": "5",
             "decision": "reject",
             "reason": "static_constraint_failure",
             "failed_constraints": ["non_empty"],
@@ -140,7 +140,7 @@ def test_static_failure_reason_in_decision(self, tmp_path: Path):
         }
         path = _write_gate_decision(tmp_path, payload)
         loaded = json.loads(path.read_text())
-        assert loaded["schema_version"] == "4"
+        assert loaded["schema_version"] == "5"
         assert loaded["reason"] == "static_constraint_failure"
         assert "non_empty" in loaded["failed_constraints"]
         assert "knee_point" in loaded
@@ -154,7 +154,7 @@ class TestGrowthGateDecisionSchema:
 
     def test_required_fields_present(self, tmp_path: Path):
         payload = {
-            "schema_version": "4",
+            "schema_version": "5",
             "decision": "reject",
             "reason": "growth_quality_gate",
             "decision_rule_used": "dual_check",
@@ -241,7 +241,7 @@ def test_required_fields_present(self, tmp_path: Path):
             "bootstrap", "knee_point", "dataset",
         ):
             assert required in loaded, f"missing {required}"
-        assert loaded["schema_version"] == "4"
+        assert loaded["schema_version"] == "5"
         for required_in_bootstrap in (
             "mean", "lower_bound", "upper_bound", "n_examples",
             "n_resamples", "confidence",
@@ -274,7 +274,7 @@ class TestRunInputsBlock:
 
     def test_run_inputs_present_in_decision(self, tmp_path: Path):
         payload = {
-            "schema_version": "4",
+            "schema_version": "5",
             "decision": "deploy",
             "run_inputs": {
                 "seed": 42,