From deb952b5784f9e7b1c0464443c99b366e8ef1d4d Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:06:19 -0600 Subject: [PATCH 1/9] feat(quality_gate): add _check_cl_primary_gate helper Pure function returning a ConstraintResult for the closed-loop-primary deploy decision. Used when saturation pre-flight reports weak_signal band. Required gain scales with description growth, mirroring the synthetic gate's free_threshold + slope shape; synthetic regression tolerance of 0.05 protects against catastrophic judge collapse. 11 unit tests cover the decision-rule math including the PR #68 calibration point (+2 gain on +121% growth -> required 2, just passes) and wallpaper protection (+1 gain on +400% growth -> required 4, fails). --- evolution/core/quality_gate.py | 74 ++++++++++++ tests/core/test_check_cl_primary_gate.py | 142 +++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 tests/core/test_check_cl_primary_gate.py diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py index 8ddd1403..bf137cb3 100644 --- a/evolution/core/quality_gate.py +++ b/evolution/core/quality_gate.py @@ -7,6 +7,7 @@ """ import json +import math import os import subprocess import time @@ -15,6 +16,7 @@ from rich.console import Console +from evolution.core.constraints import ConstraintResult from evolution.core.lm_timing_callback import COST_LEDGER, CostCeilingExceeded from evolution.skills.budget_aware_proposer import ProposerMode @@ -23,6 +25,78 @@ _BENCHMARK_OUTPUT_TAIL_BYTES = 4096 +# CL-primary deploy-gate formula constants. Mirrors the synthetic +# growth_quality_gate's free-threshold-then-slope shape (constraints.py +# _check_growth_with_quality_gate) but adapted to integer CL task gains. +# +# free_threshold matches EvolutionConfig.growth_free_threshold so both +# gates agree on the "free growth" boundary. slope=1.0 means "one extra +# task required per +100% growth above the free threshold." +CL_PRIMARY_GROWTH_FREE_THRESHOLD = 0.20 +CL_PRIMARY_GROWTH_SLOPE = 1.0 +CL_PRIMARY_SYNTH_TOLERANCE = 0.05 + + +def _check_cl_primary_gate( + *, + baseline_cl_passes: int, + evolved_cl_passes: int, + baseline_synth_mean: float, + evolved_synth_mean: float, + growth_pct: float, + synth_tolerance: float = CL_PRIMARY_SYNTH_TOLERANCE, +) -> ConstraintResult: + """Deploy-gate decision rule used when the saturation pre-flight + classifies the run as ``weak_signal`` (synthetic judge saturated, + closed-loop signal has a gradient). + + ACCEPT iff (gain >= required_gain) AND (synthetic not catastrophically + collapsed). ``required_gain`` scales with description growth so a + +1 task win can't deploy +400% wallpaper. + + Parameters are scalars (not SaturationReport) so this helper is + independent of the preflight subsystem and trivially unit-testable. + Returns the standard ``ConstraintResult`` so the deploy gate's + existing aggregation code works without changes. + """ + cl_gain = evolved_cl_passes - baseline_cl_passes + required_gain = max( + 1, + math.ceil( + max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) + ), + ) + synth_delta = evolved_synth_mean - baseline_synth_mean + synth_passed = synth_delta >= -synth_tolerance + + if cl_gain < required_gain: + return ConstraintResult( + passed=False, + constraint_name="cl_primary_gate", + message=( + f"CL gained {cl_gain:+d} tasks but required {required_gain} " + f"for {growth_pct:+.2%} growth" + ), + ) + if not synth_passed: + return ConstraintResult( + passed=False, + constraint_name="cl_primary_gate", + message=( + f"CL gained {cl_gain:+d} tasks but synthetic regressed " + f"{synth_delta:+.3f} > tolerance {synth_tolerance:.3f}" + ), + ) + return ConstraintResult( + passed=True, + constraint_name="cl_primary_gate", + message=( + f"CL gained +{cl_gain} tasks (required {required_gain}); " + f"synth Δ {synth_delta:+.3f} within ±{synth_tolerance:.3f}" + ), + ) + + # `default` is calibrated against the obsidian deploy (+24.2% growth, # ~+0.07 expected improvement). `off` disables the slope/ceiling checks # but still enforces bootstrap.mean ≥ 0 — see deprecation warning when diff --git a/tests/core/test_check_cl_primary_gate.py b/tests/core/test_check_cl_primary_gate.py new file mode 100644 index 00000000..26c66a9d --- /dev/null +++ b/tests/core/test_check_cl_primary_gate.py @@ -0,0 +1,142 @@ +"""Unit tests for the CL-primary gate helper. + +The helper combines two signals (CL pass counts, synthetic mean) and a +growth signal into a single accept/reject ConstraintResult. Tests pin +the decision-rule math; integration with evolve_tool lives in +tests/tools/test_evolve_tool_cl_aware_gate.py. +""" + +from __future__ import annotations + +import pytest + +from evolution.core.constraints import ConstraintResult +from evolution.core.quality_gate import ( + CL_PRIMARY_GROWTH_FREE_THRESHOLD, + CL_PRIMARY_GROWTH_SLOPE, + _check_cl_primary_gate, +) + + +class TestCheckClPrimaryGate: + def test_accepts_when_required_gain_met_at_free_threshold(self): + # +1 gain, +20% growth (exactly at free threshold) → required=1 + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.20, + ) + assert result.passed is True + assert result.constraint_name == "cl_primary_gate" + + def test_accepts_at_pr_68_calibration_point(self): + # PR #68: +2 gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2. + # This is the exact case that motivated this work. + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=7, + baseline_synth_mean=1.000, + evolved_synth_mean=1.000, + growth_pct=1.21, + ) + assert result.passed is True + + def test_rejects_when_growth_aware_threshold_unsatisfied(self): + # +1 gain on +400% growth → required=4, fail. + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=4.00, + ) + assert result.passed is False + assert "required" in result.message.lower() + + def test_rejects_when_no_task_gained(self): + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=5, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.20, + ) + assert result.passed is False + + def test_rejects_when_synthetic_regressed_beyond_tolerance(self): + # +1 task gained, but synthetic dropped 0.06 (> 0.05 tolerance) + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=1.000, + evolved_synth_mean=0.939, + growth_pct=0.20, + ) + assert result.passed is False + assert "synthetic" in result.message.lower() + + def test_accepts_when_synthetic_regressed_within_tolerance(self): + # +1 task gained, synthetic dropped 0.04 (< 0.05 tolerance) + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=1.000, + evolved_synth_mean=0.961, + growth_pct=0.20, + ) + assert result.passed is True + + def test_rejects_when_evolved_cl_regressed(self): + # Negative gain → reject even with no growth + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=4, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.0, + ) + assert result.passed is False + + def test_required_gain_floor_is_one_even_at_zero_growth(self): + # Even with 0 growth, must gain ≥1 task — no free deploys for null changes + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=5, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.0, + ) + assert result.passed is False + + def test_growth_within_free_threshold_requires_only_one_task(self): + # +1 gain, +15% growth (below 20% free threshold) + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.15, + ) + assert result.passed is True + + def test_message_records_required_and_actual_gain(self): + # Message must surface the numbers for gate_decision.json + console + result = _check_cl_primary_gate( + baseline_cl_passes=5, + evolved_cl_passes=6, + baseline_synth_mean=0.97, + evolved_synth_mean=0.97, + growth_pct=0.20, + ) + assert "1" in result.message # required_gain == 1 + assert "+1" in result.message or "gained 1" in result.message.lower() + + def test_constants_match_evolution_config_defaults(self): + # The CL gate's free-threshold default must match EvolutionConfig's + # synthetic-gate default so they agree on what "free growth" means. + from evolution.core.config import EvolutionConfig + cfg = EvolutionConfig() + assert CL_PRIMARY_GROWTH_FREE_THRESHOLD == cfg.growth_free_threshold + assert CL_PRIMARY_GROWTH_SLOPE == 1.0 From 365bd364dbc5ea77228b637f45b323904106bbb4 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:12:51 -0600 Subject: [PATCH 2/9] refactor(evolve_tool): preserve SaturationReport fields for deploy gate Today only sat_report.holdout_per_example survives past the preflight call site; subsequent CL-aware gate work needs the band classification and baseline CL per-task scores too. Bind four new locals next to the existing cache: band, cl_per_example, holdout_score, cl_score. All default to None on the --no-saturation-check path so the deploy gate can branch safely. No behavior change; existing tests pass unchanged. --- evolution/tools/evolve_tool.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 9dfec416..d226e109 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -669,7 +669,11 @@ def evolve( if closed_loop_in_valset: valset = valset + behavioral_examples - cached_baseline_holdout_per_example = None + cached_baseline_holdout_per_example: Optional[list[float]] = None + preflight_band: Optional[str] = None + cached_baseline_cl_per_example: Optional[list[float]] = None + preflight_holdout_score: Optional[float] = None + preflight_cl_score: Optional[float] = None if not skip_saturation_check: holdout_examples_for_preflight = _build_examples( dataset.holdout, for_module=True @@ -703,6 +707,14 @@ def evolve( else: render_saturation_panel(sat_report, console=console) cached_baseline_holdout_per_example = sat_report.holdout_per_example + # Preserve preflight outputs for the deploy gate's CL-primary + # path. None when --no-saturation-check was passed (sat_report + # itself doesn't exist in that case; handled by initialization + # to None above the preflight call). + preflight_band: Optional[str] = sat_report.band + cached_baseline_cl_per_example: Optional[list[float]] = sat_report.closed_loop_per_example + preflight_holdout_score: Optional[float] = sat_report.holdout_score + preflight_cl_score: Optional[float] = sat_report.closed_loop_score console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n") start_time = time.time() From ae1fea688b6b83095cd5166189edadc03460091f Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:22:28 -0600 Subject: [PATCH 3/9] feat(evolve_tool): branch deploy gate on saturation band MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When preflight reports weak_signal AND closed-loop is configured, run a one-shot force_run on the evolved description and gate the deploy decision on closed-loop signal via _check_cl_primary_gate. Three abort paths are written to gate_decision.json with diagnostic payloads (schema v5): - cl_eval_failed: force_run raised an exception - cl_eval_incomplete: one or more evolved CL tasks abstained (runner errored — distinguished from genuine task failure via the existing TaskResult.abstained field) - cl_primary_gate reject: returned by the gate helper itself _check_absolute_char_ceiling is preserved in the CL-primary path — wallpaper protection is orthogonal to which signal we gate on. All other bands (healthy / no_headroom / uniform_failure / no preflight) fall through to the existing synthetic path unchanged. --- evolution/tools/evolve_tool.py | 156 ++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 12 deletions(-) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index d226e109..e4eb87a0 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -41,6 +41,7 @@ resolved_lms_dump, ) from evolution.core.constraints import ( + ConstraintResult, ConstraintValidator, effective_absolute_char_ceiling, resolve_decision_rule, @@ -59,6 +60,7 @@ ) from evolution.core.quality_gate import ( QUALITY_GATE_PRESETS, + _check_cl_primary_gate, resolve_proposer_mode, run_benchmark_hook, write_cost_ceiling_abort, @@ -711,10 +713,10 @@ def evolve( # path. None when --no-saturation-check was passed (sat_report # itself doesn't exist in that case; handled by initialization # to None above the preflight call). - preflight_band: Optional[str] = sat_report.band - cached_baseline_cl_per_example: Optional[list[float]] = sat_report.closed_loop_per_example - preflight_holdout_score: Optional[float] = sat_report.holdout_score - preflight_cl_score: Optional[float] = sat_report.closed_loop_score + preflight_band = sat_report.band + cached_baseline_cl_per_example = sat_report.closed_loop_per_example + preflight_holdout_score = sat_report.holdout_score + preflight_cl_score = sat_report.closed_loop_score console.print(f"\n[bold cyan]Running GEPA optimization (max_full_evals={iterations})[/bold cyan]\n") start_time = time.time() @@ -844,6 +846,125 @@ def evolve( ) improvement = avg_evolved - avg_baseline + # Decide which deploy-gate path applies. CL-primary fires when + # the preflight saw weak_signal AND CL data is present. All + # other cases (no preflight, healthy/no_headroom/uniform_failure + # bands, missing CL data) use the synthetic-only path. + baseline_chars = len(baseline_description) + evolved_chars = len(evolved_description) + growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars) + + use_cl_primary = ( + preflight_band == "weak_signal" + and cached_baseline_cl_per_example is not None + and len(cached_baseline_cl_per_example) > 0 + and closed_loop_cache is not None + ) + + evolved_cl_report = None + evolved_cl_per_example: Optional[list[float]] = None + evolved_cl_errored_task_ids: list[str] = [] + cl_eval_cost_before: float = 0.0 + cl_eval_cost_usd: Optional[float] = None + cl_constraint: Optional[ConstraintResult] = None + + if use_cl_primary: + console.print( + f"\n[bold]Evaluating evolved description on closed-loop suite[/bold] " + "(weak_signal band → CL-primary gate)" + ) + cl_eval_cost_before = COST_LEDGER.summary().get("total_usd", 0.0) + try: + evolved_cl_report = closed_loop_cache.force_run(evolved_description) + except Exception as exc: # ValidatorError or downstream + cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before + console.print( + f"[red]✗ Evolved closed-loop eval failed: {exc}[/red] — writing aborted decision" + ) + failed_path = output_dir / "evolved_FAILED.json" + evolved_manifest = manifest.replace_description(tool_name, evolved_description) + failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") + write_gate_decision(output_dir, { + "schema_version": "5", + "decision": "aborted", + "reason": "cl_eval_failed", + "decision_signal": "closed_loop", + "cl_eval_exception": str(exc), + "evolved_cl_eval_cost_usd": cl_eval_cost_usd, + "band_trigger_score": { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + }, + "validator_agent_model": closed_loop_agent_model, + "baseline_chars": baseline_chars, + "evolved_chars": evolved_chars, + "growth_pct": growth_pct, + "knee_point": _knee_point_payload(knee_pick), + "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), + "run_inputs": run_inputs, + **tool_payload_fields, + }) + return {"decision": "aborted", "reason": "cl_eval_failed"} + cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before + + # Detect abstained tasks (TaskResult.abstained == True means + # the runner errored — see validation/report.py:score_task). + # An infrastructure flake on an evolved task is NOT a quality + # regression; conflating them would falsely reject good + # candidates. Hard-fail with a written diagnostic instead. + evolved_cl_errored_task_ids = [ + t.task_id for t in evolved_cl_report.evolved.tasks if t.abstained + ] + evolved_cl_per_example = [ + 1.0 if t.passed else 0.0 for t in evolved_cl_report.evolved.tasks + ] + if evolved_cl_errored_task_ids: + console.print( + f"[red]✗ {len(evolved_cl_errored_task_ids)} evolved CL task(s) errored " + f"({', '.join(evolved_cl_errored_task_ids)}) — writing aborted decision[/red]" + ) + failed_path = output_dir / "evolved_FAILED.json" + evolved_manifest = manifest.replace_description(tool_name, evolved_description) + failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") + write_gate_decision(output_dir, { + "schema_version": "5", + "decision": "aborted", + "reason": "cl_eval_incomplete", + "decision_signal": "closed_loop", + "evolved_closed_loop_errored_tasks": evolved_cl_errored_task_ids, + "evolved_closed_loop_per_example": evolved_cl_per_example, + "baseline_closed_loop_per_example": cached_baseline_cl_per_example, + "evolved_cl_eval_cost_usd": cl_eval_cost_usd, + "band_trigger_score": { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + }, + "validator_agent_model": closed_loop_agent_model, + "baseline_chars": baseline_chars, + "evolved_chars": evolved_chars, + "growth_pct": growth_pct, + "knee_point": _knee_point_payload(knee_pick), + "dataset": _dataset_payload(dataset, dropped_tools=manifest.dropped_tools, sessiondb_drops=sessiondb_drops), + "run_inputs": run_inputs, + **tool_payload_fields, + }) + return {"decision": "aborted", "reason": "cl_eval_incomplete"} + + baseline_cl_passes = int(sum(cached_baseline_cl_per_example)) + evolved_cl_passes = int(sum(evolved_cl_per_example)) + cl_constraint = _check_cl_primary_gate( + baseline_cl_passes=baseline_cl_passes, + evolved_cl_passes=evolved_cl_passes, + baseline_synth_mean=avg_baseline, + evolved_synth_mean=avg_evolved, + growth_pct=growth_pct, + ) + icon = "✓" if cl_constraint.passed else "✗" + color = "green" if cl_constraint.passed else "red" + console.print( + f" [{color}]{icon} cl_primary_gate[/{color}]: {cl_constraint.message}" + ) + console.print(f"\n[bold]Validating growth against holdout improvement[/bold]") bootstrap = paired_bootstrap( baseline_per_example, @@ -852,11 +973,22 @@ def evolve( n_resamples=config.bootstrap_n_resamples, seed=config.seed, ) - # Growth + ceiling check on the description, not the rendered manifest — - # the gate's curve has to apply to the artifact the user actually evolves. - growth_constraints = validator.validate_growth_with_quality( - evolved_description, baseline_description, bootstrap, - ) + if use_cl_primary: + # CL-primary path: skip the synthetic growth_quality_gate + # (it would always reject when synth is saturated and growth > 0). + # But still enforce the absolute_char_ceiling — that's an + # orthogonal wallpaper-protection backstop that must hold + # regardless of which signal we're gating on. + ceiling_constraint = validator._check_absolute_chars( + evolved_description, baseline_chars, + ) + growth_constraints = [cl_constraint, ceiling_constraint] + else: + # Synthetic-only path (unchanged): growth_quality_gate runs both + # the growth curve and the absolute-char ceiling internally. + growth_constraints = validator.validate_growth_with_quality( + evolved_description, baseline_description, bootstrap, + ) growth_pass = True for c in growth_constraints: icon = "✓" if c.passed else "✗" @@ -892,9 +1024,9 @@ def evolve( evolved_manifest_path.unlink(missing_ok=True) baseline_manifest_path.unlink(missing_ok=True) - baseline_chars = len(baseline_description) - evolved_chars = len(evolved_description) - growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars) + # baseline_chars / evolved_chars / growth_pct are bound earlier + # (before the use_cl_primary branch) so the CL-primary path can + # use them in its abort payloads. Don't recompute here. required_improvement = max( 0.0, config.growth_quality_slope * (growth_pct - config.growth_free_threshold), From 779002a7f7ab87396b9d35a2a131cfd480b1dc40 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:30:16 -0600 Subject: [PATCH 4/9] fix(evolve_tool): narrow cl_constraint type, surface saved-variant path Code review found two minor issues in the CL-primary branch added by ae1fea68: 1. cl_constraint: Optional[ConstraintResult] flows into a list[ConstraintResult] without type narrowing at the post-branch growth_constraints assignment. Added an assert so the type checker sees the correlation between the two 'if use_cl_primary:' blocks. 2. Both new abort paths wrote evolved_FAILED.json but skipped the 'Saved failed variant to {path}' console line that existing abort paths print. Operators triaging a flake need to know the file was saved and where; added the print to both new paths. No behavior change for any test. --- evolution/tools/evolve_tool.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index e4eb87a0..513ea4dd 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -884,6 +884,7 @@ def evolve( failed_path = output_dir / "evolved_FAILED.json" evolved_manifest = manifest.replace_description(tool_name, evolved_description) failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") + console.print(f" Saved failed variant to {failed_path}") write_gate_decision(output_dir, { "schema_version": "5", "decision": "aborted", @@ -926,6 +927,7 @@ def evolve( failed_path = output_dir / "evolved_FAILED.json" evolved_manifest = manifest.replace_description(tool_name, evolved_description) failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") + console.print(f" Saved failed variant to {failed_path}") write_gate_decision(output_dir, { "schema_version": "5", "decision": "aborted", @@ -979,6 +981,10 @@ def evolve( # But still enforce the absolute_char_ceiling — that's an # orthogonal wallpaper-protection backstop that must hold # regardless of which signal we're gating on. + # cl_constraint was bound in the earlier `if use_cl_primary:` block; + # the assert narrows Optional[ConstraintResult] so growth_constraints + # types as list[ConstraintResult], not list[Optional[ConstraintResult]]. + assert cl_constraint is not None ceiling_constraint = validator._check_absolute_chars( evolved_description, baseline_chars, ) From 90014c5aeb79ab33a8a15065a1d8208b2faace22 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:34:07 -0600 Subject: [PATCH 5/9] feat(evolve_tool): gate_decision.json schema v5 with CL-primary fields Schema bumps from v4 to v5 across all four gate_decision write sites (static-fail, cl_eval_failed, cl_eval_incomplete, success/reject). The bump is additive. New fields are present only when use_cl_primary == True: decision_signal, baseline_closed_loop_per_example, evolved_closed_loop_per_example, evolved_closed_loop_errored_tasks, cl_tasks_gained, cl_required_gain, synthetic_sanity_check, evolved_cl_eval_cost_usd, band_trigger_score, validator_agent_model. When preflight was skipped (--no-saturation-check), records reason_synthetic: 'preflight_skipped' so downstream consumers can distinguish 'preflight saw no weak_signal' from 'preflight didn't run.' cl_required_gain and synthetic_sanity_check reuse the CL_PRIMARY_GROWTH_SLOPE / CL_PRIMARY_GROWTH_FREE_THRESHOLD / CL_PRIMARY_SYNTH_TOLERANCE constants from quality_gate.py so the gate-decision payload can't drift from the actual gate logic. Existing v4 consumers see byte-identical output for synthetic-mode runs except the new decision_signal: 'synthetic' string. --- evolution/tools/evolve_tool.py | 43 ++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 513ea4dd..97522d3c 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -10,6 +10,7 @@ import difflib import json import logging +import math import sys import time from datetime import datetime @@ -59,6 +60,9 @@ register_litellm_failure_callback, ) from evolution.core.quality_gate import ( + CL_PRIMARY_GROWTH_FREE_THRESHOLD, + CL_PRIMARY_GROWTH_SLOPE, + CL_PRIMARY_SYNTH_TOLERANCE, QUALITY_GATE_PRESETS, _check_cl_primary_gate, resolve_proposer_mode, @@ -817,7 +821,7 @@ def evolve( evolved_manifest = manifest.replace_description(tool_name, evolved_description) failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") write_gate_decision(output_dir, { - "schema_version": "4", + "schema_version": "5", "decision": "reject", "reason": "static_constraint_failure", "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed], @@ -1045,9 +1049,10 @@ def evolve( else: decision_reason = "growth_quality_gate" decision_payload = { - "schema_version": "4", + "schema_version": "5", "decision": "deploy" if growth_pass else "reject", "reason": decision_reason, + "decision_signal": "closed_loop" if use_cl_primary else "synthetic", "decision_rule_used": decision_rule_used, "gate_mode": config.gate_mode, "inferiority_tolerance": config.inferiority_tolerance, @@ -1078,6 +1083,40 @@ def evolve( } if benchmark_block is not None: decision_payload["benchmark"] = benchmark_block + if use_cl_primary: + decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example + decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example + # Populated only on the abort path (cl_eval_incomplete); empty + # here because we reach this block only when no task errored. + decision_payload["evolved_closed_loop_errored_tasks"] = [] + decision_payload["cl_tasks_gained"] = ( + int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) + ) + decision_payload["cl_required_gain"] = max( + 1, + math.ceil( + max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) + ), + ) + decision_payload["synthetic_sanity_check"] = { + "tolerance": CL_PRIMARY_SYNTH_TOLERANCE, + "baseline_mean": avg_baseline, + "evolved_mean": avg_evolved, + "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE, + } + decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd + decision_payload["band_trigger_score"] = { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + } + decision_payload["validator_agent_model"] = closed_loop_agent_model + + if not use_cl_primary and preflight_band is None: + # User passed --no-saturation-check; record why CL-primary + # didn't fire even though CL may be configured. Lets downstream + # consumers distinguish 'preflight saw no weak_signal' from + # 'preflight didn't run.' + decision_payload["reason_synthetic"] = "preflight_skipped" gate_path = write_gate_decision(output_dir, decision_payload) console.print(f" [dim]Gate decision logged to {gate_path}[/dim]") From 0b1106c5c4ffa7c4986a86487832ed7af37e77f0 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 08:52:57 -0600 Subject: [PATCH 6/9] test(evolve_tool): integration tests for CL-aware deploy gate 10 tests covering the deploy-gate branch on saturation band: - weak_signal triggers evolved CL eval (force_run called post-GEPA) - healthy/no_headroom fall through to synthetic - --no-saturation-check records reason_synthetic in JSON - all v5 fields present + correct types - force_run failure writes aborted decision with diagnostics - evolved task abstention writes cl_eval_incomplete (not regression) - absolute_char_ceiling still enforced in CL-primary path Mocks the synthetic dataset builder + closed-loop cache at the same seams as test_evolve_tool_saturation_preflight.py; calls evolve() directly (rather than via CliRunner) so each test can inspect gate_decision.json at a pinned output_dir. --- tests/tools/test_evolve_tool_cl_aware_gate.py | 695 ++++++++++++++++++ 1 file changed, 695 insertions(+) create mode 100644 tests/tools/test_evolve_tool_cl_aware_gate.py diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py new file mode 100644 index 00000000..83b1b078 --- /dev/null +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -0,0 +1,695 @@ +"""Integration tests for the deploy-gate CL-aware branch. + +Mocks the synthetic dataset builder + closed-loop cache so each test +can pin a saturation band and verify the deploy gate's branch behavior +plus ``gate_decision.json`` shape. No real LM calls. + +Pairs with unit tests at ``tests/core/test_check_cl_primary_gate.py`` +which cover the decision-rule math in isolation. These tests run the +full ``evolve()`` orchestrator end-to-end with seams stubbed at the +saturation pre-flight, closed-loop cache, GEPA, knee-point, and holdout +evaluator, so they exercise the branch logic added in this PR rather +than the helper math. +""" + +from __future__ import annotations + +import contextlib +import json +from pathlib import Path +from types import SimpleNamespace +from typing import Optional +from unittest.mock import MagicMock, patch + +import pytest + +from evolution.core.dataset_builder import EvalExample +from evolution.core.saturation_check import SaturationReport +from evolution.skills.knee_point import CandidatePick +from evolution.tools.evolve_tool import evolve +from evolution.validation.report import ( + PhaseResult, + TaskResult, + ValidationReport, + WinLoss, +) + + +FIXTURES = Path(__file__).parent.parent / "fixtures" / "tool_manifests" + + +@pytest.fixture +def temp_manifest(tmp_path: Path) -> Path: + """Copy multiple_tools.json to a tmp location.""" + src = FIXTURES / "multiple_tools.json" + dst = tmp_path / "manifest.json" + dst.write_text(src.read_text()) + return dst + + +def _fake_tool_examples(n: int = 30) -> list[EvalExample]: + """Build n fake EvalExamples without calling an LM.""" + return [ + EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}") + for i in range(n) + ] + + +def _fake_validation_report( + *, + baseline_pass: list[bool], + evolved_pass: list[bool], + evolved_abstain: Optional[list[bool]] = None, +) -> ValidationReport: + """Build a ValidationReport with the given per-task verdicts. + + Mirrors what ClosedLoopFeedbackCache.force_run returns; ``evolved`` + is the only phase the deploy-gate branch actually reads (it pulls + baseline pass-counts from the cached preflight data). + """ + n = len(baseline_pass) + evolved_abstain = evolved_abstain or [False] * n + assert len(evolved_pass) == n + assert len(evolved_abstain) == n + + baseline_tasks = [ + TaskResult( + task_id=f"task_{i}", + passed=p, + abstained=False, + tool_calls_seq=[], + duration_seconds=0.1, + ) + for i, p in enumerate(baseline_pass) + ] + evolved_tasks = [ + TaskResult( + task_id=f"task_{i}", + passed=p, + abstained=a, + tool_calls_seq=[], + duration_seconds=0.1, + error="runner timeout" if a else None, + ) + for i, (p, a) in enumerate(zip(evolved_pass, evolved_abstain)) + ] + + def _phase(tasks: list[TaskResult]) -> PhaseResult: + n_p = sum(1 for t in tasks if t.passed and not t.abstained) + n_f = sum(1 for t in tasks if not t.passed and not t.abstained) + n_a = sum(1 for t in tasks if t.abstained) + scored = n_p + n_f + return PhaseResult( + pass_rate=(n_p / scored) if scored else 0.0, + n_passed=n_p, + n_failed=n_f, + n_abstained=n_a, + tasks=tasks, + ) + + return ValidationReport( + schema_version="1", + tool="search_files", + task_suite_path="fake_suite.jsonl", + task_suite_sha256="0" * 64, + baseline=_phase(baseline_tasks), + evolved=_phase(evolved_tasks), + delta=WinLoss( + n_wins=0, n_losses=0, n_ties=n, pass_rate_change=0.0, + ), + decision="pass", + decision_reasons=[], + ) + + +def _make_knee_pick(evolved_description: str) -> CandidatePick: + """Build a CandidatePick that select_knee_point would return.""" + fake_module = MagicMock() + return CandidatePick( + module=fake_module, + skill_text=evolved_description, + body_chars=len(evolved_description), + val_score=0.8, + val_rank_in_band=1, + band_size=1, + epsilon=0.1, + fallback="knee", + picked_idx=0, + gepa_default_idx=0, + gepa_default_body_chars=len(evolved_description), + band_roster=[], + ) + + +def _make_fake_gepa(evolved_description: str): + """Build a fake dspy.GEPA whose ``compile()`` returns a module with + the detailed_results shape the knee-point path expects.""" + + class _FakeGEPA: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def compile(self, baseline_module, *, trainset, valset): + fake_module = MagicMock() + fake_module.detailed_results = SimpleNamespace( + candidates=[fake_module], + val_aggregate_scores=[1.0], + best_idx=0, + ) + fake_module.description_text = evolved_description + return fake_module + + return _FakeGEPA + + +# Baseline description for search_files in multiple_tools.json is +# "Find things." (12 chars). With static_ceiling=5000 (default preset), +# effective_absolute_char_ceiling = max(5000, 1.5*12) = 5000 — so a +# plausible-length evolved description passes by default. +_EVOLVED_DESCRIPTION = ( + "Find files in the repository by name or glob pattern. " + "Returns matching file paths." +) + +# CL-primary path tests that want the gate to ACCEPT need growth_pct +# below CL_PRIMARY_GROWTH_FREE_THRESHOLD (0.20) so required_gain stays +# at 1 task and a +2 win clears it. 12-char baseline × 1.20 = 14.4, so +# evolved must be ≤ 14 chars. "Locate files." is 13 chars (8.3% growth) +# which lands required_gain=1. +_LOW_GROWTH_EVOLVED = "Locate files." + + +@contextlib.contextmanager +def _patch_stack( + *, + sat_report: SaturationReport, + fake_cache: Optional[MagicMock], + holdout_baseline_mean: float = 0.95, + holdout_evolved_mean: float = 0.96, + holdout_n: int = 10, + evolved_description: str = _EVOLVED_DESCRIPTION, +): + """Single context manager wrapping every seam patch each test needs. + + Tests stay focused on the band/cache/assertion they're verifying. + """ + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + knee_pick = _make_knee_pick(evolved_description) + evolved_per = [holdout_evolved_mean] * holdout_n + + def _maybe_build(**kwargs): + # Honour the real "no suite path → no cache" contract; if a test + # forgets to pass a suite path the use_cl_primary branch can't fire + # (None cache) instead of getting a confusingly-active mock. + if kwargs.get("suite_path") is None: + return None + return fake_cache + + with contextlib.ExitStack() as stack: + stack.enter_context(patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", + return_value=fake_builder, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool.saturation_preflight", + return_value=sat_report, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool._preflight_lm_credentials", + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool._maybe_build_closed_loop_cache", + side_effect=_maybe_build, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool.dspy.GEPA", + new=_make_fake_gepa(evolved_description), + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool.select_knee_point", + return_value=knee_pick, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool._candidate_description", + return_value=evolved_description, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool._holdout_evaluate_with_metric", + return_value=(holdout_evolved_mean, evolved_per), + )) + # In headless test envs stdin is non-TTY. For non-healthy bands + # the orchestrator otherwise sys.exit(3)s before the deploy gate. + stack.enter_context(patch( + "evolution.tools.evolve_tool.is_non_interactive", + return_value=False, + )) + stack.enter_context(patch( + "evolution.tools.evolve_tool.interactive_confirm", + return_value=True, + )) + yield + + +def _run_evolve( + *, + manifest_path: Path, + output_dir: Path, + extra_kwargs: Optional[dict] = None, +): + """Invoke evolve() with the minimum kwargs every test in this module + shares. Wraps the long, repetitive call so each test stays focused + on the band/cache/assertion that's actually being exercised.""" + kwargs = dict( + tool_name="search_files", + manifest_path=manifest_path, + iterations=1, + eval_dataset_size=30, + holdout_ratio=0.5, + quality_gate="non-inferiority", + closed_loop_suite_path=Path("/fake/suite.jsonl"), + closed_loop_hermes_repo=Path("/fake/hermes"), + # mode="feedback" avoids _load_behavioral_examples_from_suite, + # which would read the suite file on disk. The deploy-gate + # CL-primary branch is mode-agnostic; it pulls verdicts via + # closed_loop_cache.force_run regardless. + closed_loop_mode="feedback", + closed_loop_in_valset=False, + closed_loop_agent_model="openai/gpt-5-mini", + max_total_cost_usd=5.0, + skip_preflight=True, + output_dir=output_dir, + ) + if extra_kwargs: + kwargs.update(extra_kwargs) + return evolve(**kwargs) + + +def _weak_signal_report() -> SaturationReport: + """The one band that triggers the CL-aware deploy gate.""" + return SaturationReport( + band="weak_signal", + holdout_score=0.95, + holdout_n=10, + holdout_per_example=[0.95] * 10, + closed_loop_score=5 / 7, + closed_loop_n=7, + # 5/7 baseline pass-rate — the deploy gate reads this list + # verbatim to compute baseline_cl_passes. + closed_loop_per_example=[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + suggestions=[], + thresholds={}, + ) + + +def _healthy_report() -> SaturationReport: + """No CL data needed; the band routes through the synthetic gate.""" + return SaturationReport( + band="healthy", + holdout_score=0.5, + holdout_n=10, + holdout_per_example=[0.5] * 10, + closed_loop_score=None, + closed_loop_n=None, + closed_loop_per_example=None, + suggestions=[], + thresholds={}, + ) + + +def _no_headroom_report(*, with_cl_data: bool) -> SaturationReport: + """no_headroom band with optional CL data. CL-primary must NOT fire + on no_headroom regardless of data presence.""" + cl_per = [1.0] * 7 if with_cl_data else None + return SaturationReport( + band="no_headroom", + holdout_score=0.99, + # holdout_n must match the _patch_stack holdout_n (10) so the + # cached baseline list and the post-GEPA evolved list line up + # for paired_bootstrap. + holdout_n=10, + holdout_per_example=[1.0] * 10, + closed_loop_score=1.0 if with_cl_data else None, + closed_loop_n=7 if with_cl_data else None, + closed_loop_per_example=cl_per, + suggestions=["Try a harder suite"], + thresholds={}, + ) + + +# --------------------------------------------------------------------------- +# The 10 tests +# --------------------------------------------------------------------------- + + +def test_weak_signal_band_triggers_evolved_cl_eval( + temp_manifest: Path, tmp_path: Path, +): + """weak_signal + +2 task win → force_run is called post-GEPA, + decision == deploy, decision_signal == closed_loop, cl_tasks_gained == 2.""" + fake_cache = MagicMock() + # Baseline preflight per-example is [1]*5 + [0]*2 = 5/7. + # Evolved 7/7 — a +2 task gain that beats required_gain at small + # growth_pct. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + run_dir = tmp_path / "run" + + # _LOW_GROWTH_EVOLVED keeps required_gain at 1 task so the +2 CL win + # clears the cl_primary_gate. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=_LOW_GROWTH_EVOLVED, + ): + result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + fake_cache.force_run.assert_called() + call_args = fake_cache.force_run.call_args_list + # The CL-primary post-GEPA call passes the evolved description text. + assert any(_LOW_GROWTH_EVOLVED in str(call) for call in call_args), ( + f"Expected force_run to be called with evolved description, got: {call_args}" + ) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision"] == "deploy", ( + f"weak_signal + 5→7 should deploy, got {payload['decision']} " + f"(reason: {payload.get('reason')})" + ) + assert payload["decision_signal"] == "closed_loop" + assert payload["cl_tasks_gained"] == 2 + # The deploy result echoes the metrics dict, not the gate decision. + assert isinstance(result, dict) + + +def test_healthy_band_does_not_trigger_cl_aware_gate( + temp_manifest: Path, tmp_path: Path, +): + """healthy band → CL-primary never fires; gate falls through to + synthetic, force_run is NOT called post-GEPA, no CL fields written.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + fake_cache.force_run.assert_not_called() + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision_signal"] == "synthetic" + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + ): + assert cl_field not in payload, ( + f"CL field {cl_field!r} should not be in synthetic-gate payload" + ) + + +def test_no_headroom_without_cl_data_falls_through_to_synthetic_gate( + temp_manifest: Path, tmp_path: Path, +): + """no_headroom + no CL data + --force-saturation-check → synthetic gate + runs without KeyError. CL was never measured, so no CL fields.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_no_headroom_report(with_cl_data=False), + fake_cache=fake_cache, + ): + _run_evolve( + manifest_path=temp_manifest, + output_dir=run_dir, + extra_kwargs={"force_saturation_check": True}, + ) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision_signal"] == "synthetic" + + +def test_no_headroom_with_cl_data_falls_through_to_synthetic_gate( + temp_manifest: Path, tmp_path: Path, +): + """no_headroom + non-empty CL data → CL-primary STILL must NOT fire. + The spec triggers CL-primary only on weak_signal.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_no_headroom_report(with_cl_data=True), + fake_cache=fake_cache, + ): + _run_evolve( + manifest_path=temp_manifest, + output_dir=run_dir, + extra_kwargs={"force_saturation_check": True}, + ) + + fake_cache.force_run.assert_not_called() + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision_signal"] == "synthetic" + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + ): + assert cl_field not in payload + + +def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded( + temp_manifest: Path, tmp_path: Path, +): + """--no-saturation-check → no preflight, falls through to synthetic. + decision_signal == synthetic AND reason_synthetic == preflight_skipped + so downstream consumers can distinguish 'preflight saw nothing weak' + from 'preflight didn't run'.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + # sat_report is unused (skip_saturation_check=True bypasses preflight) + # but _patch_stack requires one. + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve( + manifest_path=temp_manifest, + output_dir=run_dir, + extra_kwargs={"skip_saturation_check": True}, + ) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision_signal"] == "synthetic" + assert payload["reason_synthetic"] == "preflight_skipped" + + +def test_cl_primary_decision_persists_to_gate_decision_json( + temp_manifest: Path, tmp_path: Path, +): + """weak_signal → all v5 CL fields present in gate_decision.json with + correct types. Pins the JSON contract downstream consumers depend on.""" + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + run_dir = tmp_path / "run" + + # _LOW_GROWTH_EVOLVED → required_gain=1 → +2 win clears the gate so + # the deploy path populates every v5 CL field we're pinning here. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=_LOW_GROWTH_EVOLVED, + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" + + assert isinstance(payload["baseline_closed_loop_per_example"], list) + assert all( + isinstance(x, (int, float)) + for x in payload["baseline_closed_loop_per_example"] + ) + assert isinstance(payload["evolved_closed_loop_per_example"], list) + assert all( + isinstance(x, (int, float)) + for x in payload["evolved_closed_loop_per_example"] + ) + + assert isinstance(payload["cl_tasks_gained"], int) + assert isinstance(payload["cl_required_gain"], int) + + sanity = payload["synthetic_sanity_check"] + assert isinstance(sanity, dict) + for key in ("tolerance", "baseline_mean", "evolved_mean", "passed"): + assert key in sanity, f"synthetic_sanity_check missing {key!r}" + assert isinstance(sanity["tolerance"], (int, float)) + assert isinstance(sanity["baseline_mean"], (int, float)) + assert isinstance(sanity["evolved_mean"], (int, float)) + assert isinstance(sanity["passed"], bool) + + # cost_usd may be None (tests don't exercise the cost ledger), float, + # or int — accept any; we only pin field presence here. + assert "evolved_cl_eval_cost_usd" in payload + cost = payload["evolved_cl_eval_cost_usd"] + assert cost is None or isinstance(cost, (int, float)) + + band_score = payload["band_trigger_score"] + assert isinstance(band_score, dict) + assert "holdout" in band_score + assert "closed_loop" in band_score + + assert isinstance(payload["validator_agent_model"], str) + + +def test_synthetic_only_decision_unchanged_in_gate_decision_json( + temp_manifest: Path, tmp_path: Path, +): + """healthy → synthetic path. All v4 fields present, schema_version=5, + decision_signal=synthetic, no CL fields.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "synthetic" + + # v4-and-earlier fields the synthetic path has always written. + for required in ( + "baseline_per_example", + "evolved_per_example", + "bootstrap", + "growth_pct", + "required_improvement", + "baseline_chars", + "evolved_chars", + "absolute_char_ceiling", + "knee_point", + "dataset", + "run_inputs", + ): + assert required in payload, f"missing v4 field {required!r}" + + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + "band_trigger_score", + "validator_agent_model", + ): + assert cl_field not in payload, ( + f"CL-only field {cl_field!r} leaked into synthetic-gate payload" + ) + + +def test_force_run_failure_writes_aborted_decision_with_diagnostic_payload( + temp_manifest: Path, tmp_path: Path, +): + """weak_signal + force_run raises → aborted decision, + reason=cl_eval_failed, exception text recorded, evolved_FAILED.json + written for forensic inspection of the rejected candidate.""" + fake_cache = MagicMock() + fake_cache.force_run.side_effect = RuntimeError("validator crashed") + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + assert result == {"decision": "aborted", "reason": "cl_eval_failed"} + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_failed" + assert "validator crashed" in payload["cl_eval_exception"] + + assert (run_dir / "evolved_FAILED.json").exists(), ( + "evolved_FAILED.json must be written so the rejected variant " + "is inspectable" + ) + + +def test_evolved_task_error_writes_cl_eval_incomplete_decision( + temp_manifest: Path, tmp_path: Path, +): + """weak_signal + one evolved task abstained → cl_eval_incomplete + (NOT a regression). An infrastructure flake on the evolved phase + isn't evidence of quality loss; conflating them would silently + reject good candidates.""" + fake_cache = MagicMock() + # task_2 abstains; others pass. Without the incomplete-detection + # branch this would score as 6/7 (+1 vs 5/7 baseline) and deploy. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, False, True, True, True, True], + evolved_abstain=[False, False, True, False, False, False, False], + ) + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + assert result == {"decision": "aborted", "reason": "cl_eval_incomplete"} + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_incomplete" + assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"] + + +def test_absolute_char_ceiling_still_enforced_in_cl_primary_path( + temp_manifest: Path, tmp_path: Path, +): + """weak_signal + +2 CL win + evolved description exceeding the + absolute char ceiling → reject. CL-primary mustn't bypass the + wallpaper-protection backstop.""" + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + # Baseline = 12 chars. ceiling = max(50, 1.5*12) = 50. + # Evolved ~480 chars; trips the absolute_char_ceiling backstop. + # Stays under max_tool_desc_size=500 so static checks still pass. + long_evolved = ( + "Find files in the repository by name pattern or glob; " + "returns matching file paths from anywhere under the project root. " + ) * 4 + assert 50 < len(long_evolved) <= 500, ( + f"Test pre-condition: expected 50 < len(long_evolved)={len(long_evolved)} <= 500" + ) + + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=long_evolved, + ): + result = _run_evolve( + manifest_path=temp_manifest, + output_dir=run_dir, + extra_kwargs={"max_absolute_chars": 50}, + ) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision"] == "reject", ( + f"absolute_char_ceiling must reject even on a winning CL gate; " + f"got decision={payload['decision']} (reason={payload.get('reason')})" + ) + assert "absolute_char_ceiling" in payload.get("failed_constraints", []), ( + f"failed_constraints={payload.get('failed_constraints')}" + ) + # The deploy-gate reject path returns the reject reason from the dict. + assert result["decision"] == "reject" From 70c0e370af77d34c7517f0abd561718d18fe3db1 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 09:05:05 -0600 Subject: [PATCH 7/9] test(evolve_tool): tighten Test 1 assertion, add uniform_failure test, pin evolved_FAILED.json Code-review feedback on the CL-aware gate test suite: 1. Test 1's force_run assertion was substring-based on str(call_args), which silently misses regressions where force_run is called twice or with extra kwargs. Tightened to assert_called_once_with. 2. Added test_uniform_failure_band_falls_through_to_synthetic_gate pinning the spec edge-case (uniform_failure -> synthetic path). Without it, expanding use_cl_primary to include uniform_failure would silently change behavior without a test failing. 3. Test 9 (cl_eval_incomplete) now asserts evolved_FAILED.json is written, mirroring Test 8's assertion on the cl_eval_failed abort path. Production writes the file on both abort paths. --- tests/tools/test_evolve_tool_cl_aware_gate.py | 45 ++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py index 83b1b078..1ab6a3e3 100644 --- a/tests/tools/test_evolve_tool_cl_aware_gate.py +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -366,12 +366,7 @@ def test_weak_signal_band_triggers_evolved_cl_eval( ): result = _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) - fake_cache.force_run.assert_called() - call_args = fake_cache.force_run.call_args_list - # The CL-primary post-GEPA call passes the evolved description text. - assert any(_LOW_GROWTH_EVOLVED in str(call) for call in call_args), ( - f"Expected force_run to be called with evolved description, got: {call_args}" - ) + fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_EVOLVED) payload = json.loads((run_dir / "gate_decision.json").read_text()) assert payload["decision"] == "deploy", ( @@ -461,6 +456,43 @@ def test_no_headroom_with_cl_data_falls_through_to_synthetic_gate( assert cl_field not in payload +def test_uniform_failure_band_falls_through_to_synthetic_gate( + temp_manifest: Path, tmp_path: Path, +): + """uniform_failure band (CL all-zero, e.g. validator broken) is NOT + covered by use_cl_primary — only weak_signal triggers CL-primary. + Verifies the gate falls through to the synthetic path with no + KeyError and no CL eval. If someone later expands use_cl_primary + to include uniform_failure, this test catches the change so it + must be accompanied by a deliberate spec update.""" + fake_cache = MagicMock() + sat_report = SaturationReport( + band="uniform_failure", + holdout_score=0.99, + holdout_n=10, + holdout_per_example=[1.0] * 10, + closed_loop_score=0.0, + closed_loop_n=7, + closed_loop_per_example=[0.0] * 7, + suggestions=[], + thresholds={}, + ) + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=sat_report, fake_cache=fake_cache): + _run_evolve( + manifest_path=temp_manifest, + output_dir=run_dir, + extra_kwargs={"force_saturation_check": True}, + ) + + fake_cache.force_run.assert_not_called() + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["decision_signal"] == "synthetic" + assert "baseline_closed_loop_per_example" not in payload + assert "cl_tasks_gained" not in payload + + def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded( temp_manifest: Path, tmp_path: Path, ): @@ -646,6 +678,7 @@ def test_evolved_task_error_writes_cl_eval_incomplete_decision( assert payload["decision"] == "aborted" assert payload["reason"] == "cl_eval_incomplete" assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"] + assert (run_dir / "evolved_FAILED.json").exists() def test_absolute_char_ceiling_still_enforced_in_cl_primary_path( From c7e8714f2b07bdc9c308f9adb93a06b97a905982 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 09:09:29 -0600 Subject: [PATCH 8/9] test(evolve_tool): schema v5 regression tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin the v4 → v5 additivity contract: every v4 field must still exist in v5 output, plus decision_signal (always) and the CL-specific fields (when use_cl_primary fired). Future schema bumps should add a TestSchemaV{N}Regression class following this pattern. --- tests/tools/test_evolve_tool_cl_aware_gate.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py index 1ab6a3e3..830c8f21 100644 --- a/tests/tools/test_evolve_tool_cl_aware_gate.py +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -726,3 +726,79 @@ def test_absolute_char_ceiling_still_enforced_in_cl_primary_path( ) # The deploy-gate reject path returns the reject reason from the dict. assert result["decision"] == "reject" + + +class TestSchemaV5Regression: + """V5 must be additive over v4. Old consumers should see all v4 fields + plus the new decision_signal field (and the CL-specific fields when + use_cl_primary fired). Future schema bumps should add a parallel + TestSchemaV{N}Regression class following the same pattern.""" + + # V4 fields that MUST persist in v5 output regardless of code path. + # Verified against the decision_payload literal in + # evolution/tools/evolve_tool.py. + V4_REQUIRED_FIELDS = frozenset({ + "schema_version", "decision", "reason", "decision_rule_used", + "gate_mode", "inferiority_tolerance", "growth_pct", + "required_improvement", "baseline_chars", "evolved_chars", + "absolute_char_ceiling", "effective_absolute_char_ceiling", + "growth_free_threshold", "fitness_profile", "proposer_mode", + "growth_quality_slope", "baseline_per_example", + "evolved_per_example", + }) + + def test_synthetic_path_writes_all_v4_fields( + self, temp_manifest: Path, tmp_path: Path, + ): + """healthy band → synthetic gate. Every v4 field must still be + present alongside the new decision_signal marker.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + + missing = self.V4_REQUIRED_FIELDS - payload.keys() + assert not missing, f"v4 fields missing in v5 synthetic payload: {sorted(missing)}" + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "synthetic" + + def test_cl_primary_path_writes_all_v4_fields_plus_cl_fields( + self, temp_manifest: Path, tmp_path: Path, + ): + """weak_signal + +2 CL win → CL-primary gate. Every v4 field must + still be present AND every new v5 CL-specific field must be + populated.""" + cl_fields = frozenset({ + "decision_signal", "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + "evolved_closed_loop_errored_tasks", "cl_tasks_gained", + "cl_required_gain", "synthetic_sanity_check", + "evolved_cl_eval_cost_usd", "band_trigger_score", + "validator_agent_model", + }) + fake_cache = MagicMock() + # 5/7 baseline → 7/7 evolved with _LOW_GROWTH_EVOLVED keeps + # required_gain=1 so the +2 win clears the gate and the deploy + # branch writes every CL-specific field. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=_LOW_GROWTH_EVOLVED, + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + + missing = (self.V4_REQUIRED_FIELDS | cl_fields) - payload.keys() + assert not missing, f"v5 fields missing in CL-primary payload: {sorted(missing)}" + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" From b265b3dfe0fd284fbf064bef33c0ce90862b45ba Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 09:33:14 -0600 Subject: [PATCH 9/9] fix(evolve_tool): schema v5 consistency across all gate_decision write sites Final-review feedback caught two seam leaks: 1. write_cost_ceiling_abort was hard-coding schema_version=4. If the cost ceiling trips during the CL-primary force_run call, the resulting gate_decision.json had v4 in a v5 directory. Made the schema_version a keyword arg (default 4 for skill-side callers that haven't bumped yet); tool-side passes 5. 2. The static_constraint_failure payload was bumped to v5 in Task 4 but never had decision_signal added. Every other v5 path has it. Set to 'synthetic' since static-fail fires before any CL eval. 3. Extended TestSchemaV5Regression with abort-path coverage so the above issues couldn't have slipped through. Three new tests pin schema_version and decision_signal on cl_eval_failed, cl_eval_incomplete, and static_constraint_failure payloads. 4. Renamed test_accepts_at_pr_68_calibration_point to test_accepts_at_24char_baseline_calibration_point per the project convention against exposing internal PR numbers in code. --- evolution/core/quality_gate.py | 9 ++- evolution/tools/evolve_tool.py | 2 + tests/core/test_check_cl_primary_gate.py | 6 +- tests/tools/test_evolve_tool_cl_aware_gate.py | 71 +++++++++++++++++++ 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py index bf137cb3..426af86f 100644 --- a/evolution/core/quality_gate.py +++ b/evolution/core/quality_gate.py @@ -157,10 +157,15 @@ def write_cost_ceiling_abort( output_dir: Path, run_inputs: dict[str, Any], extra_fields: dict[str, Any] | None = None, + schema_version: str = "4", ) -> Path: """Write a ``decision="aborted"`` gate_decision for a cost-ceiling trip. + ``extra_fields`` lets callers add path-specific keys (e.g., - ``artifact_type``, ``target_tool``). + ``artifact_type``, ``target_tool``). ``schema_version`` defaults to + ``"4"`` so skill-side callers (which haven't bumped past v4 yet) keep + working unchanged; tool-side callers pass ``"5"`` to stay consistent + with the rest of the gate_decision write sites in that ``output_dir``. """ cost_summary = COST_LEDGER.summary() _console.print( @@ -168,7 +173,7 @@ def write_cost_ceiling_abort( f"ceiling ${exc.ceiling_usd:.4f}[/bold red]" ) payload: dict[str, Any] = { - "schema_version": "4", + "schema_version": schema_version, "decision": "aborted", "reason": "cost_ceiling_exceeded", "cost_ceiling_usd": exc.ceiling_usd, diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 97522d3c..7721af8c 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -824,6 +824,7 @@ def evolve( "schema_version": "5", "decision": "reject", "reason": "static_constraint_failure", + "decision_signal": "synthetic", "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed], "messages": [c.message for c in static_constraints if not c.passed], "knee_point": _knee_point_payload(knee_pick), @@ -1250,6 +1251,7 @@ def evolve( "artifact_type": "tool_description", "target_tool": tool_name, }, + schema_version="5", ) return {"decision": "aborted", "reason": "cost_ceiling_exceeded"} finally: diff --git a/tests/core/test_check_cl_primary_gate.py b/tests/core/test_check_cl_primary_gate.py index 26c66a9d..3c43e296 100644 --- a/tests/core/test_check_cl_primary_gate.py +++ b/tests/core/test_check_cl_primary_gate.py @@ -31,9 +31,9 @@ def test_accepts_when_required_gain_met_at_free_threshold(self): assert result.passed is True assert result.constraint_name == "cl_primary_gate" - def test_accepts_at_pr_68_calibration_point(self): - # PR #68: +2 gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2. - # This is the exact case that motivated this work. + def test_accepts_at_24char_baseline_calibration_point(self): + # +2 task gain on +121% growth → required=ceil(1.0*(1.21-0.20))=2 → just barely passes. + # 24-char baseline calibration point from the prior retro-validation. result = _check_cl_primary_gate( baseline_cl_passes=5, evolved_cl_passes=7, diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py index 830c8f21..c5dd86f5 100644 --- a/tests/tools/test_evolve_tool_cl_aware_gate.py +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -802,3 +802,74 @@ def test_cl_primary_path_writes_all_v4_fields_plus_cl_fields( assert not missing, f"v5 fields missing in CL-primary payload: {sorted(missing)}" assert payload["schema_version"] == "5" assert payload["decision_signal"] == "closed_loop" + + def test_cl_eval_failed_payload_has_schema_v5_and_decision_signal( + self, temp_manifest: Path, tmp_path: Path, + ): + """Abort payloads are diagnostic-only (no full v4 field set), but + must still pin schema_version="5" and a decision_signal so abort + rows route the same way as deploy/reject rows in downstream jq.""" + fake_cache = MagicMock() + fake_cache.force_run.side_effect = RuntimeError("validator crashed") + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_failed" + + def test_cl_eval_incomplete_payload_has_schema_v5_and_decision_signal( + self, temp_manifest: Path, tmp_path: Path, + ): + """Abort payloads from the incomplete-eval branch must also pin + schema_version="5" and decision_signal so abort rows participate + in v5 cohort queries alongside deploy/reject rows.""" + fake_cache = MagicMock() + # task_2 abstains; mirrors the incomplete-detection scenario. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, False, True, True, True, True], + evolved_abstain=[False, False, True, False, False, False, False], + ) + run_dir = tmp_path / "run" + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_incomplete" + + def test_static_constraint_failure_payload_has_schema_v5_and_decision_signal( + self, temp_manifest: Path, tmp_path: Path, + ): + """Static-fail fires before any CL evaluation could run, so the + user never got into the CL-primary path → decision_signal must be + "synthetic". Triggered by patching _candidate_description to + return an empty string, which fails the non_empty constraint.""" + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + # Use the healthy band so we route through the synthetic-only + # path conceptually, then make _candidate_description return "" + # to trip the non_empty static constraint. The _patch_stack + # context manager already patches _candidate_description; we + # override it here with an empty string. + with _patch_stack( + sat_report=_healthy_report(), + fake_cache=fake_cache, + evolved_description="", + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + payload = json.loads((run_dir / "gate_decision.json").read_text()) + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "synthetic" + assert payload["decision"] == "reject" + assert payload["reason"] == "static_constraint_failure"