From 47cabe8b6d8cdf24688979038e7ee39d71943e11 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 21:56:09 -0600 Subject: [PATCH 1/5] refactor: extract build_run_inputs to evolution.core.run_inputs Five sites built the same `run_inputs` dict by hand (3 skill, 2 tool). Two latent inconsistencies fall out of normalizing through one helper: - evolve_tool's deploy-path literal was missing `eval_source`, which the skill side has always included. The helper restores it. - evolve_tool's cost-ceiling fallback was missing `enable_confusable_bucket`, which the deploy-path schema test asserts is present. Whenever the cost ceiling tripped on the deploy path, the fallback fired and silently dropped the field; routing through the helper closes that gap. The helper's `quality_gate_preset`/`fitness_profile`/`enable_confusable_bucket` kwargs make both the shape contract and the optional-on-skill / required-on-tool asymmetry explicit at every call site. --- evolution/core/run_inputs.py | 49 +++++++++++++ evolution/skills/evolve_skill.py | 70 ++++++------------ evolution/tools/evolve_tool.py | 53 +++++--------- tests/core/test_run_inputs.py | 117 +++++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+), 82 deletions(-) create mode 100644 evolution/core/run_inputs.py create mode 100644 tests/core/test_run_inputs.py diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py new file mode 100644 index 0000000..c47bb0d --- /dev/null +++ b/evolution/core/run_inputs.py @@ -0,0 +1,49 @@ +"""Build the `run_inputs` block written into every gate_decision.json. + +The block records every input that produced a given run so a third party +holding only the gate_decision.json artifact can reproduce the result. +Centralizing construction here keeps the skill and tool sides from +drifting — the cost-ceiling fallback in `evolve_tool` historically built +the block by hand and dropped `enable_confusable_bucket`, breaking the +deploy-gate schema contract. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from evolution.core.config import EvolutionConfig +from evolution.core.hermes_provider import resolved_lms_dump + + +def build_run_inputs( + *, + config: EvolutionConfig, + iterations: int, + optimizer_model: str, + quality_gate_preset: str, + eval_source: str, + fitness_profile: Optional[str] = None, + enable_confusable_bucket: Optional[bool] = None, +) -> dict[str, Any]: + run_inputs: dict[str, Any] = { + "seed": config.seed, + "iterations": iterations, + "optimizer_model": optimizer_model, + "reflection_model": config.reflection_model, + "eval_model": config.eval_model, + "resolved_lms": resolved_lms_dump( + optimizer=optimizer_model, + reflection=config.reflection_model, + eval_=config.eval_model, + ), + "eval_dataset_size": config.eval_dataset_size, + "holdout_ratio": config.holdout_ratio, + "quality_gate_preset": quality_gate_preset, + "eval_source": eval_source, + } + if fitness_profile is not None: + run_inputs["fitness_profile"] = fitness_profile + if enable_confusable_bucket is not None: + run_inputs["enable_confusable_bucket"] = enable_confusable_bucket + return run_inputs diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index ee4fb0c..b0161aa 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -52,6 +52,7 @@ write_cost_ceiling_abort, write_gate_decision, ) +from evolution.core.run_inputs import build_run_inputs from evolution.core.skill_sources import discover_skill_sources # Without this, the BudgetAwareProposer + LMTimingCallback logs stay @@ -1053,22 +1054,13 @@ def evolve( "messages": [c.message for c in static_constraints if not c.passed], "knee_point": _knee_point_payload(knee_pick), "dataset": _dataset_payload(dataset), - "run_inputs": { - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "eval_source": eval_source, - }, + "run_inputs": build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset=quality_gate, + eval_source=eval_source, + ), }) console.print(f" Saved failed variant to {failed_path}") return @@ -1104,22 +1096,13 @@ def evolve( # Hoist run_inputs to a local — referenced from 3 sites (the # two CL-primary abort paths + the main decision_payload). - run_inputs = { - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "eval_source": eval_source, - } + run_inputs = build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset=quality_gate, + eval_source=eval_source, + ) use_cl_primary = ( preflight_band == "weak_signal" @@ -1488,22 +1471,13 @@ def evolve( write_cost_ceiling_abort( exc, output_dir=output_dir, - run_inputs={ - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "eval_source": eval_source, - }, + run_inputs=build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset=quality_gate, + eval_source=eval_source, + ), schema_version="5", ) return diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 7721af8..402ee80 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -70,6 +70,7 @@ write_cost_ceiling_abort, write_gate_decision, ) +from evolution.core.run_inputs import build_run_inputs from evolution.core.stats import paired_bootstrap from evolution.skills.knee_point import CandidatePick, select_knee_point from evolution.tools.session_mining import ( @@ -791,23 +792,15 @@ def evolve( if not c.passed: static_pass = False - run_inputs = { - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "fitness_profile": fitness_profile, - "enable_confusable_bucket": config.enable_confusable_bucket, - } + run_inputs = build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset=quality_gate, + eval_source=eval_source, + fitness_profile=fitness_profile, + enable_confusable_bucket=config.enable_confusable_bucket, + ) tool_payload_fields = { "artifact_type": "tool_description", "target_tool": tool_name, @@ -1226,23 +1219,15 @@ def evolve( except CostCeilingExceeded as exc: # Abort may fire before `run_inputs` is built in the deploy path; # fall back to a minimal equivalent so gate_decision.json is still useful. - run_inputs_for_abort = locals().get("run_inputs") or { - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "fitness_profile": fitness_profile, - "eval_source": eval_source, - } + run_inputs_for_abort = locals().get("run_inputs") or build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset=quality_gate, + eval_source=eval_source, + fitness_profile=fitness_profile, + enable_confusable_bucket=config.enable_confusable_bucket, + ) write_cost_ceiling_abort( exc, output_dir=output_dir, diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py new file mode 100644 index 0000000..e839a39 --- /dev/null +++ b/tests/core/test_run_inputs.py @@ -0,0 +1,117 @@ +"""Unit tests for `build_run_inputs`. + +The helper centralizes the construction of the `run_inputs` block written +into every gate_decision.json. Both `evolve_skill` and `evolve_tool` build +the same nine-key core; the tool side adds two extra keys +(`fitness_profile`, `enable_confusable_bucket`). Lock the shape so future +refactors can't silently drop a key — the cost-ceiling fallback in +`evolve_tool` historically dropped `enable_confusable_bucket`, breaking +the deploy-gate contract asserted by `TestGateDecisionSchemaOnDeploy`. +""" + +from __future__ import annotations + +from evolution.core.config import EvolutionConfig +from evolution.core.hermes_provider import resolved_lms_dump +from evolution.core.run_inputs import build_run_inputs + + +def _fake_config() -> EvolutionConfig: + return EvolutionConfig( + seed=42, + reflection_model="openai/gpt-4.1", + eval_model="openai/gpt-4.1-mini", + eval_dataset_size=150, + holdout_ratio=0.5, + enable_confusable_bucket=False, + ) + + +class TestBuildRunInputs: + def test_skill_side_has_nine_keys(self): + config = _fake_config() + result = build_run_inputs( + config=config, + iterations=10, + optimizer_model="openai/gpt-4.1", + quality_gate_preset="default", + eval_source="synthetic", + ) + assert set(result.keys()) == { + "seed", + "iterations", + "optimizer_model", + "reflection_model", + "eval_model", + "resolved_lms", + "eval_dataset_size", + "holdout_ratio", + "quality_gate_preset", + "eval_source", + } + assert len(result) == 10 + + def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): + config = _fake_config() + config.enable_confusable_bucket = True + result = build_run_inputs( + config=config, + iterations=10, + optimizer_model="openai/gpt-4.1", + quality_gate_preset="default", + eval_source="synthetic", + fitness_profile="balanced", + enable_confusable_bucket=True, + ) + assert set(result.keys()) == { + "seed", + "iterations", + "optimizer_model", + "reflection_model", + "eval_model", + "resolved_lms", + "eval_dataset_size", + "holdout_ratio", + "quality_gate_preset", + "eval_source", + "fitness_profile", + "enable_confusable_bucket", + } + assert len(result) == 12 + assert result["fitness_profile"] == "balanced" + assert result["enable_confusable_bucket"] is True + + def test_resolved_lms_matches_helper_output(self): + config = _fake_config() + result = build_run_inputs( + config=config, + iterations=10, + optimizer_model="openai/gpt-4.1", + quality_gate_preset="default", + eval_source="synthetic", + ) + expected = resolved_lms_dump( + optimizer="openai/gpt-4.1", + reflection=config.reflection_model, + eval_=config.eval_model, + ) + assert result["resolved_lms"] == expected + + def test_enable_confusable_bucket_round_trips_when_passed(self): + # Regression: the cost-ceiling fallback in evolve_tool historically + # built run_inputs without `enable_confusable_bucket`, which broke + # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy` + # whenever the cost ceiling tripped on a deploy path. + config = _fake_config() + config.enable_confusable_bucket = True + result = build_run_inputs( + config=config, + iterations=10, + optimizer_model="openai/gpt-4.1", + quality_gate_preset="default", + eval_source="synthetic", + fitness_profile="balanced", + enable_confusable_bucket=config.enable_confusable_bucket, + ) + assert "enable_confusable_bucket" in result + assert result["enable_confusable_bucket"] is True From c4610c1458a3bd4438ddf001f04f2b24c7abc4f1 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 22:04:45 -0600 Subject: [PATCH 2/5] polish: trim build_run_inputs docstrings and tighten tests Address review feedback on 47cabe8b: - Drop historical narrative from helper and test module docstrings; keep the load-bearing "what does this produce" sentence. - Downgrade test_enable_confusable_bucket_round_trips docstring to honestly describe what the test covers (helper round-trip only, not the cost-ceiling call site). - Remove redundant len() assertions in shape tests (the set equality already pins the count) and rename test_skill_side_has_nine_keys. - Remove stale "hoist run_inputs to a local" comment; the helper call makes the intent obvious. --- evolution/core/run_inputs.py | 4 ---- evolution/skills/evolve_skill.py | 2 -- tests/core/test_run_inputs.py | 22 +++++----------------- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/evolution/core/run_inputs.py b/evolution/core/run_inputs.py index c47bb0d..7862b1c 100644 --- a/evolution/core/run_inputs.py +++ b/evolution/core/run_inputs.py @@ -2,10 +2,6 @@ The block records every input that produced a given run so a third party holding only the gate_decision.json artifact can reproduce the result. -Centralizing construction here keeps the skill and tool sides from -drifting — the cost-ceiling fallback in `evolve_tool` historically built -the block by hand and dropped `enable_confusable_bucket`, breaking the -deploy-gate schema contract. """ from __future__ import annotations diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index b0161aa..fc7aca3 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -1094,8 +1094,6 @@ def evolve( evolved_chars = len(evolved_full) growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars) - # Hoist run_inputs to a local — referenced from 3 sites (the - # two CL-primary abort paths + the main decision_payload). run_inputs = build_run_inputs( config=config, iterations=iterations, diff --git a/tests/core/test_run_inputs.py b/tests/core/test_run_inputs.py index e839a39..e525d55 100644 --- a/tests/core/test_run_inputs.py +++ b/tests/core/test_run_inputs.py @@ -1,13 +1,4 @@ -"""Unit tests for `build_run_inputs`. - -The helper centralizes the construction of the `run_inputs` block written -into every gate_decision.json. Both `evolve_skill` and `evolve_tool` build -the same nine-key core; the tool side adds two extra keys -(`fitness_profile`, `enable_confusable_bucket`). Lock the shape so future -refactors can't silently drop a key — the cost-ceiling fallback in -`evolve_tool` historically dropped `enable_confusable_bucket`, breaking -the deploy-gate contract asserted by `TestGateDecisionSchemaOnDeploy`. -""" +"""Unit tests for `build_run_inputs`.""" from __future__ import annotations @@ -28,7 +19,7 @@ def _fake_config() -> EvolutionConfig: class TestBuildRunInputs: - def test_skill_side_has_nine_keys(self): + def test_skill_side_shape(self): config = _fake_config() result = build_run_inputs( config=config, @@ -49,7 +40,6 @@ def test_skill_side_has_nine_keys(self): "quality_gate_preset", "eval_source", } - assert len(result) == 10 def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): config = _fake_config() @@ -77,7 +67,6 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self): "fitness_profile", "enable_confusable_bucket", } - assert len(result) == 12 assert result["fitness_profile"] == "balanced" assert result["enable_confusable_bucket"] is True @@ -98,10 +87,9 @@ def test_resolved_lms_matches_helper_output(self): assert result["resolved_lms"] == expected def test_enable_confusable_bucket_round_trips_when_passed(self): - # Regression: the cost-ceiling fallback in evolve_tool historically - # built run_inputs without `enable_confusable_bucket`, which broke - # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy` - # whenever the cost ceiling tripped on a deploy path. + # Helper round-trip only. Call-site coverage that the deploy-gate + # paths actually pass this kwarg lives in + # `TestGateDecisionSchemaOnDeploy::test_gate_decision_schema_on_deploy`. config = _fake_config() config.enable_confusable_bucket = True result = build_run_inputs( From 5edd73dbc7d1876941771a061448164934622796 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 22:11:58 -0600 Subject: [PATCH 3/5] refactor: extract append_cl_decision_fields to evolution.core.quality_gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The same 9-field CL-decision mutation block was duplicated byte-for-byte in evolve_tool.py and evolve_skill.py. Move it to a shared helper so the deploy-path callsites collapse to one call each and the CL_PRIMARY_* / math.ceil details live in one place. Scope is the success-path block only. The cl_eval_failed / cl_eval_incomplete abort callsites also write CL-related fields, but with a different shape (flat dict literal, no synthetic_sanity_check, no cl_tasks_gained / cl_required_gain) — forcing them through this helper would either inflate the abort payload or require ad-hoc kwargs that obscure the contract. Left as a follow-up. evolved_cl_errored_task_ids defaults to () so the deploy-path caller stays empty by construction; abort-path adoption can pass the populated list without diverging the signature. --- evolution/core/quality_gate.py | 48 ++++++++++- evolution/skills/evolve_skill.py | 39 +++------ evolution/tools/evolve_tool.py | 41 +++------- tests/core/test_append_cl_decision_fields.py | 85 ++++++++++++++++++++ 4 files changed, 156 insertions(+), 57 deletions(-) create mode 100644 tests/core/test_append_cl_decision_fields.py diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py index 426af86..a87e992 100644 --- a/evolution/core/quality_gate.py +++ b/evolution/core/quality_gate.py @@ -12,7 +12,7 @@ import subprocess import time from pathlib import Path -from typing import Any +from typing import Any, Optional, Sequence from rich.console import Console @@ -97,6 +97,52 @@ def _check_cl_primary_gate( ) +def append_cl_decision_fields( + decision_payload: dict, + *, + cached_baseline_cl_per_example: list[float], + evolved_cl_per_example: list[float], + avg_baseline: float, + avg_evolved: float, + growth_pct: float, + cl_eval_cost_usd: float, + preflight_holdout_score: Optional[float], + preflight_cl_score: Optional[float], + closed_loop_agent_model: str, + evolved_cl_errored_task_ids: Sequence = (), +) -> None: + """In-place mutation: adds the 9 closed-loop fields to ``decision_payload``. + + ``evolved_cl_errored_task_ids`` defaults to ``()`` so the deploy-path + caller (no errors by construction) can omit the kwarg; future abort-path + callers can pass the populated list without a separate code path. + """ + decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example + decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example + decision_payload["evolved_closed_loop_errored_tasks"] = list(evolved_cl_errored_task_ids) + decision_payload["cl_tasks_gained"] = ( + int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) + ) + decision_payload["cl_required_gain"] = max( + 1, + math.ceil( + max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) + ), + ) + decision_payload["synthetic_sanity_check"] = { + "tolerance": CL_PRIMARY_SYNTH_TOLERANCE, + "baseline_mean": avg_baseline, + "evolved_mean": avg_evolved, + "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE, + } + decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd + decision_payload["band_trigger_score"] = { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + } + decision_payload["validator_agent_model"] = closed_loop_agent_model + + # `default` is calibrated against the obsidian deploy (+24.2% growth, # ~+0.07 expected improvement). `off` disables the slope/ceiling checks # but still enforces bootstrap.mean ≥ 0 — see deprecation warning when diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index fc7aca3..b81c3b6 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -8,7 +8,6 @@ import difflib import json import logging -import math import random import sys import time @@ -42,11 +41,9 @@ resolved_lms_dump, ) from evolution.core.quality_gate import ( - CL_PRIMARY_GROWTH_FREE_THRESHOLD, - CL_PRIMARY_GROWTH_SLOPE, - CL_PRIMARY_SYNTH_TOLERANCE, QUALITY_GATE_PRESETS, _check_cl_primary_gate, + append_cl_decision_fields, resolve_proposer_mode, run_benchmark_hook, write_cost_ceiling_abort, @@ -1344,30 +1341,18 @@ def evolve( decision_payload["benchmark"] = benchmark_block if use_cl_primary: - decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example - decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example - decision_payload["evolved_closed_loop_errored_tasks"] = [] # populated only on abort path - decision_payload["cl_tasks_gained"] = ( - int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) - ) - decision_payload["cl_required_gain"] = max( - 1, - math.ceil( - max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) - ), + append_cl_decision_fields( + decision_payload, + cached_baseline_cl_per_example=cached_baseline_cl_per_example, + evolved_cl_per_example=evolved_cl_per_example, + avg_baseline=avg_baseline, + avg_evolved=avg_evolved, + growth_pct=growth_pct, + cl_eval_cost_usd=cl_eval_cost_usd, + preflight_holdout_score=preflight_holdout_score, + preflight_cl_score=preflight_cl_score, + closed_loop_agent_model=closed_loop_agent_model, ) - decision_payload["synthetic_sanity_check"] = { - "tolerance": CL_PRIMARY_SYNTH_TOLERANCE, - "baseline_mean": avg_baseline, - "evolved_mean": avg_evolved, - "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE, - } - decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd - decision_payload["band_trigger_score"] = { - "holdout": preflight_holdout_score, - "closed_loop": preflight_cl_score, - } - decision_payload["validator_agent_model"] = closed_loop_agent_model if not use_cl_primary and preflight_band is None: # User passed --no-saturation-check; record why CL-primary diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 402ee80..af7cff1 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -10,7 +10,6 @@ import difflib import json import logging -import math import sys import time from datetime import datetime @@ -60,11 +59,9 @@ register_litellm_failure_callback, ) from evolution.core.quality_gate import ( - CL_PRIMARY_GROWTH_FREE_THRESHOLD, - CL_PRIMARY_GROWTH_SLOPE, - CL_PRIMARY_SYNTH_TOLERANCE, QUALITY_GATE_PRESETS, _check_cl_primary_gate, + append_cl_decision_fields, resolve_proposer_mode, run_benchmark_hook, write_cost_ceiling_abort, @@ -1078,32 +1075,18 @@ def evolve( if benchmark_block is not None: decision_payload["benchmark"] = benchmark_block if use_cl_primary: - decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example - decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example - # Populated only on the abort path (cl_eval_incomplete); empty - # here because we reach this block only when no task errored. - decision_payload["evolved_closed_loop_errored_tasks"] = [] - decision_payload["cl_tasks_gained"] = ( - int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) - ) - decision_payload["cl_required_gain"] = max( - 1, - math.ceil( - max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) - ), + append_cl_decision_fields( + decision_payload, + cached_baseline_cl_per_example=cached_baseline_cl_per_example, + evolved_cl_per_example=evolved_cl_per_example, + avg_baseline=avg_baseline, + avg_evolved=avg_evolved, + growth_pct=growth_pct, + cl_eval_cost_usd=cl_eval_cost_usd, + preflight_holdout_score=preflight_holdout_score, + preflight_cl_score=preflight_cl_score, + closed_loop_agent_model=closed_loop_agent_model, ) - decision_payload["synthetic_sanity_check"] = { - "tolerance": CL_PRIMARY_SYNTH_TOLERANCE, - "baseline_mean": avg_baseline, - "evolved_mean": avg_evolved, - "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE, - } - decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd - decision_payload["band_trigger_score"] = { - "holdout": preflight_holdout_score, - "closed_loop": preflight_cl_score, - } - decision_payload["validator_agent_model"] = closed_loop_agent_model if not use_cl_primary and preflight_band is None: # User passed --no-saturation-check; record why CL-primary diff --git a/tests/core/test_append_cl_decision_fields.py b/tests/core/test_append_cl_decision_fields.py new file mode 100644 index 0000000..597ba28 --- /dev/null +++ b/tests/core/test_append_cl_decision_fields.py @@ -0,0 +1,85 @@ +"""Unit tests for `append_cl_decision_fields`.""" + +from __future__ import annotations + +import math + +from evolution.core.quality_gate import ( + CL_PRIMARY_GROWTH_FREE_THRESHOLD, + CL_PRIMARY_GROWTH_SLOPE, + CL_PRIMARY_SYNTH_TOLERANCE, + append_cl_decision_fields, +) + + +def _call(payload: dict, **overrides) -> None: + kwargs = dict( + cached_baseline_cl_per_example=[1.0, 0.0, 1.0], + evolved_cl_per_example=[1.0, 1.0, 1.0], + avg_baseline=0.60, + avg_evolved=0.65, + growth_pct=0.25, + cl_eval_cost_usd=0.0123, + preflight_holdout_score=0.7, + preflight_cl_score=0.4, + closed_loop_agent_model="openai/gpt-4.1-mini", + ) + kwargs.update(overrides) + append_cl_decision_fields(payload, **kwargs) + + +class TestAppendClDecisionFields: + def test_all_nine_fields_added(self): + payload: dict = {} + _call(payload) + assert set(payload.keys()) == { + "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + "evolved_closed_loop_errored_tasks", + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + "evolved_cl_eval_cost_usd", + "band_trigger_score", + "validator_agent_model", + } + assert payload["cl_tasks_gained"] == 3 - 2 + assert payload["evolved_cl_eval_cost_usd"] == 0.0123 + assert payload["band_trigger_score"] == {"holdout": 0.7, "closed_loop": 0.4} + assert payload["validator_agent_model"] == "openai/gpt-4.1-mini" + sanity = payload["synthetic_sanity_check"] + assert sanity["tolerance"] == CL_PRIMARY_SYNTH_TOLERANCE + assert sanity["baseline_mean"] == 0.60 + assert sanity["evolved_mean"] == 0.65 + assert sanity["passed"] is True + + def test_errored_task_ids_defaults_to_empty_list(self): + payload: dict = {} + _call(payload) + assert payload["evolved_closed_loop_errored_tasks"] == [] + + def test_cl_required_gain_uses_constants_not_magic_numbers(self): + # Pin the formula so silent constant→literal substitutions break this test. + payload: dict = {} + _call(payload, growth_pct=CL_PRIMARY_GROWTH_FREE_THRESHOLD + 0.5) + expected = max(1, math.ceil(CL_PRIMARY_GROWTH_SLOPE * 0.5)) + assert payload["cl_required_gain"] == expected + + def test_synthetic_sanity_check_passed_reflects_tolerance(self): + # Exactly at the boundary: passes. + boundary_payload: dict = {} + _call( + boundary_payload, + avg_baseline=0.50, + avg_evolved=0.50 - CL_PRIMARY_SYNTH_TOLERANCE, + ) + assert boundary_payload["synthetic_sanity_check"]["passed"] is True + + # Just past the boundary: fails. + over_payload: dict = {} + _call( + over_payload, + avg_baseline=0.50, + avg_evolved=0.50 - CL_PRIMARY_SYNTH_TOLERANCE - 0.001, + ) + assert over_payload["synthetic_sanity_check"]["passed"] is False From 63889479e1b39193735f2c4e82d2dc3afb315a95 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 22:19:50 -0600 Subject: [PATCH 4/5] polish: drop speculative evolved_cl_errored_task_ids kwarg Address review feedback on 5edd73db: - Drop evolved_cl_errored_task_ids kwarg and its Sequence import. The deferred abort paths have a different scaffold and won't adopt this helper; the kwarg + its forward-looking docstring were YAGNI. - Hard-code evolved_closed_loop_errored_tasks = [] in the helper body (matches deploy-path semantics: success path has no errors by construction). - Trim docstring to one line. - Rename test_errored_task_ids_defaults_to_empty_list to test_errored_tasks_is_empty_list (no longer about a default). --- evolution/core/quality_gate.py | 12 +++--------- tests/core/test_append_cl_decision_fields.py | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/evolution/core/quality_gate.py b/evolution/core/quality_gate.py index a87e992..6800050 100644 --- a/evolution/core/quality_gate.py +++ b/evolution/core/quality_gate.py @@ -12,7 +12,7 @@ import subprocess import time from pathlib import Path -from typing import Any, Optional, Sequence +from typing import Any, Optional from rich.console import Console @@ -109,17 +109,11 @@ def append_cl_decision_fields( preflight_holdout_score: Optional[float], preflight_cl_score: Optional[float], closed_loop_agent_model: str, - evolved_cl_errored_task_ids: Sequence = (), ) -> None: - """In-place mutation: adds the 9 closed-loop fields to ``decision_payload``. - - ``evolved_cl_errored_task_ids`` defaults to ``()`` so the deploy-path - caller (no errors by construction) can omit the kwarg; future abort-path - callers can pass the populated list without a separate code path. - """ + """Append the closed-loop deploy-gate decision fields to ``decision_payload``.""" decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example - decision_payload["evolved_closed_loop_errored_tasks"] = list(evolved_cl_errored_task_ids) + decision_payload["evolved_closed_loop_errored_tasks"] = [] decision_payload["cl_tasks_gained"] = ( int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) ) diff --git a/tests/core/test_append_cl_decision_fields.py b/tests/core/test_append_cl_decision_fields.py index 597ba28..6219c0e 100644 --- a/tests/core/test_append_cl_decision_fields.py +++ b/tests/core/test_append_cl_decision_fields.py @@ -53,7 +53,7 @@ def test_all_nine_fields_added(self): assert sanity["evolved_mean"] == 0.65 assert sanity["passed"] is True - def test_errored_task_ids_defaults_to_empty_list(self): + def test_errored_tasks_is_empty_list(self): payload: dict = {} _call(payload) assert payload["evolved_closed_loop_errored_tasks"] == [] From e2d0ab9e62e1b5260bbf26b1977087e30cb886c9 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sat, 23 May 2026 22:30:13 -0600 Subject: [PATCH 5/5] fix: surface CL gains in summary panel when CL-primary deployed The panel rendered "did not improve" on a CL-primary deploy because the synthetic delta is often near zero (or negative) when the closed-loop gate is the one driving the decision. Now under use_cl_primary the deploy line announces the CL task gain and the reject line names the shortfall vs the required gain; the Evolution Results table picks up a "Closed-loop (behavioral)" row and colors all rows by the gate verdict rather than the irrelevant synthetic delta. --- evolution/skills/evolve_skill.py | 33 ++++++- evolution/tools/evolve_tool.py | 32 ++++++- .../skills/test_evolve_skill_cl_aware_gate.py | 96 +++++++++++++++++++ tests/tools/test_evolve_tool_cl_aware_gate.py | 87 +++++++++++++++++ 4 files changed, 242 insertions(+), 6 deletions(-) diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index b81c3b6..4a2dff8 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -1364,6 +1364,12 @@ def evolve( if not growth_pass: console.print("[red]✗ Evolved skill REJECTED by quality gate — not deploying[/red]") + if use_cl_primary: + console.print( + f"[yellow]⚠ Evolution rejected: " + f"CL gain {decision_payload['cl_tasks_gained']} < " + f"required {decision_payload['cl_required_gain']}[/yellow]" + ) failed_path = output_dir / "evolved_FAILED.md" failed_path.write_text(evolved_full) console.print(f" Saved failed variant to {failed_path}") @@ -1386,13 +1392,28 @@ def evolve( table.add_column("Evolved", justify="right") table.add_column("Change", justify="right") - change_color = "green" if improvement > 0 else "red" + # Under CL-primary, the gate verdict — not the synthetic delta — + # decides the row color; the synthetic delta is informational. + row_color = ( + ("green" if growth_pass else "yellow") + if use_cl_primary + else ("green" if improvement > 0 else "red") + ) table.add_row( "Holdout Score", f"{avg_baseline:.3f}", f"{avg_evolved:.3f}", - f"[{change_color}]{improvement:+.3f}[/{change_color}]", + f"[{row_color}]{improvement:+.3f}[/{row_color}]", ) + if use_cl_primary: + baseline_cl = int(sum(cached_baseline_cl_per_example)) + evolved_cl = int(sum(evolved_cl_per_example)) + table.add_row( + "Closed-loop (behavioral)", + f"{baseline_cl} tasks", + f"{evolved_cl} tasks", + f"[{row_color}]{evolved_cl - baseline_cl:+d} tasks[/{row_color}]", + ) table.add_row( "Skill Size", f"{len(skill['body']):,} chars", @@ -1444,7 +1465,13 @@ def evolve( if applied: console.print(f" --apply: wrote evolved skill to {skill_path}") - if improvement > 0: + if use_cl_primary: + console.print( + f"\n[bold green]✓ Evolution improved skill " + f"(CL gained +{decision_payload['cl_tasks_gained']} tasks)[/bold green]" + ) + console.print(f" Review the diff: diff {output_dir}/baseline_skill.md {output_dir}/evolved_skill.md") + elif improvement > 0: console.print(f"\n[bold green]✓ Evolution improved skill by {improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]") console.print(f" Review the diff: diff {output_dir}/baseline_skill.md {output_dir}/evolved_skill.md") else: diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index af7cff1..40759bd 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -1099,6 +1099,12 @@ def evolve( if not growth_pass: console.print("[red]✗ Evolved description REJECTED by quality gate — not deploying[/red]") + if use_cl_primary: + console.print( + f"[yellow]⚠ Evolution rejected: " + f"CL gain {decision_payload['cl_tasks_gained']} < " + f"required {decision_payload['cl_required_gain']}[/yellow]" + ) evolved_manifest = manifest.replace_description(tool_name, evolved_description) failed_path = output_dir / "evolved_FAILED.json" failed_path.write_text(json.dumps(_manifest_to_dict(evolved_manifest), indent=2) + "\n") @@ -1121,13 +1127,28 @@ def evolve( table.add_column("Baseline", justify="right") table.add_column("Evolved", justify="right") table.add_column("Change", justify="right") - change_color = "green" if improvement > 0 else "red" + # Under CL-primary, the gate verdict — not the synthetic delta — + # decides the row color; the synthetic delta is informational. + row_color = ( + ("green" if growth_pass else "yellow") + if use_cl_primary + else ("green" if improvement > 0 else "red") + ) table.add_row( "Holdout Score", f"{avg_baseline:.3f}", f"{avg_evolved:.3f}", - f"[{change_color}]{improvement:+.3f}[/{change_color}]", + f"[{row_color}]{improvement:+.3f}[/{row_color}]", ) + if use_cl_primary: + baseline_cl = int(sum(cached_baseline_cl_per_example)) + evolved_cl = int(sum(evolved_cl_per_example)) + table.add_row( + "Closed-loop (behavioral)", + f"{baseline_cl} tasks", + f"{evolved_cl} tasks", + f"[{row_color}]{evolved_cl - baseline_cl:+d} tasks[/{row_color}]", + ) table.add_row( "Description Size", f"{baseline_chars:,} chars", @@ -1188,7 +1209,12 @@ def evolve( ) console.print(f" --apply: wrote evolved description to {manifest_path}") - if improvement > 0: + if use_cl_primary: + console.print( + f"\n[bold green]✓ Evolution improved tool description " + f"(CL gained +{decision_payload['cl_tasks_gained']} tasks)[/bold green]" + ) + elif improvement > 0: console.print( f"\n[bold green]✓ Evolution improved tool description by " f"{improvement:+.3f} ({improvement/max(0.001, avg_baseline)*100:+.1f}%)[/bold green]" diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py index 421352f..b285c9d 100644 --- a/tests/skills/test_evolve_skill_cl_aware_gate.py +++ b/tests/skills/test_evolve_skill_cl_aware_gate.py @@ -905,3 +905,99 @@ def test_skill_v4_payload_fields_preserved_in_v5_cl_primary( f"knee_point keys: {list(knee.keys())}" ) assert isinstance(knee["band_roster"], list) + + +def test_summary_panel_reflects_cl_decision_when_cl_primary_deploys( + skill_dir: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +): + """CL-primary deploy → summary line announces the CL gain instead of + the synthetic delta. Without the CL-aware branch the panel says + 'did not improve' even though gate_decision.json deployed the artifact, + so the operator gets a contradictory signal.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + + # _weak_signal_report pins baseline holdout to 0.95; evolved=0.90 + # forces a negative synthetic improvement so the pre-change panel + # would render 'did not improve' even though CL-primary just deployed. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + holdout_evolved_mean=0.90, + ): + _run_evolve(skill_dir=skill_dir) + + out = capsys.readouterr().out + assert "CL gained +2" in out, f"missing CL-gain line in summary: {out!r}" + assert "did not improve" not in out, ( + f"synthetic 'did not improve' line leaked through CL-primary deploy: {out!r}" + ) + + +def test_summary_panel_reflects_cl_decision_when_cl_primary_rejects( + skill_dir: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +): + """CL-primary reject → summary line explains the CL shortfall instead + of falling back to the generic synthetic-rejected line.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + # 5/7 → 5/7: zero CL gain, required_gain stays at 1 → reject. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, False, False], + ) + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + ): + _run_evolve(skill_dir=skill_dir) + + out = capsys.readouterr().out + assert "CL gain 0 < required 1" in out, ( + f"missing CL-reject line in summary: {out!r}" + ) + + +def test_summary_panel_uses_synthetic_delta_when_not_cl_primary( + skill_dir: Path, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +): + """healthy band → synthetic gate → existing 'improved/did not improve' + wording is unchanged. Regression guard for the synthetic path. + + _healthy_report() pins baseline holdout to 0.5 (cached per-example); + evolved=0.5 produces a zero synthetic delta that still clears the + non-inferiority gate (within tolerance) so the deploy-path 'did not + improve' line fires — that's the legacy branch we must preserve. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + with _patch_stack( + sat_report=_healthy_report(), + fake_cache=fake_cache, + holdout_evolved_mean=0.5, + ): + _run_evolve(skill_dir=skill_dir) + + out = capsys.readouterr().out + assert "did not improve" in out, ( + f"synthetic path must keep 'did not improve' on a zero delta: {out!r}" + ) + assert "CL gained" not in out + assert "CL gain" not in out diff --git a/tests/tools/test_evolve_tool_cl_aware_gate.py b/tests/tools/test_evolve_tool_cl_aware_gate.py index c5dd86f..20db7b3 100644 --- a/tests/tools/test_evolve_tool_cl_aware_gate.py +++ b/tests/tools/test_evolve_tool_cl_aware_gate.py @@ -873,3 +873,90 @@ def test_static_constraint_failure_payload_has_schema_v5_and_decision_signal( assert payload["decision_signal"] == "synthetic" assert payload["decision"] == "reject" assert payload["reason"] == "static_constraint_failure" + + +def test_summary_panel_reflects_cl_decision_when_cl_primary_deploys( + temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture, +): + """CL-primary deploy → summary line announces the CL gain instead of + the synthetic delta. Without the CL-aware branch the panel says + 'did not improve' even though gate_decision.json deployed the artifact, + so the operator gets a contradictory signal.""" + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + run_dir = tmp_path / "run" + + # _weak_signal_report pins baseline holdout to 0.95; evolved=0.90 + # forces a negative synthetic improvement so the pre-change panel + # would render 'did not improve' even though CL-primary just deployed. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=_LOW_GROWTH_EVOLVED, + holdout_evolved_mean=0.90, + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + out = capsys.readouterr().out + assert "CL gained +2" in out, f"missing CL-gain line in summary: {out!r}" + assert "did not improve" not in out, ( + f"synthetic 'did not improve' line leaked through CL-primary deploy: {out!r}" + ) + + +def test_summary_panel_reflects_cl_decision_when_cl_primary_rejects( + temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture, +): + """CL-primary reject → summary line explains the CL shortfall instead + of falling back to the generic synthetic-rejected line.""" + fake_cache = MagicMock() + # 5/7 → 5/7: zero CL gain, required_gain stays at 1 → reject. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, False, False], + ) + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_description=_LOW_GROWTH_EVOLVED, + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + out = capsys.readouterr().out + assert "CL gain 0 < required 1" in out, ( + f"missing CL-reject line in summary: {out!r}" + ) + + +def test_summary_panel_uses_synthetic_delta_when_not_cl_primary( + temp_manifest: Path, tmp_path: Path, capsys: pytest.CaptureFixture, +): + """healthy band → synthetic gate → existing 'improved/did not improve' + wording is unchanged. Regression guard for the synthetic path. + + _healthy_report() pins baseline holdout to 0.5 (cached per-example); + evolved=0.5 produces a zero synthetic delta that still clears the + non-inferiority gate (within tolerance) so the deploy-path 'did not + improve' line fires — that's the legacy branch we must preserve. + """ + fake_cache = MagicMock() + run_dir = tmp_path / "run" + + with _patch_stack( + sat_report=_healthy_report(), + fake_cache=fake_cache, + holdout_evolved_mean=0.5, + ): + _run_evolve(manifest_path=temp_manifest, output_dir=run_dir) + + out = capsys.readouterr().out + assert "did not improve" in out, ( + f"synthetic path must keep 'did not improve' on a zero delta: {out!r}" + ) + assert "CL gained" not in out + assert "CL gain" not in out