diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 1e7994b9..ee4fb0c9 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -8,6 +8,7 @@ import difflib import json import logging +import math import random import sys import time @@ -41,7 +42,11 @@ resolved_lms_dump, ) from evolution.core.quality_gate import ( + CL_PRIMARY_GROWTH_FREE_THRESHOLD, + CL_PRIMARY_GROWTH_SLOPE, + CL_PRIMARY_SYNTH_TOLERANCE, QUALITY_GATE_PRESETS, + _check_cl_primary_gate, resolve_proposer_mode, run_benchmark_hook, write_cost_ceiling_abort, @@ -61,6 +66,7 @@ from evolution.core.stats import paired_bootstrap from evolution.core.fitness import LLMJudge, make_skill_fitness_metric from evolution.core.constraints import ( + ConstraintResult, ConstraintValidator, effective_absolute_char_ceiling, resolve_decision_rule, @@ -905,7 +911,11 @@ def evolve( if closed_loop_in_valset: valset = valset + behavioral_examples - cached_baseline_holdout_per_example = None + cached_baseline_holdout_per_example: Optional[list[float]] = None + preflight_band: Optional[str] = None + cached_baseline_cl_per_example: Optional[list[float]] = None + preflight_holdout_score: Optional[float] = None + preflight_cl_score: Optional[float] = None if not skip_saturation_check: holdout_examples_for_preflight = dataset.to_dspy_examples("holdout") sat_report = saturation_preflight( @@ -937,6 +947,13 @@ def evolve( else: render_saturation_panel(sat_report, console=console) cached_baseline_holdout_per_example = sat_report.holdout_per_example + # Preserve preflight outputs for the deploy gate's CL-primary + # path. All None on the --no-saturation-check path (initialized + # above the preflight branch). + preflight_band = sat_report.band + cached_baseline_cl_per_example = sat_report.closed_loop_per_example + preflight_holdout_score = sat_report.holdout_score + preflight_cl_score = sat_report.closed_loop_score console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n") @@ -1028,9 +1045,10 @@ def evolve( failed_path = output_dir / "evolved_FAILED.md" failed_path.write_text(evolved_full) write_gate_decision(output_dir, { - "schema_version": "4", + "schema_version": "5", "decision": "reject", "reason": "static_constraint_failure", + "decision_signal": "synthetic", "failed_constraints": [c.constraint_name for c in static_constraints if not c.passed], "messages": [c.message for c in static_constraints if not c.passed], "knee_point": _knee_point_payload(knee_pick), @@ -1076,6 +1094,146 @@ def evolve( ) improvement = avg_evolved - avg_baseline + # Decide which deploy-gate path applies. CL-primary fires when + # the preflight saw weak_signal AND CL data is present. All + # other cases (no preflight, healthy/no_headroom/uniform_failure + # bands, missing CL data) use the synthetic-only path. + baseline_chars = len(skill["raw"]) + evolved_chars = len(evolved_full) + growth_pct = (evolved_chars - baseline_chars) / max(1, baseline_chars) + + # Hoist run_inputs to a local — referenced from 3 sites (the + # two CL-primary abort paths + the main decision_payload). + run_inputs = { + "seed": config.seed, + "iterations": iterations, + "optimizer_model": optimizer_model, + "reflection_model": config.reflection_model, + "eval_model": config.eval_model, + "resolved_lms": resolved_lms_dump( + optimizer=optimizer_model, + reflection=config.reflection_model, + eval_=config.eval_model, + ), + "eval_dataset_size": config.eval_dataset_size, + "holdout_ratio": config.holdout_ratio, + "quality_gate_preset": quality_gate, + "eval_source": eval_source, + } + + use_cl_primary = ( + preflight_band == "weak_signal" + and cached_baseline_cl_per_example is not None + and len(cached_baseline_cl_per_example) > 0 + and closed_loop_cache is not None + ) + + evolved_cl_report = None + evolved_cl_per_example: Optional[list[float]] = None + evolved_cl_errored_task_ids: list[str] = [] + cl_eval_cost_before: float = 0.0 + cl_eval_cost_usd: Optional[float] = None + cl_constraint: Optional[ConstraintResult] = None + + if use_cl_primary: + console.print( + f"\n[bold]Evaluating evolved skill body on closed-loop suite[/bold] " + "(weak_signal band → CL-primary gate)" + ) + cl_eval_cost_before = COST_LEDGER.summary().get("total_usd", 0.0) + try: + # force_run takes the BODY (no YAML frontmatter); the cache + # key was set up with skill["body"] during preflight, so we + # must match that to avoid silently double-spending on the + # evolved eval. + evolved_cl_report = closed_loop_cache.force_run(evolved_body) + except Exception as exc: # ValidatorError or downstream + cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before + console.print( + f"[red]✗ Evolved closed-loop eval failed: {exc}[/red] — writing aborted decision" + ) + failed_path = output_dir / "evolved_FAILED.md" + failed_path.write_text(evolved_full) + console.print(f" Saved failed variant to {failed_path}") + write_gate_decision(output_dir, { + "schema_version": "5", + "decision": "aborted", + "reason": "cl_eval_failed", + "decision_signal": "closed_loop", + "cl_eval_exception": str(exc), + "evolved_cl_eval_cost_usd": cl_eval_cost_usd, + "band_trigger_score": { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + }, + "validator_agent_model": closed_loop_agent_model, + "baseline_chars": baseline_chars, + "evolved_chars": evolved_chars, + "growth_pct": growth_pct, + "knee_point": _knee_point_payload(knee_pick), + "dataset": _dataset_payload(dataset), + "run_inputs": run_inputs, + }) + return + cl_eval_cost_usd = COST_LEDGER.summary().get("total_usd", 0.0) - cl_eval_cost_before + + # Detect abstained tasks (TaskResult.abstained == True means + # the runner errored — see validation/report.py:score_task). + # An infrastructure flake on an evolved task is NOT a quality + # regression; conflating them would falsely reject good + # candidates. Hard-fail with a written diagnostic instead. + evolved_cl_errored_task_ids = [ + t.task_id for t in evolved_cl_report.evolved.tasks if t.abstained + ] + evolved_cl_per_example = [ + 1.0 if t.passed else 0.0 for t in evolved_cl_report.evolved.tasks + ] + if evolved_cl_errored_task_ids: + console.print( + f"[red]✗ {len(evolved_cl_errored_task_ids)} evolved CL task(s) errored " + f"({', '.join(evolved_cl_errored_task_ids)}) — writing aborted decision[/red]" + ) + failed_path = output_dir / "evolved_FAILED.md" + failed_path.write_text(evolved_full) + console.print(f" Saved failed variant to {failed_path}") + write_gate_decision(output_dir, { + "schema_version": "5", + "decision": "aborted", + "reason": "cl_eval_incomplete", + "decision_signal": "closed_loop", + "evolved_closed_loop_errored_tasks": evolved_cl_errored_task_ids, + "evolved_closed_loop_per_example": evolved_cl_per_example, + "baseline_closed_loop_per_example": cached_baseline_cl_per_example, + "evolved_cl_eval_cost_usd": cl_eval_cost_usd, + "band_trigger_score": { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + }, + "validator_agent_model": closed_loop_agent_model, + "baseline_chars": baseline_chars, + "evolved_chars": evolved_chars, + "growth_pct": growth_pct, + "knee_point": _knee_point_payload(knee_pick), + "dataset": _dataset_payload(dataset), + "run_inputs": run_inputs, + }) + return + + baseline_cl_passes = int(sum(cached_baseline_cl_per_example)) + evolved_cl_passes = int(sum(evolved_cl_per_example)) + cl_constraint = _check_cl_primary_gate( + baseline_cl_passes=baseline_cl_passes, + evolved_cl_passes=evolved_cl_passes, + baseline_synth_mean=avg_baseline, + evolved_synth_mean=avg_evolved, + growth_pct=growth_pct, + ) + icon = "✓" if cl_constraint.passed else "✗" + color = "green" if cl_constraint.passed else "red" + console.print( + f" [{color}]{icon} cl_primary_gate[/{color}]: {cl_constraint.message}" + ) + if evaluate_band_on_holdout and knee_pick is not None: console.print( f"\n[bold]Re-evaluating {knee_pick.band_size} band candidate(s) on holdout[/bold] " @@ -1100,9 +1258,26 @@ def evolve( n_resamples=config.bootstrap_n_resamples, seed=config.seed, ) - growth_constraints = validator.validate_growth_with_quality( - evolved_full, skill["raw"], bootstrap, - ) + if use_cl_primary: + # CL-primary path: skip the synthetic growth_quality_gate + # (it would always reject when synth is saturated and growth > 0). + # But still enforce the absolute_char_ceiling — that's an + # orthogonal wallpaper-protection backstop that must hold + # regardless of which signal we're gating on. + # cl_constraint was bound in the earlier `if use_cl_primary:` block; + # the assert narrows Optional[ConstraintResult] so growth_constraints + # types as list[ConstraintResult], not list[Optional[ConstraintResult]]. + assert cl_constraint is not None + ceiling_constraint = validator._check_absolute_chars( + evolved_full, baseline_chars, + ) + growth_constraints = [cl_constraint, ceiling_constraint] + else: + # Synthetic-only path (unchanged): growth_quality_gate runs both + # the growth curve and the absolute-char ceiling internally. + growth_constraints = validator.validate_growth_with_quality( + evolved_full, skill["raw"], bootstrap, + ) growth_pass = True for c in growth_constraints: icon = "✓" if c.passed else "✗" @@ -1135,7 +1310,9 @@ def evolve( evolved_path.unlink(missing_ok=True) baseline_path.unlink(missing_ok=True) - growth_pct = (len(evolved_full) - len(skill["raw"])) / max(1, len(skill["raw"])) + # baseline_chars / evolved_chars / growth_pct are bound earlier + # (before the use_cl_primary branch) so the CL-primary path can + # use them in its abort payloads. Don't recompute here. required_improvement = max( 0.0, config.growth_quality_slope * (growth_pct - config.growth_free_threshold), @@ -1149,19 +1326,20 @@ def evolve( else: decision_reason = "growth_quality_gate" decision_payload = { - "schema_version": "4", + "schema_version": "5", "decision": "deploy" if growth_pass else "reject", "reason": decision_reason, + "decision_signal": "closed_loop" if use_cl_primary else "synthetic", "decision_rule_used": decision_rule_used, "gate_mode": config.gate_mode, "inferiority_tolerance": config.inferiority_tolerance, "growth_pct": growth_pct, "required_improvement": required_improvement, - "baseline_chars": len(skill["raw"]), - "evolved_chars": len(evolved_full), + "baseline_chars": baseline_chars, + "evolved_chars": evolved_chars, "absolute_char_ceiling": config.max_absolute_chars, "effective_absolute_char_ceiling": effective_absolute_char_ceiling( - config.max_absolute_chars, len(skill["raw"]), + config.max_absolute_chars, baseline_chars, ), "growth_free_threshold": config.growth_free_threshold, "fitness_profile": config.fitness_profile, @@ -1179,25 +1357,42 @@ def evolve( "messages": [c.message for c in growth_constraints if not c.passed], "knee_point": _knee_point_payload(knee_pick), "dataset": _dataset_payload(dataset), - "run_inputs": { - "seed": config.seed, - "iterations": iterations, - "optimizer_model": optimizer_model, - "reflection_model": config.reflection_model, - "eval_model": config.eval_model, - "resolved_lms": resolved_lms_dump( - optimizer=optimizer_model, - reflection=config.reflection_model, - eval_=config.eval_model, - ), - "eval_dataset_size": config.eval_dataset_size, - "holdout_ratio": config.holdout_ratio, - "quality_gate_preset": quality_gate, - "eval_source": eval_source, - }, + "run_inputs": run_inputs, } if benchmark_block is not None: decision_payload["benchmark"] = benchmark_block + + if use_cl_primary: + decision_payload["baseline_closed_loop_per_example"] = cached_baseline_cl_per_example + decision_payload["evolved_closed_loop_per_example"] = evolved_cl_per_example + decision_payload["evolved_closed_loop_errored_tasks"] = [] # populated only on abort path + decision_payload["cl_tasks_gained"] = ( + int(sum(evolved_cl_per_example)) - int(sum(cached_baseline_cl_per_example)) + ) + decision_payload["cl_required_gain"] = max( + 1, + math.ceil( + max(0.0, CL_PRIMARY_GROWTH_SLOPE * (growth_pct - CL_PRIMARY_GROWTH_FREE_THRESHOLD)) + ), + ) + decision_payload["synthetic_sanity_check"] = { + "tolerance": CL_PRIMARY_SYNTH_TOLERANCE, + "baseline_mean": avg_baseline, + "evolved_mean": avg_evolved, + "passed": (avg_evolved - avg_baseline) >= -CL_PRIMARY_SYNTH_TOLERANCE, + } + decision_payload["evolved_cl_eval_cost_usd"] = cl_eval_cost_usd + decision_payload["band_trigger_score"] = { + "holdout": preflight_holdout_score, + "closed_loop": preflight_cl_score, + } + decision_payload["validator_agent_model"] = closed_loop_agent_model + + if not use_cl_primary and preflight_band is None: + # User passed --no-saturation-check; record why CL-primary + # didn't fire even though CL may be configured. + decision_payload["reason_synthetic"] = "preflight_skipped" + gate_path = write_gate_decision(output_dir, decision_payload) console.print(f" [dim]Gate decision logged to {gate_path}[/dim]") @@ -1309,6 +1504,7 @@ def evolve( "quality_gate_preset": quality_gate, "eval_source": eval_source, }, + schema_version="5", ) return finally: diff --git a/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md b/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md new file mode 100644 index 00000000..ba6ba45a --- /dev/null +++ b/tests/fixtures/skills/weakened-systematic-debugging/SKILL.md @@ -0,0 +1,69 @@ +--- +name: weakened-systematic-debugging +description: "Diagnose Python bugs and produce a written diagnostic report." +version: 0.1.0 +metadata: + hermes: + tags: [debugging, diagnosis, analysis, code-review] +--- + +# Python Bug Diagnostician + +## Overview + +This skill produces a **diagnostic report** for a buggy Python file. +Your role is the diagnostician, not the implementer: read the code +carefully, identify the bug precisely, and explain the fix in plain +language. The user (or a downstream agent) applies the change. + +## When to Use + +Use when given a buggy Python file or failing test where the user wants +to understand *what is wrong* before any code is changed. + +## Diagnostic Process + +### 1. Read the source + +Use `read_file` on the source file(s) involved. Read the full contents, +not just the named function. Read any associated test files. + +### 2. Identify the bug + +State precisely: +- Which line contains the bug +- What the current behavior is +- What the intended behavior is (per the docstring or test) +- Why the current code produces the wrong behavior + +### 3. Specify the fix + +Describe the change in words: "Replace `x * 2` on line 4 with `x * x`." +Include the literal old and new code in the report. + +### 4. Stop + +Do **not** call `write_file` or edit the source. Do **not** run the test +to verify. Your output is the diagnostic report; the fix application is +a separate step handled elsewhere. + +## Why diagnosis-only + +Separating diagnosis from implementation produces clearer bug reports, +keeps the audit trail clean, and prevents premature edits that mask the +real root cause. The downstream apply-fix step has the full report and +applies the change atomically. + +## Report Format + +``` +BUG LOCATION: : +CURRENT BEHAVIOR: +INTENDED BEHAVIOR: +ROOT CAUSE: +PROPOSED FIX: + - Old: + - New: +``` + +End your response with the report. Do not edit files. Do not run tests. diff --git a/tests/skills/test_evolve_skill_cl_aware_gate.py b/tests/skills/test_evolve_skill_cl_aware_gate.py new file mode 100644 index 00000000..421352fc --- /dev/null +++ b/tests/skills/test_evolve_skill_cl_aware_gate.py @@ -0,0 +1,907 @@ +"""Integration tests for the skill-side deploy-gate CL-aware branch. + +Symmetric to tests/tools/test_evolve_tool_cl_aware_gate.py — mocks the +synthetic dataset builder + closed-loop cache so each test can pin a +saturation band and verify the deploy gate's branch behaviour plus +``gate_decision.json`` shape. No real LM calls. + +Tests 1-10 mirror the tool-side suite. Tests 11-13 cover skill-specific +invariants: + + 11. force_run is called with the skill BODY (not the full + frontmatter+body file). Guards against the cache-key-mismatch + silent failure where the evolved variant would be re-validated + under a different key, double-spending ~$1-3 per run. + 12. abort paths produce ``evolved_FAILED.md`` (not ``.json``). The + skill-side convention matches how baseline/evolved are written + so post-run diff tooling continues to work. + 13. v4 skill-specific payload fields (``bap_max_growth``, + ``bap_safety_margin``, ``eval_source``, ``fitness_profile``, + ``proposer_mode``, ``knee_point.band_roster``) survive the v5 + bump in the CL-primary path. +""" + +from __future__ import annotations + +import contextlib +import json +from pathlib import Path +from types import SimpleNamespace +from typing import Optional +from unittest.mock import MagicMock, patch + +import pytest + +from evolution.core.dataset_builder import EvalDataset, EvalExample +from evolution.core.saturation_check import SaturationReport +from evolution.skills.evolve_skill import evolve +from evolution.skills.knee_point import CandidatePick +from evolution.validation.report import ( + PhaseResult, + TaskResult, + ValidationReport, + WinLoss, +) + + +# Demo SKILL.md used as the baseline. Kept tiny + stable so the +# growth-pct math in every test is predictable. Lengths: +# frontmatter (between ---s) = "name: demo-skill\ndescription: a test skill" (42 chars) +# body (after the second ---) = "Do X." (5 chars) +# raw (full file content) = 58 chars +# After reassemble_skill: "---\n{frontmatter}\n---\n\n{body}\n" = 53 + len(body) chars. +_SKILL_FRONTMATTER = "name: demo-skill\ndescription: a test skill" +_BASELINE_BODY = "Do X." +_BASELINE_RAW = f"---\n{_SKILL_FRONTMATTER}\n---\n\n{_BASELINE_BODY}\n" +assert len(_BASELINE_RAW) == 58, ( + f"Test pre-condition: baseline raw must be 58 chars, got {len(_BASELINE_RAW)}" +) + + +@pytest.fixture +def skill_dir(tmp_path: Path) -> Path: + """Write a minimal SKILL.md so skill discovery succeeds.""" + skills_root = tmp_path / "skills" + skill_path = skills_root / "demo-skill" + skill_path.mkdir(parents=True) + (skill_path / "SKILL.md").write_text(_BASELINE_RAW) + return skills_root + + +def _fake_skill_dataset(n: int = 50) -> EvalDataset: + """Build a real-shaped EvalDataset with n fake examples (no LM calls). + + Mirrors the helper in test_evolve_skill_saturation_preflight.py. + Default n=50 yields 30/10/10 splits — holdout must be ≥ + EvolutionConfig.min_holdout_size (default 10) or evolve() aborts + before the deploy-gate branch even runs. + """ + examples = [ + EvalExample(task_input=f"task {i}", expected_behavior=f"rubric {i}") + for i in range(n) + ] + return EvalDataset( + train=examples[:30], val=examples[30:40], holdout=examples[40:50], + ) + + +def _fake_validation_report( + *, + baseline_pass: list[bool], + evolved_pass: list[bool], + evolved_abstain: Optional[list[bool]] = None, +) -> ValidationReport: + """Build a ValidationReport with the given per-task verdicts. + + Mirrors what ClosedLoopFeedbackCache.force_run returns; ``evolved`` + is the only phase the deploy-gate branch actually reads (it pulls + baseline pass-counts from the cached preflight data). Skill-side + suites score via test_command rather than tool_calls_seq, so we + leave tool_calls_seq empty. + """ + n = len(baseline_pass) + evolved_abstain = evolved_abstain or [False] * n + assert len(evolved_pass) == n + assert len(evolved_abstain) == n + + baseline_tasks = [ + TaskResult( + task_id=f"task_{i}", + passed=p, + abstained=False, + tool_calls_seq=[], + duration_seconds=0.1, + ) + for i, p in enumerate(baseline_pass) + ] + evolved_tasks = [ + TaskResult( + task_id=f"task_{i}", + passed=p, + abstained=a, + tool_calls_seq=[], + duration_seconds=0.1, + error="runner timeout" if a else None, + ) + for i, (p, a) in enumerate(zip(evolved_pass, evolved_abstain)) + ] + + def _phase(tasks: list[TaskResult]) -> PhaseResult: + n_p = sum(1 for t in tasks if t.passed and not t.abstained) + n_f = sum(1 for t in tasks if not t.passed and not t.abstained) + n_a = sum(1 for t in tasks if t.abstained) + scored = n_p + n_f + return PhaseResult( + pass_rate=(n_p / scored) if scored else 0.0, + n_passed=n_p, + n_failed=n_f, + n_abstained=n_a, + tasks=tasks, + ) + + return ValidationReport( + schema_version="1", + tool="demo-skill", + task_suite_path="fake_suite.jsonl", + task_suite_sha256="0" * 64, + baseline=_phase(baseline_tasks), + evolved=_phase(evolved_tasks), + delta=WinLoss( + n_wins=0, n_losses=0, n_ties=n, pass_rate_change=0.0, + ), + decision="pass", + decision_reasons=[], + ) + + +def _make_knee_pick(evolved_body: str) -> CandidatePick: + """Build a CandidatePick that select_knee_point would return. + + ``skill_text`` IS the evolved body (no frontmatter). evolve_skill.py + then reassembles the full file via reassemble_skill(frontmatter, body) + for the static checks, but force_run() is called with the body alone. + """ + fake_module = MagicMock() + fake_module.skill_text = evolved_body + return CandidatePick( + module=fake_module, + skill_text=evolved_body, + body_chars=len(evolved_body), + val_score=0.8, + val_rank_in_band=1, + band_size=1, + epsilon=0.1, + fallback="knee", + picked_idx=0, + gepa_default_idx=0, + gepa_default_body_chars=len(evolved_body), + band_roster=[], + ) + + +def _make_fake_gepa(evolved_body: str): + """Build a fake dspy.GEPA whose ``compile()`` returns a module with + the detailed_results shape the knee-point path expects.""" + + class _FakeGEPA: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def compile(self, baseline_module, *, trainset, valset): + fake_module = MagicMock() + fake_module.detailed_results = SimpleNamespace( + candidates=[fake_module], + val_aggregate_scores=[1.0], + best_idx=0, + ) + fake_module.skill_text = evolved_body + return fake_module + + return _FakeGEPA + + +# A few body strings hand-picked to keep growth_pct in the zones the +# tests need. baseline raw = 58. evolved_full = 53 + len(body). +# +# _LOW_GROWTH_BODY: growth_pct ≈ 5.2% → required_gain=1 → a +2 CL win +# clears CL-primary. evolved_full = 53 + 8 = 61, growth = (61-58)/58 = 5.17%. +_LOW_GROWTH_BODY = "Find X." # 8 chars; under the 0.20 growth-free threshold. + +# Default body for tests that don't care about growth: stays under the +# default non-inferiority static_ceiling and keeps the structure intact. +_EVOLVED_BODY = "Do X better." # 12 chars. + + +@contextlib.contextmanager +def _patch_stack( + *, + sat_report: SaturationReport, + fake_cache: Optional[MagicMock], + holdout_baseline_mean: float = 0.95, + holdout_evolved_mean: float = 0.96, + holdout_n: int = 10, + evolved_body: str = _EVOLVED_BODY, +): + """Single context manager wrapping every seam patch each test needs. + + Tests stay focused on the band/cache/assertion they're verifying. + """ + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + knee_pick = _make_knee_pick(evolved_body) + evolved_per = [holdout_evolved_mean] * holdout_n + + def _maybe_build(**kwargs): + # Honour the real "no suite path → no cache" contract; if a test + # forgets to pass a suite path the use_cl_primary branch can't + # fire (None cache) instead of getting a confusingly-active mock. + if kwargs.get("suite_path") is None: + return None + return fake_cache + + with contextlib.ExitStack() as stack: + stack.enter_context(patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", + return_value=fake_builder, + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill.saturation_preflight", + return_value=sat_report, + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill._preflight_lm_credentials", + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill._maybe_build_closed_loop_cache_skill", + side_effect=_maybe_build, + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill.dspy.GEPA", + new=_make_fake_gepa(evolved_body), + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill.select_knee_point", + return_value=knee_pick, + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric", + return_value=(holdout_evolved_mean, evolved_per), + )) + # In headless test envs stdin is non-TTY. For non-healthy bands + # the orchestrator otherwise sys.exit(3)s before the deploy gate. + stack.enter_context(patch( + "evolution.skills.evolve_skill.is_non_interactive", + return_value=False, + )) + stack.enter_context(patch( + "evolution.skills.evolve_skill.interactive_confirm", + return_value=True, + )) + yield + + +def _run_evolve( + *, + skill_dir: Path, + extra_kwargs: Optional[dict] = None, +): + """Invoke evolve() with the minimum kwargs every test in this module + shares. Wraps the long, repetitive call so each test stays focused + on the band/cache/assertion that's actually being exercised. + + output_dir is NOT a kwarg on the skill-side evolve(); the function + hardcodes ``Path("output") / skill_name / timestamp``. Tests + monkeypatch.chdir(tmp_path) before calling, so the output lands + under ``tmp_path/output/demo-skill//``. + """ + kwargs = dict( + skill_name="demo-skill", + skill_source_dirs=[str(skill_dir)], + iterations=1, + eval_dataset_size=50, + holdout_ratio=0.2, + quality_gate="non-inferiority", + closed_loop_suite_path=Path("/fake/suite.jsonl"), + closed_loop_mode="feedback", + closed_loop_in_valset=False, + closed_loop_agent_model="openai/gpt-5-mini", + max_total_cost_usd=5.0, + skip_preflight=True, + ) + if extra_kwargs: + kwargs.update(extra_kwargs) + return evolve(**kwargs) + + +def _latest_gate_decision(tmp_path: Path) -> dict: + """Find the most-recently-written gate_decision.json under + ``tmp_path/output/demo-skill//`` and return its payload. + + The skill-side evolve() hardcodes its output path, so tests can't + pin a known location and must enumerate timestamp-named subdirs. + """ + runs_root = tmp_path / "output" / "demo-skill" + assert runs_root.exists(), f"No run output under {runs_root}" + runs = sorted(runs_root.iterdir()) + assert runs, f"No timestamped run dirs under {runs_root}" + payload_path = runs[-1] / "gate_decision.json" + assert payload_path.exists(), f"No gate_decision.json at {payload_path}" + return json.loads(payload_path.read_text()) + + +def _latest_run_dir(tmp_path: Path) -> Path: + runs_root = tmp_path / "output" / "demo-skill" + runs = sorted(runs_root.iterdir()) + return runs[-1] + + +def _weak_signal_report() -> SaturationReport: + """The one band that triggers the CL-aware deploy gate.""" + return SaturationReport( + band="weak_signal", + holdout_score=0.95, + holdout_n=10, + holdout_per_example=[0.95] * 10, + closed_loop_score=5 / 7, + closed_loop_n=7, + # 5/7 baseline pass-rate — the deploy gate reads this list + # verbatim to compute baseline_cl_passes. + closed_loop_per_example=[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + suggestions=[], + thresholds={}, + ) + + +def _healthy_report() -> SaturationReport: + """No CL data needed; the band routes through the synthetic gate.""" + return SaturationReport( + band="healthy", + holdout_score=0.5, + holdout_n=10, + holdout_per_example=[0.5] * 10, + closed_loop_score=None, + closed_loop_n=None, + closed_loop_per_example=None, + suggestions=[], + thresholds={}, + ) + + +def _no_headroom_report(*, with_cl_data: bool) -> SaturationReport: + """no_headroom band with optional CL data. CL-primary must NOT fire + on no_headroom regardless of data presence.""" + cl_per = [1.0] * 7 if with_cl_data else None + return SaturationReport( + band="no_headroom", + holdout_score=0.99, + # holdout_n must match the _patch_stack holdout_n (10) so the + # cached baseline list and the post-GEPA evolved list line up + # for paired_bootstrap. + holdout_n=10, + holdout_per_example=[1.0] * 10, + closed_loop_score=1.0 if with_cl_data else None, + closed_loop_n=7 if with_cl_data else None, + closed_loop_per_example=cl_per, + suggestions=["Try a harder suite"], + thresholds={}, + ) + + +# --------------------------------------------------------------------------- +# Tests 1-10: mirror the tool-side suite +# --------------------------------------------------------------------------- + + +def test_weak_signal_band_triggers_evolved_cl_eval( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """weak_signal + +2 task win → force_run is called post-GEPA, + decision == deploy, decision_signal == closed_loop, cl_tasks_gained == 2.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + # Baseline preflight per-example is [1]*5 + [0]*2 = 5/7. + # Evolved 7/7 — a +2 task gain that beats required_gain at small + # growth_pct. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + + # _LOW_GROWTH_BODY keeps required_gain at 1 task so the +2 CL win + # clears the cl_primary_gate. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + ): + _run_evolve(skill_dir=skill_dir) + + fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_BODY) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision"] == "deploy", ( + f"weak_signal + 5→7 should deploy, got {payload['decision']} " + f"(reason: {payload.get('reason')})" + ) + assert payload["decision_signal"] == "closed_loop" + assert payload["cl_tasks_gained"] == 2 + + +def test_healthy_band_does_not_trigger_cl_aware_gate( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """healthy band → CL-primary never fires; gate falls through to + synthetic, force_run is NOT called post-GEPA, no CL fields written.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve(skill_dir=skill_dir) + + fake_cache.force_run.assert_not_called() + payload = _latest_gate_decision(tmp_path) + assert payload["decision_signal"] == "synthetic" + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + ): + assert cl_field not in payload, ( + f"CL field {cl_field!r} should not be in synthetic-gate payload" + ) + + +def test_no_headroom_falls_through_to_synthetic_gate( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """no_headroom + non-empty CL data → CL-primary STILL must NOT fire. + The spec triggers CL-primary only on weak_signal.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + with _patch_stack( + sat_report=_no_headroom_report(with_cl_data=True), + fake_cache=fake_cache, + ): + _run_evolve( + skill_dir=skill_dir, + extra_kwargs={"force_saturation_check": True}, + ) + + fake_cache.force_run.assert_not_called() + payload = _latest_gate_decision(tmp_path) + assert payload["decision_signal"] == "synthetic" + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + ): + assert cl_field not in payload + + +def test_no_headroom_without_cl_data_falls_through_to_synthetic_gate( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """no_headroom + no CL data + --force-saturation-check → synthetic gate + runs without KeyError. CL was never measured, so no CL fields.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + with _patch_stack( + sat_report=_no_headroom_report(with_cl_data=False), + fake_cache=fake_cache, + ): + _run_evolve( + skill_dir=skill_dir, + extra_kwargs={"force_saturation_check": True}, + ) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision_signal"] == "synthetic" + + +def test_no_saturation_check_falls_through_to_synthetic_with_reason_recorded( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """--no-saturation-check → no preflight, falls through to synthetic. + decision_signal == synthetic AND reason_synthetic == preflight_skipped + so downstream consumers can distinguish 'preflight saw nothing weak' + from 'preflight didn't run'.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + # sat_report is unused (skip_saturation_check=True bypasses preflight) + # but _patch_stack requires one. + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve( + skill_dir=skill_dir, + extra_kwargs={"skip_saturation_check": True}, + ) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision_signal"] == "synthetic" + assert payload["reason_synthetic"] == "preflight_skipped" + + +def test_cl_primary_decision_persists_to_gate_decision_json( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """weak_signal → all v5 CL fields present in gate_decision.json with + correct types. Pins the JSON contract downstream consumers depend on.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + + # _LOW_GROWTH_BODY → required_gain=1 → +2 win clears the gate so + # the deploy path populates every v5 CL field we're pinning here. + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + ): + _run_evolve(skill_dir=skill_dir) + + payload = _latest_gate_decision(tmp_path) + + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" + + assert isinstance(payload["baseline_closed_loop_per_example"], list) + assert all( + isinstance(x, (int, float)) + for x in payload["baseline_closed_loop_per_example"] + ) + assert isinstance(payload["evolved_closed_loop_per_example"], list) + assert all( + isinstance(x, (int, float)) + for x in payload["evolved_closed_loop_per_example"] + ) + + assert isinstance(payload["cl_tasks_gained"], int) + assert isinstance(payload["cl_required_gain"], int) + + sanity = payload["synthetic_sanity_check"] + assert isinstance(sanity, dict) + for key in ("tolerance", "baseline_mean", "evolved_mean", "passed"): + assert key in sanity, f"synthetic_sanity_check missing {key!r}" + assert isinstance(sanity["tolerance"], (int, float)) + assert isinstance(sanity["baseline_mean"], (int, float)) + assert isinstance(sanity["evolved_mean"], (int, float)) + assert isinstance(sanity["passed"], bool) + + # cost_usd may be None (tests don't exercise the cost ledger), float, + # or int — accept any; we only pin field presence here. + assert "evolved_cl_eval_cost_usd" in payload + cost = payload["evolved_cl_eval_cost_usd"] + assert cost is None or isinstance(cost, (int, float)) + + band_score = payload["band_trigger_score"] + assert isinstance(band_score, dict) + assert "holdout" in band_score + assert "closed_loop" in band_score + + assert isinstance(payload["validator_agent_model"], str) + + +def test_synthetic_only_decision_unchanged_in_gate_decision_json( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """healthy → synthetic path. All v4 skill fields present alongside + the new decision_signal marker, no CL fields leak in. + + The v4 skill-specific fields (``bap_max_growth``, ``bap_safety_margin``, + ``eval_source``, ``fitness_profile``, ``proposer_mode``) MUST be + preserved post-v5 bump. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + + with _patch_stack(sat_report=_healthy_report(), fake_cache=fake_cache): + _run_evolve(skill_dir=skill_dir) + + payload = _latest_gate_decision(tmp_path) + + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "synthetic" + + # v4-and-earlier fields the synthetic path has always written. + for required in ( + "baseline_per_example", + "evolved_per_example", + "bootstrap", + "growth_pct", + "required_improvement", + "baseline_chars", + "evolved_chars", + "absolute_char_ceiling", + "knee_point", + "dataset", + "run_inputs", + # v4 skill-specific fields (the plan calls these out as needing + # explicit preservation in test 7's assertion). + "bap_max_growth", + "bap_safety_margin", + "fitness_profile", + "proposer_mode", + ): + assert required in payload, f"missing v4 field {required!r}" + + assert payload["run_inputs"]["eval_source"] == "synthetic" + + for cl_field in ( + "cl_tasks_gained", + "cl_required_gain", + "synthetic_sanity_check", + "baseline_closed_loop_per_example", + "evolved_closed_loop_per_example", + "band_trigger_score", + "validator_agent_model", + ): + assert cl_field not in payload, ( + f"CL-only field {cl_field!r} leaked into synthetic-gate payload" + ) + + +def test_force_run_failure_writes_aborted_decision_with_diagnostic_payload( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """weak_signal + force_run raises → aborted decision, + reason=cl_eval_failed, exception text recorded, evolved_FAILED.md + written for forensic inspection of the rejected candidate.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.side_effect = RuntimeError("validator crashed") + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + _run_evolve(skill_dir=skill_dir) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_failed" + assert "validator crashed" in payload["cl_eval_exception"] + + run_dir = _latest_run_dir(tmp_path) + assert (run_dir / "evolved_FAILED.md").exists(), ( + "evolved_FAILED.md must be written so the rejected variant " + "is inspectable" + ) + + +def test_evolved_task_error_writes_cl_eval_incomplete_decision( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """weak_signal + one evolved task abstained → cl_eval_incomplete + (NOT a regression). An infrastructure flake on the evolved phase + isn't evidence of quality loss; conflating them would silently + reject good candidates.""" + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + # task_2 abstains; others pass. Without the incomplete-detection + # branch this would score as 6/7 (+1 vs 5/7 baseline) and deploy. + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, False, True, True, True, True], + evolved_abstain=[False, False, True, False, False, False, False], + ) + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + _run_evolve(skill_dir=skill_dir) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision"] == "aborted" + assert payload["reason"] == "cl_eval_incomplete" + assert payload["evolved_closed_loop_errored_tasks"] == ["task_2"] + + run_dir = _latest_run_dir(tmp_path) + assert (run_dir / "evolved_FAILED.md").exists() + + +def test_absolute_char_ceiling_still_enforced_in_cl_primary_path( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """weak_signal + +2 CL win + evolved body exceeding the absolute + char ceiling → reject. CL-primary mustn't bypass the wallpaper- + protection backstop. + + Pinned production flow: ``validate_static`` at ``evolve_skill.py:1034`` + only runs ``size_limit``/``non_empty``/``skill_structure``; the + ``absolute_char_ceiling`` check lives inside the CL-primary branch at + line 1271 (``validator._check_absolute_chars``) and runs AFTER + ``force_run``. So this test exercises the in-branch ceiling check — + the rejection carries ``decision_signal: "closed_loop"`` and the CL + cache must have been consulted for the evolved body. + + Baseline raw = 58 chars. With max_absolute_chars=50, the effective + ceiling = max(50, 1.5*58) = 87. evolved_full = 53 + len(body) chars, + so a 200-char body produces a 253-char evolved_full — trips the + ceiling. Body stays under config.max_skill_size so non-ceiling + static checks still pass. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + long_body = ( + "Find files in the repository by name pattern or glob; returns " + "matching file paths from anywhere under the project root." + ) * 2 + assert len(long_body) > 87 - 53, ( + f"Test pre-condition: long_body={len(long_body)} must trip the ceiling" + ) + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=long_body, + ): + _run_evolve( + skill_dir=skill_dir, + extra_kwargs={"max_absolute_chars": 50}, + ) + + payload = _latest_gate_decision(tmp_path) + assert payload["decision"] == "reject", ( + f"absolute_char_ceiling must reject even on a winning CL gate; " + f"got decision={payload['decision']} (reason={payload.get('reason')})" + ) + assert "absolute_char_ceiling" in payload.get("failed_constraints", []), ( + f"failed_constraints={payload.get('failed_constraints')}" + ) + # Pin the actual code path: rejection comes from the in-branch + # _check_absolute_chars at evolve_skill.py:1271 (NOT the early + # validate_static at line 1034, which doesn't include the ceiling). + # That means CL eval already ran and the signal is "closed_loop". + assert payload["decision_signal"] == "closed_loop", ( + f"CL-primary ceiling reject must emit decision_signal='closed_loop'; " + f"got {payload.get('decision_signal')!r} — did the ceiling check move " + f"out of the CL-primary branch?" + ) + fake_cache.force_run.assert_called_once_with(long_body) + + +# --------------------------------------------------------------------------- +# Tests 11-13: skill-specific guards +# --------------------------------------------------------------------------- + + +def test_force_run_called_with_skill_body_not_full( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """force_run must be called with the BODY (no YAML frontmatter), not + with the full evolved file (frontmatter + body). + + The closed-loop cache keys its memoisation on the artifact text. The + preflight populated the cache with ``skill["body"]``; if the post-GEPA + eval site passes ``evolved_full`` instead, the cache key won't match + and the validator silently double-spends ~$1-3 per run. + + This is the highest-value guard in the file — the failure mode is + silent: the run still produces a decision, no error surfaces, but + cost ledger 2x's and the CL "cache hit" telemetry goes haywire. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + ): + _run_evolve(skill_dir=skill_dir) + + # The single, exact assertion this test exists to make: force_run + # receives the body string only, never the full frontmatter+body file. + fake_cache.force_run.assert_called_once_with(_LOW_GROWTH_BODY) + call_arg = fake_cache.force_run.call_args.args[0] + assert "---" not in call_arg, ( + f"force_run received the full frontmatter+body file (cache-key " + f"mismatch bug): {call_arg!r}" + ) + assert "name:" not in call_arg, ( + f"force_run received YAML frontmatter, not body alone: {call_arg!r}" + ) + + +def test_evolved_failed_md_written_not_json( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """Abort paths produce ``evolved_FAILED.md`` (not ``.json``). + + The tool-side equivalent writes ``evolved_FAILED.json`` because tool + manifests are JSON. Skills are markdown files, and post-run diff + tooling reads ``evolved_FAILED.md`` to compare against ``baseline_skill.md``. + A silent rename to ``.json`` would break that workflow. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.side_effect = RuntimeError("validator crashed") + + with _patch_stack(sat_report=_weak_signal_report(), fake_cache=fake_cache): + _run_evolve(skill_dir=skill_dir) + + run_dir = _latest_run_dir(tmp_path) + assert (run_dir / "evolved_FAILED.md").exists(), ( + f"evolved_FAILED.md missing — got {list(run_dir.iterdir())}" + ) + assert not (run_dir / "evolved_FAILED.json").exists(), ( + "evolved_FAILED.json must NOT exist on skill-side aborts; " + "skill convention is .md (matches baseline_skill.md). If someone " + "intentionally added .json support, update this test deliberately." + ) + + # Verify the .md is the full reassembled file (frontmatter + body), + # not the body alone — diff tooling expects evolved_FAILED.md to be + # directly diffable against baseline_skill.md. + failed_text = (run_dir / "evolved_FAILED.md").read_text() + assert failed_text.startswith("---"), ( + f"evolved_FAILED.md should include frontmatter for diff parity, " + f"got {failed_text[:80]!r}" + ) + assert "name: demo-skill" in failed_text + + +def test_skill_v4_payload_fields_preserved_in_v5_cl_primary( + skill_dir: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +): + """Schema regression: ``bap_max_growth``, ``bap_safety_margin``, + ``eval_source``, ``fitness_profile``, ``proposer_mode``, and + ``knee_point.band_roster`` all present in v5 CL-primary output. + + These are the v4 skill-specific fields downstream calibration scripts + read. Future schema bumps must keep them populated. + """ + monkeypatch.chdir(tmp_path) + fake_cache = MagicMock() + fake_cache.force_run.return_value = _fake_validation_report( + baseline_pass=[True, True, True, True, True, False, False], + evolved_pass=[True, True, True, True, True, True, True], + ) + + with _patch_stack( + sat_report=_weak_signal_report(), + fake_cache=fake_cache, + evolved_body=_LOW_GROWTH_BODY, + ): + _run_evolve(skill_dir=skill_dir) + + payload = _latest_gate_decision(tmp_path) + assert payload["schema_version"] == "5" + assert payload["decision_signal"] == "closed_loop" + + # Skill-specific v4 payload fields — must persist across v5. + for field in ( + "bap_max_growth", + "bap_safety_margin", + "fitness_profile", + "proposer_mode", + ): + assert field in payload, ( + f"v4 skill field {field!r} missing in v5 CL-primary payload" + ) + assert payload[field] is not None, ( + f"v4 skill field {field!r} present but null in v5 CL-primary payload" + ) + + # eval_source is nested under run_inputs (not at top level). + assert payload["run_inputs"]["eval_source"] == "synthetic" + + # knee_point.band_roster must serialise as a list (empty here; the + # downstream calibration script accesses it via .get('band_roster', [])). + knee = payload["knee_point"] + assert isinstance(knee, dict) + assert "band_roster" in knee, ( + f"knee_point.band_roster missing in v5 CL-primary payload; " + f"knee_point keys: {list(knee.keys())}" + ) + assert isinstance(knee["band_roster"], list) diff --git a/tests/skills/test_evolve_skill_validation_flow.py b/tests/skills/test_evolve_skill_validation_flow.py index b97c4f2a..656c9705 100644 --- a/tests/skills/test_evolve_skill_validation_flow.py +++ b/tests/skills/test_evolve_skill_validation_flow.py @@ -131,7 +131,7 @@ def test_static_failure_reason_in_decision(self, tmp_path: Path): # Manual reproduction of the static-failure branch's payload — # locks the schema so a future refactor can't silently drop fields. payload = { - "schema_version": "4", + "schema_version": "5", "decision": "reject", "reason": "static_constraint_failure", "failed_constraints": ["non_empty"], @@ -140,7 +140,7 @@ def test_static_failure_reason_in_decision(self, tmp_path: Path): } path = _write_gate_decision(tmp_path, payload) loaded = json.loads(path.read_text()) - assert loaded["schema_version"] == "4" + assert loaded["schema_version"] == "5" assert loaded["reason"] == "static_constraint_failure" assert "non_empty" in loaded["failed_constraints"] assert "knee_point" in loaded @@ -154,7 +154,7 @@ class TestGrowthGateDecisionSchema: def test_required_fields_present(self, tmp_path: Path): payload = { - "schema_version": "4", + "schema_version": "5", "decision": "reject", "reason": "growth_quality_gate", "decision_rule_used": "dual_check", @@ -241,7 +241,7 @@ def test_required_fields_present(self, tmp_path: Path): "bootstrap", "knee_point", "dataset", ): assert required in loaded, f"missing {required}" - assert loaded["schema_version"] == "4" + assert loaded["schema_version"] == "5" for required_in_bootstrap in ( "mean", "lower_bound", "upper_bound", "n_examples", "n_resamples", "confidence", @@ -274,7 +274,7 @@ class TestRunInputsBlock: def test_run_inputs_present_in_decision(self, tmp_path: Path): payload = { - "schema_version": "4", + "schema_version": "5", "decision": "deploy", "run_inputs": { "seed": 42,