diff --git a/evolution/core/config.py b/evolution/core/config.py index 02726b5..39b1090 100644 --- a/evolution/core/config.py +++ b/evolution/core/config.py @@ -26,6 +26,17 @@ class EvolutionConfig: iterations: int = 10 population_size: int = 5 + # GEPA's reflective minibatch size — the number of training examples + # sampled per reflective step for the sum() acceptance gate at + # gepa/core/engine.py:491-493. Default 3 matches GEPA's own default + # (no behavior change). Users hitting the weak_signal saturation + # band can bump this to ~8 to widen the sampling window so + # discriminating examples appear more often per minibatch — see + # reports/pareto_frontier_feasibility.md spike #2 for the + # motivating case and saturation_check.py's weak_signal suggestions + # for the actionable hint surfaced to users. + reflection_minibatch_size: int = 3 + # Per-role model overrides. When set, treated as explicit LiteLLM model # strings that bypass Hermes resolution. When None, get_lm() falls back # to resolve_default_lm() against ~/.hermes/config.yaml + auth.json + diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py index 16b65a5..90acdb5 100644 --- a/evolution/core/saturation_check.py +++ b/evolution/core/saturation_check.py @@ -97,8 +97,8 @@ def _classify_band( ): return "weak_signal", [ "Judge saturating but closed-loop has signal; GEPA's small-minibatch acceptance will struggle.", - "Expect many proposals rejected — bump --iterations above 5.", - "Larger minibatch (Path E follow-up) would help once landed.", + "Try --gepa-minibatch-size 8 (default 3) — widens the sampling window so discriminating examples appear in ~68% of minibatches vs ~34% at default.", + "Larger minibatch means fewer proposals per budget: on evolve_tool bump --iterations to ~10, on evolve_skill use --budget heavy.", ] return "healthy", [] diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index bfeb6cc..1e7994b 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -270,6 +270,7 @@ def _default_gepa_runner( seed: int, instruction_proposer=None, reflection_model: Optional[str] = None, + reflection_minibatch_size: int = 3, ): # max_tokens=32000 satisfies DSPy's reasoning-model floor of 16000 # (DSPy raises ValueError below that). @@ -298,6 +299,7 @@ def _default_gepa_runner( # (.candidates, .val_aggregate_scores) on the returned module. track_stats=True, instruction_proposer=instruction_proposer, + reflection_minibatch_size=reflection_minibatch_size, ) return optimizer.compile(baseline_module, trainset=trainset, valset=valset) @@ -355,6 +357,7 @@ def _build_optimizer_and_compile( failure_log_path: Optional[Path] = None, instruction_proposer=None, reflection_model: Optional[str] = None, + reflection_minibatch_size: int = 3, _gepa_runner=_default_gepa_runner, _mipro_runner=_default_mipro_runner, ): @@ -376,6 +379,7 @@ def _build_optimizer_and_compile( seed=seed, instruction_proposer=instruction_proposer, reflection_model=reflection_model, + reflection_minibatch_size=reflection_minibatch_size, ) return optimized, "GEPA" except CostCeilingExceeded: @@ -606,6 +610,7 @@ def evolve( skip_cost_suggest: bool = False, skip_saturation_check: bool = False, force_saturation_check: bool = False, + gepa_minibatch_size: int = 3, closed_loop_suite_path: Optional[Path] = None, closed_loop_saturation_threshold: float = 0.95, closed_loop_min_iters: int = 3, @@ -656,6 +661,7 @@ def evolve( config_kwargs["eval_dataset_size"] = eval_dataset_size if holdout_ratio is not None: config_kwargs["holdout_ratio"] = holdout_ratio + config_kwargs["reflection_minibatch_size"] = gepa_minibatch_size config = EvolutionConfig(**config_kwargs) explicit_dirs = [Path(d) for d in (skill_source_dirs or [])] if explicit_dirs: @@ -793,6 +799,18 @@ def evolve( ) sys.exit(1) + # Guard: GEPA's reflective batch sampler asserts + # len(trainset) >= reflection_minibatch_size mid-optimization + # (gepa/strategies/batch_sampler.py). Catch the misconfiguration + # at startup with an actionable message instead. + if config.reflection_minibatch_size > len(dataset.train): + console.print( + f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} " + f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ " + f"{len(dataset.train)} or increase --eval-dataset-size.[/red]" + ) + sys.exit(1) + # Static checks only — the growth-with-quality gate runs later on # the evolved artifact once there's a holdout improvement signal. console.print(f"\n[bold]Validating baseline constraints[/bold]") @@ -953,6 +971,7 @@ def evolve( failure_log_path=failure_log_path, instruction_proposer=proposer, reflection_model=config.reflection_model, + reflection_minibatch_size=config.reflection_minibatch_size, ) elapsed = time.time() - start_time @@ -1566,6 +1585,23 @@ def evolve( "regardless of band. Required to override a non-healthy verdict " "in non-interactive contexts (no TTY).", ) +@click.option( + "--gepa-minibatch-size", + "gepa_minibatch_size", + default=3, + type=click.IntRange(min=1), + help="GEPA's reflective minibatch size — number of training examples " + "sampled per reflective step for the sum() acceptance gate. " + "Default 3 matches GEPA's own default. Bump to ~8 when the " + "saturation pre-flight flags the weak_signal band: the wider " + "sampling window makes discriminating examples appear in " + "~68% of minibatches vs ~34% at default. Trade-off: larger " + "minibatch means each accepted proposal consumes more of the " + "metric-call budget. The skill pipeline uses --budget (not " + "--iterations) for its budget knob, so consider --budget heavy " + "to preserve the proposal count. Aborts at startup if the " + "value exceeds the trainset size.", +) @click.option( "--closed-loop-during-evolution", "closed_loop_suite_path", @@ -1659,6 +1695,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti skip_cost_suggest, skip_saturation_check, force_saturation_check, + gepa_minibatch_size, closed_loop_suite_path, closed_loop_saturation_threshold, closed_loop_min_iters, @@ -1706,6 +1743,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti skip_cost_suggest=skip_cost_suggest, skip_saturation_check=skip_saturation_check, force_saturation_check=force_saturation_check, + gepa_minibatch_size=gepa_minibatch_size, closed_loop_suite_path=closed_loop_suite_path, closed_loop_saturation_threshold=closed_loop_saturation_threshold, closed_loop_min_iters=closed_loop_min_iters, diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index cd9b502..9dfec41 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -376,6 +376,7 @@ def evolve( skip_cost_suggest: bool = False, skip_saturation_check: bool = False, force_saturation_check: bool = False, + gepa_minibatch_size: int = 3, ) -> dict[str, Any]: """Evolve one tool description inside a manifest. @@ -421,6 +422,7 @@ def evolve( eval_dataset_size=eval_dataset_size, holdout_ratio=holdout_ratio, enable_confusable_bucket=enable_confusable_bucket, + reflection_minibatch_size=gepa_minibatch_size, ) console.print( @@ -573,6 +575,18 @@ def evolve( ) sys.exit(1) + # Guard: GEPA's reflective batch sampler asserts + # len(trainset) >= reflection_minibatch_size mid-optimization + # (gepa/strategies/batch_sampler.py). Catch the misconfiguration + # at startup with an actionable message instead. + if config.reflection_minibatch_size > len(dataset.train): + console.print( + f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} " + f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ " + f"{len(dataset.train)} or increase --eval-dataset-size.[/red]" + ) + sys.exit(1) + console.print(f"\n[bold]Validating baseline description[/bold]") validator = ConstraintValidator(config) baseline_constraints = validator.validate_static(baseline_description, "tool_description") @@ -712,6 +726,7 @@ def evolve( seed=config.seed, track_stats=True, instruction_proposer=proposer, + reflection_minibatch_size=config.reflection_minibatch_size, ) optimized_module = optimizer.compile( baseline_module, trainset=trainset, valset=valset, @@ -1255,6 +1270,22 @@ def evolve( "in non-interactive contexts (no TTY). Without this in such a " "context, the framework exits cleanly without spending GEPA budget.", ) +@click.option( + "--gepa-minibatch-size", + "gepa_minibatch_size", + default=3, + type=click.IntRange(min=1), + help="GEPA's reflective minibatch size — number of training examples " + "sampled per reflective step for the sum() acceptance gate. " + "Default 3 matches GEPA's own default. Bump to ~8 when the " + "saturation pre-flight flags the weak_signal band: the wider " + "sampling window makes discriminating examples appear in " + "~68% of minibatches vs ~34% at default. Trade-off: larger " + "minibatch means each accepted proposal consumes more of the " + "metric-call budget, so consider also bumping --iterations to " + "~10 to preserve the proposal count. Aborts at startup if the " + "value exceeds the trainset size.", +) @click.option( "--closed-loop-in-valset/--no-closed-loop-in-valset", "closed_loop_in_valset", @@ -1308,6 +1339,7 @@ def main( skip_cost_suggest: bool, skip_saturation_check: bool, force_saturation_check: bool, + gepa_minibatch_size: int, closed_loop_suite_path: Optional[Path], closed_loop_hermes_repo: Optional[Path], closed_loop_saturation_threshold: float, @@ -1360,6 +1392,7 @@ def main( skip_cost_suggest=skip_cost_suggest, skip_saturation_check=skip_saturation_check, force_saturation_check=force_saturation_check, + gepa_minibatch_size=gepa_minibatch_size, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback — diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py index 252b2d8..6713135 100644 --- a/tests/skills/test_evolve_skill_saturation_preflight.py +++ b/tests/skills/test_evolve_skill_saturation_preflight.py @@ -280,3 +280,97 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir): f"Expected baseline holdout to be reused from preflight cache " f"(1 call for evolved only), got {mock_holdout_eval.call_count}" ) + + +class TestGepaMinibatchSizeFlag: + """--gepa-minibatch-size threads through to dspy.GEPA's + reflection_minibatch_size kwarg, and the post-dataset-build guard + rejects values that exceed the trainset size with an actionable + message instead of an opaque assertion deep inside GEPA.""" + + def test_flag_passes_through_to_dspy_gepa(self, skill_dir): + """Patch dspy.GEPA's __init__ to record the value, then invoke the + CLI with --gepa-minibatch-size 7. Assert the constructed instance + carries the value on the documented attribute.""" + from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick + captured: dict = {} + original_init = __import__("dspy").GEPA.__init__ + + def recording_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + captured["reflection_minibatch_size"] = self.reflection_minibatch_size + + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + fake_module = MagicMock() + fake_module.skill_text = "evolved skill text" + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved skill text", body_chars=18, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=18, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch("evolution.skills.evolve_skill.dspy.GEPA.__init__", recording_init), patch( + "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module + ), patch( + "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick + ), patch( + "evolution.skills.evolve_skill._holdout_evaluate_with_metric", + return_value=(0.6, [0.6] * 10), + ): + runner = CliRunner() + result = runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight", + "--gepa-minibatch-size", "7"], + ) + assert captured.get("reflection_minibatch_size") == 7, ( + f"Expected dspy.GEPA.reflection_minibatch_size=7; got " + f"{captured!r}. CLI output: {result.output}" + ) + + def test_minibatch_exceeding_trainset_aborts_at_startup(self, skill_dir): + """--gepa-minibatch-size larger than the trainset triggers the + post-dataset guard (sys.exit(1) with an actionable message), + not a mid-optimization assertion inside EpochShuffledBatchSampler.""" + from evolution.core.saturation_check import SaturationReport + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + # _fake_skill_dataset() returns train=30 — so 1000 exceeds it. + fake_builder = MagicMock() + fake_builder.generate.return_value = _fake_skill_dataset() + gepa_mock = MagicMock() + with patch( + "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy + ), patch( + "evolution.skills.evolve_skill._preflight_lm_credentials" + ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_skill_main, + ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir), + "--iterations", "1", "--no-preflight", + "--gepa-minibatch-size", "1000"], + ) + assert result.exit_code == 1, ( + f"Expected exit 1 from trainset-ceiling guard, got " + f"{result.exit_code}. Output: {result.output}" + ) + assert "exceeds trainset size" in result.output + gepa_mock.assert_not_called() diff --git a/tests/tools/test_evolve_tool_saturation_preflight.py b/tests/tools/test_evolve_tool_saturation_preflight.py index 62a6842..fce76cd 100644 --- a/tests/tools/test_evolve_tool_saturation_preflight.py +++ b/tests/tools/test_evolve_tool_saturation_preflight.py @@ -295,3 +295,99 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, manifest_dir): f"Expected baseline holdout to be reused from preflight cache " f"(1 call for evolved only), got {mock_holdout_eval.call_count}" ) + + +class TestGepaMinibatchSizeFlag: + """--gepa-minibatch-size threads through to dspy.GEPA's + reflection_minibatch_size kwarg, and the post-dataset-build guard + rejects values that exceed the trainset size with an actionable + message instead of an opaque assertion deep inside GEPA.""" + + def test_flag_passes_through_to_dspy_gepa(self, manifest_dir): + """Patch dspy.GEPA's __init__ to record the value, then invoke the + CLI with --gepa-minibatch-size 7. Assert the constructed instance + carries the value on the documented attribute. Catches future + DSPy refactors that rename reflection_minibatch_size.""" + from evolution.core.saturation_check import SaturationReport + from evolution.skills.knee_point import CandidatePick + captured: dict = {} + original_init = __import__("dspy").GEPA.__init__ + + def recording_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + captured["reflection_minibatch_size"] = self.reflection_minibatch_size + + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + fake_module = MagicMock() + knee_pick = CandidatePick( + module=fake_module, skill_text="evolved desc", body_chars=12, + val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1, + fallback="knee", picked_idx=0, gepa_default_idx=0, + gepa_default_body_chars=12, band_roster=[], + ) + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch("evolution.tools.evolve_tool.dspy.GEPA.__init__", recording_init), patch( + "evolution.tools.evolve_tool.dspy.GEPA.compile", return_value=fake_module + ), patch( + "evolution.tools.evolve_tool.select_knee_point", return_value=knee_pick + ), patch( + "evolution.tools.evolve_tool._candidate_description", return_value="evolved desc" + ), patch( + "evolution.tools.evolve_tool._holdout_evaluate_with_metric", + return_value=(0.6, [0.6] * 10), + ): + runner = CliRunner() + result = runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight", + "--gepa-minibatch-size", "7"], + ) + assert captured.get("reflection_minibatch_size") == 7, ( + f"Expected dspy.GEPA.reflection_minibatch_size=7; got " + f"{captured!r}. CLI output: {result.output}" + ) + + def test_minibatch_exceeding_trainset_aborts_at_startup(self, manifest_dir): + """--gepa-minibatch-size larger than the trainset triggers the + post-dataset guard (sys.exit(1) with an actionable message), + not a mid-optimization assertion inside EpochShuffledBatchSampler.""" + from evolution.core.saturation_check import SaturationReport + healthy = SaturationReport( + band="healthy", holdout_score=0.6, holdout_n=10, + holdout_per_example=[0.6] * 10, suggestions=[], thresholds={}, + ) + # _fake_tool_examples() returns 30 — so 1000 exceeds it. + fake_builder = MagicMock() + fake_builder.generate_tool_selection.return_value = _fake_tool_examples() + gepa_mock = MagicMock() + with patch( + "evolution.tools.evolve_tool.SyntheticDatasetBuilder", return_value=fake_builder + ), patch( + "evolution.tools.evolve_tool.saturation_preflight", return_value=healthy + ), patch( + "evolution.tools.evolve_tool._preflight_lm_credentials" + ), patch("evolution.tools.evolve_tool.dspy.GEPA", gepa_mock): + runner = CliRunner() + result = runner.invoke( + evolve_tool_main, + ["--tool", "write_file", "--manifest", str(manifest_dir), + "--iterations", "1", "--no-preflight", + "--gepa-minibatch-size", "1000"], + ) + assert result.exit_code == 1, ( + f"Expected exit 1 from trainset-ceiling guard, got " + f"{result.exit_code}. Output: {result.output}" + ) + assert "exceeds trainset size" in result.output + gepa_mock.assert_not_called()