diff --git a/docs/workflows.md b/docs/workflows.md index 01706618..0ddd4451 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -488,7 +488,19 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256 Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable. -Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`. +Reference suites: +- `evolution/validation/suites/systematic_debugging.jsonl` — 5 textbook bugs; good for verifying the wiring works. +- `evolution/validation/suites/systematic_debugging_advanced.jsonl` — 5 harder bugs (generator exhaustion, shared mutable return, float-precision equality, leftmost-insert boundary, class-vs-instance attribute) designed to discriminate skill-text variants on capable agent models that saturate the basic suite at 5/5. + +When your daily-driver Hermes model is capable enough to solve every textbook bug regardless of skill text, the planted-bug verdict adds no signal. Three knobs to recover discrimination: + +- `--closed-loop-during-evolution .../systematic_debugging_advanced.jsonl` — use the harder bugs (different cognitive failure modes). +- `--closed-loop-agent-model MODEL` — run the validator's agent against a different model than your `~/.hermes/config.yaml` default. Hermes sends `include: ['reasoning.encrypted_content']` so the model must be a reasoning model (o1-family, o3-family, o4-mini, gpt-5.x-family); non-reasoning models reject the request. +- `--closed-loop-task-timeout-seconds N` — bump the per-task wall-clock budget. The default is 120s; most reasoning models other than the smallest take 200–300s per debugging task and would otherwise abstain (timeout) without recording a verdict. + +**Empirical caveat from validation.** Both suites saturate at 5/5 against capable reasoning models (`gpt-5.4-mini` saturated both; `o3-mini` was slow enough to abstain most tasks at the default timeout). For a setup where the user's default model handles textbook Python debugging easily, the closed-loop signal on this domain may be uninformative regardless of skill text — the agent's raw capability dominates. Real headroom likely needs evaluation surfaces where methodology matters more than recognition: multi-file refactoring, ambiguous specs with edge cases the agent must enumerate, tasks requiring iterative hypothesis-testing across multiple test runs. + +Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py` (supports `--suite {basic,advanced}`, `--agent-model MODEL`, `--task-timeout-seconds N`). ## Failure-mode summary diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 80498848..2dbdfd05 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -471,6 +471,8 @@ def _maybe_build_closed_loop_cache_skill( min_iters: int, window_size: int, gate_mode: str = "sampled", + agent_model: Optional[str] = None, + agent_timeout_seconds: Optional[int] = None, ): """Build a ClosedLoopFeedbackCache for the skill path; return None when disabled. @@ -506,7 +508,10 @@ def _maybe_build_closed_loop_cache_skill( skill_name=skill_name, workdir=workdir, ) - runner = HermesAgentRunner() + runner_kwargs: dict = {"model": agent_model} + if agent_timeout_seconds is not None: + runner_kwargs["timeout_seconds"] = agent_timeout_seconds + runner = HermesAgentRunner(**runner_kwargs) validator = ClosedLoopValidator(installer=installer, runner=runner) suite = TaskSuite.from_jsonl(suite_path) return ClosedLoopFeedbackCache( @@ -599,6 +604,8 @@ def evolve( closed_loop_window_size: int = 8, closed_loop_mode: str = "feedback", closed_loop_in_valset: bool = False, + closed_loop_agent_model: Optional[str] = None, + closed_loop_task_timeout_seconds: Optional[int] = None, ): """Main evolution function — orchestrates the full optimization loop.""" @@ -835,6 +842,8 @@ def evolve( min_iters=closed_loop_min_iters, window_size=closed_loop_window_size, gate_mode=_cache_gate_mode, + agent_model=closed_loop_agent_model, + agent_timeout_seconds=closed_loop_task_timeout_seconds, ) # Build the metric once: DSPy's LM cache lines up across GEPA's @@ -1547,6 +1556,31 @@ def evolve( "scoring). Costs more — each accepted candidate triggers another full " "eval pass over the behavioral examples. Default off.", ) +@click.option( + "--closed-loop-agent-model", + "closed_loop_agent_model", + default=None, + type=str, + help="Override the agent model the closed-loop validator runs `hermes -z` " + "with (passed as `hermes -m MODEL -z ...`). When unset, the validator " + "uses whatever's in your ~/.hermes/config.yaml. Useful when your " + "daily-driver Hermes model is so capable it saturates the planted-bug " + "suite at 100%, hiding the behavioral signal closed-loop is supposed " + "to surface — run validation against a weaker model without touching " + "your config.", +) +@click.option( + "--closed-loop-task-timeout-seconds", + "closed_loop_task_timeout_seconds", + default=None, + type=click.IntRange(min=1), + help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` " + "subprocess (default 120s). Bump when --closed-loop-agent-model selects " + "a slow reasoning model that doesn't finish within the default — most " + "OpenAI reasoning models (o1-family, o3-family) take 60-180s per " + "debugging task. Hitting the timeout abstains the task verdict rather " + "than failing it, so over-tight values silently produce no-signal runs.", +) def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model, eval_model, skill_source_dir, dry_run, seed, budget, no_fallback, quality_gate, growth_free_threshold, @@ -1563,7 +1597,9 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti closed_loop_min_iters, closed_loop_window_size, closed_loop_mode, - closed_loop_in_valset): + closed_loop_in_valset, + closed_loop_agent_model, + closed_loop_task_timeout_seconds): """Evolve an agent skill using DSPy + GEPA optimization.""" try: evolve( @@ -1607,6 +1643,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti closed_loop_window_size=closed_loop_window_size, closed_loop_mode=closed_loop_mode, closed_loop_in_valset=closed_loop_in_valset, + closed_loop_agent_model=closed_loop_agent_model, + closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 31c39cc6..04b8fd90 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -260,6 +260,8 @@ def _maybe_build_closed_loop_cache( min_iters: int, window_size: int, gate_mode: str = "sampled", + agent_model: Optional[str] = None, + agent_timeout_seconds: Optional[int] = None, ): """Build a ClosedLoopFeedbackCache when the user opted in, else None. @@ -286,7 +288,10 @@ def _maybe_build_closed_loop_cache( installer = HermesToolDescriptionInstaller( hermes_repo=hermes_repo, tool_name=tool_name ) - runner = HermesAgentRunner() + runner_kwargs: dict = {"model": agent_model} + if agent_timeout_seconds is not None: + runner_kwargs["timeout_seconds"] = agent_timeout_seconds + runner = HermesAgentRunner(**runner_kwargs) validator = ClosedLoopValidator(installer=installer, runner=runner) suite = TaskSuite.from_jsonl(suite_path) return ClosedLoopFeedbackCache( @@ -359,6 +364,8 @@ def evolve( closed_loop_window_size: int = 8, closed_loop_mode: str = "feedback", closed_loop_in_valset: bool = False, + closed_loop_agent_model: Optional[str] = None, + closed_loop_task_timeout_seconds: Optional[int] = None, skip_preflight: bool = False, skip_cost_suggest: bool = False, ) -> dict[str, Any]: @@ -592,6 +599,8 @@ def evolve( min_iters=closed_loop_min_iters, window_size=closed_loop_window_size, gate_mode=cache_gate_mode, + agent_model=closed_loop_agent_model, + agent_timeout_seconds=closed_loop_task_timeout_seconds, ) metric = make_tool_fitness_metric( judge=judge, @@ -1187,6 +1196,30 @@ def evolve( "scoring). Costs more — each accepted candidate triggers another full " "eval pass over the behavioral examples. Default off.", ) +@click.option( + "--closed-loop-agent-model", + "closed_loop_agent_model", + default=None, + type=str, + help="Override the agent model the closed-loop validator runs `hermes -z` " + "with (passed as `hermes -m MODEL -z ...`). When unset, the validator " + "uses whatever's in your ~/.hermes/config.yaml. Useful when your " + "daily-driver Hermes model saturates the planted-bug suite at 100%, " + "hiding the behavioral signal — run validation against a weaker model " + "without touching your config.", +) +@click.option( + "--closed-loop-task-timeout-seconds", + "closed_loop_task_timeout_seconds", + default=None, + type=click.IntRange(min=1), + help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` " + "subprocess (default 120s). Bump when --closed-loop-agent-model selects " + "a slow reasoning model that doesn't finish within the default — most " + "OpenAI reasoning models (o1-family, o3-family) take 60-180s per " + "debugging task. Hitting the timeout abstains the task verdict rather " + "than failing it, so over-tight values silently produce no-signal runs.", +) def main( tool_name: str, manifest_path: Path, @@ -1212,6 +1245,8 @@ def main( closed_loop_window_size: int, closed_loop_mode: str, closed_loop_in_valset: bool, + closed_loop_agent_model: Optional[str], + closed_loop_task_timeout_seconds: Optional[int], ) -> None: """Evolve one tool description in an MCP manifest using DSPy + GEPA.""" if apply_flag and patch_flag: @@ -1249,6 +1284,8 @@ def main( closed_loop_window_size=closed_loop_window_size, closed_loop_mode=closed_loop_mode, closed_loop_in_valset=closed_loop_in_valset, + closed_loop_agent_model=closed_loop_agent_model, + closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds, skip_preflight=skip_preflight, skip_cost_suggest=skip_cost_suggest, ) diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index 9c8940b5..4a4dacd1 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -46,6 +46,7 @@ def __init__( hermes_command: str = "hermes", timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS, user_config_path: Optional[Path] = None, + model: Optional[str] = None, ) -> None: self.hermes_command = hermes_command self.timeout_seconds = timeout_seconds @@ -56,6 +57,13 @@ def __init__( if user_config_path is not None else Path.home() / ".hermes" / "config.yaml" ) + # Optional per-invocation model override (passed as ``hermes -z -m + # ``). When unset, Hermes uses whatever is configured in + # the sandboxed ``config.yaml``. Useful for closed-loop validation + # against a deliberately weaker agent model than the user's + # daily-driver default — saturation on capable models hides + # behavioral signal that a weaker model would expose. + self.model = model def run(self, ctx: TaskRunContext) -> AgentRunResult: message = ctx.user_message @@ -68,10 +76,15 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult: "HOME": str(sandbox), **ctx.extra_env, } + argv = [self.hermes_command, "-z", message] + if self.model is not None: + # Insert before the -z so hermes parses it as a global flag, + # not as part of the -z prompt value. + argv = [self.hermes_command, "-m", self.model, "-z", message] start = time.time() try: subprocess.run( - [self.hermes_command, "-z", message], + argv, env=env, cwd=str(ctx.fixture_dir), capture_output=True, diff --git a/evolution/validation/suites/systematic_debugging_advanced.jsonl b/evolution/validation/suites/systematic_debugging_advanced.jsonl new file mode 100644 index 00000000..5a1939db --- /dev/null +++ b/evolution/validation/suites/systematic_debugging_advanced.jsonl @@ -0,0 +1,22 @@ +# Advanced planted-bug suite for the `systematic-debugging` skill. +# +# Companion to systematic_debugging.jsonl (5 textbook bugs). This suite +# adds 5 bugs that capable reasoning models often "fix" wrong on the +# first pass — the obvious patch makes the failing test green but +# breaks another, or the bug is in the interaction between two pieces +# of seemingly-correct code rather than in one wrong line. +# +# Each bug exercises a different cognitive failure mode: +# 1. iteration: generator exhaustion (function iterates twice) +# 2. state: shared mutable returned across calls +# 3. precision: float equality after computation introduces drift +# 4. algorithm: binary search boundary case for leftmost insertion +# 5. OOP: class attribute used where instance attribute intended +# +# These survive a "just try edits until the test passes" approach +# because the spec demands edge cases the obvious patch doesn't cover. +{"task_id": "debug_generator_exhaustion", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def count_partitions(items, predicate):\n \"\"\"Return (n_true, n_false): how many items satisfy predicate vs not.\n\n Must work for any iterable input including single-pass generators.\n \"\"\"\n n_true = sum(1 for x in items if predicate(x))\n n_false = sum(1 for x in items if not predicate(x))\n return (n_true, n_false)\n", "test_solution.py": "from solution import count_partitions\n\n# Works on lists — easy case.\nassert count_partitions([1, 2, 3, 4], lambda x: x > 2) == (2, 2)\nassert count_partitions([], lambda x: x > 0) == (0, 0)\n\n# Must also work on single-pass generators — the spec is explicit.\ndef gen():\n yield from [1, 2, 3, 4, 5]\nassert count_partitions(gen(), lambda x: x > 2) == (3, 2), count_partitions(gen(), lambda x: x > 2)\n\n# And on a generator of strings.\ndef words():\n yield from ['apple', 'banana', 'cherry', 'kiwi']\nassert count_partitions(words(), lambda s: len(s) > 5) == (2, 2), count_partitions(words(), lambda s: len(s) > 5)\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_shared_mutable_return", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "_history_cache = {}\n\ndef get_history(user_id):\n \"\"\"Return the user's history list. Each call must return an INDEPENDENT\n list — mutating the returned list must NOT affect subsequent calls.\n \"\"\"\n if user_id not in _history_cache:\n _history_cache[user_id] = []\n return _history_cache[user_id]\n", "test_solution.py": "from solution import get_history\n\n# First call returns empty.\nh1 = get_history('alice')\nassert h1 == [], h1\n\n# Caller mutates the returned list (logging some action).\nh1.append('logged_in')\n\n# Second call for SAME user must return a fresh empty list per the spec —\n# mutation of h1 must not leak into the cached history visible to other callers.\nh2 = get_history('alice')\nassert h2 == [], f'mutation leaked: {h2}'\n\n# Independent users are independent.\nassert get_history('bob') == []\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_float_precision_equality", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def is_target_amount(payments, target):\n \"\"\"Return True iff the sum of payments equals target. Payments come\n from upstream float arithmetic so the sum may have float-precision\n drift on the order of 1e-9; treat amounts within 1e-6 of target as equal.\n \"\"\"\n return sum(payments) == target\n", "test_solution.py": "from solution import is_target_amount\n\n# Exact case: trivially true.\nassert is_target_amount([1.0, 2.0, 3.0], 6.0) is True\n\n# Float-drift cases: each computes a sum that does NOT exactly equal the\n# target due to IEEE 754, but is within the 1e-6 tolerance the spec allows.\nassert is_target_amount([0.1, 0.1, 0.1], 0.3) is True, sum([0.1, 0.1, 0.1]) - 0.3\nassert is_target_amount([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 0.7) is True, sum([0.1]*7) - 0.7\nassert is_target_amount([0.2, 0.2, 0.2], 0.6) is True, sum([0.2]*3) - 0.6\n\n# Genuinely different amounts: must still return False.\nassert is_target_amount([1.0, 2.0], 5.0) is False\nassert is_target_amount([0.1, 0.1], 0.3) is False\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_binary_search_boundary", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def leftmost_insert(sorted_arr, target):\n \"\"\"Return the leftmost index where target could be inserted to keep\n sorted_arr sorted. Equivalent to bisect.bisect_left.\n\n For target equal to existing elements, returns the FIRST such index.\n For target smaller than all elements, returns 0.\n For target larger than all elements, returns len(sorted_arr).\n \"\"\"\n lo, hi = 0, len(sorted_arr)\n while lo < hi:\n mid = (lo + hi) // 2\n if sorted_arr[mid] <= target:\n lo = mid + 1\n else:\n hi = mid\n return lo\n", "test_solution.py": "from solution import leftmost_insert\n\n# Middle insertion — buggy code usually returns the right answer here.\nassert leftmost_insert([1, 3, 5, 7], 4) == 2, leftmost_insert([1, 3, 5, 7], 4)\n\n# Target equals an existing element — must return the LEFTMOST occurrence.\n# The buggy code returns the RIGHTMOST position (bisect_right behavior).\nassert leftmost_insert([1, 2, 2, 2, 3], 2) == 1, leftmost_insert([1, 2, 2, 2, 3], 2)\nassert leftmost_insert([2, 2, 2], 2) == 0, leftmost_insert([2, 2, 2], 2)\n\n# Boundary: target smaller than everything → 0.\nassert leftmost_insert([5, 6, 7], 1) == 0, leftmost_insert([5, 6, 7], 1)\n\n# Boundary: target larger than everything → len.\nassert leftmost_insert([5, 6, 7], 99) == 3, leftmost_insert([5, 6, 7], 99)\n\n# Empty array.\nassert leftmost_insert([], 5) == 0, leftmost_insert([], 5)\n\n# Single-element array, target matches → 0 (leftmost).\nassert leftmost_insert([5], 5) == 0, leftmost_insert([5], 5)\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_class_vs_instance_attribute", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "class TokenBucket:\n \"\"\"Per-instance rate-limit bucket. Each instance must track its OWN\n consumed tokens independently — instances must not share state.\n \"\"\"\n\n consumed = []\n\n def __init__(self, capacity):\n self.capacity = capacity\n\n def take(self, n):\n \"\"\"Record consumption of n tokens; raise if it would exceed capacity.\"\"\"\n if sum(self.consumed) + n > self.capacity:\n raise ValueError('over capacity')\n self.consumed.append(n)\n\n def used(self):\n return sum(self.consumed)\n", "test_solution.py": "from solution import TokenBucket\n\n# Single bucket: basic behavior works.\nb1 = TokenBucket(capacity=10)\nb1.take(3)\nb1.take(4)\nassert b1.used() == 7, b1.used()\n\n# Two independent buckets must NOT share state.\nb2 = TokenBucket(capacity=10)\nassert b2.used() == 0, f'fresh bucket should be empty, got {b2.used()}'\nb2.take(2)\nassert b2.used() == 2, b2.used()\n\n# And b1's state must be unaffected by b2's activity.\nassert b1.used() == 7, f'b1 corrupted by b2 activity: {b1.used()}'\n\n# Third bucket, different capacity — definitely independent.\nb3 = TokenBucket(capacity=5)\nassert b3.used() == 0, f'b3 should be empty, got {b3.used()}'\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"} diff --git a/tests/manual/skill_closed_loop_smoke.py b/tests/manual/skill_closed_loop_smoke.py index df398d47..b34b617e 100644 --- a/tests/manual/skill_closed_loop_smoke.py +++ b/tests/manual/skill_closed_loop_smoke.py @@ -6,19 +6,20 @@ the real planted-bug suite, so a regression in (e.g.) how Hermes discovers skills inside the per-task sandbox, or how `python test_solution.py` actually scores in the validator, would slip - through CI. This smoke runs the entire closed-loop layer end-to-end - against: - - the checked-in fake target skill at tests/fixtures/skills/systematic_debugging/ - - the checked-in planted-bug suite at evolution/validation/suites/systematic_debugging.jsonl - - whatever Hermes is configured with (uses the user's real ~/.hermes/config.yaml) + through CI. This smoke runs the entire closed-loop layer end-to-end. Drives one ClosedLoopValidator.validate() call directly — 5 tasks × - 2 phases (baseline + evolved) = 10 hermes -z invocations. With a - cheap eval model (gpt-4o-mini-ish) this is pennies, not dollars. + 2 phases (baseline + evolved) = 10 hermes -z invocations. How to run: + # Wiring sanity (basic textbook bugs, uses your Hermes default model) uv run python tests/manual/skill_closed_loop_smoke.py + # Headroom validation: harder bugs + weaker model so the planted-bug + # verdicts don't all saturate at 5/5 on capable agents + uv run python tests/manual/skill_closed_loop_smoke.py \\ + --suite advanced --agent-model gpt-4o-mini + Exits 0 on success. Not part of CI — heavyweight (drives real hermes -z subprocesses + real LM spend). @@ -29,6 +30,7 @@ from __future__ import annotations +import argparse import shutil import sys import tempfile @@ -43,9 +45,10 @@ FAKE_SKILL_PATH = ( REPO_ROOT / "tests" / "fixtures" / "skills" / "systematic_debugging" / "SKILL.md" ) -SUITE_PATH = ( - REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl" -) +SUITE_PATHS = { + "basic": REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl", + "advanced": REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging_advanced.jsonl", +} SKILL_NAME = "systematic_debugging" @@ -87,12 +90,42 @@ def _section(title: str) -> None: def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--suite", + choices=sorted(SUITE_PATHS), + default="basic", + help="Which planted-bug suite to run against. `basic` (default) is " + "the textbook bugs — proves wiring works on any model. `advanced` " + "is harder bugs designed to discriminate skill-text variants on " + "capable agent models.", + ) + parser.add_argument( + "--agent-model", + default=None, + help="Override the model hermes -z runs with (passed as `hermes -m MODEL`). " + "Use when your default Hermes model saturates the suite at 5/5 and " + "you want to see the planted bugs actually fail on baseline.", + ) + parser.add_argument( + "--task-timeout-seconds", + type=int, + default=None, + help="Per-task wall-clock budget (default 120s). Bump when --agent-model " + "selects a slow reasoning model (o1/o3-family typically need 200-300s " + "per debugging task). Timeouts abstain the task verdict rather than " + "failing it, so under-sized values silently produce no-signal runs.", + ) + args = parser.parse_args() + + suite_path = SUITE_PATHS[args.suite] + _section("Pre-flight checks") if not FAKE_SKILL_PATH.is_file(): print(f" ✗ Missing fixture skill at {FAKE_SKILL_PATH}") return 1 - if not SUITE_PATH.is_file(): - print(f" ✗ Missing suite at {SUITE_PATH}") + if not suite_path.is_file(): + print(f" ✗ Missing suite at {suite_path}") return 1 if shutil.which("hermes") is None: print( @@ -104,8 +137,16 @@ def main() -> int: print(" ✗ `python` not on PATH. Suite tasks use `python test_solution.py`.") return 1 print(f" ✓ Fixture skill present: {FAKE_SKILL_PATH}") - print(f" ✓ Suite present: {SUITE_PATH}") + print(f" ✓ Suite ({args.suite}) present: {suite_path}") print(f" ✓ hermes binary present: {shutil.which('hermes')}") + if args.agent_model: + print(f" ✓ Agent model override: {args.agent_model}") + else: + print(f" ✓ Agent model: ") + if args.task_timeout_seconds: + print(f" ✓ Task timeout override: {args.task_timeout_seconds}s") + else: + print(f" ✓ Task timeout: 120s (default)") _section("Constructing closed-loop cache") from evolution.skills.evolve_skill import _maybe_build_closed_loop_cache_skill @@ -125,11 +166,13 @@ def main() -> int: skill_name=SKILL_NAME, skill_path=FAKE_SKILL_PATH, baseline_skill_body=skill["body"], - suite_path=SUITE_PATH, + suite_path=suite_path, saturation_threshold=0.95, min_iters=1, window_size=4, gate_mode="always", # force the validator to fire + agent_model=args.agent_model, + agent_timeout_seconds=args.task_timeout_seconds, ) assert cache is not None, "cache should be constructed when suite_path is set" print(f" ✓ Cache constructed (gate_mode={cache.gate_mode})") diff --git a/tests/skills/test_evolve_skill_closed_loop.py b/tests/skills/test_evolve_skill_closed_loop.py index 91dc4d99..f1f40224 100644 --- a/tests/skills/test_evolve_skill_closed_loop.py +++ b/tests/skills/test_evolve_skill_closed_loop.py @@ -158,6 +158,64 @@ def test_gate_mode_always_propagates(self, fake_skill_path): ) assert cache.gate_mode == "always" + def test_agent_model_plumbed_into_runner(self, fake_skill_path): + # The closed-loop validator's runner picks up the agent_model + # override so `hermes -z -m MODEL` runs instead of using the + # user's ~/.hermes/config.yaml default. + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + agent_model="gpt-4o-mini", + ) + assert cache._validator.runner.model == "gpt-4o-mini" + + def test_agent_model_none_leaves_runner_model_none(self, fake_skill_path): + # No override → runner.model is None and hermes uses config default. + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + ) + assert cache._validator.runner.model is None + + def test_agent_timeout_seconds_plumbed_into_runner(self, fake_skill_path): + # Slow reasoning models (o1/o3-family) need more than the 120s default. + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + agent_timeout_seconds=300, + ) + assert cache._validator.runner.timeout_seconds == 300 + + def test_agent_timeout_none_keeps_runner_default(self, fake_skill_path): + # No override → runner uses its DEFAULT_TASK_TIMEOUT_SECONDS (120s). + from evolution.validation.hermes_runner import DEFAULT_TASK_TIMEOUT_SECONDS + + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + ) + assert cache._validator.runner.timeout_seconds == DEFAULT_TASK_TIMEOUT_SECONDS + def test_installer_skills_src_is_under_workdir(self, fake_skill_path): # The cache's validator holds an installer with skills_src — confirming # the wiring is intact so the runner sees the candidate. diff --git a/tests/tools/test_evolve_tool_closed_loop.py b/tests/tools/test_evolve_tool_closed_loop.py index f0c8c42b..80ae0150 100644 --- a/tests/tools/test_evolve_tool_closed_loop.py +++ b/tests/tools/test_evolve_tool_closed_loop.py @@ -165,6 +165,72 @@ def test_skill_side_closed_loop_flag_passes_to_evolve( # Default mode is feedback for skills (skill bodies mutate heavily # so opting into trainset is explicit). assert kwargs["closed_loop_mode"] == "feedback" + # Default agent_model is None (use Hermes config default). + assert kwargs["closed_loop_agent_model"] is None + + def test_skill_side_agent_model_flag_threads_through( + self, tmp_path, task_suite_file + ): + from evolution.skills.evolve_skill import main + runner = CliRunner() + with patch("evolution.skills.evolve_skill.evolve") as fake_evolve: + fake_evolve.return_value = None + result = runner.invoke(main, [ + "--skill", "some-skill", + "--closed-loop-during-evolution", str(task_suite_file), + "--closed-loop-agent-model", "gpt-4o-mini", + ]) + assert result.exit_code == 0, result.output + assert fake_evolve.call_args.kwargs["closed_loop_agent_model"] == "gpt-4o-mini" + + def test_tool_side_agent_model_flag_threads_through( + self, tmp_path, fake_hermes_repo, task_suite_file + ): + from evolution.tools.evolve_tool import main + runner = CliRunner() + with patch("evolution.tools.evolve_tool.evolve") as fake_evolve: + fake_evolve.return_value = {"ok": True} + result = runner.invoke(main, [ + "--tool", "patch", + "--manifest", str(fake_hermes_repo / "tools"), + "--closed-loop-during-evolution", str(task_suite_file), + "--closed-loop-hermes-repo", str(fake_hermes_repo), + "--closed-loop-agent-model", "gpt-4o-mini", + ]) + assert result.exit_code == 0, result.output + assert fake_evolve.call_args.kwargs["closed_loop_agent_model"] == "gpt-4o-mini" + + def test_skill_side_task_timeout_flag_threads_through( + self, tmp_path, task_suite_file + ): + from evolution.skills.evolve_skill import main + runner = CliRunner() + with patch("evolution.skills.evolve_skill.evolve") as fake_evolve: + fake_evolve.return_value = None + result = runner.invoke(main, [ + "--skill", "some-skill", + "--closed-loop-during-evolution", str(task_suite_file), + "--closed-loop-task-timeout-seconds", "300", + ]) + assert result.exit_code == 0, result.output + assert fake_evolve.call_args.kwargs["closed_loop_task_timeout_seconds"] == 300 + + def test_tool_side_task_timeout_flag_threads_through( + self, tmp_path, fake_hermes_repo, task_suite_file + ): + from evolution.tools.evolve_tool import main + runner = CliRunner() + with patch("evolution.tools.evolve_tool.evolve") as fake_evolve: + fake_evolve.return_value = {"ok": True} + result = runner.invoke(main, [ + "--tool", "patch", + "--manifest", str(fake_hermes_repo / "tools"), + "--closed-loop-during-evolution", str(task_suite_file), + "--closed-loop-hermes-repo", str(fake_hermes_repo), + "--closed-loop-task-timeout-seconds", "300", + ]) + assert result.exit_code == 0, result.output + assert fake_evolve.call_args.kwargs["closed_loop_task_timeout_seconds"] == 300 class TestClosedLoopModeFlag: diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index 8bc066f5..c56faa9e 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -292,6 +292,60 @@ def _fake_run(*args, **kwargs): assert sandbox_skills_state["present"] is True assert "evolved body" in sandbox_skills_state["text"] + def test_model_override_passes_minus_m_flag(self, fixture_dir, tmp_path): + """When model is set, hermes is invoked with `hermes -m MODEL -z ...`.""" + runner = HermesAgentRunner( + user_config_path=tmp_path / "nonexistent", + model="gpt-4o-mini", + ) + captured: dict = {} + + def _fake_run(*args, **kwargs): + captured["args"] = args[0] if args else kwargs.get("args") + sandbox = Path(kwargs["env"]["HERMES_HOME"]) + (sandbox / "sessions").mkdir(exist_ok=True) + _write_session( + sandbox / "sessions" / "session_test.json", + [{"role": "assistant", "content": "ok"}], + ) + return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): + runner.run(TaskRunContext( + user_message="debug", + fixture_dir=fixture_dir, + )) + + # -m must appear before -z so hermes parses it as a global flag, + # not as part of the -z message. + assert captured["args"][:4] == ["hermes", "-m", "gpt-4o-mini", "-z"] + assert captured["args"][4] == "debug" + + def test_model_none_omits_minus_m_flag(self, fixture_dir, tmp_path): + """No model override → original argv shape (no -m), preserves the + existing behavior bit-for-bit for callers that don't opt in.""" + runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent") + captured: dict = {} + + def _fake_run(*args, **kwargs): + captured["args"] = args[0] if args else kwargs.get("args") + sandbox = Path(kwargs["env"]["HERMES_HOME"]) + (sandbox / "sessions").mkdir(exist_ok=True) + _write_session( + sandbox / "sessions" / "session_test.json", + [{"role": "assistant", "content": "ok"}], + ) + return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): + runner.run(TaskRunContext( + user_message="hello", + fixture_dir=fixture_dir, + )) + + assert "-m" not in captured["args"] + assert captured["args"] == ["hermes", "-z", "hello"] + def test_skills_src_none_means_no_skills_dir_created(self, fixture_dir, tmp_path): """Tool-side runs (no skills_src) must not create an empty skills/ directory in the sandbox — keeps the legacy code path bit-for-bit.""" diff --git a/tests/validation/test_systematic_debugging_advanced_suite.py b/tests/validation/test_systematic_debugging_advanced_suite.py new file mode 100644 index 00000000..a0b6e874 --- /dev/null +++ b/tests/validation/test_systematic_debugging_advanced_suite.py @@ -0,0 +1,176 @@ +"""Sanity tests for the advanced systematic-debugging planted-bug suite. + +Same shape as test_systematic_debugging_suite.py — proves every planted +bug is real (baseline test exits non-zero) and every known fix passes +(test exits zero after applying it). Without these guards, a too-permissive +assertion would silently invalidate every closed-loop run that uses this +suite. + +The advanced suite exists for closed-loop runs where the textbook suite +saturates against a capable agent model — these bugs require structured +debugging (the spec edge case the obvious patch misses) and a "just try +edits" methodology fails on them. +""" + +from __future__ import annotations + +import shlex +import subprocess +from pathlib import Path + +import pytest + +from evolution.validation.task import TaskSuite + + +SUITE_PATH = ( + Path(__file__).resolve().parents[2] + / "evolution" + / "validation" + / "suites" + / "systematic_debugging_advanced.jsonl" +) + + +# Known-good fix per task_id. Each value replaces solution.py wholesale. +_KNOWN_FIXES: dict[str, str] = { + "debug_generator_exhaustion": ( + "def count_partitions(items, predicate):\n" + " items = list(items)\n" + " n_true = sum(1 for x in items if predicate(x))\n" + " n_false = sum(1 for x in items if not predicate(x))\n" + " return (n_true, n_false)\n" + ), + "debug_shared_mutable_return": ( + "_history_cache = {}\n" + "\n" + "def get_history(user_id):\n" + " if user_id not in _history_cache:\n" + " _history_cache[user_id] = []\n" + " return list(_history_cache[user_id])\n" + ), + "debug_float_precision_equality": ( + "import math\n" + "\n" + "def is_target_amount(payments, target):\n" + " return math.isclose(sum(payments), target, abs_tol=1e-6)\n" + ), + "debug_binary_search_boundary": ( + "def leftmost_insert(sorted_arr, target):\n" + " lo, hi = 0, len(sorted_arr)\n" + " while lo < hi:\n" + " mid = (lo + hi) // 2\n" + " if sorted_arr[mid] < target:\n" + " lo = mid + 1\n" + " else:\n" + " hi = mid\n" + " return lo\n" + ), + "debug_class_vs_instance_attribute": ( + "class TokenBucket:\n" + " def __init__(self, capacity):\n" + " self.capacity = capacity\n" + " self.consumed = []\n" + "\n" + " def take(self, n):\n" + " if sum(self.consumed) + n > self.capacity:\n" + " raise ValueError('over capacity')\n" + " self.consumed.append(n)\n" + "\n" + " def used(self):\n" + " return sum(self.consumed)\n" + ), +} + + +@pytest.fixture(scope="module") +def suite() -> TaskSuite: + return TaskSuite.from_jsonl(SUITE_PATH) + + +class TestSuiteShape: + def test_loads_five_tasks(self, suite): + assert len(suite.tasks) == 5 + + def test_every_task_has_test_command(self, suite): + for task in suite.tasks: + assert task.test_command, f"{task.task_id}: missing test_command" + + def test_every_task_has_solution_and_test_files(self, suite): + for task in suite.tasks: + files = set(task.fixture_setup.keys()) + assert "solution.py" in files, f"{task.task_id}: no solution.py" + assert "test_solution.py" in files, f"{task.task_id}: no test_solution.py" + + def test_every_task_id_has_a_known_fix(self, suite): + suite_ids = {t.task_id for t in suite.tasks} + fix_ids = set(_KNOWN_FIXES.keys()) + assert suite_ids == fix_ids, ( + f"task ids and known-fix ids out of sync: " + f"missing fixes={suite_ids - fix_ids}, " + f"extra fixes={fix_ids - suite_ids}" + ) + + def test_no_task_id_collision_with_basic_suite(self, suite): + # Sanity guard: basic + advanced suites are separate files but + # behavioral examples key by task_id. Collisions would silently + # overwrite verdicts in the cache. Confirm zero overlap. + basic_path = SUITE_PATH.parent / "systematic_debugging.jsonl" + basic = TaskSuite.from_jsonl(basic_path) + basic_ids = {t.task_id for t in basic.tasks} + advanced_ids = {t.task_id for t in suite.tasks} + assert not (basic_ids & advanced_ids), ( + f"task_id collision between suites: {basic_ids & advanced_ids}" + ) + + +class TestPlantedBugsAreReal: + @pytest.mark.parametrize( + "task_id", + list(_KNOWN_FIXES.keys()), + ids=list(_KNOWN_FIXES.keys()), + ) + def test_baseline_buggy_code_fails(self, task_id, tmp_path, suite): + task = next(t for t in suite.tasks if t.task_id == task_id) + _materialize(task.fixture_setup, tmp_path) + result = subprocess.run( + shlex.split(task.test_command), + cwd=tmp_path, + capture_output=True, + text=True, + timeout=15, + ) + assert result.returncode != 0, ( + f"{task_id}: baseline buggy code unexpectedly passed the test.\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +class TestKnownFixesPass: + @pytest.mark.parametrize( + "task_id,fixed_solution", + list(_KNOWN_FIXES.items()), + ids=list(_KNOWN_FIXES.keys()), + ) + def test_known_fix_passes(self, task_id, fixed_solution, tmp_path, suite): + task = next(t for t in suite.tasks if t.task_id == task_id) + _materialize(task.fixture_setup, tmp_path) + (tmp_path / "solution.py").write_text(fixed_solution) + result = subprocess.run( + shlex.split(task.test_command), + cwd=tmp_path, + capture_output=True, + text=True, + timeout=15, + ) + assert result.returncode == 0, ( + f"{task_id}: known fix did not make the test pass.\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +def _materialize(setup: dict[str, str], target: Path) -> None: + for rel, content in setup.items(): + dest = target / rel + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text(content)