diff --git a/docs/workflows.md b/docs/workflows.md
index 01706618..0ddd4451 100644
--- a/docs/workflows.md
+++ b/docs/workflows.md
@@ -488,7 +488,19 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256
 
 Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable.
 
-Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`.
+Reference suites:
+- `evolution/validation/suites/systematic_debugging.jsonl` — 5 textbook bugs; good for verifying the wiring works.
+- `evolution/validation/suites/systematic_debugging_advanced.jsonl` — 5 harder bugs (generator exhaustion, shared mutable return, float-precision equality, leftmost-insert boundary, class-vs-instance attribute) designed to discriminate skill-text variants on capable agent models that saturate the basic suite at 5/5.
+
+When your daily-driver Hermes model is capable enough to solve every textbook bug regardless of skill text, the planted-bug verdict adds no signal. Three knobs to recover discrimination:
+
+- `--closed-loop-during-evolution .../systematic_debugging_advanced.jsonl` — use the harder bugs (different cognitive failure modes).
+- `--closed-loop-agent-model MODEL` — run the validator's agent against a different model than your `~/.hermes/config.yaml` default. Hermes sends `include: ['reasoning.encrypted_content']` so the model must be a reasoning model (o1-family, o3-family, o4-mini, gpt-5.x-family); non-reasoning models reject the request.
+- `--closed-loop-task-timeout-seconds N` — bump the per-task wall-clock budget. The default is 120s; most reasoning models other than the smallest take 200–300s per debugging task and would otherwise abstain (timeout) without recording a verdict.
+
+**Empirical caveat from validation.** Both suites saturate at 5/5 against capable reasoning models (`gpt-5.4-mini` saturated both; `o3-mini` was slow enough to abstain most tasks at the default timeout). For a setup where the user's default model handles textbook Python debugging easily, the closed-loop signal on this domain may be uninformative regardless of skill text — the agent's raw capability dominates. Real headroom likely needs evaluation surfaces where methodology matters more than recognition: multi-file refactoring, ambiguous specs with edge cases the agent must enumerate, tasks requiring iterative hypothesis-testing across multiple test runs.
+
+Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py` (supports `--suite {basic,advanced}`, `--agent-model MODEL`, `--task-timeout-seconds N`).
 
 ## Failure-mode summary
 
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 80498848..2dbdfd05 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -471,6 +471,8 @@ def _maybe_build_closed_loop_cache_skill(
     min_iters: int,
     window_size: int,
     gate_mode: str = "sampled",
+    agent_model: Optional[str] = None,
+    agent_timeout_seconds: Optional[int] = None,
 ):
     """Build a ClosedLoopFeedbackCache for the skill path; return None when disabled.
 
@@ -506,7 +508,10 @@ def _maybe_build_closed_loop_cache_skill(
         skill_name=skill_name,
         workdir=workdir,
     )
-    runner = HermesAgentRunner()
+    runner_kwargs: dict = {"model": agent_model}
+    if agent_timeout_seconds is not None:
+        runner_kwargs["timeout_seconds"] = agent_timeout_seconds
+    runner = HermesAgentRunner(**runner_kwargs)
     validator = ClosedLoopValidator(installer=installer, runner=runner)
     suite = TaskSuite.from_jsonl(suite_path)
     return ClosedLoopFeedbackCache(
@@ -599,6 +604,8 @@ def evolve(
     closed_loop_window_size: int = 8,
     closed_loop_mode: str = "feedback",
     closed_loop_in_valset: bool = False,
+    closed_loop_agent_model: Optional[str] = None,
+    closed_loop_task_timeout_seconds: Optional[int] = None,
 ):
     """Main evolution function — orchestrates the full optimization loop."""
 
@@ -835,6 +842,8 @@ def evolve(
                 min_iters=closed_loop_min_iters,
                 window_size=closed_loop_window_size,
                 gate_mode=_cache_gate_mode,
+                agent_model=closed_loop_agent_model,
+                agent_timeout_seconds=closed_loop_task_timeout_seconds,
             )
 
             # Build the metric once: DSPy's LM cache lines up across GEPA's
@@ -1547,6 +1556,31 @@ def evolve(
          "scoring). Costs more — each accepted candidate triggers another full "
          "eval pass over the behavioral examples. Default off.",
 )
+@click.option(
+    "--closed-loop-agent-model",
+    "closed_loop_agent_model",
+    default=None,
+    type=str,
+    help="Override the agent model the closed-loop validator runs `hermes -z` "
+         "with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
+         "uses whatever's in your ~/.hermes/config.yaml. Useful when your "
+         "daily-driver Hermes model is so capable it saturates the planted-bug "
+         "suite at 100%, hiding the behavioral signal closed-loop is supposed "
+         "to surface — run validation against a weaker model without touching "
+         "your config.",
+)
+@click.option(
+    "--closed-loop-task-timeout-seconds",
+    "closed_loop_task_timeout_seconds",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
+         "subprocess (default 120s). Bump when --closed-loop-agent-model selects "
+         "a slow reasoning model that doesn't finish within the default — most "
+         "OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
+         "debugging task. Hitting the timeout abstains the task verdict rather "
+         "than failing it, so over-tight values silently produce no-signal runs.",
+)
 def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model,
          eval_model, skill_source_dir, dry_run, seed, budget, no_fallback,
          quality_gate, growth_free_threshold,
@@ -1563,7 +1597,9 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
          closed_loop_min_iters,
          closed_loop_window_size,
          closed_loop_mode,
-         closed_loop_in_valset):
+         closed_loop_in_valset,
+         closed_loop_agent_model,
+         closed_loop_task_timeout_seconds):
     """Evolve an agent skill using DSPy + GEPA optimization."""
     try:
         evolve(
@@ -1607,6 +1643,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
             closed_loop_window_size=closed_loop_window_size,
             closed_loop_mode=closed_loop_mode,
             closed_loop_in_valset=closed_loop_in_valset,
+            closed_loop_agent_model=closed_loop_agent_model,
+            closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
         )
     except HermesProviderError as exc:
         # Render a clean error panel instead of dumping a Python traceback
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 31c39cc6..04b8fd90 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -260,6 +260,8 @@ def _maybe_build_closed_loop_cache(
     min_iters: int,
     window_size: int,
     gate_mode: str = "sampled",
+    agent_model: Optional[str] = None,
+    agent_timeout_seconds: Optional[int] = None,
 ):
     """Build a ClosedLoopFeedbackCache when the user opted in, else None.
 
@@ -286,7 +288,10 @@ def _maybe_build_closed_loop_cache(
     installer = HermesToolDescriptionInstaller(
         hermes_repo=hermes_repo, tool_name=tool_name
     )
-    runner = HermesAgentRunner()
+    runner_kwargs: dict = {"model": agent_model}
+    if agent_timeout_seconds is not None:
+        runner_kwargs["timeout_seconds"] = agent_timeout_seconds
+    runner = HermesAgentRunner(**runner_kwargs)
     validator = ClosedLoopValidator(installer=installer, runner=runner)
     suite = TaskSuite.from_jsonl(suite_path)
     return ClosedLoopFeedbackCache(
@@ -359,6 +364,8 @@ def evolve(
     closed_loop_window_size: int = 8,
     closed_loop_mode: str = "feedback",
     closed_loop_in_valset: bool = False,
+    closed_loop_agent_model: Optional[str] = None,
+    closed_loop_task_timeout_seconds: Optional[int] = None,
     skip_preflight: bool = False,
     skip_cost_suggest: bool = False,
 ) -> dict[str, Any]:
@@ -592,6 +599,8 @@ def evolve(
                 min_iters=closed_loop_min_iters,
                 window_size=closed_loop_window_size,
                 gate_mode=cache_gate_mode,
+                agent_model=closed_loop_agent_model,
+                agent_timeout_seconds=closed_loop_task_timeout_seconds,
             )
             metric = make_tool_fitness_metric(
                 judge=judge,
@@ -1187,6 +1196,30 @@ def evolve(
          "scoring). Costs more — each accepted candidate triggers another full "
          "eval pass over the behavioral examples. Default off.",
 )
+@click.option(
+    "--closed-loop-agent-model",
+    "closed_loop_agent_model",
+    default=None,
+    type=str,
+    help="Override the agent model the closed-loop validator runs `hermes -z` "
+         "with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
+         "uses whatever's in your ~/.hermes/config.yaml. Useful when your "
+         "daily-driver Hermes model saturates the planted-bug suite at 100%, "
+         "hiding the behavioral signal — run validation against a weaker model "
+         "without touching your config.",
+)
+@click.option(
+    "--closed-loop-task-timeout-seconds",
+    "closed_loop_task_timeout_seconds",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
+         "subprocess (default 120s). Bump when --closed-loop-agent-model selects "
+         "a slow reasoning model that doesn't finish within the default — most "
+         "OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
+         "debugging task. Hitting the timeout abstains the task verdict rather "
+         "than failing it, so over-tight values silently produce no-signal runs.",
+)
 def main(
     tool_name: str,
     manifest_path: Path,
@@ -1212,6 +1245,8 @@ def main(
     closed_loop_window_size: int,
     closed_loop_mode: str,
     closed_loop_in_valset: bool,
+    closed_loop_agent_model: Optional[str],
+    closed_loop_task_timeout_seconds: Optional[int],
 ) -> None:
     """Evolve one tool description in an MCP manifest using DSPy + GEPA."""
     if apply_flag and patch_flag:
@@ -1249,6 +1284,8 @@ def main(
             closed_loop_window_size=closed_loop_window_size,
             closed_loop_mode=closed_loop_mode,
             closed_loop_in_valset=closed_loop_in_valset,
+            closed_loop_agent_model=closed_loop_agent_model,
+            closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
             skip_preflight=skip_preflight,
             skip_cost_suggest=skip_cost_suggest,
         )
diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
index 9c8940b5..4a4dacd1 100644
--- a/evolution/validation/hermes_runner.py
+++ b/evolution/validation/hermes_runner.py
@@ -46,6 +46,7 @@ def __init__(
         hermes_command: str = "hermes",
         timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
         user_config_path: Optional[Path] = None,
+        model: Optional[str] = None,
     ) -> None:
         self.hermes_command = hermes_command
         self.timeout_seconds = timeout_seconds
@@ -56,6 +57,13 @@ def __init__(
             if user_config_path is not None
             else Path.home() / ".hermes" / "config.yaml"
         )
+        # Optional per-invocation model override (passed as ``hermes -z -m
+        # <model>``). When unset, Hermes uses whatever is configured in
+        # the sandboxed ``config.yaml``. Useful for closed-loop validation
+        # against a deliberately weaker agent model than the user's
+        # daily-driver default — saturation on capable models hides
+        # behavioral signal that a weaker model would expose.
+        self.model = model
 
     def run(self, ctx: TaskRunContext) -> AgentRunResult:
         message = ctx.user_message
@@ -68,10 +76,15 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
                 "HOME": str(sandbox),
                 **ctx.extra_env,
             }
+            argv = [self.hermes_command, "-z", message]
+            if self.model is not None:
+                # Insert before the -z so hermes parses it as a global flag,
+                # not as part of the -z prompt value.
+                argv = [self.hermes_command, "-m", self.model, "-z", message]
             start = time.time()
             try:
                 subprocess.run(
-                    [self.hermes_command, "-z", message],
+                    argv,
                     env=env,
                     cwd=str(ctx.fixture_dir),
                     capture_output=True,
diff --git a/evolution/validation/suites/systematic_debugging_advanced.jsonl b/evolution/validation/suites/systematic_debugging_advanced.jsonl
new file mode 100644
index 00000000..5a1939db
--- /dev/null
+++ b/evolution/validation/suites/systematic_debugging_advanced.jsonl
@@ -0,0 +1,22 @@
+# Advanced planted-bug suite for the `systematic-debugging` skill.
+#
+# Companion to systematic_debugging.jsonl (5 textbook bugs). This suite
+# adds 5 bugs that capable reasoning models often "fix" wrong on the
+# first pass — the obvious patch makes the failing test green but
+# breaks another, or the bug is in the interaction between two pieces
+# of seemingly-correct code rather than in one wrong line.
+#
+# Each bug exercises a different cognitive failure mode:
+#   1. iteration:  generator exhaustion (function iterates twice)
+#   2. state:      shared mutable returned across calls
+#   3. precision:  float equality after computation introduces drift
+#   4. algorithm:  binary search boundary case for leftmost insertion
+#   5. OOP:        class attribute used where instance attribute intended
+#
+# These survive a "just try edits until the test passes" approach
+# because the spec demands edge cases the obvious patch doesn't cover.
+{"task_id": "debug_generator_exhaustion", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def count_partitions(items, predicate):\n    \"\"\"Return (n_true, n_false): how many items satisfy predicate vs not.\n\n    Must work for any iterable input including single-pass generators.\n    \"\"\"\n    n_true = sum(1 for x in items if predicate(x))\n    n_false = sum(1 for x in items if not predicate(x))\n    return (n_true, n_false)\n", "test_solution.py": "from solution import count_partitions\n\n# Works on lists — easy case.\nassert count_partitions([1, 2, 3, 4], lambda x: x > 2) == (2, 2)\nassert count_partitions([], lambda x: x > 0) == (0, 0)\n\n# Must also work on single-pass generators — the spec is explicit.\ndef gen():\n    yield from [1, 2, 3, 4, 5]\nassert count_partitions(gen(), lambda x: x > 2) == (3, 2), count_partitions(gen(), lambda x: x > 2)\n\n# And on a generator of strings.\ndef words():\n    yield from ['apple', 'banana', 'cherry', 'kiwi']\nassert count_partitions(words(), lambda s: len(s) > 5) == (2, 2), count_partitions(words(), lambda s: len(s) > 5)\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_shared_mutable_return", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "_history_cache = {}\n\ndef get_history(user_id):\n    \"\"\"Return the user's history list. Each call must return an INDEPENDENT\n    list — mutating the returned list must NOT affect subsequent calls.\n    \"\"\"\n    if user_id not in _history_cache:\n        _history_cache[user_id] = []\n    return _history_cache[user_id]\n", "test_solution.py": "from solution import get_history\n\n# First call returns empty.\nh1 = get_history('alice')\nassert h1 == [], h1\n\n# Caller mutates the returned list (logging some action).\nh1.append('logged_in')\n\n# Second call for SAME user must return a fresh empty list per the spec —\n# mutation of h1 must not leak into the cached history visible to other callers.\nh2 = get_history('alice')\nassert h2 == [], f'mutation leaked: {h2}'\n\n# Independent users are independent.\nassert get_history('bob') == []\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_float_precision_equality", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def is_target_amount(payments, target):\n    \"\"\"Return True iff the sum of payments equals target. Payments come\n    from upstream float arithmetic so the sum may have float-precision\n    drift on the order of 1e-9; treat amounts within 1e-6 of target as equal.\n    \"\"\"\n    return sum(payments) == target\n", "test_solution.py": "from solution import is_target_amount\n\n# Exact case: trivially true.\nassert is_target_amount([1.0, 2.0, 3.0], 6.0) is True\n\n# Float-drift cases: each computes a sum that does NOT exactly equal the\n# target due to IEEE 754, but is within the 1e-6 tolerance the spec allows.\nassert is_target_amount([0.1, 0.1, 0.1], 0.3) is True, sum([0.1, 0.1, 0.1]) - 0.3\nassert is_target_amount([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 0.7) is True, sum([0.1]*7) - 0.7\nassert is_target_amount([0.2, 0.2, 0.2], 0.6) is True, sum([0.2]*3) - 0.6\n\n# Genuinely different amounts: must still return False.\nassert is_target_amount([1.0, 2.0], 5.0) is False\nassert is_target_amount([0.1, 0.1], 0.3) is False\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_binary_search_boundary", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def leftmost_insert(sorted_arr, target):\n    \"\"\"Return the leftmost index where target could be inserted to keep\n    sorted_arr sorted. Equivalent to bisect.bisect_left.\n\n    For target equal to existing elements, returns the FIRST such index.\n    For target smaller than all elements, returns 0.\n    For target larger than all elements, returns len(sorted_arr).\n    \"\"\"\n    lo, hi = 0, len(sorted_arr)\n    while lo < hi:\n        mid = (lo + hi) // 2\n        if sorted_arr[mid] <= target:\n            lo = mid + 1\n        else:\n            hi = mid\n    return lo\n", "test_solution.py": "from solution import leftmost_insert\n\n# Middle insertion — buggy code usually returns the right answer here.\nassert leftmost_insert([1, 3, 5, 7], 4) == 2, leftmost_insert([1, 3, 5, 7], 4)\n\n# Target equals an existing element — must return the LEFTMOST occurrence.\n# The buggy code returns the RIGHTMOST position (bisect_right behavior).\nassert leftmost_insert([1, 2, 2, 2, 3], 2) == 1, leftmost_insert([1, 2, 2, 2, 3], 2)\nassert leftmost_insert([2, 2, 2], 2) == 0, leftmost_insert([2, 2, 2], 2)\n\n# Boundary: target smaller than everything → 0.\nassert leftmost_insert([5, 6, 7], 1) == 0, leftmost_insert([5, 6, 7], 1)\n\n# Boundary: target larger than everything → len.\nassert leftmost_insert([5, 6, 7], 99) == 3, leftmost_insert([5, 6, 7], 99)\n\n# Empty array.\nassert leftmost_insert([], 5) == 0, leftmost_insert([], 5)\n\n# Single-element array, target matches → 0 (leftmost).\nassert leftmost_insert([5], 5) == 0, leftmost_insert([5], 5)\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_class_vs_instance_attribute", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "class TokenBucket:\n    \"\"\"Per-instance rate-limit bucket. Each instance must track its OWN\n    consumed tokens independently — instances must not share state.\n    \"\"\"\n\n    consumed = []\n\n    def __init__(self, capacity):\n        self.capacity = capacity\n\n    def take(self, n):\n        \"\"\"Record consumption of n tokens; raise if it would exceed capacity.\"\"\"\n        if sum(self.consumed) + n > self.capacity:\n            raise ValueError('over capacity')\n        self.consumed.append(n)\n\n    def used(self):\n        return sum(self.consumed)\n", "test_solution.py": "from solution import TokenBucket\n\n# Single bucket: basic behavior works.\nb1 = TokenBucket(capacity=10)\nb1.take(3)\nb1.take(4)\nassert b1.used() == 7, b1.used()\n\n# Two independent buckets must NOT share state.\nb2 = TokenBucket(capacity=10)\nassert b2.used() == 0, f'fresh bucket should be empty, got {b2.used()}'\nb2.take(2)\nassert b2.used() == 2, b2.used()\n\n# And b1's state must be unaffected by b2's activity.\nassert b1.used() == 7, f'b1 corrupted by b2 activity: {b1.used()}'\n\n# Third bucket, different capacity — definitely independent.\nb3 = TokenBucket(capacity=5)\nassert b3.used() == 0, f'b3 should be empty, got {b3.used()}'\n\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
diff --git a/tests/manual/skill_closed_loop_smoke.py b/tests/manual/skill_closed_loop_smoke.py
index df398d47..b34b617e 100644
--- a/tests/manual/skill_closed_loop_smoke.py
+++ b/tests/manual/skill_closed_loop_smoke.py
@@ -6,19 +6,20 @@
   the real planted-bug suite, so a regression in (e.g.) how Hermes
   discovers skills inside the per-task sandbox, or how `python
   test_solution.py` actually scores in the validator, would slip
-  through CI. This smoke runs the entire closed-loop layer end-to-end
-  against:
-    - the checked-in fake target skill at tests/fixtures/skills/systematic_debugging/
-    - the checked-in planted-bug suite at evolution/validation/suites/systematic_debugging.jsonl
-    - whatever Hermes is configured with (uses the user's real ~/.hermes/config.yaml)
+  through CI. This smoke runs the entire closed-loop layer end-to-end.
 
   Drives one ClosedLoopValidator.validate() call directly — 5 tasks ×
-  2 phases (baseline + evolved) = 10 hermes -z invocations. With a
-  cheap eval model (gpt-4o-mini-ish) this is pennies, not dollars.
+  2 phases (baseline + evolved) = 10 hermes -z invocations.
 
 How to run:
+  # Wiring sanity (basic textbook bugs, uses your Hermes default model)
   uv run python tests/manual/skill_closed_loop_smoke.py
 
+  # Headroom validation: harder bugs + weaker model so the planted-bug
+  # verdicts don't all saturate at 5/5 on capable agents
+  uv run python tests/manual/skill_closed_loop_smoke.py \\
+      --suite advanced --agent-model gpt-4o-mini
+
   Exits 0 on success. Not part of CI — heavyweight (drives real
   hermes -z subprocesses + real LM spend).
 
@@ -29,6 +30,7 @@
 
 from __future__ import annotations
 
+import argparse
 import shutil
 import sys
 import tempfile
@@ -43,9 +45,10 @@
 FAKE_SKILL_PATH = (
     REPO_ROOT / "tests" / "fixtures" / "skills" / "systematic_debugging" / "SKILL.md"
 )
-SUITE_PATH = (
-    REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl"
-)
+SUITE_PATHS = {
+    "basic": REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl",
+    "advanced": REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging_advanced.jsonl",
+}
 SKILL_NAME = "systematic_debugging"
 
 
@@ -87,12 +90,42 @@ def _section(title: str) -> None:
 
 
 def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument(
+        "--suite",
+        choices=sorted(SUITE_PATHS),
+        default="basic",
+        help="Which planted-bug suite to run against. `basic` (default) is "
+             "the textbook bugs — proves wiring works on any model. `advanced` "
+             "is harder bugs designed to discriminate skill-text variants on "
+             "capable agent models.",
+    )
+    parser.add_argument(
+        "--agent-model",
+        default=None,
+        help="Override the model hermes -z runs with (passed as `hermes -m MODEL`). "
+             "Use when your default Hermes model saturates the suite at 5/5 and "
+             "you want to see the planted bugs actually fail on baseline.",
+    )
+    parser.add_argument(
+        "--task-timeout-seconds",
+        type=int,
+        default=None,
+        help="Per-task wall-clock budget (default 120s). Bump when --agent-model "
+             "selects a slow reasoning model (o1/o3-family typically need 200-300s "
+             "per debugging task). Timeouts abstain the task verdict rather than "
+             "failing it, so under-sized values silently produce no-signal runs.",
+    )
+    args = parser.parse_args()
+
+    suite_path = SUITE_PATHS[args.suite]
+
     _section("Pre-flight checks")
     if not FAKE_SKILL_PATH.is_file():
         print(f"  ✗ Missing fixture skill at {FAKE_SKILL_PATH}")
         return 1
-    if not SUITE_PATH.is_file():
-        print(f"  ✗ Missing suite at {SUITE_PATH}")
+    if not suite_path.is_file():
+        print(f"  ✗ Missing suite at {suite_path}")
         return 1
     if shutil.which("hermes") is None:
         print(
@@ -104,8 +137,16 @@ def main() -> int:
         print("  ✗ `python` not on PATH. Suite tasks use `python test_solution.py`.")
         return 1
     print(f"  ✓ Fixture skill present: {FAKE_SKILL_PATH}")
-    print(f"  ✓ Suite present: {SUITE_PATH}")
+    print(f"  ✓ Suite ({args.suite}) present: {suite_path}")
     print(f"  ✓ hermes binary present: {shutil.which('hermes')}")
+    if args.agent_model:
+        print(f"  ✓ Agent model override: {args.agent_model}")
+    else:
+        print(f"  ✓ Agent model: <Hermes config default>")
+    if args.task_timeout_seconds:
+        print(f"  ✓ Task timeout override: {args.task_timeout_seconds}s")
+    else:
+        print(f"  ✓ Task timeout: 120s (default)")
 
     _section("Constructing closed-loop cache")
     from evolution.skills.evolve_skill import _maybe_build_closed_loop_cache_skill
@@ -125,11 +166,13 @@ def main() -> int:
         skill_name=SKILL_NAME,
         skill_path=FAKE_SKILL_PATH,
         baseline_skill_body=skill["body"],
-        suite_path=SUITE_PATH,
+        suite_path=suite_path,
         saturation_threshold=0.95,
         min_iters=1,
         window_size=4,
         gate_mode="always",  # force the validator to fire
+        agent_model=args.agent_model,
+        agent_timeout_seconds=args.task_timeout_seconds,
     )
     assert cache is not None, "cache should be constructed when suite_path is set"
     print(f"  ✓ Cache constructed (gate_mode={cache.gate_mode})")
diff --git a/tests/skills/test_evolve_skill_closed_loop.py b/tests/skills/test_evolve_skill_closed_loop.py
index 91dc4d99..f1f40224 100644
--- a/tests/skills/test_evolve_skill_closed_loop.py
+++ b/tests/skills/test_evolve_skill_closed_loop.py
@@ -158,6 +158,64 @@ def test_gate_mode_always_propagates(self, fake_skill_path):
         )
         assert cache.gate_mode == "always"
 
+    def test_agent_model_plumbed_into_runner(self, fake_skill_path):
+        # The closed-loop validator's runner picks up the agent_model
+        # override so `hermes -z -m MODEL` runs instead of using the
+        # user's ~/.hermes/config.yaml default.
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+            agent_model="gpt-4o-mini",
+        )
+        assert cache._validator.runner.model == "gpt-4o-mini"
+
+    def test_agent_model_none_leaves_runner_model_none(self, fake_skill_path):
+        # No override → runner.model is None and hermes uses config default.
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+        )
+        assert cache._validator.runner.model is None
+
+    def test_agent_timeout_seconds_plumbed_into_runner(self, fake_skill_path):
+        # Slow reasoning models (o1/o3-family) need more than the 120s default.
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+            agent_timeout_seconds=300,
+        )
+        assert cache._validator.runner.timeout_seconds == 300
+
+    def test_agent_timeout_none_keeps_runner_default(self, fake_skill_path):
+        # No override → runner uses its DEFAULT_TASK_TIMEOUT_SECONDS (120s).
+        from evolution.validation.hermes_runner import DEFAULT_TASK_TIMEOUT_SECONDS
+
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+        )
+        assert cache._validator.runner.timeout_seconds == DEFAULT_TASK_TIMEOUT_SECONDS
+
     def test_installer_skills_src_is_under_workdir(self, fake_skill_path):
         # The cache's validator holds an installer with skills_src — confirming
         # the wiring is intact so the runner sees the candidate.
diff --git a/tests/tools/test_evolve_tool_closed_loop.py b/tests/tools/test_evolve_tool_closed_loop.py
index f0c8c42b..80ae0150 100644
--- a/tests/tools/test_evolve_tool_closed_loop.py
+++ b/tests/tools/test_evolve_tool_closed_loop.py
@@ -165,6 +165,72 @@ def test_skill_side_closed_loop_flag_passes_to_evolve(
         # Default mode is feedback for skills (skill bodies mutate heavily
         # so opting into trainset is explicit).
         assert kwargs["closed_loop_mode"] == "feedback"
+        # Default agent_model is None (use Hermes config default).
+        assert kwargs["closed_loop_agent_model"] is None
+
+    def test_skill_side_agent_model_flag_threads_through(
+        self, tmp_path, task_suite_file
+    ):
+        from evolution.skills.evolve_skill import main
+        runner = CliRunner()
+        with patch("evolution.skills.evolve_skill.evolve") as fake_evolve:
+            fake_evolve.return_value = None
+            result = runner.invoke(main, [
+                "--skill", "some-skill",
+                "--closed-loop-during-evolution", str(task_suite_file),
+                "--closed-loop-agent-model", "gpt-4o-mini",
+            ])
+        assert result.exit_code == 0, result.output
+        assert fake_evolve.call_args.kwargs["closed_loop_agent_model"] == "gpt-4o-mini"
+
+    def test_tool_side_agent_model_flag_threads_through(
+        self, tmp_path, fake_hermes_repo, task_suite_file
+    ):
+        from evolution.tools.evolve_tool import main
+        runner = CliRunner()
+        with patch("evolution.tools.evolve_tool.evolve") as fake_evolve:
+            fake_evolve.return_value = {"ok": True}
+            result = runner.invoke(main, [
+                "--tool", "patch",
+                "--manifest", str(fake_hermes_repo / "tools"),
+                "--closed-loop-during-evolution", str(task_suite_file),
+                "--closed-loop-hermes-repo", str(fake_hermes_repo),
+                "--closed-loop-agent-model", "gpt-4o-mini",
+            ])
+        assert result.exit_code == 0, result.output
+        assert fake_evolve.call_args.kwargs["closed_loop_agent_model"] == "gpt-4o-mini"
+
+    def test_skill_side_task_timeout_flag_threads_through(
+        self, tmp_path, task_suite_file
+    ):
+        from evolution.skills.evolve_skill import main
+        runner = CliRunner()
+        with patch("evolution.skills.evolve_skill.evolve") as fake_evolve:
+            fake_evolve.return_value = None
+            result = runner.invoke(main, [
+                "--skill", "some-skill",
+                "--closed-loop-during-evolution", str(task_suite_file),
+                "--closed-loop-task-timeout-seconds", "300",
+            ])
+        assert result.exit_code == 0, result.output
+        assert fake_evolve.call_args.kwargs["closed_loop_task_timeout_seconds"] == 300
+
+    def test_tool_side_task_timeout_flag_threads_through(
+        self, tmp_path, fake_hermes_repo, task_suite_file
+    ):
+        from evolution.tools.evolve_tool import main
+        runner = CliRunner()
+        with patch("evolution.tools.evolve_tool.evolve") as fake_evolve:
+            fake_evolve.return_value = {"ok": True}
+            result = runner.invoke(main, [
+                "--tool", "patch",
+                "--manifest", str(fake_hermes_repo / "tools"),
+                "--closed-loop-during-evolution", str(task_suite_file),
+                "--closed-loop-hermes-repo", str(fake_hermes_repo),
+                "--closed-loop-task-timeout-seconds", "300",
+            ])
+        assert result.exit_code == 0, result.output
+        assert fake_evolve.call_args.kwargs["closed_loop_task_timeout_seconds"] == 300
 
 
 class TestClosedLoopModeFlag:
diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py
index 8bc066f5..c56faa9e 100644
--- a/tests/validation/test_hermes_runner.py
+++ b/tests/validation/test_hermes_runner.py
@@ -292,6 +292,60 @@ def _fake_run(*args, **kwargs):
         assert sandbox_skills_state["present"] is True
         assert "evolved body" in sandbox_skills_state["text"]
 
+    def test_model_override_passes_minus_m_flag(self, fixture_dir, tmp_path):
+        """When model is set, hermes is invoked with `hermes -m MODEL -z ...`."""
+        runner = HermesAgentRunner(
+            user_config_path=tmp_path / "nonexistent",
+            model="gpt-4o-mini",
+        )
+        captured: dict = {}
+
+        def _fake_run(*args, **kwargs):
+            captured["args"] = args[0] if args else kwargs.get("args")
+            sandbox = Path(kwargs["env"]["HERMES_HOME"])
+            (sandbox / "sessions").mkdir(exist_ok=True)
+            _write_session(
+                sandbox / "sessions" / "session_test.json",
+                [{"role": "assistant", "content": "ok"}],
+            )
+            return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
+
+        with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run):
+            runner.run(TaskRunContext(
+                user_message="debug",
+                fixture_dir=fixture_dir,
+            ))
+
+        # -m must appear before -z so hermes parses it as a global flag,
+        # not as part of the -z message.
+        assert captured["args"][:4] == ["hermes", "-m", "gpt-4o-mini", "-z"]
+        assert captured["args"][4] == "debug"
+
+    def test_model_none_omits_minus_m_flag(self, fixture_dir, tmp_path):
+        """No model override → original argv shape (no -m), preserves the
+        existing behavior bit-for-bit for callers that don't opt in."""
+        runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent")
+        captured: dict = {}
+
+        def _fake_run(*args, **kwargs):
+            captured["args"] = args[0] if args else kwargs.get("args")
+            sandbox = Path(kwargs["env"]["HERMES_HOME"])
+            (sandbox / "sessions").mkdir(exist_ok=True)
+            _write_session(
+                sandbox / "sessions" / "session_test.json",
+                [{"role": "assistant", "content": "ok"}],
+            )
+            return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
+
+        with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run):
+            runner.run(TaskRunContext(
+                user_message="hello",
+                fixture_dir=fixture_dir,
+            ))
+
+        assert "-m" not in captured["args"]
+        assert captured["args"] == ["hermes", "-z", "hello"]
+
     def test_skills_src_none_means_no_skills_dir_created(self, fixture_dir, tmp_path):
         """Tool-side runs (no skills_src) must not create an empty skills/
         directory in the sandbox — keeps the legacy code path bit-for-bit."""
diff --git a/tests/validation/test_systematic_debugging_advanced_suite.py b/tests/validation/test_systematic_debugging_advanced_suite.py
new file mode 100644
index 00000000..a0b6e874
--- /dev/null
+++ b/tests/validation/test_systematic_debugging_advanced_suite.py
@@ -0,0 +1,176 @@
+"""Sanity tests for the advanced systematic-debugging planted-bug suite.
+
+Same shape as test_systematic_debugging_suite.py — proves every planted
+bug is real (baseline test exits non-zero) and every known fix passes
+(test exits zero after applying it). Without these guards, a too-permissive
+assertion would silently invalidate every closed-loop run that uses this
+suite.
+
+The advanced suite exists for closed-loop runs where the textbook suite
+saturates against a capable agent model — these bugs require structured
+debugging (the spec edge case the obvious patch misses) and a "just try
+edits" methodology fails on them.
+"""
+
+from __future__ import annotations
+
+import shlex
+import subprocess
+from pathlib import Path
+
+import pytest
+
+from evolution.validation.task import TaskSuite
+
+
+SUITE_PATH = (
+    Path(__file__).resolve().parents[2]
+    / "evolution"
+    / "validation"
+    / "suites"
+    / "systematic_debugging_advanced.jsonl"
+)
+
+
+# Known-good fix per task_id. Each value replaces solution.py wholesale.
+_KNOWN_FIXES: dict[str, str] = {
+    "debug_generator_exhaustion": (
+        "def count_partitions(items, predicate):\n"
+        "    items = list(items)\n"
+        "    n_true = sum(1 for x in items if predicate(x))\n"
+        "    n_false = sum(1 for x in items if not predicate(x))\n"
+        "    return (n_true, n_false)\n"
+    ),
+    "debug_shared_mutable_return": (
+        "_history_cache = {}\n"
+        "\n"
+        "def get_history(user_id):\n"
+        "    if user_id not in _history_cache:\n"
+        "        _history_cache[user_id] = []\n"
+        "    return list(_history_cache[user_id])\n"
+    ),
+    "debug_float_precision_equality": (
+        "import math\n"
+        "\n"
+        "def is_target_amount(payments, target):\n"
+        "    return math.isclose(sum(payments), target, abs_tol=1e-6)\n"
+    ),
+    "debug_binary_search_boundary": (
+        "def leftmost_insert(sorted_arr, target):\n"
+        "    lo, hi = 0, len(sorted_arr)\n"
+        "    while lo < hi:\n"
+        "        mid = (lo + hi) // 2\n"
+        "        if sorted_arr[mid] < target:\n"
+        "            lo = mid + 1\n"
+        "        else:\n"
+        "            hi = mid\n"
+        "    return lo\n"
+    ),
+    "debug_class_vs_instance_attribute": (
+        "class TokenBucket:\n"
+        "    def __init__(self, capacity):\n"
+        "        self.capacity = capacity\n"
+        "        self.consumed = []\n"
+        "\n"
+        "    def take(self, n):\n"
+        "        if sum(self.consumed) + n > self.capacity:\n"
+        "            raise ValueError('over capacity')\n"
+        "        self.consumed.append(n)\n"
+        "\n"
+        "    def used(self):\n"
+        "        return sum(self.consumed)\n"
+    ),
+}
+
+
+@pytest.fixture(scope="module")
+def suite() -> TaskSuite:
+    return TaskSuite.from_jsonl(SUITE_PATH)
+
+
+class TestSuiteShape:
+    def test_loads_five_tasks(self, suite):
+        assert len(suite.tasks) == 5
+
+    def test_every_task_has_test_command(self, suite):
+        for task in suite.tasks:
+            assert task.test_command, f"{task.task_id}: missing test_command"
+
+    def test_every_task_has_solution_and_test_files(self, suite):
+        for task in suite.tasks:
+            files = set(task.fixture_setup.keys())
+            assert "solution.py" in files, f"{task.task_id}: no solution.py"
+            assert "test_solution.py" in files, f"{task.task_id}: no test_solution.py"
+
+    def test_every_task_id_has_a_known_fix(self, suite):
+        suite_ids = {t.task_id for t in suite.tasks}
+        fix_ids = set(_KNOWN_FIXES.keys())
+        assert suite_ids == fix_ids, (
+            f"task ids and known-fix ids out of sync: "
+            f"missing fixes={suite_ids - fix_ids}, "
+            f"extra fixes={fix_ids - suite_ids}"
+        )
+
+    def test_no_task_id_collision_with_basic_suite(self, suite):
+        # Sanity guard: basic + advanced suites are separate files but
+        # behavioral examples key by task_id. Collisions would silently
+        # overwrite verdicts in the cache. Confirm zero overlap.
+        basic_path = SUITE_PATH.parent / "systematic_debugging.jsonl"
+        basic = TaskSuite.from_jsonl(basic_path)
+        basic_ids = {t.task_id for t in basic.tasks}
+        advanced_ids = {t.task_id for t in suite.tasks}
+        assert not (basic_ids & advanced_ids), (
+            f"task_id collision between suites: {basic_ids & advanced_ids}"
+        )
+
+
+class TestPlantedBugsAreReal:
+    @pytest.mark.parametrize(
+        "task_id",
+        list(_KNOWN_FIXES.keys()),
+        ids=list(_KNOWN_FIXES.keys()),
+    )
+    def test_baseline_buggy_code_fails(self, task_id, tmp_path, suite):
+        task = next(t for t in suite.tasks if t.task_id == task_id)
+        _materialize(task.fixture_setup, tmp_path)
+        result = subprocess.run(
+            shlex.split(task.test_command),
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        assert result.returncode != 0, (
+            f"{task_id}: baseline buggy code unexpectedly passed the test.\n"
+            f"stdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+
+
+class TestKnownFixesPass:
+    @pytest.mark.parametrize(
+        "task_id,fixed_solution",
+        list(_KNOWN_FIXES.items()),
+        ids=list(_KNOWN_FIXES.keys()),
+    )
+    def test_known_fix_passes(self, task_id, fixed_solution, tmp_path, suite):
+        task = next(t for t in suite.tasks if t.task_id == task_id)
+        _materialize(task.fixture_setup, tmp_path)
+        (tmp_path / "solution.py").write_text(fixed_solution)
+        result = subprocess.run(
+            shlex.split(task.test_command),
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        assert result.returncode == 0, (
+            f"{task_id}: known fix did not make the test pass.\n"
+            f"stdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+
+
+def _materialize(setup: dict[str, str], target: Path) -> None:
+    for rel, content in setup.items():
+        dest = target / rel
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        dest.write_text(content)