From dc32179c89eb78320349c6281b5d2199b5cad41c Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 17 May 2026 14:56:15 -0600
Subject: [PATCH 1/5] feat(validation): test_command verdict mode for
 skill-shaped tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tool-side closed-loop scoring is a tool-call membership test (was an
expected tool invoked, was a forbidden tool avoided). Skill-side suites
need a different verdict: did the agent's edits make a planted test
pass.

Extends Task with an optional test_command field. When set, score_task
runs the command in fixture_dir after the agent finishes and passes iff
exit code is zero — tool-call membership is bypassed. Command failure
modes (nonzero exit, timeout, FileNotFoundError) all map to "not
passed" since the verdict is exit-code-driven. Runner errors still
abstain.

Uses shlex.split (no shell) so suite-controlled commands don't pick up
shell metacharacters. Default timeout 60s.

Tool-side suites (patch, write_file, search_files) are unchanged — they
never set test_command and follow the existing tool-call rule.
---
 evolution/validation/report.py    |  40 ++++++++++
 evolution/validation/task.py      |  26 +++++--
 evolution/validation/validator.py |   2 +
 tests/validation/test_report.py   | 119 ++++++++++++++++++++++++++++++
 tests/validation/test_task.py     |  25 +++++++
 5 files changed, 207 insertions(+), 5 deletions(-)

diff --git a/evolution/validation/report.py b/evolution/validation/report.py
index 6345832f..94de672d 100644
--- a/evolution/validation/report.py
+++ b/evolution/validation/report.py
@@ -7,6 +7,8 @@
 from __future__ import annotations
 
 import json
+import shlex
+import subprocess
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Any, Optional
@@ -50,14 +52,31 @@ def score_task(
     expected_tools: tuple[str, ...],
     forbidden_tools: tuple[str, ...],
     run: AgentRunResult,
+    test_command: Optional[str] = None,
+    fixture_dir: Optional[Path] = None,
+    test_command_timeout_seconds: float = 60.0,
 ) -> tuple[bool, bool]:
     """Return (passed, abstained).
 
     Abstention takes precedence over pass/fail: a task that errored out
     in the runner is not evidence of the artifact's quality either way.
+
+    When ``test_command`` is set (skill-side suites), the verdict is
+    "did the planted test pass after the agent's edits": the command
+    runs in ``fixture_dir`` with the given timeout, and passes iff exit
+    code is zero. ``expected_tools`` / ``forbidden_tools`` are ignored
+    in this mode. Command failure modes (nonzero exit, timeout,
+    FileNotFoundError) all map to ``(False, False)`` — "the test did
+    not pass," which is the meaningful verdict regardless of cause.
     """
     if run.error is not None:
         return False, True
+    if test_command is not None:
+        if fixture_dir is None:
+            raise ValueError(
+                "score_task: fixture_dir is required when test_command is set"
+            )
+        return _run_test_command(test_command, fixture_dir, test_command_timeout_seconds), False
     invoked = set(run.tool_calls_seq)
     if forbidden_tools and (invoked & set(forbidden_tools)):
         return False, False
@@ -66,6 +85,27 @@ def score_task(
     return True, False
 
 
+def _run_test_command(command: str, cwd: Path, timeout_seconds: float) -> bool:
+    """Run ``command`` in ``cwd``. Return True iff exit code is zero.
+
+    Uses ``shlex.split`` (no shell) so suite-controlled commands don't
+    accidentally pick up shell metacharacters. All failure modes
+    (nonzero exit, timeout, FileNotFoundError, OSError) return False —
+    the test did not pass is the verdict the caller needs.
+    """
+    try:
+        result = subprocess.run(
+            shlex.split(command),
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            timeout=timeout_seconds,
+        )
+        return result.returncode == 0
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        return False
+
+
 def summarize_phase(task_results: list[TaskResult]) -> PhaseResult:
     n_passed = sum(1 for r in task_results if r.passed and not r.abstained)
     n_abstained = sum(1 for r in task_results if r.abstained)
diff --git a/evolution/validation/task.py b/evolution/validation/task.py
index 72bd6834..209c8b38 100644
--- a/evolution/validation/task.py
+++ b/evolution/validation/task.py
@@ -6,6 +6,7 @@
 import json
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Optional
 
 
 @dataclass(frozen=True)
@@ -18,11 +19,19 @@ class Task:
     by that dir's absolute path so the task can reference the fixture
     files unambiguously.
 
-    ``expected_tools`` and ``forbidden_tools`` define the pass rule:
-    pass iff at least one expected tool was invoked AND no forbidden
-    tool was invoked. Both empty is a degenerate "no-tool task" — the
-    pass rule reduces to "agent didn't invoke any forbidden tools,"
-    which trivially passes if forbidden is empty too.
+    Two verdict mechanisms are supported, depending on what the suite
+    is validating:
+
+    - ``expected_tools`` / ``forbidden_tools`` (tool-side): pass iff at
+      least one expected tool was invoked AND no forbidden tool was
+      invoked. Both empty is a degenerate "no-tool task" — the pass
+      rule reduces to "agent didn't invoke any forbidden tools," which
+      trivially passes if forbidden is empty too.
+    - ``test_command`` (skill-side): pass iff the command exits zero
+      when run in ``fixture_dir`` after the agent finishes. Use when
+      the verdict is "did the agent's edits make the planted test
+      pass" rather than "did the agent invoke the right tools." When
+      set, takes precedence over the tool-call rule.
     """
 
     task_id: str
@@ -30,6 +39,7 @@ class Task:
     expected_tools: tuple[str, ...] = ()
     forbidden_tools: tuple[str, ...] = ()
     fixture_setup: dict[str, str] = field(default_factory=dict)
+    test_command: Optional[str] = None
 
     def render_message(self, fixture_dir: Path) -> str:
         """Substitute ``{fixture_dir}`` in the message with the resolved path.
@@ -83,10 +93,16 @@ def _task_from_dict(obj: dict, *, source: str) -> Task:
         raise ValueError(
             f"{source}: fixture_setup must be a dict of relative_path -> content"
         )
+    test_command = obj.get("test_command")
+    if test_command is not None and not isinstance(test_command, str):
+        raise ValueError(
+            f"{source}: test_command must be a string (got {type(test_command).__name__})"
+        )
     return Task(
         task_id=obj["task_id"],
         user_message=obj["user_message"],
         expected_tools=tuple(obj.get("expected_tools") or ()),
         forbidden_tools=tuple(obj.get("forbidden_tools") or ()),
         fixture_setup=dict(fixture_setup),
+        test_command=test_command,
     )
diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py
index 56c078cb..da24959c 100644
--- a/evolution/validation/validator.py
+++ b/evolution/validation/validator.py
@@ -147,6 +147,8 @@ def _run_one_task(self, task: Task) -> TaskResult:
                 expected_tools=task.expected_tools,
                 forbidden_tools=task.forbidden_tools,
                 run=run,
+                test_command=task.test_command,
+                fixture_dir=fixture_dir,
             )
             return TaskResult(
                 task_id=task.task_id,
diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py
index abd02d75..ec8eaa8b 100644
--- a/tests/validation/test_report.py
+++ b/tests/validation/test_report.py
@@ -68,6 +68,125 @@ def test_error_marks_abstention(self):
         assert abstained
 
 
+class TestScoreTaskTestCommandMode:
+    """When ``test_command`` is set on a task, the verdict is exit-code-driven,
+    not tool-call-driven. Used by skill-side suites (e.g., planted-bug:
+    "did the agent's edits make the test pass").
+    """
+
+    @staticmethod
+    def _ok_run() -> AgentRunResult:
+        return AgentRunResult(
+            tool_calls_seq=[], final_text_tail="", duration_seconds=1.0,
+        )
+
+    def test_passes_on_exit_zero(self, tmp_path):
+        (tmp_path / "ok.py").write_text("import sys; sys.exit(0)\n")
+        passed, abstained = score_task(
+            expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+            test_command="python ok.py",
+            fixture_dir=tmp_path,
+        )
+        assert passed
+        assert not abstained
+
+    def test_fails_on_nonzero_exit(self, tmp_path):
+        (tmp_path / "bad.py").write_text("import sys; sys.exit(1)\n")
+        passed, abstained = score_task(
+            expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+            test_command="python bad.py",
+            fixture_dir=tmp_path,
+        )
+        assert not passed
+        assert not abstained
+
+    def test_timeout_marks_failed_not_abstained(self, tmp_path):
+        # Treat hangs as failure rather than abstention — a debugging
+        # task that goes infinite is the agent's failure to debug.
+        (tmp_path / "slow.py").write_text("import time; time.sleep(60)\n")
+        passed, abstained = score_task(
+            expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+            test_command="python slow.py",
+            fixture_dir=tmp_path,
+            test_command_timeout_seconds=0.3,
+        )
+        assert not passed
+        assert not abstained
+
+    def test_cwd_is_fixture_dir(self, tmp_path):
+        # The test script verifies its own cwd matches the fixture dir.
+        (tmp_path / "cwd_check.py").write_text(
+            "import os, sys\n"
+            "sys.exit(0 if os.path.realpath(os.getcwd()) == sys.argv[1] else 1)\n"
+        )
+        passed, _ = score_task(
+            expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+            test_command=f"python cwd_check.py {tmp_path.resolve()}",
+            fixture_dir=tmp_path,
+        )
+        assert passed
+
+    def test_precedence_over_tool_call_rule(self, tmp_path):
+        # When test_command is set, the tool-call rule is ignored
+        # entirely — even if the agent invoked a forbidden tool.
+        (tmp_path / "ok.py").write_text("")  # empty file → python ok.py exits 0
+        run = AgentRunResult(
+            tool_calls_seq=["forbidden_tool"], final_text_tail="", duration_seconds=1.0,
+        )
+        passed, _ = score_task(
+            expected_tools=("expected_tool",),
+            forbidden_tools=("forbidden_tool",),
+            run=run,
+            test_command="python ok.py",
+            fixture_dir=tmp_path,
+        )
+        assert passed
+
+    def test_command_not_found_fails(self, tmp_path):
+        passed, abstained = score_task(
+            expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+            test_command="this-binary-does-not-exist-12345",
+            fixture_dir=tmp_path,
+        )
+        assert not passed
+        assert not abstained
+
+    def test_missing_fixture_dir_raises(self):
+        with pytest.raises(ValueError, match="fixture_dir is required"):
+            score_task(
+                expected_tools=(), forbidden_tools=(), run=self._ok_run(),
+                test_command="python ok.py",
+            )
+
+    def test_runner_error_still_abstains_with_test_command(self, tmp_path):
+        # Runner error takes precedence over test_command — same as it
+        # does over the tool-call rule. A subprocess crash that prevented
+        # the agent from running isn't evidence either way.
+        (tmp_path / "ok.py").write_text("")
+        run = AgentRunResult(
+            tool_calls_seq=[], final_text_tail="", duration_seconds=1.0,
+            error="hermes crashed",
+        )
+        passed, abstained = score_task(
+            expected_tools=(), forbidden_tools=(), run=run,
+            test_command="python ok.py",
+            fixture_dir=tmp_path,
+        )
+        assert not passed
+        assert abstained
+
+    def test_tool_only_path_unchanged_when_test_command_absent(self):
+        # Regression guard for the existing patch/search_files/write_file suites.
+        run = AgentRunResult(
+            tool_calls_seq=["patch"], final_text_tail="", duration_seconds=1.0,
+        )
+        passed, abstained = score_task(
+            expected_tools=("patch",), forbidden_tools=("write_file",), run=run,
+        )
+        assert passed
+        assert not abstained
+
+
 class TestSummarizePhase:
     def test_counts_and_pass_rate(self):
         results = [
diff --git a/tests/validation/test_task.py b/tests/validation/test_task.py
index b531605a..4c0b0908 100644
--- a/tests/validation/test_task.py
+++ b/tests/validation/test_task.py
@@ -130,3 +130,28 @@ def test_non_dict_fixture_setup_raises(self, tmp_path):
         }])
         with pytest.raises(ValueError, match="fixture_setup must be a dict"):
             TaskSuite.from_jsonl(p)
+
+    def test_test_command_parsed_when_present(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{
+            "task_id": "t1",
+            "user_message": "debug it",
+            "test_command": "python test_solution.py",
+        }])
+        suite = TaskSuite.from_jsonl(p)
+        assert suite.tasks[0].test_command == "python test_solution.py"
+
+    def test_test_command_defaults_to_none(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{"task_id": "t1", "user_message": "do X"}])
+        suite = TaskSuite.from_jsonl(p)
+        assert suite.tasks[0].test_command is None
+
+    def test_test_command_non_string_raises(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{
+            "task_id": "t", "user_message": "m",
+            "test_command": ["python", "x.py"],
+        }])
+        with pytest.raises(ValueError, match="test_command must be a string"):
+            TaskSuite.from_jsonl(p)

From 4b83e65d9756fed2181f6d61a8e52a3d2c7bf8ba Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 17 May 2026 15:06:32 -0600
Subject: [PATCH 2/5] feat(validation): SkillFileInstaller + cache
 artifact-writer injection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the plumbing needed to run closed-loop validation on a SKILL.md,
not just a tool description. Three coordinated pieces:

SkillFileInstaller copies the baseline skill directory (SKILL.md + any
siblings) into a caller-owned writable workdir at construction. install()
then atomically overwrites the target with candidate text. The original
on-disk skill — possibly in a read-only plugin cache — is never touched.
The installer exposes `skills_src`, the directory the runner needs to
stage into its per-task sandbox so `hermes -z` discovers the candidate.

ClosedLoopFeedbackCache grows an `artifact_writer` callable kwarg
(default = the existing single-tool MCP manifest JSON, preserving the
tool path bit-for-bit) and an `artifact_suffix` for the temp filenames.
Skill-side callers inject `write_text_artifact` and `.md`. Constructor
kwargs renamed `tool_name`→`artifact_name` and
`baseline_description`→`baseline_artifact_text` for semantic neutrality;
tool-side call site updated.

HermesAgentRunner._prime_sandbox reads `TaskRunContext.skills_src`
(new optional field) and copies its contents into `<sandbox>/skills/`
per task — fresh copy each task so an in-task agent write can't corrupt
the installer's persistent candidate state. Confirmed empirically that
Hermes loads skills from `$HERMES_HOME/skills/`.

ArtifactInstaller Protocol grows `verify_backup(backup_path)` so each
installer validates its own backup format. Tool installer parses Python
(as before); skill installer checks non-empty + UTF-8 decodable.
---
 evolution/core/closed_loop_feedback.py      |  62 +++++++--
 evolution/tools/evolve_tool.py              |   4 +-
 evolution/validation/agent_runner.py        |   7 +
 evolution/validation/artifact_installer.py  |  96 ++++++++++++-
 evolution/validation/hermes_runner.py       |   8 +-
 evolution/validation/validator.py           |  16 ++-
 tests/core/test_closed_loop_feedback.py     | 141 +++++++++++++++-----
 tests/validation/test_artifact_installer.py | 138 +++++++++++++++++++
 tests/validation/test_hermes_runner.py      |  61 +++++++++
 tests/validation/test_safety.py             |   5 +
 tests/validation/test_validator.py          |   5 +
 11 files changed, 480 insertions(+), 63 deletions(-)

diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py
index c9702523..1cd40b44 100644
--- a/evolution/core/closed_loop_feedback.py
+++ b/evolution/core/closed_loop_feedback.py
@@ -27,7 +27,7 @@
 import tempfile
 import threading
 from pathlib import Path
-from typing import Literal, Optional
+from typing import Callable, Literal, Optional
 
 from evolution.validation.report import TaskResult, ValidationReport
 from evolution.validation.task import TaskSuite
@@ -42,6 +42,15 @@
 
 GateMode = Literal["sampled", "always"]
 
+ArtifactWriter = Callable[[str, Path], None]
+"""Write candidate text to a path in the format the installer consumes.
+
+The cache calls this with ``(baseline_or_candidate_text, target_path)``
+before each validator run. The default writes a single-tool MCP manifest
+JSON (the tool-side shape); skill-side passes a writer that drops raw
+text directly into the path.
+"""
+
 logger = logging.getLogger(__name__)
 
 
@@ -67,12 +76,14 @@ def __init__(
         *,
         validator: ClosedLoopValidator,
         suite: TaskSuite,
-        tool_name: str,
-        baseline_description: str,
+        artifact_name: str,
+        baseline_artifact_text: str,
         saturation_threshold: float = 0.95,
         min_iters: int = 3,
         window_size: int = 8,
         gate_mode: GateMode = "sampled",
+        artifact_writer: Optional[ArtifactWriter] = None,
+        artifact_suffix: str = ".json",
     ) -> None:
         if not (0.0 <= saturation_threshold <= 1.0):
             raise ValueError(
@@ -88,19 +99,23 @@ def __init__(
             )
         self._validator = validator
         self._suite = suite
-        self._tool_name = tool_name
+        self._artifact_name = artifact_name
         self.saturation_threshold = saturation_threshold
         self.min_iters = min_iters
         self.window_size = window_size
         self.gate_mode = gate_mode
 
-        self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_"))
-        self._baseline_path = self._tmp_dir / "baseline.json"
-        self._evolved_path = self._tmp_dir / "evolved.json"
-        self._baseline_path.write_text(
-            _manifest_json(tool_name, baseline_description)
+        self._artifact_writer: ArtifactWriter = (
+            artifact_writer
+            if artifact_writer is not None
+            else _make_default_tool_writer(artifact_name)
         )
 
+        self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_"))
+        self._baseline_path = self._tmp_dir / f"baseline{artifact_suffix}"
+        self._evolved_path = self._tmp_dir / f"evolved{artifact_suffix}"
+        self._artifact_writer(baseline_artifact_text, self._baseline_path)
+
         self._cache: dict[str, ValidationReport] = {}
         self._judge_history: list[float] = []
         self._iters_since_last_run = self.min_iters  # allow first fire
@@ -147,11 +162,9 @@ def get_or_run(self, candidate_text: str) -> Optional[ValidationReport]:
             if not self.should_run():
                 return None
             try:
-                self._evolved_path.write_text(
-                    _manifest_json(self._tool_name, candidate_text)
-                )
+                self._artifact_writer(candidate_text, self._evolved_path)
                 inputs = ValidationInputs(
-                    tool_name=self._tool_name,
+                    tool_name=self._artifact_name,
                     suite=self._suite,
                     baseline_artifact=self._baseline_path,
                     evolved_artifact=self._evolved_path,
@@ -293,3 +306,26 @@ def _manifest_json(tool_name: str, description: str) -> str:
         },
         indent=2,
     )
+
+
+def _make_default_tool_writer(tool_name: str) -> ArtifactWriter:
+    """Default ``artifact_writer`` for tool-side closed-loop.
+
+    Writes a single-tool MCP manifest JSON — the shape
+    ``HermesToolDescriptionInstaller._extract_description`` consumes when
+    ``artifact_source.suffix == ".json"``.
+    """
+
+    def write(candidate_text: str, path: Path) -> None:
+        path.write_text(_manifest_json(tool_name, candidate_text))
+
+    return write
+
+
+def write_text_artifact(candidate_text: str, path: Path) -> None:
+    """``artifact_writer`` for skill-side closed-loop: drop raw text into the path.
+
+    The skill installer reads the whole file as the candidate SKILL.md
+    body, so no envelope is needed.
+    """
+    path.write_text(candidate_text)
diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
index 8e7e083a..31c39cc6 100644
--- a/evolution/tools/evolve_tool.py
+++ b/evolution/tools/evolve_tool.py
@@ -292,8 +292,8 @@ def _maybe_build_closed_loop_cache(
     return ClosedLoopFeedbackCache(
         validator=validator,
         suite=suite,
-        tool_name=tool_name,
-        baseline_description=baseline_description,
+        artifact_name=tool_name,
+        baseline_artifact_text=baseline_description,
         saturation_threshold=saturation_threshold,
         min_iters=min_iters,
         window_size=window_size,
diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py
index 945c3053..dc4500d4 100644
--- a/evolution/validation/agent_runner.py
+++ b/evolution/validation/agent_runner.py
@@ -43,11 +43,18 @@ class TaskRunContext:
 
     ``fixture_dir`` is the per-task ``mkdtemp`` directory the validator
     populated from ``Task.fixture_setup`` before calling the runner.
+
+    ``skills_src`` is the directory whose contents should be staged into
+    the runner's per-task sandbox under ``skills/`` — used for skill-side
+    closed-loop where the installer maintains a persistent writable
+    copy of the candidate skill that needs to be re-staged into each
+    task's ephemeral sandbox.
     """
 
     user_message: str
     fixture_dir: Path
     extra_env: dict[str, str] = field(default_factory=dict)
+    skills_src: Optional[Path] = None
 
 
 class AgentRunner(Protocol):
diff --git a/evolution/validation/artifact_installer.py b/evolution/validation/artifact_installer.py
index d7cebfcf..44780ec2 100644
--- a/evolution/validation/artifact_installer.py
+++ b/evolution/validation/artifact_installer.py
@@ -1,9 +1,15 @@
 """ArtifactInstaller Protocol — how the validator gets an artifact onto disk.
 
-v1 only ships ``HermesToolDescriptionInstaller`` (splice into an
-existing tool's description via ``HermesToolSource``). v2 will add
-skill installers (drop a SKILL.md into the sandboxed HERMES_HOME's
-skills dir) without changing ``ClosedLoopValidator``.
+Two concrete installers ship: ``HermesToolDescriptionInstaller`` (splice
+an evolved description into a Hermes tool-module ``*_SCHEMA`` file in
+place) and ``SkillFileInstaller`` (write an evolved SKILL.md into a
+caller-provided writable workdir, decoupled from the user's actual
+HERMES_HOME / read-only plugin cache).
+
+Installers that need the runner to stage extra state into its per-task
+sandbox (only ``SkillFileInstaller`` today, which needs the candidate
+SKILL.md visible to ``hermes -z``) expose an optional ``skills_src``
+attribute the validator threads through ``TaskRunContext``.
 """
 
 from __future__ import annotations
@@ -11,9 +17,10 @@
 import ast
 import hashlib
 import os
+import shutil
 import tempfile
 from pathlib import Path
-from typing import Protocol
+from typing import Optional, Protocol
 
 from evolution.tools.hermes_source import HermesToolSource
 from evolution.tools.tool_source import ToolManifest
@@ -32,6 +39,15 @@ def install(self, artifact_source: Path) -> str:
         validator can verify the file wasn't mutated between tasks."""
         ...
 
+    def verify_backup(self, backup_path: Path) -> None:
+        """Validate the backup before trusting it for restore.
+
+        Default behavior for Python-source artifacts: raise SyntaxError if
+        the backup doesn't parse. For non-Python artifacts (skills), the
+        installer overrides with a format-appropriate check (e.g., UTF-8
+        decodability + non-empty)."""
+        ...
+
 
 class HermesToolDescriptionInstaller:
     """Splice an evolved tool description into a Hermes ``*_SCHEMA`` file.
@@ -89,6 +105,9 @@ def install(self, artifact_source: Path) -> str:
         )
         return sha256_of(self.target_path)
 
+    def verify_backup(self, backup_path: Path) -> None:
+        verify_python_parses(backup_path)
+
     def _extract_description(self, artifact_source: Path) -> str:
         """Return the description string for ``self.tool_name`` from
         ``artifact_source``. Dispatches on suffix so the installer can
@@ -120,6 +139,73 @@ def _extract_description(self, artifact_source: Path) -> str:
             return manifest.find_tool(self.tool_name).description
 
 
+class SkillFileInstaller:
+    """Write an evolved SKILL.md into a writable workdir for closed-loop validation.
+
+    The user's actual skill may live in a read-only location (the Claude
+    Code plugin cache, a system-installed skill bundle, or a user
+    HERMES_HOME we don't want to mutate). The installer copies the
+    entire baseline skill directory once at construction into a
+    caller-owned ``workdir``, then ``install()`` writes candidate text
+    over the resulting target SKILL.md. The original location is never
+    touched.
+
+    ``skills_src`` is the directory the runner copies into its per-task
+    sandbox so ``hermes -z`` discovers the candidate skill. Exposed
+    here so the validator can thread it through ``TaskRunContext``
+    without a Hermes-specific code path.
+    """
+
+    def __init__(
+        self,
+        *,
+        skill_source_path: Path,
+        skill_name: str,
+        workdir: Path,
+    ) -> None:
+        if not skill_source_path.is_file():
+            raise FileNotFoundError(
+                f"skill_source_path not found: {skill_source_path}"
+            )
+        if not workdir.is_dir():
+            raise NotADirectoryError(
+                f"workdir not found or not a directory: {workdir}"
+            )
+        self.skill_name = skill_name
+        self.workdir = workdir
+        self.skills_src: Path = workdir / "skills"
+        skill_dest_dir = self.skills_src / skill_name
+        skill_dest_dir.parent.mkdir(parents=True, exist_ok=True)
+        source_dir = skill_source_path.parent
+        # Copy the entire skill directory (SKILL.md + any sibling files
+        # the skill references) so the candidate has the same surrounding
+        # context as the baseline.
+        shutil.copytree(source_dir, skill_dest_dir, dirs_exist_ok=False)
+        self.target_path: Path = skill_dest_dir / skill_source_path.name
+
+    def install(self, artifact_source: Path) -> str:
+        """Atomically overwrite ``target_path`` with ``artifact_source`` contents.
+
+        The artifact source is the candidate SKILL.md as written by the
+        cache's ``artifact_writer``. We just copy bytes — no parse, no
+        splice — since for skills the whole file is the artifact.
+        """
+        atomic_write_bytes(self.target_path, artifact_source.read_bytes())
+        return sha256_of(self.target_path)
+
+    def verify_backup(self, backup_path: Path) -> None:
+        """Skills are UTF-8 text; reject empty or non-UTF-8 backups."""
+        data = backup_path.read_bytes()
+        if not data:
+            raise ValueError(f"skill backup at {backup_path} is empty")
+        try:
+            data.decode("utf-8")
+        except UnicodeDecodeError as exc:
+            raise ValueError(
+                f"skill backup at {backup_path} is not valid UTF-8: {exc}"
+            ) from exc
+
+
 def sha256_of(path: Path) -> str:
     return hashlib.sha256(path.read_bytes()).hexdigest()
 
diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
index 6ea75e79..9c8940b5 100644
--- a/evolution/validation/hermes_runner.py
+++ b/evolution/validation/hermes_runner.py
@@ -61,7 +61,7 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
         message = ctx.user_message
         sandbox = Path(tempfile.mkdtemp(prefix="cl_hermes_home_"))
         try:
-            self._prime_sandbox(sandbox)
+            self._prime_sandbox(sandbox, ctx)
             env = {
                 **os.environ,
                 "HERMES_HOME": str(sandbox),
@@ -107,10 +107,14 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
         finally:
             shutil.rmtree(sandbox, ignore_errors=True)
 
-    def _prime_sandbox(self, sandbox: Path) -> None:
+    def _prime_sandbox(self, sandbox: Path, ctx: TaskRunContext) -> None:
         (sandbox / "sessions").mkdir(parents=True, exist_ok=True)
         if self.user_config_path.exists():
             shutil.copy2(self.user_config_path, sandbox / "config.yaml")
+        if ctx.skills_src is not None and ctx.skills_src.is_dir():
+            # Copy (not symlink) so an in-task write by the agent corrupts
+            # only this sandbox, not the installer's persistent candidate.
+            shutil.copytree(ctx.skills_src, sandbox / "skills")
 
 
 def _find_latest_session(sessions_dir: Path) -> Optional[Path]:
diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py
index da24959c..936692b5 100644
--- a/evolution/validation/validator.py
+++ b/evolution/validation/validator.py
@@ -29,7 +29,6 @@
     ArtifactInstaller,
     atomic_write_bytes,
     sha256_of,
-    verify_python_parses,
 )
 from evolution.validation.report import (
     PhaseResult,
@@ -88,11 +87,11 @@ def __init__(self, installer: ArtifactInstaller, runner: AgentRunner) -> None:
     def validate(self, inputs: ValidationInputs) -> ValidationReport:
         target = self.installer.target_path
         backup_path = target.with_suffix(target.suffix + _BACKUP_SUFFIX)
-        _refuse_if_stale_backup_exists(backup_path)
+        _refuse_if_stale_backup_exists(backup_path, self.installer)
 
         with _exclusive_lock(target.parent):
             atomic_write_bytes(backup_path, target.read_bytes())
-            verify_python_parses(backup_path)
+            self.installer.verify_backup(backup_path)
             try:
                 baseline_results = self._run_phase(
                     inputs.suite,
@@ -141,6 +140,7 @@ def _run_one_task(self, task: Task) -> TaskResult:
             ctx = TaskRunContext(
                 user_message=task.render_message(fixture_dir),
                 fixture_dir=fixture_dir,
+                skills_src=getattr(self.installer, "skills_src", None),
             )
             run = self.runner.run(ctx)
             passed, abstained = score_task(
@@ -161,14 +161,16 @@ def _run_one_task(self, task: Task) -> TaskResult:
             )
 
 
-def _refuse_if_stale_backup_exists(backup_path: Path) -> None:
+def _refuse_if_stale_backup_exists(
+    backup_path: Path, installer: ArtifactInstaller
+) -> None:
     if not backup_path.exists():
         return
     try:
-        verify_python_parses(backup_path)
-    except SyntaxError as exc:
+        installer.verify_backup(backup_path)
+    except Exception as exc:
         raise StaleBackupError(
-            f"Stale backup at {backup_path} is corrupt (cannot parse as Python: "
+            f"Stale backup at {backup_path} is corrupt ({type(exc).__name__}: "
             f"{exc}). Inspect it manually; do not blindly restore."
         ) from exc
     raise StaleBackupError(
diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py
index 0de6631c..291a0e14 100644
--- a/tests/core/test_closed_loop_feedback.py
+++ b/tests/core/test_closed_loop_feedback.py
@@ -90,8 +90,8 @@ def test_gate_closed_when_no_history(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline desc",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline desc",
         )
         cache._iters_since_last_run = 0  # reset to test purely-history-based gate
         assert cache.should_run() is False
@@ -100,8 +100,8 @@ def test_gate_open_on_saturated_window(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             saturation_threshold=0.95,
             window_size=4,
             min_iters=999,  # disable periodic fallback
@@ -115,8 +115,8 @@ def test_gate_closed_on_unsaturated_window(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             saturation_threshold=0.95,
             window_size=4,
             min_iters=999,
@@ -130,8 +130,8 @@ def test_periodic_floor_triggers_even_when_unsaturated(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             saturation_threshold=0.95,
             min_iters=3,
             window_size=4,
@@ -150,8 +150,8 @@ def test_cache_hit_short_circuits_validator(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             min_iters=1,
         )
         cache.record_judge_score(0.99)  # open the gate
@@ -167,8 +167,8 @@ def test_gate_closed_returns_none(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             saturation_threshold=0.95,
             min_iters=999,
         )
@@ -183,8 +183,8 @@ def test_concurrent_run_error_does_not_propagate(self, tmp_path, caplog):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             min_iters=1,
         )
         cache.record_judge_score(0.99)
@@ -197,8 +197,8 @@ def test_cache_key_depends_on_candidate_and_suite_sha(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
         )
         k1 = cache._key("desc A")
         k2 = cache._key("desc B")
@@ -319,8 +319,8 @@ def test_two_threads_invoke_validator_exactly_once(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             min_iters=1,
         )
         cache.record_judge_score(0.99)
@@ -349,8 +349,8 @@ def test_threshold_out_of_range_raises(self, tmp_path):
             ClosedLoopFeedbackCache(
                 validator=MagicMock(),
                 suite=_build_suite(tmp_path),
-                tool_name="write_file",
-                baseline_description="baseline",
+                artifact_name="write_file",
+                baseline_artifact_text="baseline",
                 saturation_threshold=1.5,
             )
 
@@ -359,8 +359,8 @@ def test_min_iters_zero_raises(self, tmp_path):
             ClosedLoopFeedbackCache(
                 validator=MagicMock(),
                 suite=_build_suite(tmp_path),
-                tool_name="write_file",
-                baseline_description="baseline",
+                artifact_name="write_file",
+                baseline_artifact_text="baseline",
                 min_iters=0,
             )
 
@@ -369,8 +369,8 @@ def test_invalid_gate_mode_raises(self, tmp_path):
             ClosedLoopFeedbackCache(
                 validator=MagicMock(),
                 suite=_build_suite(tmp_path),
-                tool_name="write_file",
-                baseline_description="baseline",
+                artifact_name="write_file",
+                baseline_artifact_text="baseline",
                 gate_mode="bogus",
             )
 
@@ -380,8 +380,8 @@ def test_always_mode_opens_gate_without_history(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             gate_mode="always",
         )
         # No record_judge_score calls — sampled mode would return False here.
@@ -393,8 +393,8 @@ def test_always_mode_fires_validator_on_cache_miss(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             gate_mode="always",
         )
         # No judge scores recorded — sampled mode would return None.
@@ -418,8 +418,8 @@ def test_returns_per_task_result_on_cache_hit(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             gate_mode="always",
         )
         t1 = cache.get_task_verdict("cand", "t1")
@@ -433,8 +433,8 @@ def test_returns_none_on_cache_miss_with_closed_gate(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=MagicMock(),
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             saturation_threshold=0.95,
             min_iters=999,
             gate_mode="sampled",  # default; explicit for clarity
@@ -455,8 +455,81 @@ def test_returns_none_when_task_id_not_in_report(self, tmp_path):
         cache = ClosedLoopFeedbackCache(
             validator=validator,
             suite=_build_suite(tmp_path),
-            tool_name="write_file",
-            baseline_description="baseline",
+            artifact_name="write_file",
+            baseline_artifact_text="baseline",
             gate_mode="always",
         )
         assert cache.get_task_verdict("cand", "nonexistent_task") is None
+
+
+class TestArtifactWriterInjection:
+    """The cache's default ``artifact_writer`` produces the single-tool MCP
+    manifest JSON the tool installer consumes. Skill-side callers inject
+    a writer that drops raw text directly. Tool-path behavior must be
+    bit-for-bit preserved when no writer is passed.
+    """
+
+    def test_default_writer_produces_mcp_manifest_json(self, tmp_path):
+        cache = ClosedLoopFeedbackCache(
+            validator=MagicMock(),
+            suite=_build_suite(tmp_path),
+            artifact_name="write_file",
+            baseline_artifact_text="baseline desc",
+        )
+        # Default suffix is .json; default writer is the MCP manifest.
+        baseline_text = cache._baseline_path.read_text()
+        assert cache._baseline_path.suffix == ".json"
+        parsed = json.loads(baseline_text)
+        assert parsed["tools"][0]["name"] == "write_file"
+        assert parsed["tools"][0]["description"] == "baseline desc"
+
+    def test_custom_writer_called_with_candidate_and_path(self, tmp_path):
+        calls: list[tuple[str, Path]] = []
+
+        def fake_writer(text: str, path: Path) -> None:
+            calls.append((text, path))
+            path.write_text(f"WRAPPED::{text}")
+
+        cache = ClosedLoopFeedbackCache(
+            validator=MagicMock(),
+            suite=_build_suite(tmp_path),
+            artifact_name="systematic_debugging",
+            baseline_artifact_text="baseline body",
+            artifact_writer=fake_writer,
+            artifact_suffix=".md",
+        )
+        # Baseline was written once during construction.
+        assert len(calls) == 1
+        assert calls[0][0] == "baseline body"
+        assert calls[0][1].suffix == ".md"
+        assert cache._baseline_path.read_text() == "WRAPPED::baseline body"
+
+    def test_custom_writer_invoked_on_each_validate(self, tmp_path):
+        calls: list[str] = []
+
+        def fake_writer(text: str, path: Path) -> None:
+            calls.append(text)
+            path.write_text(text)
+
+        validator = MagicMock()
+        validator.validate.return_value = _build_report()
+        cache = ClosedLoopFeedbackCache(
+            validator=validator,
+            suite=_build_suite(tmp_path),
+            artifact_name="systematic_debugging",
+            baseline_artifact_text="baseline",
+            artifact_writer=fake_writer,
+            artifact_suffix=".md",
+            gate_mode="always",
+        )
+        cache.get_or_run("candidate-1")
+        cache.get_or_run("candidate-2")
+        # 1 baseline write + 2 evolved writes.
+        assert calls == ["baseline", "candidate-1", "candidate-2"]
+
+    def test_write_text_artifact_helper_writes_plain_text(self, tmp_path):
+        from evolution.core.closed_loop_feedback import write_text_artifact
+
+        path = tmp_path / "out.md"
+        write_text_artifact("hello world\n", path)
+        assert path.read_text() == "hello world\n"
diff --git a/tests/validation/test_artifact_installer.py b/tests/validation/test_artifact_installer.py
index 7309360a..95b1cfbd 100644
--- a/tests/validation/test_artifact_installer.py
+++ b/tests/validation/test_artifact_installer.py
@@ -123,3 +123,141 @@ def test_json_target_tool_missing_raises(self, fake_hermes_repo, tmp_path):
         )
         with pytest.raises(KeyError):
             installer._extract_description(evolved_json)
+
+
+# ---------------------------------------------------------------------------
+# SkillFileInstaller
+# ---------------------------------------------------------------------------
+
+
+from evolution.validation.artifact_installer import SkillFileInstaller
+
+
+@pytest.fixture
+def baseline_skill(tmp_path):
+    """A read-only source skill directory the installer must not mutate."""
+    src_root = tmp_path / "source_skills" / "systematic_debugging"
+    src_root.mkdir(parents=True)
+    skill_md = src_root / "SKILL.md"
+    skill_md.write_text(
+        "---\nname: systematic_debugging\n---\n\n# Baseline body\n"
+    )
+    # Sibling file the skill might reference — must also be copied.
+    (src_root / "examples.md").write_text("baseline examples")
+    return skill_md
+
+
+class TestSkillFileInstallerConstruction:
+    def test_copies_skill_directory_into_workdir(self, baseline_skill, tmp_path):
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        installer = SkillFileInstaller(
+            skill_source_path=baseline_skill,
+            skill_name="systematic_debugging",
+            workdir=workdir,
+        )
+        # The target_path lives under the workdir.
+        assert installer.target_path == (
+            workdir / "skills" / "systematic_debugging" / "SKILL.md"
+        )
+        assert installer.target_path.is_file()
+        # Sibling files are copied too — agents may reference them.
+        sibling = workdir / "skills" / "systematic_debugging" / "examples.md"
+        assert sibling.is_file()
+        assert sibling.read_text() == "baseline examples"
+
+    def test_skills_src_attribute_points_at_skills_root(self, baseline_skill, tmp_path):
+        # The validator threads this into TaskRunContext so the runner
+        # can stage it into its per-task sandbox.
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        installer = SkillFileInstaller(
+            skill_source_path=baseline_skill,
+            skill_name="systematic_debugging",
+            workdir=workdir,
+        )
+        assert installer.skills_src == workdir / "skills"
+
+    def test_missing_source_raises(self, tmp_path):
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        with pytest.raises(FileNotFoundError):
+            SkillFileInstaller(
+                skill_source_path=tmp_path / "does_not_exist.md",
+                skill_name="x",
+                workdir=workdir,
+            )
+
+    def test_missing_workdir_raises(self, baseline_skill, tmp_path):
+        with pytest.raises(NotADirectoryError):
+            SkillFileInstaller(
+                skill_source_path=baseline_skill,
+                skill_name="x",
+                workdir=tmp_path / "does_not_exist",
+            )
+
+
+class TestSkillFileInstallerInstall:
+    def test_install_overwrites_target_with_candidate_text(self, baseline_skill, tmp_path):
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        installer = SkillFileInstaller(
+            skill_source_path=baseline_skill,
+            skill_name="systematic_debugging",
+            workdir=workdir,
+        )
+        candidate = tmp_path / "candidate.md"
+        candidate.write_text("# Evolved body\n")
+        sha = installer.install(candidate)
+        assert installer.target_path.read_text() == "# Evolved body\n"
+        # Returned sha matches the post-install bytes.
+        import hashlib
+        assert sha == hashlib.sha256(b"# Evolved body\n").hexdigest()
+
+    def test_install_does_not_mutate_source(self, baseline_skill, tmp_path):
+        """The whole point of the workdir copy: the user's actual skill
+        on disk is never touched, even when we install a candidate."""
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        original = baseline_skill.read_text()
+        installer = SkillFileInstaller(
+            skill_source_path=baseline_skill,
+            skill_name="systematic_debugging",
+            workdir=workdir,
+        )
+        candidate = tmp_path / "candidate.md"
+        candidate.write_text("# Different body\n")
+        installer.install(candidate)
+        assert baseline_skill.read_text() == original
+
+
+class TestSkillFileInstallerVerifyBackup:
+    def _installer(self, baseline_skill, tmp_path):
+        workdir = tmp_path / "workdir"
+        workdir.mkdir()
+        return SkillFileInstaller(
+            skill_source_path=baseline_skill,
+            skill_name="systematic_debugging",
+            workdir=workdir,
+        )
+
+    def test_accepts_valid_utf8_backup(self, baseline_skill, tmp_path):
+        installer = self._installer(baseline_skill, tmp_path)
+        backup = tmp_path / "backup.md"
+        backup.write_text("# Some skill text\n")
+        # Does not raise.
+        installer.verify_backup(backup)
+
+    def test_rejects_empty_backup(self, baseline_skill, tmp_path):
+        installer = self._installer(baseline_skill, tmp_path)
+        backup = tmp_path / "backup.md"
+        backup.write_bytes(b"")
+        with pytest.raises(ValueError, match="empty"):
+            installer.verify_backup(backup)
+
+    def test_rejects_invalid_utf8(self, baseline_skill, tmp_path):
+        installer = self._installer(baseline_skill, tmp_path)
+        backup = tmp_path / "backup.md"
+        backup.write_bytes(b"\xff\xfe\x00\x00invalid")
+        with pytest.raises(ValueError, match="not valid UTF-8"):
+            installer.verify_backup(backup)
diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py
index 327c5b11..8bc066f5 100644
--- a/tests/validation/test_hermes_runner.py
+++ b/tests/validation/test_hermes_runner.py
@@ -254,3 +254,64 @@ def _fake_run(*args, **kwargs):
 
         assert sandbox_seen["config_present"] is True
         assert "hunter2" in sandbox_seen["config_text"]
+
+    def test_skills_src_copied_into_sandbox_when_present(self, fixture_dir, tmp_path):
+        """When TaskRunContext.skills_src points at a directory, the runner
+        copies its contents into <sandbox>/skills/ so ``hermes -z``
+        discovers any candidate skills installed there.
+        """
+        skills_src = tmp_path / "candidate_skills"
+        (skills_src / "systematic_debugging").mkdir(parents=True)
+        (skills_src / "systematic_debugging" / "SKILL.md").write_text(
+            "---\nname: systematic_debugging\n---\n\nevolved body\n"
+        )
+        runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent")
+
+        sandbox_skills_state: dict = {}
+
+        def _fake_run(*args, **kwargs):
+            sandbox = Path(kwargs["env"]["HERMES_HOME"])
+            skill_path = sandbox / "skills" / "systematic_debugging" / "SKILL.md"
+            sandbox_skills_state["present"] = skill_path.is_file()
+            if sandbox_skills_state["present"]:
+                sandbox_skills_state["text"] = skill_path.read_text()
+            (sandbox / "sessions").mkdir(exist_ok=True)
+            _write_session(
+                sandbox / "sessions" / "session_test.json",
+                [{"role": "assistant", "content": "ok"}],
+            )
+            return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
+
+        with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run):
+            runner.run(TaskRunContext(
+                user_message="debug it",
+                fixture_dir=fixture_dir,
+                skills_src=skills_src,
+            ))
+
+        assert sandbox_skills_state["present"] is True
+        assert "evolved body" in sandbox_skills_state["text"]
+
+    def test_skills_src_none_means_no_skills_dir_created(self, fixture_dir, tmp_path):
+        """Tool-side runs (no skills_src) must not create an empty skills/
+        directory in the sandbox — keeps the legacy code path bit-for-bit."""
+        runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent")
+        sandbox_seen: dict = {}
+
+        def _fake_run(*args, **kwargs):
+            sandbox = Path(kwargs["env"]["HERMES_HOME"])
+            sandbox_seen["has_skills_dir"] = (sandbox / "skills").exists()
+            (sandbox / "sessions").mkdir(exist_ok=True)
+            _write_session(
+                sandbox / "sessions" / "session_test.json",
+                [{"role": "assistant", "content": "ok"}],
+            )
+            return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
+
+        with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run):
+            runner.run(TaskRunContext(
+                user_message="run",
+                fixture_dir=fixture_dir,
+            ))
+
+        assert sandbox_seen["has_skills_dir"] is False
diff --git a/tests/validation/test_safety.py b/tests/validation/test_safety.py
index a74aab8b..721e9443 100644
--- a/tests/validation/test_safety.py
+++ b/tests/validation/test_safety.py
@@ -46,6 +46,11 @@ def install(self, artifact_source: Path) -> str:
         import hashlib
         return hashlib.sha256(self.target_path.read_bytes()).hexdigest()
 
+    def verify_backup(self, backup_path: Path) -> None:
+        # Stub: any non-empty backup is fine for tests.
+        if not backup_path.read_bytes():
+            raise ValueError(f"empty backup at {backup_path}")
+
 
 class _StubRunner:
     def __init__(self, results: list[AgentRunResult]):
diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py
index 1de996d3..1e8417eb 100644
--- a/tests/validation/test_validator.py
+++ b/tests/validation/test_validator.py
@@ -21,6 +21,11 @@ def install(self, artifact_source: Path) -> str:
         import hashlib
         return hashlib.sha256(self.target_path.read_bytes()).hexdigest()
 
+    def verify_backup(self, backup_path: Path) -> None:
+        # Stub: any non-empty backup is fine for tests.
+        if not backup_path.read_bytes():
+            raise ValueError(f"empty backup at {backup_path}")
+
 
 class _ScriptedRunner:
     """Runner that returns different tool_calls_seq depending on which

From c9dd6f91f5ec2cfa37d7eb711b9858646039bd7d Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 17 May 2026 15:08:38 -0600
Subject: [PATCH 3/5] feat(validation): systematic-debugging planted-bug suite
 + fake target skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five planted-bug tasks at evolution/validation/suites/systematic_debugging.jsonl
covering bug shapes a debugging methodology should be able to handle:
off-by-one in range, inverted condition, missing recursive base case,
wrong arithmetic operator, mutable-default-argument aliasing.

Each task ships:
  - solution.py with one planted bug and a docstring stating the spec
  - test_solution.py asserting the spec across the bug case plus
    enough edge cases that a too-narrow "fix" still fails
  - test_command="python test_solution.py" — the verdict from
    test_command-mode scoring

Sanity tests in tests/validation/test_systematic_debugging_suite.py:
  - Suite shape: 5 tasks, every task has test_command, solution.py,
    and test_solution.py
  - Planted bugs are real: materializing the buggy fixture and
    running its test exits non-zero
  - Known fixes pass: applying the documented fix and re-running
    the test exits zero
The last two are critical — they would catch a typo'd assertion or
a pathologically-broken test that no candidate could satisfy.

Fake target skill at tests/fixtures/skills/systematic_debugging/SKILL.md
gives the closed-loop machinery a stable substrate for unit tests +
manual smoke. Deliberately weak baseline ("try random changes") so
GEPA has measurable headroom to evolve toward a real methodology.
---
 .../suites/systematic_debugging.jsonl         |  20 +++
 .../skills/systematic_debugging/SKILL.md      |  14 ++
 .../test_systematic_debugging_suite.py        | 164 ++++++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 evolution/validation/suites/systematic_debugging.jsonl
 create mode 100644 tests/fixtures/skills/systematic_debugging/SKILL.md
 create mode 100644 tests/validation/test_systematic_debugging_suite.py

diff --git a/evolution/validation/suites/systematic_debugging.jsonl b/evolution/validation/suites/systematic_debugging.jsonl
new file mode 100644
index 00000000..3e1c390c
--- /dev/null
+++ b/evolution/validation/suites/systematic_debugging.jsonl
@@ -0,0 +1,20 @@
+# Planted-bug suite for the `systematic-debugging` skill.
+#
+# Each task drops a small Python program with one planted bug + a test
+# script that exercises the bug. The agent (driven by the skill under
+# evaluation) is asked to debug and fix; the verdict is whether
+# `python test_solution.py` exits zero after the agent's edits.
+#
+# Conventions for adding tasks:
+#   - solution.py contains exactly ONE bug, with a docstring stating
+#     the intended behavior so the spec is unambiguous.
+#   - test_solution.py asserts the spec across the bug case + at least
+#     one edge case (empty input, zero, negative, etc.) so a candidate
+#     "fix" that only handles the obvious case fails.
+#   - test_solution.py exits non-zero on baseline and exits zero only
+#     when the intended behavior holds.
+{"task_id": "debug_off_by_one_range", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def get_range_inclusive(n):\n    \"\"\"Return the integers from 1 to n inclusive (empty list when n <= 0).\"\"\"\n    return list(range(n))\n", "test_solution.py": "from solution import get_range_inclusive\n\nassert get_range_inclusive(5) == [1, 2, 3, 4, 5], get_range_inclusive(5)\nassert get_range_inclusive(1) == [1], get_range_inclusive(1)\nassert get_range_inclusive(0) == [], get_range_inclusive(0)\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_inverted_condition", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def first_positive(values):\n    \"\"\"Return the first strictly-positive number in values, or None if none.\"\"\"\n    for x in values:\n        if x < 0:\n            return x\n    return None\n", "test_solution.py": "from solution import first_positive\n\nassert first_positive([-1, -2, 3, 4]) == 3, first_positive([-1, -2, 3, 4])\nassert first_positive([1, 2, 3]) == 1, first_positive([1, 2, 3])\nassert first_positive([-1, -2, -3]) is None, first_positive([-1, -2, -3])\nassert first_positive([]) is None, first_positive([])\nassert first_positive([0, -1, 2]) == 2, first_positive([0, -1, 2])\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_missing_base_case", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def factorial(n):\n    \"\"\"Return n! for non-negative integers (factorial(0) == 1).\"\"\"\n    return n * factorial(n - 1)\n", "test_solution.py": "from solution import factorial\n\nassert factorial(5) == 120, factorial(5)\nassert factorial(1) == 1, factorial(1)\nassert factorial(0) == 1, factorial(0)\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_wrong_operator", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def square(x):\n    \"\"\"Return x squared (x * x).\"\"\"\n    return x * 2\n", "test_solution.py": "from solution import square\n\nassert square(3) == 9, square(3)\nassert square(0) == 0, square(0)\nassert square(-4) == 16, square(-4)\nassert square(2) == 4, square(2)\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
+{"task_id": "debug_mutable_default_arg", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def append_to_new(item, items=[]):\n    \"\"\"Append item to a NEW empty list and return it. Each call must be independent.\"\"\"\n    items.append(item)\n    return items\n", "test_solution.py": "from solution import append_to_new\n\nassert append_to_new(1) == [1], append_to_new(1)\nassert append_to_new(2) == [2], append_to_new(2)\nassert append_to_new(3) == [3], append_to_new(3)\nprint('PASS')\n"}, "test_command": "python test_solution.py"}
diff --git a/tests/fixtures/skills/systematic_debugging/SKILL.md b/tests/fixtures/skills/systematic_debugging/SKILL.md
new file mode 100644
index 00000000..c0fc6567
--- /dev/null
+++ b/tests/fixtures/skills/systematic_debugging/SKILL.md
@@ -0,0 +1,14 @@
+---
+name: systematic-debugging
+description: Approach for debugging Python bugs in small programs.
+---
+
+# Systematic debugging
+
+When you encounter a bug in Python code, try to fix it.
+
+Common approaches: add print statements, comment things out, try
+random changes until the test passes. If one approach doesn't work,
+try another. Keep going until the test exits with code zero.
+
+Don't spend too much time reading the code — just try edits.
diff --git a/tests/validation/test_systematic_debugging_suite.py b/tests/validation/test_systematic_debugging_suite.py
new file mode 100644
index 00000000..9b8f2be9
--- /dev/null
+++ b/tests/validation/test_systematic_debugging_suite.py
@@ -0,0 +1,164 @@
+"""Sanity tests for the systematic-debugging planted-bug suite.
+
+The suite at ``evolution/validation/suites/systematic_debugging.jsonl``
+is the substrate for skill-side closed-loop validation. These tests
+prove the suite itself is well-formed and that every task's planted
+bug + test pair behaves as designed (test fails on baseline, passes
+when the documented fix is applied) — without these guards a typo in
+the test file or a too-permissive assertion would silently invalidate
+every closed-loop run.
+"""
+
+from __future__ import annotations
+
+import shlex
+import subprocess
+from pathlib import Path
+
+import pytest
+
+from evolution.validation.task import TaskSuite
+
+
+SUITE_PATH = (
+    Path(__file__).resolve().parents[2]
+    / "evolution"
+    / "validation"
+    / "suites"
+    / "systematic_debugging.jsonl"
+)
+
+
+# Known-good fix per task_id. Each value replaces solution.py wholesale
+# after the buggy fixture is materialized. The test then re-runs
+# test_solution.py to confirm the fix actually passes the asserts.
+_KNOWN_FIXES: dict[str, str] = {
+    "debug_off_by_one_range": (
+        "def get_range_inclusive(n):\n"
+        "    return list(range(1, n + 1))\n"
+    ),
+    "debug_inverted_condition": (
+        "def first_positive(values):\n"
+        "    for x in values:\n"
+        "        if x > 0:\n"
+        "            return x\n"
+        "    return None\n"
+    ),
+    "debug_missing_base_case": (
+        "def factorial(n):\n"
+        "    if n <= 1:\n"
+        "        return 1\n"
+        "    return n * factorial(n - 1)\n"
+    ),
+    "debug_wrong_operator": (
+        "def square(x):\n"
+        "    return x * x\n"
+    ),
+    "debug_mutable_default_arg": (
+        "def append_to_new(item, items=None):\n"
+        "    if items is None:\n"
+        "        items = []\n"
+        "    items.append(item)\n"
+        "    return items\n"
+    ),
+}
+
+
+@pytest.fixture(scope="module")
+def suite() -> TaskSuite:
+    return TaskSuite.from_jsonl(SUITE_PATH)
+
+
+class TestSuiteShape:
+    def test_loads_five_tasks(self, suite):
+        assert len(suite.tasks) == 5
+
+    def test_every_task_has_test_command(self, suite):
+        for task in suite.tasks:
+            assert task.test_command, f"{task.task_id}: missing test_command"
+
+    def test_every_task_has_solution_and_test_files(self, suite):
+        for task in suite.tasks:
+            files = set(task.fixture_setup.keys())
+            assert "solution.py" in files, f"{task.task_id}: no solution.py"
+            assert "test_solution.py" in files, f"{task.task_id}: no test_solution.py"
+
+    def test_every_task_id_has_a_known_fix(self, suite):
+        # Future-proofing: when adding tasks, the contributor must also
+        # add an entry to _KNOWN_FIXES so the round-trip tests below cover it.
+        suite_ids = {t.task_id for t in suite.tasks}
+        fix_ids = set(_KNOWN_FIXES.keys())
+        assert suite_ids == fix_ids, (
+            f"task ids and known-fix ids out of sync: "
+            f"missing fixes={suite_ids - fix_ids}, "
+            f"extra fixes={fix_ids - suite_ids}"
+        )
+
+    def test_test_command_is_runnable_form(self, suite):
+        # We use shlex.split (no shell) on the test_command. Each command
+        # should split cleanly without quoting tricks.
+        for task in suite.tasks:
+            parts = shlex.split(task.test_command)
+            assert len(parts) >= 2, f"{task.task_id}: test_command too short"
+
+
+class TestPlantedBugsAreReal:
+    """For every task: materializing the baseline buggy fixture and running
+    its test must exit non-zero. If this fails, the planted bug is no bug
+    at all — the suite isn't measuring what it claims to measure."""
+
+    @pytest.mark.parametrize(
+        "task_id",
+        list(_KNOWN_FIXES.keys()),
+        ids=list(_KNOWN_FIXES.keys()),
+    )
+    def test_baseline_buggy_code_fails(self, task_id, tmp_path, suite):
+        task = next(t for t in suite.tasks if t.task_id == task_id)
+        _materialize(task.fixture_setup, tmp_path)
+        result = subprocess.run(
+            shlex.split(task.test_command),
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        assert result.returncode != 0, (
+            f"{task_id}: baseline buggy code unexpectedly passed the test.\n"
+            f"stdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+
+
+class TestKnownFixesPass:
+    """For every task: applying the known fix and re-running the test must
+    exit zero. If this fails, the test is pathologically broken (asserts
+    the wrong thing, has a syntax error, etc.) and would never accept
+    any candidate fix — including the right one."""
+
+    @pytest.mark.parametrize(
+        "task_id,fixed_solution",
+        list(_KNOWN_FIXES.items()),
+        ids=list(_KNOWN_FIXES.keys()),
+    )
+    def test_known_fix_passes(self, task_id, fixed_solution, tmp_path, suite):
+        task = next(t for t in suite.tasks if t.task_id == task_id)
+        _materialize(task.fixture_setup, tmp_path)
+        # Apply the known fix.
+        (tmp_path / "solution.py").write_text(fixed_solution)
+        result = subprocess.run(
+            shlex.split(task.test_command),
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        assert result.returncode == 0, (
+            f"{task_id}: known fix did not make the test pass.\n"
+            f"stdout: {result.stdout}\nstderr: {result.stderr}"
+        )
+
+
+def _materialize(setup: dict[str, str], target: Path) -> None:
+    for rel, content in setup.items():
+        dest = target / rel
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        dest.write_text(content)

From b3e8289f87c0ce03a07e054c23bce07c96d15a0d Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 17 May 2026 15:18:05 -0600
Subject: [PATCH 4/5] feat(evolve_skill): wire closed-loop infrastructure
 end-to-end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the placeholder UsageError on --closed-loop-during-evolution.
The skill-evolution path now matches the tool-evolution path:
closed-loop validator feedback flows into the reflection LM, and
opt-in trainset mode lets behavioral verdicts break judge ties in
GEPA's sum(scores) acceptance rule.

Five coordinated changes:

- CLI grows --closed-loop-saturation-threshold, --closed-loop-min-iters,
  --closed-loop-window-size, --closed-loop-mode, --closed-loop-in-valset,
  mirroring evolve_tool. No --closed-loop-hermes-repo: SkillSource
  discovery + the installer's writable workdir replace that.
- New _maybe_build_closed_loop_cache_skill helper constructs the
  SkillFileInstaller, ClosedLoopValidator, and ClosedLoopFeedbackCache
  wired with write_text_artifact and .md suffix.
- SkillModule.forward grows an optional closed_loop_task_id parameter.
  When present, skips the predictor LM call and returns a Prediction
  carrying _closed_loop_task_id + _candidate_text markers the metric
  routes on.
- Trainset/valset get behavioral examples appended when mode is
  trainset or both.
- Metric construction passes closed_loop_cache through.

build_behavioral_examples gets a task_field kwarg (default "task"
preserves tool-path behavior; skill-side passes "task_input" so the
example shape matches SkillModule.forward).

Default --closed-loop-mode is feedback for skills (not trainset).
Skill bodies mutate heavily, so the "always" gate mode that trainset
needs would fire the validator on every novel candidate — costlier
than the tool path. Users opt into trainset explicitly.
---
 evolution/core/behavioral_example.py          |  25 +--
 evolution/skills/evolve_skill.py              | 192 +++++++++++++++++-
 evolution/skills/skill_module.py              |  16 +-
 tests/skills/test_evolve_skill_closed_loop.py | 179 ++++++++++++++++
 tests/tools/test_evolve_tool_closed_loop.py   |  27 ++-
 5 files changed, 407 insertions(+), 32 deletions(-)
 create mode 100644 tests/skills/test_evolve_skill_closed_loop.py

diff --git a/evolution/core/behavioral_example.py b/evolution/core/behavioral_example.py
index 6e7c47f9..51bee60e 100644
--- a/evolution/core/behavioral_example.py
+++ b/evolution/core/behavioral_example.py
@@ -7,10 +7,13 @@
 and get accepted.
 
 The example carries a ``closed_loop_task_id`` marker the metric routes on,
-plus a placeholder ``task`` value (the suite's ``user_message``) that
-``ToolModule.forward`` skips past on the behavioral branch. ``task`` and
-``closed_loop_task_id`` are both marked as input keys so DSPy passes them
-to ``forward()`` via ``program(**example.inputs())``.
+plus a placeholder task-input value (the suite's ``user_message``) that the
+module's ``forward()`` skips past on the behavioral branch. Both fields are
+marked as input keys so DSPy passes them via ``program(**example.inputs())``.
+
+``task_field`` parameterizes the input field name to match the host module's
+forward signature: ``ToolModule.forward(task=...)`` uses ``"task"`` (the
+default); ``SkillModule.forward(task_input=...)`` passes ``"task_input"``.
 """
 
 from __future__ import annotations
@@ -20,17 +23,15 @@
 from evolution.validation.task import TaskSuite
 
 
-def build_behavioral_examples(suite: TaskSuite) -> list[dspy.Example]:
-    """One example per task in ``suite``, stable order by ``task_id``.
-
-    The placeholder ``task`` value carries the original ``user_message`` for
-    debuggability; it isn't consumed by the behavioral metric branch.
-    """
+def build_behavioral_examples(
+    suite: TaskSuite, *, task_field: str = "task"
+) -> list[dspy.Example]:
+    """One example per task in ``suite``, stable order by ``task_id``."""
     examples = [
         dspy.Example(
-            task=task.user_message,
+            **{task_field: task.user_message},
             closed_loop_task_id=task.task_id,
-        ).with_inputs("task", "closed_loop_task_id")
+        ).with_inputs(task_field, "closed_loop_task_id")
         for task in sorted(suite.tasks, key=lambda t: t.task_id)
     ]
     return examples
diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
index 4e0b8744..80498848 100644
--- a/evolution/skills/evolve_skill.py
+++ b/evolution/skills/evolve_skill.py
@@ -461,6 +461,83 @@ def _emit_patch(baseline_text: str, evolved_text: str, path: Path) -> str:
     return "".join(diff_lines)
 
 
+def _maybe_build_closed_loop_cache_skill(
+    *,
+    skill_name: str,
+    skill_path: Path,
+    baseline_skill_body: str,
+    suite_path: Optional[Path],
+    saturation_threshold: float,
+    min_iters: int,
+    window_size: int,
+    gate_mode: str = "sampled",
+):
+    """Build a ClosedLoopFeedbackCache for the skill path; return None when disabled.
+
+    Mirrors evolve_tool's _maybe_build_closed_loop_cache. Local imports keep
+    the validation stack out of the cold path — most evolve_skill runs don't
+    set the flag and shouldn't pay the import cost.
+
+    Constructs:
+      - a per-process workdir under /tmp where the installer maintains a
+        writable copy of the baseline skill (decoupled from the user's
+        real HERMES_HOME / plugin cache)
+      - SkillFileInstaller pointing at that workdir
+      - ClosedLoopValidator + HermesAgentRunner
+      - ClosedLoopFeedbackCache wired with write_text_artifact (skill bodies
+        are raw text, not MCP manifests) and .md suffix
+    """
+    if suite_path is None:
+        return None
+    import tempfile
+
+    from evolution.core.closed_loop_feedback import (
+        ClosedLoopFeedbackCache,
+        write_text_artifact,
+    )
+    from evolution.validation.artifact_installer import SkillFileInstaller
+    from evolution.validation.hermes_runner import HermesAgentRunner
+    from evolution.validation.task import TaskSuite
+    from evolution.validation.validator import ClosedLoopValidator
+
+    workdir = Path(tempfile.mkdtemp(prefix="cl_skill_workdir_"))
+    installer = SkillFileInstaller(
+        skill_source_path=skill_path,
+        skill_name=skill_name,
+        workdir=workdir,
+    )
+    runner = HermesAgentRunner()
+    validator = ClosedLoopValidator(installer=installer, runner=runner)
+    suite = TaskSuite.from_jsonl(suite_path)
+    return ClosedLoopFeedbackCache(
+        validator=validator,
+        suite=suite,
+        artifact_name=skill_name,
+        baseline_artifact_text=baseline_skill_body,
+        saturation_threshold=saturation_threshold,
+        min_iters=min_iters,
+        window_size=window_size,
+        gate_mode=gate_mode,
+        artifact_writer=write_text_artifact,
+        artifact_suffix=".md",
+    )
+
+
+def _load_behavioral_examples_from_suite(suite_path: Path) -> list:
+    """Build behavioral dspy.Examples from a suite file.
+
+    Uses ``task_field="task_input"`` so the examples are shape-compatible
+    with ``SkillModule.forward(task_input=...)`` — tool-side uses ``"task"``.
+    Local-import path keeps the validation stack out of the cold path.
+    """
+    from evolution.core.behavioral_example import build_behavioral_examples
+    from evolution.validation.task import TaskSuite
+
+    return build_behavioral_examples(
+        TaskSuite.from_jsonl(suite_path), task_field="task_input"
+    )
+
+
 def _apply_in_place(skill_path: Path, evolved_full: str) -> bool:
     """Overwrite ``skill_path`` with ``evolved_full``.
 
@@ -516,6 +593,12 @@ def evolve(
     benchmark_timeout_seconds: int = 600,
     skip_preflight: bool = False,
     skip_cost_suggest: bool = False,
+    closed_loop_suite_path: Optional[Path] = None,
+    closed_loop_saturation_threshold: float = 0.95,
+    closed_loop_min_iters: int = 3,
+    closed_loop_window_size: int = 8,
+    closed_loop_mode: str = "feedback",
+    closed_loop_in_valset: bool = False,
 ):
     """Main evolution function — orchestrates the full optimization loop."""
 
@@ -735,6 +818,25 @@ def evolve(
 
             baseline_module = SkillModule(skill["body"])
 
+            # In behavioral-trainset modes the saturation gate would defeat
+            # the purpose — every novel candidate must score every time it's
+            # sampled. Otherwise default to "sampled" to keep cost bounded;
+            # skill bodies mutate heavily, so cache hit rate on the validator
+            # is lower than tool-path.
+            _cache_gate_mode = (
+                "always" if closed_loop_mode in ("trainset", "both") else "sampled"
+            )
+            closed_loop_cache = _maybe_build_closed_loop_cache_skill(
+                skill_name=skill_name,
+                skill_path=skill_path,
+                baseline_skill_body=skill["body"],
+                suite_path=closed_loop_suite_path,
+                saturation_threshold=closed_loop_saturation_threshold,
+                min_iters=closed_loop_min_iters,
+                window_size=closed_loop_window_size,
+                gate_mode=_cache_gate_mode,
+            )
+
             # Build the metric once: DSPy's LM cache lines up across GEPA's
             # per-iteration scoring and the holdout eval below. The [BUDGET]
             # feedback line targets growth_free_threshold (the zone where the
@@ -745,11 +847,29 @@ def evolve(
                 judge,
                 baseline_skill_text=skill["body"],
                 max_growth=config.growth_free_threshold,
+                closed_loop_cache=closed_loop_cache,
             )
 
             trainset = dataset.to_dspy_examples("train")
             valset = dataset.to_dspy_examples("val")
 
+            # Behavioral-example injection: each closed-loop task becomes an
+            # additional dspy.Example whose score contributes to GEPA's
+            # sum(minibatch_scores) acceptance — behavioral wins can break
+            # judge ties on saturated baselines.
+            if closed_loop_mode in ("trainset", "both"):
+                if closed_loop_suite_path is None:
+                    raise ValueError(
+                        f"--closed-loop-mode={closed_loop_mode} requires "
+                        "--closed-loop-during-evolution to be set"
+                    )
+                behavioral_examples = _load_behavioral_examples_from_suite(
+                    closed_loop_suite_path
+                )
+                trainset = trainset + behavioral_examples
+                if closed_loop_in_valset:
+                    valset = valset + behavioral_examples
+
             console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n")
 
             start_time = time.time()
@@ -1377,10 +1497,55 @@ def evolve(
     "closed_loop_suite_path",
     default=None,
     type=click.Path(exists=True, dir_okay=False, path_type=Path),
-    help="Path to a JSONL task suite for in-loop closed-loop validation feedback. "
-         "Wired symmetrically with evolve_tool, but skill-side closed-loop "
-         "validation requires a SkillFileInstaller that doesn't exist yet — "
-         "setting this flag raises until that lands.",
+    help="Path to a JSONL task suite (e.g. evolution/validation/suites/"
+         "systematic_debugging.jsonl). When set, the framework runs the "
+         "closed-loop validator on saturating GEPA iterations and surfaces "
+         "verdicts into the reflection LM's feedback. Held-out from training "
+         "tasks (no overlap-detection enforcement).",
+)
+@click.option(
+    "--closed-loop-saturation-threshold",
+    default=0.95,
+    type=click.FloatRange(min=0.0, max=1.0),
+    help="Min judge score over the recent window for the saturation gate to "
+         "open (default 0.95).",
+)
+@click.option(
+    "--closed-loop-min-iters",
+    default=3,
+    type=click.IntRange(min=1),
+    help="Periodic-fire floor: fire closed-loop at least every N reflective "
+         "iterations even when the judge isn't saturating (default 3).",
+)
+@click.option(
+    "--closed-loop-window-size",
+    default=8,
+    type=click.IntRange(min=1),
+    help="Number of recent judge scores the saturation gate inspects (default 8).",
+)
+@click.option(
+    "--closed-loop-mode",
+    default="feedback",
+    type=click.Choice(["feedback", "trainset", "both"]),
+    help="How closed-loop signal participates in GEPA. 'feedback' (default) "
+         "appends a [CLOSED_LOOP] block to the reflection LM's input — "
+         "proposal-prompt signal only, no acceptance change. 'trainset' adds "
+         "behavioral dspy.Examples to the training set whose score (binary "
+         "pass/fail from the validator) contributes to GEPA's "
+         "sum(minibatch_scores) acceptance — lets behavioral wins break judge "
+         "ties on saturated baselines. 'both' does trainset + the [CLOSED_LOOP] "
+         "feedback block (most expensive). Skill bodies mutate heavily so "
+         "trainset/both fires the validator on every novel candidate; default "
+         "stays 'feedback' to keep cost bounded.",
+)
+@click.option(
+    "--closed-loop-in-valset/--no-closed-loop-in-valset",
+    "closed_loop_in_valset",
+    default=False,
+    help="When --closed-loop-mode is trainset or both, also include behavioral "
+         "examples in the valset (adds them to the Pareto frontier + holdout "
+         "scoring). Costs more — each accepted candidate triggers another full "
+         "eval pass over the behavioral examples. Default off.",
 )
 def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model,
          eval_model, skill_source_dir, dry_run, seed, budget, no_fallback,
@@ -1393,14 +1558,13 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
          benchmark_cmd, benchmark_timeout_seconds,
          skip_preflight,
          skip_cost_suggest,
-         closed_loop_suite_path):
+         closed_loop_suite_path,
+         closed_loop_saturation_threshold,
+         closed_loop_min_iters,
+         closed_loop_window_size,
+         closed_loop_mode,
+         closed_loop_in_valset):
     """Evolve an agent skill using DSPy + GEPA optimization."""
-    if closed_loop_suite_path is not None:
-        raise click.UsageError(
-            "--closed-loop-during-evolution is wired on evolve_skill for CLI "
-            "consistency, but skill-side closed-loop validation requires a "
-            "SkillFileInstaller that doesn't exist yet."
-        )
     try:
         evolve(
             skill_name=skill,
@@ -1437,6 +1601,12 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
             benchmark_timeout_seconds=benchmark_timeout_seconds,
             skip_preflight=skip_preflight,
             skip_cost_suggest=skip_cost_suggest,
+            closed_loop_suite_path=closed_loop_suite_path,
+            closed_loop_saturation_threshold=closed_loop_saturation_threshold,
+            closed_loop_min_iters=closed_loop_min_iters,
+            closed_loop_window_size=closed_loop_window_size,
+            closed_loop_mode=closed_loop_mode,
+            closed_loop_in_valset=closed_loop_in_valset,
         )
     except HermesProviderError as exc:
         # Render a clean error panel instead of dumping a Python traceback
diff --git a/evolution/skills/skill_module.py b/evolution/skills/skill_module.py
index b7655733..fc823585 100644
--- a/evolution/skills/skill_module.py
+++ b/evolution/skills/skill_module.py
@@ -93,7 +93,21 @@ def __init__(self, skill_text: str):
         # install the skill body there so forward() reads exactly what GEPA writes.
         self.predictor.predict.signature = self.predictor.predict.signature.with_instructions(skill_text)
 
-    def forward(self, task_input: str) -> dspy.Prediction:
+    def forward(
+        self,
+        task_input: str,
+        closed_loop_task_id: Optional[str] = None,
+    ) -> dspy.Prediction:
+        if closed_loop_task_id is not None:
+            # Behavioral example: skip the predictor LM call. The metric's
+            # behavioral branch reads _closed_loop_task_id off the Prediction
+            # and routes to the closed-loop cache for scoring, so the LM call
+            # would be wasted spend.
+            return dspy.Prediction(
+                output="",
+                _closed_loop_task_id=closed_loop_task_id,
+                _candidate_text=self.skill_text,
+            )
         result = self.predictor(task_input=task_input)
         return dspy.Prediction(output=result.output)
 
diff --git a/tests/skills/test_evolve_skill_closed_loop.py b/tests/skills/test_evolve_skill_closed_loop.py
new file mode 100644
index 00000000..91dc4d99
--- /dev/null
+++ b/tests/skills/test_evolve_skill_closed_loop.py
@@ -0,0 +1,179 @@
+"""Tests for the closed-loop integration on the skill-evolution path.
+
+Exercises the integration seams independently of GEPA: SkillModule's
+behavioral branch, the cache-construction helper, the behavioral-example
+loader's task_input shape, and the gate_mode policy. End-to-end behavior
+is exercised by the manual smoke harness at
+``tests/manual/skill_closed_loop_smoke.py`` (heavy, real LM spend).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from evolution.core.behavioral_example import build_behavioral_examples
+from evolution.skills.evolve_skill import (
+    _load_behavioral_examples_from_suite,
+    _maybe_build_closed_loop_cache_skill,
+)
+from evolution.skills.skill_module import SkillModule
+from evolution.validation.task import TaskSuite
+
+
+_SUITE_FIXTURE = (
+    Path(__file__).resolve().parents[2]
+    / "evolution"
+    / "validation"
+    / "suites"
+    / "systematic_debugging.jsonl"
+)
+
+
+@pytest.fixture
+def fake_skill_path(tmp_path):
+    src = tmp_path / "systematic_debugging"
+    src.mkdir()
+    md = src / "SKILL.md"
+    md.write_text(
+        "---\nname: systematic_debugging\n---\n\n# Weak baseline\n\nTry stuff.\n"
+    )
+    return md
+
+
+# ---------------------------------------------------------------------------
+# SkillModule.forward behavioral branch
+# ---------------------------------------------------------------------------
+
+
+class TestSkillModuleBehavioralBranch:
+    def test_normal_branch_calls_predictor(self):
+        # closed_loop_task_id absent → take the LM path. We mock the
+        # predictor to avoid a real LM call.
+        module = SkillModule("body")
+        with patch.object(module, "predictor") as mock_pred:
+            mock_pred.return_value = type("R", (), {"output": "hello"})()
+            result = module.forward(task_input="do X")
+        assert result.output == "hello"
+        mock_pred.assert_called_once_with(task_input="do X")
+
+    def test_behavioral_branch_skips_predictor(self):
+        # closed_loop_task_id present → no LM call; marker fields are
+        # stuffed onto the Prediction for the metric to route. Real
+        # SkillModule (no mock) because the behavioral branch must not
+        # touch the predictor at all — if it did, this test would hang
+        # waiting for a non-existent LM configuration.
+        module = SkillModule("evolved body text")
+        result = module.forward(
+            task_input="placeholder",
+            closed_loop_task_id="t1",
+        )
+        assert result._closed_loop_task_id == "t1"
+        assert result._candidate_text == "evolved body text"
+        assert result.output == ""
+
+
+# ---------------------------------------------------------------------------
+# Behavioral-example loader (skill-side task_input shape)
+# ---------------------------------------------------------------------------
+
+
+class TestBehavioralExampleLoader:
+    def test_skill_examples_use_task_input_field(self, tmp_path):
+        suite_path = tmp_path / "s.jsonl"
+        suite_path.write_text(
+            '{"task_id": "a", "user_message": "debug a", "test_command": "python t.py"}\n'
+            '{"task_id": "b", "user_message": "debug b", "test_command": "python t.py"}\n'
+        )
+        examples = _load_behavioral_examples_from_suite(suite_path)
+        assert len(examples) == 2
+        # Skill modules take `task_input`; the loader must use that field name.
+        assert "task_input" in examples[0].inputs()
+        assert "closed_loop_task_id" in examples[0].inputs()
+        assert examples[0].task_input == "debug a"
+
+    def test_tool_side_default_still_uses_task_field(self, tmp_path):
+        # build_behavioral_examples default preserves tool-path shape.
+        suite_path = tmp_path / "s.jsonl"
+        suite_path.write_text(
+            '{"task_id": "a", "user_message": "do a", "expected_tools": ["patch"]}\n'
+        )
+        examples = build_behavioral_examples(TaskSuite.from_jsonl(suite_path))
+        assert "task" in examples[0].inputs()
+        assert examples[0].task == "do a"
+
+
+# ---------------------------------------------------------------------------
+# Cache construction helper
+# ---------------------------------------------------------------------------
+
+
+class TestMaybeBuildClosedLoopCacheSkill:
+    def test_none_suite_returns_none(self, fake_skill_path):
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=None,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+        )
+        assert cache is None
+
+    def test_constructs_cache_with_skill_installer_and_md_suffix(
+        self, fake_skill_path
+    ):
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="baseline body text",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+        )
+        assert cache is not None
+        # Baseline written as raw text via write_text_artifact (no JSON envelope).
+        baseline_text = cache._baseline_path.read_text()
+        assert baseline_text == "baseline body text"
+        # Suffix is .md for skill artifacts (not the default .json).
+        assert cache._baseline_path.suffix == ".md"
+        # Default gate_mode for skills is "sampled" — caller passes "always"
+        # only when wiring trainset mode.
+        assert cache.gate_mode == "sampled"
+
+    def test_gate_mode_always_propagates(self, fake_skill_path):
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+            gate_mode="always",
+        )
+        assert cache.gate_mode == "always"
+
+    def test_installer_skills_src_is_under_workdir(self, fake_skill_path):
+        # The cache's validator holds an installer with skills_src — confirming
+        # the wiring is intact so the runner sees the candidate.
+        cache = _maybe_build_closed_loop_cache_skill(
+            skill_name="systematic_debugging",
+            skill_path=fake_skill_path,
+            baseline_skill_body="body",
+            suite_path=_SUITE_FIXTURE,
+            saturation_threshold=0.95,
+            min_iters=3,
+            window_size=8,
+        )
+        installer = cache._validator.installer
+        assert installer.skills_src.name == "skills"
+        # The baseline skill was copied into the installer's workdir.
+        target = installer.skills_src / "systematic_debugging" / "SKILL.md"
+        assert target.is_file()
+        # The user's original SKILL.md is untouched.
+        assert "Weak baseline" in fake_skill_path.read_text()
diff --git a/tests/tools/test_evolve_tool_closed_loop.py b/tests/tools/test_evolve_tool_closed_loop.py
index 8e272ad4..f0c8c42b 100644
--- a/tests/tools/test_evolve_tool_closed_loop.py
+++ b/tests/tools/test_evolve_tool_closed_loop.py
@@ -142,18 +142,29 @@ def test_flag_with_repo_does_not_immediately_error(
         assert kwargs["closed_loop_hermes_repo"] == fake_hermes_repo
 
 
-class TestEvolveSkillFlagRaises:
-    def test_skill_side_flag_raises_with_clear_error(
+class TestEvolveSkillFlagThreadsThrough:
+    def test_skill_side_closed_loop_flag_passes_to_evolve(
         self, tmp_path, task_suite_file
     ):
+        # The skill-side --closed-loop-during-evolution flag is now fully
+        # wired (no longer a placeholder UsageError). Confirms the flag
+        # reaches evolve() — full end-to-end behavior is exercised by
+        # tests/skills/test_evolve_skill_closed_loop.py and the manual
+        # smoke harness.
         from evolution.skills.evolve_skill import main
         runner = CliRunner()
-        result = runner.invoke(main, [
-            "--skill", "some-skill",
-            "--closed-loop-during-evolution", str(task_suite_file),
-        ])
-        assert result.exit_code != 0
-        assert "SkillFileInstaller" in result.output
+        with patch("evolution.skills.evolve_skill.evolve") as fake_evolve:
+            fake_evolve.return_value = None
+            result = runner.invoke(main, [
+                "--skill", "some-skill",
+                "--closed-loop-during-evolution", str(task_suite_file),
+            ])
+        assert result.exit_code == 0, result.output
+        kwargs = fake_evolve.call_args.kwargs
+        assert kwargs["closed_loop_suite_path"] == task_suite_file
+        # Default mode is feedback for skills (skill bodies mutate heavily
+        # so opting into trainset is explicit).
+        assert kwargs["closed_loop_mode"] == "feedback"
 
 
 class TestClosedLoopModeFlag:

From 5c6a8065a9281c2a414e544433f014233e445ed7 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 17 May 2026 15:20:44 -0600
Subject: [PATCH 5/5] docs+chore: close PLAN deviation #3 + add skill
 closed-loop smoke harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PLAN.md Phase 1 deviation #3 was the unfinished-skill-side-closed-loop
note. systematic-debugging is now built (commits 1-4); update the
deviation to reflect that, and explicitly call out which equivalents
remain deferred (arxiv known-paper-recall, github-code-review
planted-issue — same harness, different verdict mechanisms).

docs/workflows.md grows a "Skill-path equivalent" subsection under
Workflow 11 covering the two structural differences (test_command
verdict vs tool-call membership; SkillFileInstaller vs in-place tool
splice) and the rationale for the different default --closed-loop-mode
(feedback for skills, since heavy mutation makes "always" gate mode
3-10x costlier than tool-path).

tests/manual/skill_closed_loop_smoke.py drives the closed-loop layer
end-to-end against the fake systematic-debugging skill + the planted-bug
suite, with real hermes -z subprocesses. Pre-flight checks confirm
hermes and python are on PATH. Asserts the cache fires, per-task
verdicts come back, candidate-text caching dedups the second call,
and no orphaned per-task tmpdirs leak. Not in CI — heavyweight (10
hermes -z calls, real LM spend).
---
 PLAN.md                                 |   2 +-
 docs/workflows.md                       |  11 ++
 tests/manual/skill_closed_loop_smoke.py | 209 ++++++++++++++++++++++++
 3 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100644 tests/manual/skill_closed_loop_smoke.py

diff --git a/PLAN.md b/PLAN.md
index a8566baf..a7b12172 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -372,7 +372,7 @@ The agent can self-invoke optimization:
 
 1. **Benchmark gating (TBLite/YC-Bench) was not wired as a built-in.** The plan made "TBLite within 2%" the next-phase gate. The framework went framework-agnostic (any agent that emits `SKILL.md`), so the Hermes-specific benchmark suites no longer apply uniformly. The built-in deploy gate is a paired-bootstrap CI on a held-out split of the skill's own eval set (`evolution/core/stats.py` + `_check_growth_with_quality_gate`). For users who want a benchmark-style gate, `--benchmark-cmd "<shell command>"` runs an arbitrary user-provided command after the built-in gate passes; nonzero exit flips the decision to reject with `reason="benchmark_failed"` and a `benchmark` block in `gate_decision.json`. This restores the plan author's "benchmarks as gates" vision in a framework-agnostic way: TBLite users wire `--benchmark-cmd "tblite-runner --threshold 0.02"`, no built-in coupling.
 2. **Eval datasets are larger than the planned 15–30 examples.** `eval_dataset_size=150` is the current default, sized so the bootstrap CI half-width can detect ±2% effects. Rationale and the supporting study live in `reports/calibration_findings.md`.
-3. **Source D (skill-specific auto-evaluation) is now built for tool descriptions; skill-side auto-eval still deferred.** `evolution/validation/closed_loop` drives a real Hermes Agent via `hermes -z` against a JSONL task suite, scores actual tool-selection behavior with both expected and forbidden tools, compares baseline vs. evolved with a two-condition decision rule (no aggregate regression + no per-task loss unless wins offset 2:1). v1 ships a 5-task suite for `patch`. Skill-side equivalents (planted-bug for `systematic-debugging`, known-paper recall for `arxiv`, planted-issue for `github-code-review`) remain deferred — same harness shape, needs different `ArtifactInstaller` impls.
+3. **Source D (skill-specific auto-evaluation) is built for tool descriptions and for `systematic-debugging`; remaining skill-side suites still deferred.** `evolution/validation/closed_loop` drives a real Hermes Agent via `hermes -z` against a JSONL task suite, scores actual tool-selection behavior with both expected and forbidden tools, compares baseline vs. evolved with a two-condition decision rule (no aggregate regression + no per-task loss unless wins offset 2:1). v1 ships a 5-task suite for `patch` (tool-side). Skill-side now ships `systematic-debugging` via a new `SkillFileInstaller` plus a `test_command` verdict mode (exit code of a planted test replaces tool-call membership scoring); same harness, different artifact + scoring shape. Remaining skill-side equivalents (known-paper recall for `arxiv`, planted-issue for `github-code-review`) remain deferred — same harness, different verdict mechanisms (rubric-based for arxiv, planted-issue identification for code-review).
 4. **The selection and gating layer is much thicker than the plan anticipated.** Knee-point Pareto selection, a budget-aware reflection-prompt proposer with growth/balanced/compression modes, paired-bootstrap CI, non-inferiority gate option, and the `gate_decision.json` v4 audit schema were added on top of raw GEPA. Rationale: raw "ship the best valset candidate" overfits at small N. See `docs/framework_advantages.md`.
 5. **CLI shipped as `python -m evolution.skills.evolve_skill`, not a `hermes evolve skill` subcommand.** The framework rebranded out of Hermes-only branding so it could target any agent framework whose skills are `SKILL.md` files.
 6. **`--run-tests` (target-repo pytest as a hard gate) was specified but never wired and has been removed.** Skill text doesn't touch target-repo Python, so coupling the deploy decision to the target repo's test suite would gate on unrelated signal. The CLI flag set a config field that no code read. The unused `EvolutionConfig.run_pytest`, `run_tblite`, `tblite_regression_threshold` fields and `ConstraintValidator.run_test_suite()` method were removed alongside the flag.
diff --git a/docs/workflows.md b/docs/workflows.md
index 6531e95d..01706618 100644
--- a/docs/workflows.md
+++ b/docs/workflows.md
@@ -479,6 +479,17 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256
 
 `--closed-loop-mode both` does both: trainset behavioral examples for acceptance, plus the `[CLOSED_LOOP]` feedback block on non-behavioral examples for reflection.
 
+### Skill-path equivalent
+
+`evolve_skill` exposes the same `--closed-loop-*` flags. Two differences from the tool path:
+
+- **Verdict mechanism is `test_command`, not tool-call membership.** Skill-side suites set `"test_command": "python test_solution.py"` on each task; the validator runs that command in `fixture_dir` after the agent and passes iff exit code is zero. `expected_tools` / `forbidden_tools` aren't meaningful for "did the agent debug correctly"-shape verdicts. The decision rule (two-condition: aggregate no-regression + per-task wins offset losses 2:1) is unchanged.
+- **`SkillFileInstaller` instead of in-place tool description splice.** The user's actual skill may live in a read-only plugin cache, so the installer copies the baseline skill directory into a writable workdir at construction and mutates the copy. `HermesAgentRunner._prime_sandbox` reads `TaskRunContext.skills_src` and copies that workdir's `skills/` into each per-task sandbox so `hermes -z` discovers the candidate. The user's source skill is never touched.
+
+Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable.
+
+Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`.
+
 ## Failure-mode summary
 
 | Trigger | Outcome | Where to look |
diff --git a/tests/manual/skill_closed_loop_smoke.py b/tests/manual/skill_closed_loop_smoke.py
new file mode 100644
index 00000000..df398d47
--- /dev/null
+++ b/tests/manual/skill_closed_loop_smoke.py
@@ -0,0 +1,209 @@
+"""Manual end-to-end smoke for skill-side closed-loop validation.
+
+Why this exists:
+  Unit tests mock the validator so they catch wiring bugs without paying
+  for real LM calls. They don't exercise the real Hermes agent against
+  the real planted-bug suite, so a regression in (e.g.) how Hermes
+  discovers skills inside the per-task sandbox, or how `python
+  test_solution.py` actually scores in the validator, would slip
+  through CI. This smoke runs the entire closed-loop layer end-to-end
+  against:
+    - the checked-in fake target skill at tests/fixtures/skills/systematic_debugging/
+    - the checked-in planted-bug suite at evolution/validation/suites/systematic_debugging.jsonl
+    - whatever Hermes is configured with (uses the user's real ~/.hermes/config.yaml)
+
+  Drives one ClosedLoopValidator.validate() call directly — 5 tasks ×
+  2 phases (baseline + evolved) = 10 hermes -z invocations. With a
+  cheap eval model (gpt-4o-mini-ish) this is pennies, not dollars.
+
+How to run:
+  uv run python tests/manual/skill_closed_loop_smoke.py
+
+  Exits 0 on success. Not part of CI — heavyweight (drives real
+  hermes -z subprocesses + real LM spend).
+
+Prerequisites:
+  - `hermes` on PATH and configured (`hermes model` to select one)
+  - Whatever credentials Hermes needs in `~/.hermes/auth.json`
+"""
+
+from __future__ import annotations
+
+import shutil
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+FAKE_SKILL_PATH = (
+    REPO_ROOT / "tests" / "fixtures" / "skills" / "systematic_debugging" / "SKILL.md"
+)
+SUITE_PATH = (
+    REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl"
+)
+SKILL_NAME = "systematic_debugging"
+
+
+# A deliberately different candidate body so the validator has something
+# to do — also serves as a smoke for the structural-difference path.
+EVOLVED_BODY = (
+    "# Systematic debugging — evolved\n"
+    "\n"
+    "When a test fails:\n"
+    "1. Read the test to understand the spec.\n"
+    "2. Read the buggy code with the spec in mind.\n"
+    "3. Form a hypothesis about which line is wrong.\n"
+    "4. Make the smallest change you can.\n"
+    "5. Re-run the test. If it passes, stop. If not, revert and try again.\n"
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _watermark_tempdirs(prefix_substrings: list[str]) -> set[Path]:
+    tmp = Path(tempfile.gettempdir())
+    found: set[Path] = set()
+    for entry in tmp.iterdir():
+        if any(sub in entry.name for sub in prefix_substrings):
+            found.add(entry)
+    return found
+
+
+def _section(title: str) -> None:
+    print(f"\n{'=' * 60}\n  {title}\n{'=' * 60}")
+
+
+# ---------------------------------------------------------------------------
+# The smoke
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    _section("Pre-flight checks")
+    if not FAKE_SKILL_PATH.is_file():
+        print(f"  ✗ Missing fixture skill at {FAKE_SKILL_PATH}")
+        return 1
+    if not SUITE_PATH.is_file():
+        print(f"  ✗ Missing suite at {SUITE_PATH}")
+        return 1
+    if shutil.which("hermes") is None:
+        print(
+            "  ✗ `hermes` not on PATH. Install hermes-agent first "
+            "(see README); this smoke drives real hermes -z calls."
+        )
+        return 1
+    if shutil.which("python") is None:
+        print("  ✗ `python` not on PATH. Suite tasks use `python test_solution.py`.")
+        return 1
+    print(f"  ✓ Fixture skill present: {FAKE_SKILL_PATH}")
+    print(f"  ✓ Suite present: {SUITE_PATH}")
+    print(f"  ✓ hermes binary present: {shutil.which('hermes')}")
+
+    _section("Constructing closed-loop cache")
+    from evolution.skills.evolve_skill import _maybe_build_closed_loop_cache_skill
+    from evolution.skills.skill_module import load_skill
+
+    skill = load_skill(FAKE_SKILL_PATH)
+    print(f"  Baseline body ({len(skill['body'])} chars):")
+    print("  ┌" + "─" * 58)
+    for line in skill["body"].splitlines()[:5]:
+        print(f"  │ {line}")
+    if len(skill["body"].splitlines()) > 5:
+        print("  │ ...")
+    print("  └" + "─" * 58)
+
+    tempdirs_before = _watermark_tempdirs(["cl_skill_workdir_", "cl_feedback_"])
+    cache = _maybe_build_closed_loop_cache_skill(
+        skill_name=SKILL_NAME,
+        skill_path=FAKE_SKILL_PATH,
+        baseline_skill_body=skill["body"],
+        suite_path=SUITE_PATH,
+        saturation_threshold=0.95,
+        min_iters=1,
+        window_size=4,
+        gate_mode="always",  # force the validator to fire
+    )
+    assert cache is not None, "cache should be constructed when suite_path is set"
+    print(f"  ✓ Cache constructed (gate_mode={cache.gate_mode})")
+    installer = cache._validator.installer
+    print(f"  ✓ Installer skills_src: {installer.skills_src}")
+    target = installer.skills_src / SKILL_NAME / "SKILL.md"
+    assert target.is_file(), f"baseline skill not copied into workdir: {target}"
+    print(f"  ✓ Baseline staged at: {target}")
+
+    _section("Running validator (5 tasks × 2 phases = 10 hermes -z calls)")
+    print("  This invokes hermes -z 10 times. Wait ~60-180s depending on LM…\n")
+    start = time.time()
+    report = cache.get_or_run(EVOLVED_BODY)
+    elapsed = time.time() - start
+    print(f"\n  ✓ Validator completed in {elapsed:.1f}s")
+
+    if report is None:
+        print(
+            "  ✗ Validator returned None — gate closed or validator raised. "
+            "Re-run with logging at WARNING+ to see the cause."
+        )
+        return 1
+
+    _section("Report")
+    print(f"  Decision: {report.decision}")
+    for reason in report.decision_reasons:
+        print(f"    • {reason}")
+    print(
+        f"\n  Baseline:  {report.baseline.n_passed}/"
+        f"{report.baseline.n_passed + report.baseline.n_failed} "
+        f"({report.baseline.pass_rate:.0%})"
+    )
+    print(
+        f"  Evolved:   {report.evolved.n_passed}/"
+        f"{report.evolved.n_passed + report.evolved.n_failed} "
+        f"({report.evolved.pass_rate:.0%})"
+    )
+    print(f"  Δ:         {report.delta.pass_rate_change:+.2f}")
+    print(
+        f"  Per-task:  {report.delta.n_wins} wins, "
+        f"{report.delta.n_losses} losses, {report.delta.n_ties} ties"
+    )
+
+    print("\n  Per-task verdicts (baseline → evolved):")
+    b_by_id = {t.task_id: t for t in report.baseline.tasks}
+    for ev in report.evolved.tasks:
+        b = b_by_id.get(ev.task_id)
+        b_mark = "✓" if (b and b.passed) else ("◌" if (b and b.abstained) else "✗")
+        e_mark = "✓" if ev.passed else ("◌" if ev.abstained else "✗")
+        print(f"    [{b_mark} → {e_mark}] {ev.task_id}")
+
+    _section("Cache + temp dir housekeeping")
+    second_report = cache.get_or_run(EVOLVED_BODY)
+    assert second_report is report, (
+        "second get_or_run with same candidate should be a cache hit"
+    )
+    print("  ✓ Candidate-text cache hit on second call (no re-run)")
+
+    tempdirs_after = _watermark_tempdirs(["cl_skill_workdir_", "cl_feedback_"])
+    leaked = tempdirs_after - tempdirs_before
+    # Cache + installer workdirs persist until process exit by design.
+    # Count them — they should appear here. What we DON'T want to see is
+    # cl_fixture_ or cl_hermes_home_ persisting (those are per-task tmpdirs
+    # that the validator/runner cleans up in their own scope).
+    leaked_per_task = _watermark_tempdirs(["cl_fixture_", "cl_hermes_home_"])
+    print(f"  New persistent workdirs (expected, ≤ 2): {len(leaked)}")
+    print(f"  Orphaned per-task tmpdirs (should be 0): {len(leaked_per_task)}")
+    if leaked_per_task:
+        print(f"  ✗ Orphaned: {leaked_per_task}")
+        return 1
+
+    _section("All assertions passed")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())