From dc32179c89eb78320349c6281b5d2199b5cad41c Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 17 May 2026 14:56:15 -0600 Subject: [PATCH 1/5] feat(validation): test_command verdict mode for skill-shaped tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tool-side closed-loop scoring is a tool-call membership test (was an expected tool invoked, was a forbidden tool avoided). Skill-side suites need a different verdict: did the agent's edits make a planted test pass. Extends Task with an optional test_command field. When set, score_task runs the command in fixture_dir after the agent finishes and passes iff exit code is zero — tool-call membership is bypassed. Command failure modes (nonzero exit, timeout, FileNotFoundError) all map to "not passed" since the verdict is exit-code-driven. Runner errors still abstain. Uses shlex.split (no shell) so suite-controlled commands don't pick up shell metacharacters. Default timeout 60s. Tool-side suites (patch, write_file, search_files) are unchanged — they never set test_command and follow the existing tool-call rule. --- evolution/validation/report.py | 40 ++++++++++ evolution/validation/task.py | 26 +++++-- evolution/validation/validator.py | 2 + tests/validation/test_report.py | 119 ++++++++++++++++++++++++++++++ tests/validation/test_task.py | 25 +++++++ 5 files changed, 207 insertions(+), 5 deletions(-) diff --git a/evolution/validation/report.py b/evolution/validation/report.py index 6345832f..94de672d 100644 --- a/evolution/validation/report.py +++ b/evolution/validation/report.py @@ -7,6 +7,8 @@ from __future__ import annotations import json +import shlex +import subprocess from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Optional @@ -50,14 +52,31 @@ def score_task( expected_tools: tuple[str, ...], forbidden_tools: tuple[str, ...], run: AgentRunResult, + test_command: Optional[str] = None, + fixture_dir: Optional[Path] = None, + test_command_timeout_seconds: float = 60.0, ) -> tuple[bool, bool]: """Return (passed, abstained). Abstention takes precedence over pass/fail: a task that errored out in the runner is not evidence of the artifact's quality either way. + + When ``test_command`` is set (skill-side suites), the verdict is + "did the planted test pass after the agent's edits": the command + runs in ``fixture_dir`` with the given timeout, and passes iff exit + code is zero. ``expected_tools`` / ``forbidden_tools`` are ignored + in this mode. Command failure modes (nonzero exit, timeout, + FileNotFoundError) all map to ``(False, False)`` — "the test did + not pass," which is the meaningful verdict regardless of cause. """ if run.error is not None: return False, True + if test_command is not None: + if fixture_dir is None: + raise ValueError( + "score_task: fixture_dir is required when test_command is set" + ) + return _run_test_command(test_command, fixture_dir, test_command_timeout_seconds), False invoked = set(run.tool_calls_seq) if forbidden_tools and (invoked & set(forbidden_tools)): return False, False @@ -66,6 +85,27 @@ def score_task( return True, False +def _run_test_command(command: str, cwd: Path, timeout_seconds: float) -> bool: + """Run ``command`` in ``cwd``. Return True iff exit code is zero. + + Uses ``shlex.split`` (no shell) so suite-controlled commands don't + accidentally pick up shell metacharacters. All failure modes + (nonzero exit, timeout, FileNotFoundError, OSError) return False — + the test did not pass is the verdict the caller needs. + """ + try: + result = subprocess.run( + shlex.split(command), + cwd=cwd, + capture_output=True, + text=True, + timeout=timeout_seconds, + ) + return result.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + return False + + def summarize_phase(task_results: list[TaskResult]) -> PhaseResult: n_passed = sum(1 for r in task_results if r.passed and not r.abstained) n_abstained = sum(1 for r in task_results if r.abstained) diff --git a/evolution/validation/task.py b/evolution/validation/task.py index 72bd6834..209c8b38 100644 --- a/evolution/validation/task.py +++ b/evolution/validation/task.py @@ -6,6 +6,7 @@ import json from dataclasses import dataclass, field from pathlib import Path +from typing import Optional @dataclass(frozen=True) @@ -18,11 +19,19 @@ class Task: by that dir's absolute path so the task can reference the fixture files unambiguously. - ``expected_tools`` and ``forbidden_tools`` define the pass rule: - pass iff at least one expected tool was invoked AND no forbidden - tool was invoked. Both empty is a degenerate "no-tool task" — the - pass rule reduces to "agent didn't invoke any forbidden tools," - which trivially passes if forbidden is empty too. + Two verdict mechanisms are supported, depending on what the suite + is validating: + + - ``expected_tools`` / ``forbidden_tools`` (tool-side): pass iff at + least one expected tool was invoked AND no forbidden tool was + invoked. Both empty is a degenerate "no-tool task" — the pass + rule reduces to "agent didn't invoke any forbidden tools," which + trivially passes if forbidden is empty too. + - ``test_command`` (skill-side): pass iff the command exits zero + when run in ``fixture_dir`` after the agent finishes. Use when + the verdict is "did the agent's edits make the planted test + pass" rather than "did the agent invoke the right tools." When + set, takes precedence over the tool-call rule. """ task_id: str @@ -30,6 +39,7 @@ class Task: expected_tools: tuple[str, ...] = () forbidden_tools: tuple[str, ...] = () fixture_setup: dict[str, str] = field(default_factory=dict) + test_command: Optional[str] = None def render_message(self, fixture_dir: Path) -> str: """Substitute ``{fixture_dir}`` in the message with the resolved path. @@ -83,10 +93,16 @@ def _task_from_dict(obj: dict, *, source: str) -> Task: raise ValueError( f"{source}: fixture_setup must be a dict of relative_path -> content" ) + test_command = obj.get("test_command") + if test_command is not None and not isinstance(test_command, str): + raise ValueError( + f"{source}: test_command must be a string (got {type(test_command).__name__})" + ) return Task( task_id=obj["task_id"], user_message=obj["user_message"], expected_tools=tuple(obj.get("expected_tools") or ()), forbidden_tools=tuple(obj.get("forbidden_tools") or ()), fixture_setup=dict(fixture_setup), + test_command=test_command, ) diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py index 56c078cb..da24959c 100644 --- a/evolution/validation/validator.py +++ b/evolution/validation/validator.py @@ -147,6 +147,8 @@ def _run_one_task(self, task: Task) -> TaskResult: expected_tools=task.expected_tools, forbidden_tools=task.forbidden_tools, run=run, + test_command=task.test_command, + fixture_dir=fixture_dir, ) return TaskResult( task_id=task.task_id, diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py index abd02d75..ec8eaa8b 100644 --- a/tests/validation/test_report.py +++ b/tests/validation/test_report.py @@ -68,6 +68,125 @@ def test_error_marks_abstention(self): assert abstained +class TestScoreTaskTestCommandMode: + """When ``test_command`` is set on a task, the verdict is exit-code-driven, + not tool-call-driven. Used by skill-side suites (e.g., planted-bug: + "did the agent's edits make the test pass"). + """ + + @staticmethod + def _ok_run() -> AgentRunResult: + return AgentRunResult( + tool_calls_seq=[], final_text_tail="", duration_seconds=1.0, + ) + + def test_passes_on_exit_zero(self, tmp_path): + (tmp_path / "ok.py").write_text("import sys; sys.exit(0)\n") + passed, abstained = score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command="python ok.py", + fixture_dir=tmp_path, + ) + assert passed + assert not abstained + + def test_fails_on_nonzero_exit(self, tmp_path): + (tmp_path / "bad.py").write_text("import sys; sys.exit(1)\n") + passed, abstained = score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command="python bad.py", + fixture_dir=tmp_path, + ) + assert not passed + assert not abstained + + def test_timeout_marks_failed_not_abstained(self, tmp_path): + # Treat hangs as failure rather than abstention — a debugging + # task that goes infinite is the agent's failure to debug. + (tmp_path / "slow.py").write_text("import time; time.sleep(60)\n") + passed, abstained = score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command="python slow.py", + fixture_dir=tmp_path, + test_command_timeout_seconds=0.3, + ) + assert not passed + assert not abstained + + def test_cwd_is_fixture_dir(self, tmp_path): + # The test script verifies its own cwd matches the fixture dir. + (tmp_path / "cwd_check.py").write_text( + "import os, sys\n" + "sys.exit(0 if os.path.realpath(os.getcwd()) == sys.argv[1] else 1)\n" + ) + passed, _ = score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command=f"python cwd_check.py {tmp_path.resolve()}", + fixture_dir=tmp_path, + ) + assert passed + + def test_precedence_over_tool_call_rule(self, tmp_path): + # When test_command is set, the tool-call rule is ignored + # entirely — even if the agent invoked a forbidden tool. + (tmp_path / "ok.py").write_text("") # empty file → python ok.py exits 0 + run = AgentRunResult( + tool_calls_seq=["forbidden_tool"], final_text_tail="", duration_seconds=1.0, + ) + passed, _ = score_task( + expected_tools=("expected_tool",), + forbidden_tools=("forbidden_tool",), + run=run, + test_command="python ok.py", + fixture_dir=tmp_path, + ) + assert passed + + def test_command_not_found_fails(self, tmp_path): + passed, abstained = score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command="this-binary-does-not-exist-12345", + fixture_dir=tmp_path, + ) + assert not passed + assert not abstained + + def test_missing_fixture_dir_raises(self): + with pytest.raises(ValueError, match="fixture_dir is required"): + score_task( + expected_tools=(), forbidden_tools=(), run=self._ok_run(), + test_command="python ok.py", + ) + + def test_runner_error_still_abstains_with_test_command(self, tmp_path): + # Runner error takes precedence over test_command — same as it + # does over the tool-call rule. A subprocess crash that prevented + # the agent from running isn't evidence either way. + (tmp_path / "ok.py").write_text("") + run = AgentRunResult( + tool_calls_seq=[], final_text_tail="", duration_seconds=1.0, + error="hermes crashed", + ) + passed, abstained = score_task( + expected_tools=(), forbidden_tools=(), run=run, + test_command="python ok.py", + fixture_dir=tmp_path, + ) + assert not passed + assert abstained + + def test_tool_only_path_unchanged_when_test_command_absent(self): + # Regression guard for the existing patch/search_files/write_file suites. + run = AgentRunResult( + tool_calls_seq=["patch"], final_text_tail="", duration_seconds=1.0, + ) + passed, abstained = score_task( + expected_tools=("patch",), forbidden_tools=("write_file",), run=run, + ) + assert passed + assert not abstained + + class TestSummarizePhase: def test_counts_and_pass_rate(self): results = [ diff --git a/tests/validation/test_task.py b/tests/validation/test_task.py index b531605a..4c0b0908 100644 --- a/tests/validation/test_task.py +++ b/tests/validation/test_task.py @@ -130,3 +130,28 @@ def test_non_dict_fixture_setup_raises(self, tmp_path): }]) with pytest.raises(ValueError, match="fixture_setup must be a dict"): TaskSuite.from_jsonl(p) + + def test_test_command_parsed_when_present(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{ + "task_id": "t1", + "user_message": "debug it", + "test_command": "python test_solution.py", + }]) + suite = TaskSuite.from_jsonl(p) + assert suite.tasks[0].test_command == "python test_solution.py" + + def test_test_command_defaults_to_none(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{"task_id": "t1", "user_message": "do X"}]) + suite = TaskSuite.from_jsonl(p) + assert suite.tasks[0].test_command is None + + def test_test_command_non_string_raises(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{ + "task_id": "t", "user_message": "m", + "test_command": ["python", "x.py"], + }]) + with pytest.raises(ValueError, match="test_command must be a string"): + TaskSuite.from_jsonl(p) From 4b83e65d9756fed2181f6d61a8e52a3d2c7bf8ba Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 17 May 2026 15:06:32 -0600 Subject: [PATCH 2/5] feat(validation): SkillFileInstaller + cache artifact-writer injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the plumbing needed to run closed-loop validation on a SKILL.md, not just a tool description. Three coordinated pieces: SkillFileInstaller copies the baseline skill directory (SKILL.md + any siblings) into a caller-owned writable workdir at construction. install() then atomically overwrites the target with candidate text. The original on-disk skill — possibly in a read-only plugin cache — is never touched. The installer exposes `skills_src`, the directory the runner needs to stage into its per-task sandbox so `hermes -z` discovers the candidate. ClosedLoopFeedbackCache grows an `artifact_writer` callable kwarg (default = the existing single-tool MCP manifest JSON, preserving the tool path bit-for-bit) and an `artifact_suffix` for the temp filenames. Skill-side callers inject `write_text_artifact` and `.md`. Constructor kwargs renamed `tool_name`→`artifact_name` and `baseline_description`→`baseline_artifact_text` for semantic neutrality; tool-side call site updated. HermesAgentRunner._prime_sandbox reads `TaskRunContext.skills_src` (new optional field) and copies its contents into `/skills/` per task — fresh copy each task so an in-task agent write can't corrupt the installer's persistent candidate state. Confirmed empirically that Hermes loads skills from `$HERMES_HOME/skills/`. ArtifactInstaller Protocol grows `verify_backup(backup_path)` so each installer validates its own backup format. Tool installer parses Python (as before); skill installer checks non-empty + UTF-8 decodable. --- evolution/core/closed_loop_feedback.py | 62 +++++++-- evolution/tools/evolve_tool.py | 4 +- evolution/validation/agent_runner.py | 7 + evolution/validation/artifact_installer.py | 96 ++++++++++++- evolution/validation/hermes_runner.py | 8 +- evolution/validation/validator.py | 16 ++- tests/core/test_closed_loop_feedback.py | 141 +++++++++++++++----- tests/validation/test_artifact_installer.py | 138 +++++++++++++++++++ tests/validation/test_hermes_runner.py | 61 +++++++++ tests/validation/test_safety.py | 5 + tests/validation/test_validator.py | 5 + 11 files changed, 480 insertions(+), 63 deletions(-) diff --git a/evolution/core/closed_loop_feedback.py b/evolution/core/closed_loop_feedback.py index c9702523..1cd40b44 100644 --- a/evolution/core/closed_loop_feedback.py +++ b/evolution/core/closed_loop_feedback.py @@ -27,7 +27,7 @@ import tempfile import threading from pathlib import Path -from typing import Literal, Optional +from typing import Callable, Literal, Optional from evolution.validation.report import TaskResult, ValidationReport from evolution.validation.task import TaskSuite @@ -42,6 +42,15 @@ GateMode = Literal["sampled", "always"] +ArtifactWriter = Callable[[str, Path], None] +"""Write candidate text to a path in the format the installer consumes. + +The cache calls this with ``(baseline_or_candidate_text, target_path)`` +before each validator run. The default writes a single-tool MCP manifest +JSON (the tool-side shape); skill-side passes a writer that drops raw +text directly into the path. +""" + logger = logging.getLogger(__name__) @@ -67,12 +76,14 @@ def __init__( *, validator: ClosedLoopValidator, suite: TaskSuite, - tool_name: str, - baseline_description: str, + artifact_name: str, + baseline_artifact_text: str, saturation_threshold: float = 0.95, min_iters: int = 3, window_size: int = 8, gate_mode: GateMode = "sampled", + artifact_writer: Optional[ArtifactWriter] = None, + artifact_suffix: str = ".json", ) -> None: if not (0.0 <= saturation_threshold <= 1.0): raise ValueError( @@ -88,19 +99,23 @@ def __init__( ) self._validator = validator self._suite = suite - self._tool_name = tool_name + self._artifact_name = artifact_name self.saturation_threshold = saturation_threshold self.min_iters = min_iters self.window_size = window_size self.gate_mode = gate_mode - self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_")) - self._baseline_path = self._tmp_dir / "baseline.json" - self._evolved_path = self._tmp_dir / "evolved.json" - self._baseline_path.write_text( - _manifest_json(tool_name, baseline_description) + self._artifact_writer: ArtifactWriter = ( + artifact_writer + if artifact_writer is not None + else _make_default_tool_writer(artifact_name) ) + self._tmp_dir = Path(tempfile.mkdtemp(prefix="cl_feedback_")) + self._baseline_path = self._tmp_dir / f"baseline{artifact_suffix}" + self._evolved_path = self._tmp_dir / f"evolved{artifact_suffix}" + self._artifact_writer(baseline_artifact_text, self._baseline_path) + self._cache: dict[str, ValidationReport] = {} self._judge_history: list[float] = [] self._iters_since_last_run = self.min_iters # allow first fire @@ -147,11 +162,9 @@ def get_or_run(self, candidate_text: str) -> Optional[ValidationReport]: if not self.should_run(): return None try: - self._evolved_path.write_text( - _manifest_json(self._tool_name, candidate_text) - ) + self._artifact_writer(candidate_text, self._evolved_path) inputs = ValidationInputs( - tool_name=self._tool_name, + tool_name=self._artifact_name, suite=self._suite, baseline_artifact=self._baseline_path, evolved_artifact=self._evolved_path, @@ -293,3 +306,26 @@ def _manifest_json(tool_name: str, description: str) -> str: }, indent=2, ) + + +def _make_default_tool_writer(tool_name: str) -> ArtifactWriter: + """Default ``artifact_writer`` for tool-side closed-loop. + + Writes a single-tool MCP manifest JSON — the shape + ``HermesToolDescriptionInstaller._extract_description`` consumes when + ``artifact_source.suffix == ".json"``. + """ + + def write(candidate_text: str, path: Path) -> None: + path.write_text(_manifest_json(tool_name, candidate_text)) + + return write + + +def write_text_artifact(candidate_text: str, path: Path) -> None: + """``artifact_writer`` for skill-side closed-loop: drop raw text into the path. + + The skill installer reads the whole file as the candidate SKILL.md + body, so no envelope is needed. + """ + path.write_text(candidate_text) diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py index 8e7e083a..31c39cc6 100644 --- a/evolution/tools/evolve_tool.py +++ b/evolution/tools/evolve_tool.py @@ -292,8 +292,8 @@ def _maybe_build_closed_loop_cache( return ClosedLoopFeedbackCache( validator=validator, suite=suite, - tool_name=tool_name, - baseline_description=baseline_description, + artifact_name=tool_name, + baseline_artifact_text=baseline_description, saturation_threshold=saturation_threshold, min_iters=min_iters, window_size=window_size, diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py index 945c3053..dc4500d4 100644 --- a/evolution/validation/agent_runner.py +++ b/evolution/validation/agent_runner.py @@ -43,11 +43,18 @@ class TaskRunContext: ``fixture_dir`` is the per-task ``mkdtemp`` directory the validator populated from ``Task.fixture_setup`` before calling the runner. + + ``skills_src`` is the directory whose contents should be staged into + the runner's per-task sandbox under ``skills/`` — used for skill-side + closed-loop where the installer maintains a persistent writable + copy of the candidate skill that needs to be re-staged into each + task's ephemeral sandbox. """ user_message: str fixture_dir: Path extra_env: dict[str, str] = field(default_factory=dict) + skills_src: Optional[Path] = None class AgentRunner(Protocol): diff --git a/evolution/validation/artifact_installer.py b/evolution/validation/artifact_installer.py index d7cebfcf..44780ec2 100644 --- a/evolution/validation/artifact_installer.py +++ b/evolution/validation/artifact_installer.py @@ -1,9 +1,15 @@ """ArtifactInstaller Protocol — how the validator gets an artifact onto disk. -v1 only ships ``HermesToolDescriptionInstaller`` (splice into an -existing tool's description via ``HermesToolSource``). v2 will add -skill installers (drop a SKILL.md into the sandboxed HERMES_HOME's -skills dir) without changing ``ClosedLoopValidator``. +Two concrete installers ship: ``HermesToolDescriptionInstaller`` (splice +an evolved description into a Hermes tool-module ``*_SCHEMA`` file in +place) and ``SkillFileInstaller`` (write an evolved SKILL.md into a +caller-provided writable workdir, decoupled from the user's actual +HERMES_HOME / read-only plugin cache). + +Installers that need the runner to stage extra state into its per-task +sandbox (only ``SkillFileInstaller`` today, which needs the candidate +SKILL.md visible to ``hermes -z``) expose an optional ``skills_src`` +attribute the validator threads through ``TaskRunContext``. """ from __future__ import annotations @@ -11,9 +17,10 @@ import ast import hashlib import os +import shutil import tempfile from pathlib import Path -from typing import Protocol +from typing import Optional, Protocol from evolution.tools.hermes_source import HermesToolSource from evolution.tools.tool_source import ToolManifest @@ -32,6 +39,15 @@ def install(self, artifact_source: Path) -> str: validator can verify the file wasn't mutated between tasks.""" ... + def verify_backup(self, backup_path: Path) -> None: + """Validate the backup before trusting it for restore. + + Default behavior for Python-source artifacts: raise SyntaxError if + the backup doesn't parse. For non-Python artifacts (skills), the + installer overrides with a format-appropriate check (e.g., UTF-8 + decodability + non-empty).""" + ... + class HermesToolDescriptionInstaller: """Splice an evolved tool description into a Hermes ``*_SCHEMA`` file. @@ -89,6 +105,9 @@ def install(self, artifact_source: Path) -> str: ) return sha256_of(self.target_path) + def verify_backup(self, backup_path: Path) -> None: + verify_python_parses(backup_path) + def _extract_description(self, artifact_source: Path) -> str: """Return the description string for ``self.tool_name`` from ``artifact_source``. Dispatches on suffix so the installer can @@ -120,6 +139,73 @@ def _extract_description(self, artifact_source: Path) -> str: return manifest.find_tool(self.tool_name).description +class SkillFileInstaller: + """Write an evolved SKILL.md into a writable workdir for closed-loop validation. + + The user's actual skill may live in a read-only location (the Claude + Code plugin cache, a system-installed skill bundle, or a user + HERMES_HOME we don't want to mutate). The installer copies the + entire baseline skill directory once at construction into a + caller-owned ``workdir``, then ``install()`` writes candidate text + over the resulting target SKILL.md. The original location is never + touched. + + ``skills_src`` is the directory the runner copies into its per-task + sandbox so ``hermes -z`` discovers the candidate skill. Exposed + here so the validator can thread it through ``TaskRunContext`` + without a Hermes-specific code path. + """ + + def __init__( + self, + *, + skill_source_path: Path, + skill_name: str, + workdir: Path, + ) -> None: + if not skill_source_path.is_file(): + raise FileNotFoundError( + f"skill_source_path not found: {skill_source_path}" + ) + if not workdir.is_dir(): + raise NotADirectoryError( + f"workdir not found or not a directory: {workdir}" + ) + self.skill_name = skill_name + self.workdir = workdir + self.skills_src: Path = workdir / "skills" + skill_dest_dir = self.skills_src / skill_name + skill_dest_dir.parent.mkdir(parents=True, exist_ok=True) + source_dir = skill_source_path.parent + # Copy the entire skill directory (SKILL.md + any sibling files + # the skill references) so the candidate has the same surrounding + # context as the baseline. + shutil.copytree(source_dir, skill_dest_dir, dirs_exist_ok=False) + self.target_path: Path = skill_dest_dir / skill_source_path.name + + def install(self, artifact_source: Path) -> str: + """Atomically overwrite ``target_path`` with ``artifact_source`` contents. + + The artifact source is the candidate SKILL.md as written by the + cache's ``artifact_writer``. We just copy bytes — no parse, no + splice — since for skills the whole file is the artifact. + """ + atomic_write_bytes(self.target_path, artifact_source.read_bytes()) + return sha256_of(self.target_path) + + def verify_backup(self, backup_path: Path) -> None: + """Skills are UTF-8 text; reject empty or non-UTF-8 backups.""" + data = backup_path.read_bytes() + if not data: + raise ValueError(f"skill backup at {backup_path} is empty") + try: + data.decode("utf-8") + except UnicodeDecodeError as exc: + raise ValueError( + f"skill backup at {backup_path} is not valid UTF-8: {exc}" + ) from exc + + def sha256_of(path: Path) -> str: return hashlib.sha256(path.read_bytes()).hexdigest() diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index 6ea75e79..9c8940b5 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -61,7 +61,7 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult: message = ctx.user_message sandbox = Path(tempfile.mkdtemp(prefix="cl_hermes_home_")) try: - self._prime_sandbox(sandbox) + self._prime_sandbox(sandbox, ctx) env = { **os.environ, "HERMES_HOME": str(sandbox), @@ -107,10 +107,14 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult: finally: shutil.rmtree(sandbox, ignore_errors=True) - def _prime_sandbox(self, sandbox: Path) -> None: + def _prime_sandbox(self, sandbox: Path, ctx: TaskRunContext) -> None: (sandbox / "sessions").mkdir(parents=True, exist_ok=True) if self.user_config_path.exists(): shutil.copy2(self.user_config_path, sandbox / "config.yaml") + if ctx.skills_src is not None and ctx.skills_src.is_dir(): + # Copy (not symlink) so an in-task write by the agent corrupts + # only this sandbox, not the installer's persistent candidate. + shutil.copytree(ctx.skills_src, sandbox / "skills") def _find_latest_session(sessions_dir: Path) -> Optional[Path]: diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py index da24959c..936692b5 100644 --- a/evolution/validation/validator.py +++ b/evolution/validation/validator.py @@ -29,7 +29,6 @@ ArtifactInstaller, atomic_write_bytes, sha256_of, - verify_python_parses, ) from evolution.validation.report import ( PhaseResult, @@ -88,11 +87,11 @@ def __init__(self, installer: ArtifactInstaller, runner: AgentRunner) -> None: def validate(self, inputs: ValidationInputs) -> ValidationReport: target = self.installer.target_path backup_path = target.with_suffix(target.suffix + _BACKUP_SUFFIX) - _refuse_if_stale_backup_exists(backup_path) + _refuse_if_stale_backup_exists(backup_path, self.installer) with _exclusive_lock(target.parent): atomic_write_bytes(backup_path, target.read_bytes()) - verify_python_parses(backup_path) + self.installer.verify_backup(backup_path) try: baseline_results = self._run_phase( inputs.suite, @@ -141,6 +140,7 @@ def _run_one_task(self, task: Task) -> TaskResult: ctx = TaskRunContext( user_message=task.render_message(fixture_dir), fixture_dir=fixture_dir, + skills_src=getattr(self.installer, "skills_src", None), ) run = self.runner.run(ctx) passed, abstained = score_task( @@ -161,14 +161,16 @@ def _run_one_task(self, task: Task) -> TaskResult: ) -def _refuse_if_stale_backup_exists(backup_path: Path) -> None: +def _refuse_if_stale_backup_exists( + backup_path: Path, installer: ArtifactInstaller +) -> None: if not backup_path.exists(): return try: - verify_python_parses(backup_path) - except SyntaxError as exc: + installer.verify_backup(backup_path) + except Exception as exc: raise StaleBackupError( - f"Stale backup at {backup_path} is corrupt (cannot parse as Python: " + f"Stale backup at {backup_path} is corrupt ({type(exc).__name__}: " f"{exc}). Inspect it manually; do not blindly restore." ) from exc raise StaleBackupError( diff --git a/tests/core/test_closed_loop_feedback.py b/tests/core/test_closed_loop_feedback.py index 0de6631c..291a0e14 100644 --- a/tests/core/test_closed_loop_feedback.py +++ b/tests/core/test_closed_loop_feedback.py @@ -90,8 +90,8 @@ def test_gate_closed_when_no_history(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline desc", + artifact_name="write_file", + baseline_artifact_text="baseline desc", ) cache._iters_since_last_run = 0 # reset to test purely-history-based gate assert cache.should_run() is False @@ -100,8 +100,8 @@ def test_gate_open_on_saturated_window(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=0.95, window_size=4, min_iters=999, # disable periodic fallback @@ -115,8 +115,8 @@ def test_gate_closed_on_unsaturated_window(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=0.95, window_size=4, min_iters=999, @@ -130,8 +130,8 @@ def test_periodic_floor_triggers_even_when_unsaturated(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=0.95, min_iters=3, window_size=4, @@ -150,8 +150,8 @@ def test_cache_hit_short_circuits_validator(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", min_iters=1, ) cache.record_judge_score(0.99) # open the gate @@ -167,8 +167,8 @@ def test_gate_closed_returns_none(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=0.95, min_iters=999, ) @@ -183,8 +183,8 @@ def test_concurrent_run_error_does_not_propagate(self, tmp_path, caplog): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", min_iters=1, ) cache.record_judge_score(0.99) @@ -197,8 +197,8 @@ def test_cache_key_depends_on_candidate_and_suite_sha(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", ) k1 = cache._key("desc A") k2 = cache._key("desc B") @@ -319,8 +319,8 @@ def test_two_threads_invoke_validator_exactly_once(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", min_iters=1, ) cache.record_judge_score(0.99) @@ -349,8 +349,8 @@ def test_threshold_out_of_range_raises(self, tmp_path): ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=1.5, ) @@ -359,8 +359,8 @@ def test_min_iters_zero_raises(self, tmp_path): ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", min_iters=0, ) @@ -369,8 +369,8 @@ def test_invalid_gate_mode_raises(self, tmp_path): ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", gate_mode="bogus", ) @@ -380,8 +380,8 @@ def test_always_mode_opens_gate_without_history(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", gate_mode="always", ) # No record_judge_score calls — sampled mode would return False here. @@ -393,8 +393,8 @@ def test_always_mode_fires_validator_on_cache_miss(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", gate_mode="always", ) # No judge scores recorded — sampled mode would return None. @@ -418,8 +418,8 @@ def test_returns_per_task_result_on_cache_hit(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", gate_mode="always", ) t1 = cache.get_task_verdict("cand", "t1") @@ -433,8 +433,8 @@ def test_returns_none_on_cache_miss_with_closed_gate(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=MagicMock(), suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", saturation_threshold=0.95, min_iters=999, gate_mode="sampled", # default; explicit for clarity @@ -455,8 +455,81 @@ def test_returns_none_when_task_id_not_in_report(self, tmp_path): cache = ClosedLoopFeedbackCache( validator=validator, suite=_build_suite(tmp_path), - tool_name="write_file", - baseline_description="baseline", + artifact_name="write_file", + baseline_artifact_text="baseline", gate_mode="always", ) assert cache.get_task_verdict("cand", "nonexistent_task") is None + + +class TestArtifactWriterInjection: + """The cache's default ``artifact_writer`` produces the single-tool MCP + manifest JSON the tool installer consumes. Skill-side callers inject + a writer that drops raw text directly. Tool-path behavior must be + bit-for-bit preserved when no writer is passed. + """ + + def test_default_writer_produces_mcp_manifest_json(self, tmp_path): + cache = ClosedLoopFeedbackCache( + validator=MagicMock(), + suite=_build_suite(tmp_path), + artifact_name="write_file", + baseline_artifact_text="baseline desc", + ) + # Default suffix is .json; default writer is the MCP manifest. + baseline_text = cache._baseline_path.read_text() + assert cache._baseline_path.suffix == ".json" + parsed = json.loads(baseline_text) + assert parsed["tools"][0]["name"] == "write_file" + assert parsed["tools"][0]["description"] == "baseline desc" + + def test_custom_writer_called_with_candidate_and_path(self, tmp_path): + calls: list[tuple[str, Path]] = [] + + def fake_writer(text: str, path: Path) -> None: + calls.append((text, path)) + path.write_text(f"WRAPPED::{text}") + + cache = ClosedLoopFeedbackCache( + validator=MagicMock(), + suite=_build_suite(tmp_path), + artifact_name="systematic_debugging", + baseline_artifact_text="baseline body", + artifact_writer=fake_writer, + artifact_suffix=".md", + ) + # Baseline was written once during construction. + assert len(calls) == 1 + assert calls[0][0] == "baseline body" + assert calls[0][1].suffix == ".md" + assert cache._baseline_path.read_text() == "WRAPPED::baseline body" + + def test_custom_writer_invoked_on_each_validate(self, tmp_path): + calls: list[str] = [] + + def fake_writer(text: str, path: Path) -> None: + calls.append(text) + path.write_text(text) + + validator = MagicMock() + validator.validate.return_value = _build_report() + cache = ClosedLoopFeedbackCache( + validator=validator, + suite=_build_suite(tmp_path), + artifact_name="systematic_debugging", + baseline_artifact_text="baseline", + artifact_writer=fake_writer, + artifact_suffix=".md", + gate_mode="always", + ) + cache.get_or_run("candidate-1") + cache.get_or_run("candidate-2") + # 1 baseline write + 2 evolved writes. + assert calls == ["baseline", "candidate-1", "candidate-2"] + + def test_write_text_artifact_helper_writes_plain_text(self, tmp_path): + from evolution.core.closed_loop_feedback import write_text_artifact + + path = tmp_path / "out.md" + write_text_artifact("hello world\n", path) + assert path.read_text() == "hello world\n" diff --git a/tests/validation/test_artifact_installer.py b/tests/validation/test_artifact_installer.py index 7309360a..95b1cfbd 100644 --- a/tests/validation/test_artifact_installer.py +++ b/tests/validation/test_artifact_installer.py @@ -123,3 +123,141 @@ def test_json_target_tool_missing_raises(self, fake_hermes_repo, tmp_path): ) with pytest.raises(KeyError): installer._extract_description(evolved_json) + + +# --------------------------------------------------------------------------- +# SkillFileInstaller +# --------------------------------------------------------------------------- + + +from evolution.validation.artifact_installer import SkillFileInstaller + + +@pytest.fixture +def baseline_skill(tmp_path): + """A read-only source skill directory the installer must not mutate.""" + src_root = tmp_path / "source_skills" / "systematic_debugging" + src_root.mkdir(parents=True) + skill_md = src_root / "SKILL.md" + skill_md.write_text( + "---\nname: systematic_debugging\n---\n\n# Baseline body\n" + ) + # Sibling file the skill might reference — must also be copied. + (src_root / "examples.md").write_text("baseline examples") + return skill_md + + +class TestSkillFileInstallerConstruction: + def test_copies_skill_directory_into_workdir(self, baseline_skill, tmp_path): + workdir = tmp_path / "workdir" + workdir.mkdir() + installer = SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="systematic_debugging", + workdir=workdir, + ) + # The target_path lives under the workdir. + assert installer.target_path == ( + workdir / "skills" / "systematic_debugging" / "SKILL.md" + ) + assert installer.target_path.is_file() + # Sibling files are copied too — agents may reference them. + sibling = workdir / "skills" / "systematic_debugging" / "examples.md" + assert sibling.is_file() + assert sibling.read_text() == "baseline examples" + + def test_skills_src_attribute_points_at_skills_root(self, baseline_skill, tmp_path): + # The validator threads this into TaskRunContext so the runner + # can stage it into its per-task sandbox. + workdir = tmp_path / "workdir" + workdir.mkdir() + installer = SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="systematic_debugging", + workdir=workdir, + ) + assert installer.skills_src == workdir / "skills" + + def test_missing_source_raises(self, tmp_path): + workdir = tmp_path / "workdir" + workdir.mkdir() + with pytest.raises(FileNotFoundError): + SkillFileInstaller( + skill_source_path=tmp_path / "does_not_exist.md", + skill_name="x", + workdir=workdir, + ) + + def test_missing_workdir_raises(self, baseline_skill, tmp_path): + with pytest.raises(NotADirectoryError): + SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="x", + workdir=tmp_path / "does_not_exist", + ) + + +class TestSkillFileInstallerInstall: + def test_install_overwrites_target_with_candidate_text(self, baseline_skill, tmp_path): + workdir = tmp_path / "workdir" + workdir.mkdir() + installer = SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="systematic_debugging", + workdir=workdir, + ) + candidate = tmp_path / "candidate.md" + candidate.write_text("# Evolved body\n") + sha = installer.install(candidate) + assert installer.target_path.read_text() == "# Evolved body\n" + # Returned sha matches the post-install bytes. + import hashlib + assert sha == hashlib.sha256(b"# Evolved body\n").hexdigest() + + def test_install_does_not_mutate_source(self, baseline_skill, tmp_path): + """The whole point of the workdir copy: the user's actual skill + on disk is never touched, even when we install a candidate.""" + workdir = tmp_path / "workdir" + workdir.mkdir() + original = baseline_skill.read_text() + installer = SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="systematic_debugging", + workdir=workdir, + ) + candidate = tmp_path / "candidate.md" + candidate.write_text("# Different body\n") + installer.install(candidate) + assert baseline_skill.read_text() == original + + +class TestSkillFileInstallerVerifyBackup: + def _installer(self, baseline_skill, tmp_path): + workdir = tmp_path / "workdir" + workdir.mkdir() + return SkillFileInstaller( + skill_source_path=baseline_skill, + skill_name="systematic_debugging", + workdir=workdir, + ) + + def test_accepts_valid_utf8_backup(self, baseline_skill, tmp_path): + installer = self._installer(baseline_skill, tmp_path) + backup = tmp_path / "backup.md" + backup.write_text("# Some skill text\n") + # Does not raise. + installer.verify_backup(backup) + + def test_rejects_empty_backup(self, baseline_skill, tmp_path): + installer = self._installer(baseline_skill, tmp_path) + backup = tmp_path / "backup.md" + backup.write_bytes(b"") + with pytest.raises(ValueError, match="empty"): + installer.verify_backup(backup) + + def test_rejects_invalid_utf8(self, baseline_skill, tmp_path): + installer = self._installer(baseline_skill, tmp_path) + backup = tmp_path / "backup.md" + backup.write_bytes(b"\xff\xfe\x00\x00invalid") + with pytest.raises(ValueError, match="not valid UTF-8"): + installer.verify_backup(backup) diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index 327c5b11..8bc066f5 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -254,3 +254,64 @@ def _fake_run(*args, **kwargs): assert sandbox_seen["config_present"] is True assert "hunter2" in sandbox_seen["config_text"] + + def test_skills_src_copied_into_sandbox_when_present(self, fixture_dir, tmp_path): + """When TaskRunContext.skills_src points at a directory, the runner + copies its contents into /skills/ so ``hermes -z`` + discovers any candidate skills installed there. + """ + skills_src = tmp_path / "candidate_skills" + (skills_src / "systematic_debugging").mkdir(parents=True) + (skills_src / "systematic_debugging" / "SKILL.md").write_text( + "---\nname: systematic_debugging\n---\n\nevolved body\n" + ) + runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent") + + sandbox_skills_state: dict = {} + + def _fake_run(*args, **kwargs): + sandbox = Path(kwargs["env"]["HERMES_HOME"]) + skill_path = sandbox / "skills" / "systematic_debugging" / "SKILL.md" + sandbox_skills_state["present"] = skill_path.is_file() + if sandbox_skills_state["present"]: + sandbox_skills_state["text"] = skill_path.read_text() + (sandbox / "sessions").mkdir(exist_ok=True) + _write_session( + sandbox / "sessions" / "session_test.json", + [{"role": "assistant", "content": "ok"}], + ) + return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): + runner.run(TaskRunContext( + user_message="debug it", + fixture_dir=fixture_dir, + skills_src=skills_src, + )) + + assert sandbox_skills_state["present"] is True + assert "evolved body" in sandbox_skills_state["text"] + + def test_skills_src_none_means_no_skills_dir_created(self, fixture_dir, tmp_path): + """Tool-side runs (no skills_src) must not create an empty skills/ + directory in the sandbox — keeps the legacy code path bit-for-bit.""" + runner = HermesAgentRunner(user_config_path=tmp_path / "nonexistent") + sandbox_seen: dict = {} + + def _fake_run(*args, **kwargs): + sandbox = Path(kwargs["env"]["HERMES_HOME"]) + sandbox_seen["has_skills_dir"] = (sandbox / "skills").exists() + (sandbox / "sessions").mkdir(exist_ok=True) + _write_session( + sandbox / "sessions" / "session_test.json", + [{"role": "assistant", "content": "ok"}], + ) + return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): + runner.run(TaskRunContext( + user_message="run", + fixture_dir=fixture_dir, + )) + + assert sandbox_seen["has_skills_dir"] is False diff --git a/tests/validation/test_safety.py b/tests/validation/test_safety.py index a74aab8b..721e9443 100644 --- a/tests/validation/test_safety.py +++ b/tests/validation/test_safety.py @@ -46,6 +46,11 @@ def install(self, artifact_source: Path) -> str: import hashlib return hashlib.sha256(self.target_path.read_bytes()).hexdigest() + def verify_backup(self, backup_path: Path) -> None: + # Stub: any non-empty backup is fine for tests. + if not backup_path.read_bytes(): + raise ValueError(f"empty backup at {backup_path}") + class _StubRunner: def __init__(self, results: list[AgentRunResult]): diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py index 1de996d3..1e8417eb 100644 --- a/tests/validation/test_validator.py +++ b/tests/validation/test_validator.py @@ -21,6 +21,11 @@ def install(self, artifact_source: Path) -> str: import hashlib return hashlib.sha256(self.target_path.read_bytes()).hexdigest() + def verify_backup(self, backup_path: Path) -> None: + # Stub: any non-empty backup is fine for tests. + if not backup_path.read_bytes(): + raise ValueError(f"empty backup at {backup_path}") + class _ScriptedRunner: """Runner that returns different tool_calls_seq depending on which From c9dd6f91f5ec2cfa37d7eb711b9858646039bd7d Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 17 May 2026 15:08:38 -0600 Subject: [PATCH 3/5] feat(validation): systematic-debugging planted-bug suite + fake target skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five planted-bug tasks at evolution/validation/suites/systematic_debugging.jsonl covering bug shapes a debugging methodology should be able to handle: off-by-one in range, inverted condition, missing recursive base case, wrong arithmetic operator, mutable-default-argument aliasing. Each task ships: - solution.py with one planted bug and a docstring stating the spec - test_solution.py asserting the spec across the bug case plus enough edge cases that a too-narrow "fix" still fails - test_command="python test_solution.py" — the verdict from test_command-mode scoring Sanity tests in tests/validation/test_systematic_debugging_suite.py: - Suite shape: 5 tasks, every task has test_command, solution.py, and test_solution.py - Planted bugs are real: materializing the buggy fixture and running its test exits non-zero - Known fixes pass: applying the documented fix and re-running the test exits zero The last two are critical — they would catch a typo'd assertion or a pathologically-broken test that no candidate could satisfy. Fake target skill at tests/fixtures/skills/systematic_debugging/SKILL.md gives the closed-loop machinery a stable substrate for unit tests + manual smoke. Deliberately weak baseline ("try random changes") so GEPA has measurable headroom to evolve toward a real methodology. --- .../suites/systematic_debugging.jsonl | 20 +++ .../skills/systematic_debugging/SKILL.md | 14 ++ .../test_systematic_debugging_suite.py | 164 ++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 evolution/validation/suites/systematic_debugging.jsonl create mode 100644 tests/fixtures/skills/systematic_debugging/SKILL.md create mode 100644 tests/validation/test_systematic_debugging_suite.py diff --git a/evolution/validation/suites/systematic_debugging.jsonl b/evolution/validation/suites/systematic_debugging.jsonl new file mode 100644 index 00000000..3e1c390c --- /dev/null +++ b/evolution/validation/suites/systematic_debugging.jsonl @@ -0,0 +1,20 @@ +# Planted-bug suite for the `systematic-debugging` skill. +# +# Each task drops a small Python program with one planted bug + a test +# script that exercises the bug. The agent (driven by the skill under +# evaluation) is asked to debug and fix; the verdict is whether +# `python test_solution.py` exits zero after the agent's edits. +# +# Conventions for adding tasks: +# - solution.py contains exactly ONE bug, with a docstring stating +# the intended behavior so the spec is unambiguous. +# - test_solution.py asserts the spec across the bug case + at least +# one edge case (empty input, zero, negative, etc.) so a candidate +# "fix" that only handles the obvious case fails. +# - test_solution.py exits non-zero on baseline and exits zero only +# when the intended behavior holds. +{"task_id": "debug_off_by_one_range", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def get_range_inclusive(n):\n \"\"\"Return the integers from 1 to n inclusive (empty list when n <= 0).\"\"\"\n return list(range(n))\n", "test_solution.py": "from solution import get_range_inclusive\n\nassert get_range_inclusive(5) == [1, 2, 3, 4, 5], get_range_inclusive(5)\nassert get_range_inclusive(1) == [1], get_range_inclusive(1)\nassert get_range_inclusive(0) == [], get_range_inclusive(0)\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_inverted_condition", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def first_positive(values):\n \"\"\"Return the first strictly-positive number in values, or None if none.\"\"\"\n for x in values:\n if x < 0:\n return x\n return None\n", "test_solution.py": "from solution import first_positive\n\nassert first_positive([-1, -2, 3, 4]) == 3, first_positive([-1, -2, 3, 4])\nassert first_positive([1, 2, 3]) == 1, first_positive([1, 2, 3])\nassert first_positive([-1, -2, -3]) is None, first_positive([-1, -2, -3])\nassert first_positive([]) is None, first_positive([])\nassert first_positive([0, -1, 2]) == 2, first_positive([0, -1, 2])\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_missing_base_case", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def factorial(n):\n \"\"\"Return n! for non-negative integers (factorial(0) == 1).\"\"\"\n return n * factorial(n - 1)\n", "test_solution.py": "from solution import factorial\n\nassert factorial(5) == 120, factorial(5)\nassert factorial(1) == 1, factorial(1)\nassert factorial(0) == 1, factorial(0)\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_wrong_operator", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def square(x):\n \"\"\"Return x squared (x * x).\"\"\"\n return x * 2\n", "test_solution.py": "from solution import square\n\nassert square(3) == 9, square(3)\nassert square(0) == 0, square(0)\nassert square(-4) == 16, square(-4)\nassert square(2) == 4, square(2)\nprint('PASS')\n"}, "test_command": "python test_solution.py"} +{"task_id": "debug_mutable_default_arg", "user_message": "There is a bug in solution.py. Read it, identify the bug, and edit the file to fix it. Then run `python test_solution.py` to confirm — the test exits zero when the bug is fixed.", "fixture_setup": {"solution.py": "def append_to_new(item, items=[]):\n \"\"\"Append item to a NEW empty list and return it. Each call must be independent.\"\"\"\n items.append(item)\n return items\n", "test_solution.py": "from solution import append_to_new\n\nassert append_to_new(1) == [1], append_to_new(1)\nassert append_to_new(2) == [2], append_to_new(2)\nassert append_to_new(3) == [3], append_to_new(3)\nprint('PASS')\n"}, "test_command": "python test_solution.py"} diff --git a/tests/fixtures/skills/systematic_debugging/SKILL.md b/tests/fixtures/skills/systematic_debugging/SKILL.md new file mode 100644 index 00000000..c0fc6567 --- /dev/null +++ b/tests/fixtures/skills/systematic_debugging/SKILL.md @@ -0,0 +1,14 @@ +--- +name: systematic-debugging +description: Approach for debugging Python bugs in small programs. +--- + +# Systematic debugging + +When you encounter a bug in Python code, try to fix it. + +Common approaches: add print statements, comment things out, try +random changes until the test passes. If one approach doesn't work, +try another. Keep going until the test exits with code zero. + +Don't spend too much time reading the code — just try edits. diff --git a/tests/validation/test_systematic_debugging_suite.py b/tests/validation/test_systematic_debugging_suite.py new file mode 100644 index 00000000..9b8f2be9 --- /dev/null +++ b/tests/validation/test_systematic_debugging_suite.py @@ -0,0 +1,164 @@ +"""Sanity tests for the systematic-debugging planted-bug suite. + +The suite at ``evolution/validation/suites/systematic_debugging.jsonl`` +is the substrate for skill-side closed-loop validation. These tests +prove the suite itself is well-formed and that every task's planted +bug + test pair behaves as designed (test fails on baseline, passes +when the documented fix is applied) — without these guards a typo in +the test file or a too-permissive assertion would silently invalidate +every closed-loop run. +""" + +from __future__ import annotations + +import shlex +import subprocess +from pathlib import Path + +import pytest + +from evolution.validation.task import TaskSuite + + +SUITE_PATH = ( + Path(__file__).resolve().parents[2] + / "evolution" + / "validation" + / "suites" + / "systematic_debugging.jsonl" +) + + +# Known-good fix per task_id. Each value replaces solution.py wholesale +# after the buggy fixture is materialized. The test then re-runs +# test_solution.py to confirm the fix actually passes the asserts. +_KNOWN_FIXES: dict[str, str] = { + "debug_off_by_one_range": ( + "def get_range_inclusive(n):\n" + " return list(range(1, n + 1))\n" + ), + "debug_inverted_condition": ( + "def first_positive(values):\n" + " for x in values:\n" + " if x > 0:\n" + " return x\n" + " return None\n" + ), + "debug_missing_base_case": ( + "def factorial(n):\n" + " if n <= 1:\n" + " return 1\n" + " return n * factorial(n - 1)\n" + ), + "debug_wrong_operator": ( + "def square(x):\n" + " return x * x\n" + ), + "debug_mutable_default_arg": ( + "def append_to_new(item, items=None):\n" + " if items is None:\n" + " items = []\n" + " items.append(item)\n" + " return items\n" + ), +} + + +@pytest.fixture(scope="module") +def suite() -> TaskSuite: + return TaskSuite.from_jsonl(SUITE_PATH) + + +class TestSuiteShape: + def test_loads_five_tasks(self, suite): + assert len(suite.tasks) == 5 + + def test_every_task_has_test_command(self, suite): + for task in suite.tasks: + assert task.test_command, f"{task.task_id}: missing test_command" + + def test_every_task_has_solution_and_test_files(self, suite): + for task in suite.tasks: + files = set(task.fixture_setup.keys()) + assert "solution.py" in files, f"{task.task_id}: no solution.py" + assert "test_solution.py" in files, f"{task.task_id}: no test_solution.py" + + def test_every_task_id_has_a_known_fix(self, suite): + # Future-proofing: when adding tasks, the contributor must also + # add an entry to _KNOWN_FIXES so the round-trip tests below cover it. + suite_ids = {t.task_id for t in suite.tasks} + fix_ids = set(_KNOWN_FIXES.keys()) + assert suite_ids == fix_ids, ( + f"task ids and known-fix ids out of sync: " + f"missing fixes={suite_ids - fix_ids}, " + f"extra fixes={fix_ids - suite_ids}" + ) + + def test_test_command_is_runnable_form(self, suite): + # We use shlex.split (no shell) on the test_command. Each command + # should split cleanly without quoting tricks. + for task in suite.tasks: + parts = shlex.split(task.test_command) + assert len(parts) >= 2, f"{task.task_id}: test_command too short" + + +class TestPlantedBugsAreReal: + """For every task: materializing the baseline buggy fixture and running + its test must exit non-zero. If this fails, the planted bug is no bug + at all — the suite isn't measuring what it claims to measure.""" + + @pytest.mark.parametrize( + "task_id", + list(_KNOWN_FIXES.keys()), + ids=list(_KNOWN_FIXES.keys()), + ) + def test_baseline_buggy_code_fails(self, task_id, tmp_path, suite): + task = next(t for t in suite.tasks if t.task_id == task_id) + _materialize(task.fixture_setup, tmp_path) + result = subprocess.run( + shlex.split(task.test_command), + cwd=tmp_path, + capture_output=True, + text=True, + timeout=15, + ) + assert result.returncode != 0, ( + f"{task_id}: baseline buggy code unexpectedly passed the test.\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +class TestKnownFixesPass: + """For every task: applying the known fix and re-running the test must + exit zero. If this fails, the test is pathologically broken (asserts + the wrong thing, has a syntax error, etc.) and would never accept + any candidate fix — including the right one.""" + + @pytest.mark.parametrize( + "task_id,fixed_solution", + list(_KNOWN_FIXES.items()), + ids=list(_KNOWN_FIXES.keys()), + ) + def test_known_fix_passes(self, task_id, fixed_solution, tmp_path, suite): + task = next(t for t in suite.tasks if t.task_id == task_id) + _materialize(task.fixture_setup, tmp_path) + # Apply the known fix. + (tmp_path / "solution.py").write_text(fixed_solution) + result = subprocess.run( + shlex.split(task.test_command), + cwd=tmp_path, + capture_output=True, + text=True, + timeout=15, + ) + assert result.returncode == 0, ( + f"{task_id}: known fix did not make the test pass.\n" + f"stdout: {result.stdout}\nstderr: {result.stderr}" + ) + + +def _materialize(setup: dict[str, str], target: Path) -> None: + for rel, content in setup.items(): + dest = target / rel + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text(content) From b3e8289f87c0ce03a07e054c23bce07c96d15a0d Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 17 May 2026 15:18:05 -0600 Subject: [PATCH 4/5] feat(evolve_skill): wire closed-loop infrastructure end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the placeholder UsageError on --closed-loop-during-evolution. The skill-evolution path now matches the tool-evolution path: closed-loop validator feedback flows into the reflection LM, and opt-in trainset mode lets behavioral verdicts break judge ties in GEPA's sum(scores) acceptance rule. Five coordinated changes: - CLI grows --closed-loop-saturation-threshold, --closed-loop-min-iters, --closed-loop-window-size, --closed-loop-mode, --closed-loop-in-valset, mirroring evolve_tool. No --closed-loop-hermes-repo: SkillSource discovery + the installer's writable workdir replace that. - New _maybe_build_closed_loop_cache_skill helper constructs the SkillFileInstaller, ClosedLoopValidator, and ClosedLoopFeedbackCache wired with write_text_artifact and .md suffix. - SkillModule.forward grows an optional closed_loop_task_id parameter. When present, skips the predictor LM call and returns a Prediction carrying _closed_loop_task_id + _candidate_text markers the metric routes on. - Trainset/valset get behavioral examples appended when mode is trainset or both. - Metric construction passes closed_loop_cache through. build_behavioral_examples gets a task_field kwarg (default "task" preserves tool-path behavior; skill-side passes "task_input" so the example shape matches SkillModule.forward). Default --closed-loop-mode is feedback for skills (not trainset). Skill bodies mutate heavily, so the "always" gate mode that trainset needs would fire the validator on every novel candidate — costlier than the tool path. Users opt into trainset explicitly. --- evolution/core/behavioral_example.py | 25 +-- evolution/skills/evolve_skill.py | 192 +++++++++++++++++- evolution/skills/skill_module.py | 16 +- tests/skills/test_evolve_skill_closed_loop.py | 179 ++++++++++++++++ tests/tools/test_evolve_tool_closed_loop.py | 27 ++- 5 files changed, 407 insertions(+), 32 deletions(-) create mode 100644 tests/skills/test_evolve_skill_closed_loop.py diff --git a/evolution/core/behavioral_example.py b/evolution/core/behavioral_example.py index 6e7c47f9..51bee60e 100644 --- a/evolution/core/behavioral_example.py +++ b/evolution/core/behavioral_example.py @@ -7,10 +7,13 @@ and get accepted. The example carries a ``closed_loop_task_id`` marker the metric routes on, -plus a placeholder ``task`` value (the suite's ``user_message``) that -``ToolModule.forward`` skips past on the behavioral branch. ``task`` and -``closed_loop_task_id`` are both marked as input keys so DSPy passes them -to ``forward()`` via ``program(**example.inputs())``. +plus a placeholder task-input value (the suite's ``user_message``) that the +module's ``forward()`` skips past on the behavioral branch. Both fields are +marked as input keys so DSPy passes them via ``program(**example.inputs())``. + +``task_field`` parameterizes the input field name to match the host module's +forward signature: ``ToolModule.forward(task=...)`` uses ``"task"`` (the +default); ``SkillModule.forward(task_input=...)`` passes ``"task_input"``. """ from __future__ import annotations @@ -20,17 +23,15 @@ from evolution.validation.task import TaskSuite -def build_behavioral_examples(suite: TaskSuite) -> list[dspy.Example]: - """One example per task in ``suite``, stable order by ``task_id``. - - The placeholder ``task`` value carries the original ``user_message`` for - debuggability; it isn't consumed by the behavioral metric branch. - """ +def build_behavioral_examples( + suite: TaskSuite, *, task_field: str = "task" +) -> list[dspy.Example]: + """One example per task in ``suite``, stable order by ``task_id``.""" examples = [ dspy.Example( - task=task.user_message, + **{task_field: task.user_message}, closed_loop_task_id=task.task_id, - ).with_inputs("task", "closed_loop_task_id") + ).with_inputs(task_field, "closed_loop_task_id") for task in sorted(suite.tasks, key=lambda t: t.task_id) ] return examples diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 4e0b8744..80498848 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -461,6 +461,83 @@ def _emit_patch(baseline_text: str, evolved_text: str, path: Path) -> str: return "".join(diff_lines) +def _maybe_build_closed_loop_cache_skill( + *, + skill_name: str, + skill_path: Path, + baseline_skill_body: str, + suite_path: Optional[Path], + saturation_threshold: float, + min_iters: int, + window_size: int, + gate_mode: str = "sampled", +): + """Build a ClosedLoopFeedbackCache for the skill path; return None when disabled. + + Mirrors evolve_tool's _maybe_build_closed_loop_cache. Local imports keep + the validation stack out of the cold path — most evolve_skill runs don't + set the flag and shouldn't pay the import cost. + + Constructs: + - a per-process workdir under /tmp where the installer maintains a + writable copy of the baseline skill (decoupled from the user's + real HERMES_HOME / plugin cache) + - SkillFileInstaller pointing at that workdir + - ClosedLoopValidator + HermesAgentRunner + - ClosedLoopFeedbackCache wired with write_text_artifact (skill bodies + are raw text, not MCP manifests) and .md suffix + """ + if suite_path is None: + return None + import tempfile + + from evolution.core.closed_loop_feedback import ( + ClosedLoopFeedbackCache, + write_text_artifact, + ) + from evolution.validation.artifact_installer import SkillFileInstaller + from evolution.validation.hermes_runner import HermesAgentRunner + from evolution.validation.task import TaskSuite + from evolution.validation.validator import ClosedLoopValidator + + workdir = Path(tempfile.mkdtemp(prefix="cl_skill_workdir_")) + installer = SkillFileInstaller( + skill_source_path=skill_path, + skill_name=skill_name, + workdir=workdir, + ) + runner = HermesAgentRunner() + validator = ClosedLoopValidator(installer=installer, runner=runner) + suite = TaskSuite.from_jsonl(suite_path) + return ClosedLoopFeedbackCache( + validator=validator, + suite=suite, + artifact_name=skill_name, + baseline_artifact_text=baseline_skill_body, + saturation_threshold=saturation_threshold, + min_iters=min_iters, + window_size=window_size, + gate_mode=gate_mode, + artifact_writer=write_text_artifact, + artifact_suffix=".md", + ) + + +def _load_behavioral_examples_from_suite(suite_path: Path) -> list: + """Build behavioral dspy.Examples from a suite file. + + Uses ``task_field="task_input"`` so the examples are shape-compatible + with ``SkillModule.forward(task_input=...)`` — tool-side uses ``"task"``. + Local-import path keeps the validation stack out of the cold path. + """ + from evolution.core.behavioral_example import build_behavioral_examples + from evolution.validation.task import TaskSuite + + return build_behavioral_examples( + TaskSuite.from_jsonl(suite_path), task_field="task_input" + ) + + def _apply_in_place(skill_path: Path, evolved_full: str) -> bool: """Overwrite ``skill_path`` with ``evolved_full``. @@ -516,6 +593,12 @@ def evolve( benchmark_timeout_seconds: int = 600, skip_preflight: bool = False, skip_cost_suggest: bool = False, + closed_loop_suite_path: Optional[Path] = None, + closed_loop_saturation_threshold: float = 0.95, + closed_loop_min_iters: int = 3, + closed_loop_window_size: int = 8, + closed_loop_mode: str = "feedback", + closed_loop_in_valset: bool = False, ): """Main evolution function — orchestrates the full optimization loop.""" @@ -735,6 +818,25 @@ def evolve( baseline_module = SkillModule(skill["body"]) + # In behavioral-trainset modes the saturation gate would defeat + # the purpose — every novel candidate must score every time it's + # sampled. Otherwise default to "sampled" to keep cost bounded; + # skill bodies mutate heavily, so cache hit rate on the validator + # is lower than tool-path. + _cache_gate_mode = ( + "always" if closed_loop_mode in ("trainset", "both") else "sampled" + ) + closed_loop_cache = _maybe_build_closed_loop_cache_skill( + skill_name=skill_name, + skill_path=skill_path, + baseline_skill_body=skill["body"], + suite_path=closed_loop_suite_path, + saturation_threshold=closed_loop_saturation_threshold, + min_iters=closed_loop_min_iters, + window_size=closed_loop_window_size, + gate_mode=_cache_gate_mode, + ) + # Build the metric once: DSPy's LM cache lines up across GEPA's # per-iteration scoring and the holdout eval below. The [BUDGET] # feedback line targets growth_free_threshold (the zone where the @@ -745,11 +847,29 @@ def evolve( judge, baseline_skill_text=skill["body"], max_growth=config.growth_free_threshold, + closed_loop_cache=closed_loop_cache, ) trainset = dataset.to_dspy_examples("train") valset = dataset.to_dspy_examples("val") + # Behavioral-example injection: each closed-loop task becomes an + # additional dspy.Example whose score contributes to GEPA's + # sum(minibatch_scores) acceptance — behavioral wins can break + # judge ties on saturated baselines. + if closed_loop_mode in ("trainset", "both"): + if closed_loop_suite_path is None: + raise ValueError( + f"--closed-loop-mode={closed_loop_mode} requires " + "--closed-loop-during-evolution to be set" + ) + behavioral_examples = _load_behavioral_examples_from_suite( + closed_loop_suite_path + ) + trainset = trainset + behavioral_examples + if closed_loop_in_valset: + valset = valset + behavioral_examples + console.print(f"\n[bold cyan]Running GEPA optimization (budget={gepa_budget})...[/bold cyan]\n") start_time = time.time() @@ -1377,10 +1497,55 @@ def evolve( "closed_loop_suite_path", default=None, type=click.Path(exists=True, dir_okay=False, path_type=Path), - help="Path to a JSONL task suite for in-loop closed-loop validation feedback. " - "Wired symmetrically with evolve_tool, but skill-side closed-loop " - "validation requires a SkillFileInstaller that doesn't exist yet — " - "setting this flag raises until that lands.", + help="Path to a JSONL task suite (e.g. evolution/validation/suites/" + "systematic_debugging.jsonl). When set, the framework runs the " + "closed-loop validator on saturating GEPA iterations and surfaces " + "verdicts into the reflection LM's feedback. Held-out from training " + "tasks (no overlap-detection enforcement).", +) +@click.option( + "--closed-loop-saturation-threshold", + default=0.95, + type=click.FloatRange(min=0.0, max=1.0), + help="Min judge score over the recent window for the saturation gate to " + "open (default 0.95).", +) +@click.option( + "--closed-loop-min-iters", + default=3, + type=click.IntRange(min=1), + help="Periodic-fire floor: fire closed-loop at least every N reflective " + "iterations even when the judge isn't saturating (default 3).", +) +@click.option( + "--closed-loop-window-size", + default=8, + type=click.IntRange(min=1), + help="Number of recent judge scores the saturation gate inspects (default 8).", +) +@click.option( + "--closed-loop-mode", + default="feedback", + type=click.Choice(["feedback", "trainset", "both"]), + help="How closed-loop signal participates in GEPA. 'feedback' (default) " + "appends a [CLOSED_LOOP] block to the reflection LM's input — " + "proposal-prompt signal only, no acceptance change. 'trainset' adds " + "behavioral dspy.Examples to the training set whose score (binary " + "pass/fail from the validator) contributes to GEPA's " + "sum(minibatch_scores) acceptance — lets behavioral wins break judge " + "ties on saturated baselines. 'both' does trainset + the [CLOSED_LOOP] " + "feedback block (most expensive). Skill bodies mutate heavily so " + "trainset/both fires the validator on every novel candidate; default " + "stays 'feedback' to keep cost bounded.", +) +@click.option( + "--closed-loop-in-valset/--no-closed-loop-in-valset", + "closed_loop_in_valset", + default=False, + help="When --closed-loop-mode is trainset or both, also include behavioral " + "examples in the valset (adds them to the Pareto frontier + holdout " + "scoring). Costs more — each accepted candidate triggers another full " + "eval pass over the behavioral examples. Default off.", ) def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model, eval_model, skill_source_dir, dry_run, seed, budget, no_fallback, @@ -1393,14 +1558,13 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti benchmark_cmd, benchmark_timeout_seconds, skip_preflight, skip_cost_suggest, - closed_loop_suite_path): + closed_loop_suite_path, + closed_loop_saturation_threshold, + closed_loop_min_iters, + closed_loop_window_size, + closed_loop_mode, + closed_loop_in_valset): """Evolve an agent skill using DSPy + GEPA optimization.""" - if closed_loop_suite_path is not None: - raise click.UsageError( - "--closed-loop-during-evolution is wired on evolve_skill for CLI " - "consistency, but skill-side closed-loop validation requires a " - "SkillFileInstaller that doesn't exist yet." - ) try: evolve( skill_name=skill, @@ -1437,6 +1601,12 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti benchmark_timeout_seconds=benchmark_timeout_seconds, skip_preflight=skip_preflight, skip_cost_suggest=skip_cost_suggest, + closed_loop_suite_path=closed_loop_suite_path, + closed_loop_saturation_threshold=closed_loop_saturation_threshold, + closed_loop_min_iters=closed_loop_min_iters, + closed_loop_window_size=closed_loop_window_size, + closed_loop_mode=closed_loop_mode, + closed_loop_in_valset=closed_loop_in_valset, ) except HermesProviderError as exc: # Render a clean error panel instead of dumping a Python traceback diff --git a/evolution/skills/skill_module.py b/evolution/skills/skill_module.py index b7655733..fc823585 100644 --- a/evolution/skills/skill_module.py +++ b/evolution/skills/skill_module.py @@ -93,7 +93,21 @@ def __init__(self, skill_text: str): # install the skill body there so forward() reads exactly what GEPA writes. self.predictor.predict.signature = self.predictor.predict.signature.with_instructions(skill_text) - def forward(self, task_input: str) -> dspy.Prediction: + def forward( + self, + task_input: str, + closed_loop_task_id: Optional[str] = None, + ) -> dspy.Prediction: + if closed_loop_task_id is not None: + # Behavioral example: skip the predictor LM call. The metric's + # behavioral branch reads _closed_loop_task_id off the Prediction + # and routes to the closed-loop cache for scoring, so the LM call + # would be wasted spend. + return dspy.Prediction( + output="", + _closed_loop_task_id=closed_loop_task_id, + _candidate_text=self.skill_text, + ) result = self.predictor(task_input=task_input) return dspy.Prediction(output=result.output) diff --git a/tests/skills/test_evolve_skill_closed_loop.py b/tests/skills/test_evolve_skill_closed_loop.py new file mode 100644 index 00000000..91dc4d99 --- /dev/null +++ b/tests/skills/test_evolve_skill_closed_loop.py @@ -0,0 +1,179 @@ +"""Tests for the closed-loop integration on the skill-evolution path. + +Exercises the integration seams independently of GEPA: SkillModule's +behavioral branch, the cache-construction helper, the behavioral-example +loader's task_input shape, and the gate_mode policy. End-to-end behavior +is exercised by the manual smoke harness at +``tests/manual/skill_closed_loop_smoke.py`` (heavy, real LM spend). +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from evolution.core.behavioral_example import build_behavioral_examples +from evolution.skills.evolve_skill import ( + _load_behavioral_examples_from_suite, + _maybe_build_closed_loop_cache_skill, +) +from evolution.skills.skill_module import SkillModule +from evolution.validation.task import TaskSuite + + +_SUITE_FIXTURE = ( + Path(__file__).resolve().parents[2] + / "evolution" + / "validation" + / "suites" + / "systematic_debugging.jsonl" +) + + +@pytest.fixture +def fake_skill_path(tmp_path): + src = tmp_path / "systematic_debugging" + src.mkdir() + md = src / "SKILL.md" + md.write_text( + "---\nname: systematic_debugging\n---\n\n# Weak baseline\n\nTry stuff.\n" + ) + return md + + +# --------------------------------------------------------------------------- +# SkillModule.forward behavioral branch +# --------------------------------------------------------------------------- + + +class TestSkillModuleBehavioralBranch: + def test_normal_branch_calls_predictor(self): + # closed_loop_task_id absent → take the LM path. We mock the + # predictor to avoid a real LM call. + module = SkillModule("body") + with patch.object(module, "predictor") as mock_pred: + mock_pred.return_value = type("R", (), {"output": "hello"})() + result = module.forward(task_input="do X") + assert result.output == "hello" + mock_pred.assert_called_once_with(task_input="do X") + + def test_behavioral_branch_skips_predictor(self): + # closed_loop_task_id present → no LM call; marker fields are + # stuffed onto the Prediction for the metric to route. Real + # SkillModule (no mock) because the behavioral branch must not + # touch the predictor at all — if it did, this test would hang + # waiting for a non-existent LM configuration. + module = SkillModule("evolved body text") + result = module.forward( + task_input="placeholder", + closed_loop_task_id="t1", + ) + assert result._closed_loop_task_id == "t1" + assert result._candidate_text == "evolved body text" + assert result.output == "" + + +# --------------------------------------------------------------------------- +# Behavioral-example loader (skill-side task_input shape) +# --------------------------------------------------------------------------- + + +class TestBehavioralExampleLoader: + def test_skill_examples_use_task_input_field(self, tmp_path): + suite_path = tmp_path / "s.jsonl" + suite_path.write_text( + '{"task_id": "a", "user_message": "debug a", "test_command": "python t.py"}\n' + '{"task_id": "b", "user_message": "debug b", "test_command": "python t.py"}\n' + ) + examples = _load_behavioral_examples_from_suite(suite_path) + assert len(examples) == 2 + # Skill modules take `task_input`; the loader must use that field name. + assert "task_input" in examples[0].inputs() + assert "closed_loop_task_id" in examples[0].inputs() + assert examples[0].task_input == "debug a" + + def test_tool_side_default_still_uses_task_field(self, tmp_path): + # build_behavioral_examples default preserves tool-path shape. + suite_path = tmp_path / "s.jsonl" + suite_path.write_text( + '{"task_id": "a", "user_message": "do a", "expected_tools": ["patch"]}\n' + ) + examples = build_behavioral_examples(TaskSuite.from_jsonl(suite_path)) + assert "task" in examples[0].inputs() + assert examples[0].task == "do a" + + +# --------------------------------------------------------------------------- +# Cache construction helper +# --------------------------------------------------------------------------- + + +class TestMaybeBuildClosedLoopCacheSkill: + def test_none_suite_returns_none(self, fake_skill_path): + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=None, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + ) + assert cache is None + + def test_constructs_cache_with_skill_installer_and_md_suffix( + self, fake_skill_path + ): + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="baseline body text", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + ) + assert cache is not None + # Baseline written as raw text via write_text_artifact (no JSON envelope). + baseline_text = cache._baseline_path.read_text() + assert baseline_text == "baseline body text" + # Suffix is .md for skill artifacts (not the default .json). + assert cache._baseline_path.suffix == ".md" + # Default gate_mode for skills is "sampled" — caller passes "always" + # only when wiring trainset mode. + assert cache.gate_mode == "sampled" + + def test_gate_mode_always_propagates(self, fake_skill_path): + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + gate_mode="always", + ) + assert cache.gate_mode == "always" + + def test_installer_skills_src_is_under_workdir(self, fake_skill_path): + # The cache's validator holds an installer with skills_src — confirming + # the wiring is intact so the runner sees the candidate. + cache = _maybe_build_closed_loop_cache_skill( + skill_name="systematic_debugging", + skill_path=fake_skill_path, + baseline_skill_body="body", + suite_path=_SUITE_FIXTURE, + saturation_threshold=0.95, + min_iters=3, + window_size=8, + ) + installer = cache._validator.installer + assert installer.skills_src.name == "skills" + # The baseline skill was copied into the installer's workdir. + target = installer.skills_src / "systematic_debugging" / "SKILL.md" + assert target.is_file() + # The user's original SKILL.md is untouched. + assert "Weak baseline" in fake_skill_path.read_text() diff --git a/tests/tools/test_evolve_tool_closed_loop.py b/tests/tools/test_evolve_tool_closed_loop.py index 8e272ad4..f0c8c42b 100644 --- a/tests/tools/test_evolve_tool_closed_loop.py +++ b/tests/tools/test_evolve_tool_closed_loop.py @@ -142,18 +142,29 @@ def test_flag_with_repo_does_not_immediately_error( assert kwargs["closed_loop_hermes_repo"] == fake_hermes_repo -class TestEvolveSkillFlagRaises: - def test_skill_side_flag_raises_with_clear_error( +class TestEvolveSkillFlagThreadsThrough: + def test_skill_side_closed_loop_flag_passes_to_evolve( self, tmp_path, task_suite_file ): + # The skill-side --closed-loop-during-evolution flag is now fully + # wired (no longer a placeholder UsageError). Confirms the flag + # reaches evolve() — full end-to-end behavior is exercised by + # tests/skills/test_evolve_skill_closed_loop.py and the manual + # smoke harness. from evolution.skills.evolve_skill import main runner = CliRunner() - result = runner.invoke(main, [ - "--skill", "some-skill", - "--closed-loop-during-evolution", str(task_suite_file), - ]) - assert result.exit_code != 0 - assert "SkillFileInstaller" in result.output + with patch("evolution.skills.evolve_skill.evolve") as fake_evolve: + fake_evolve.return_value = None + result = runner.invoke(main, [ + "--skill", "some-skill", + "--closed-loop-during-evolution", str(task_suite_file), + ]) + assert result.exit_code == 0, result.output + kwargs = fake_evolve.call_args.kwargs + assert kwargs["closed_loop_suite_path"] == task_suite_file + # Default mode is feedback for skills (skill bodies mutate heavily + # so opting into trainset is explicit). + assert kwargs["closed_loop_mode"] == "feedback" class TestClosedLoopModeFlag: From 5c6a8065a9281c2a414e544433f014233e445ed7 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 17 May 2026 15:20:44 -0600 Subject: [PATCH 5/5] docs+chore: close PLAN deviation #3 + add skill closed-loop smoke harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PLAN.md Phase 1 deviation #3 was the unfinished-skill-side-closed-loop note. systematic-debugging is now built (commits 1-4); update the deviation to reflect that, and explicitly call out which equivalents remain deferred (arxiv known-paper-recall, github-code-review planted-issue — same harness, different verdict mechanisms). docs/workflows.md grows a "Skill-path equivalent" subsection under Workflow 11 covering the two structural differences (test_command verdict vs tool-call membership; SkillFileInstaller vs in-place tool splice) and the rationale for the different default --closed-loop-mode (feedback for skills, since heavy mutation makes "always" gate mode 3-10x costlier than tool-path). tests/manual/skill_closed_loop_smoke.py drives the closed-loop layer end-to-end against the fake systematic-debugging skill + the planted-bug suite, with real hermes -z subprocesses. Pre-flight checks confirm hermes and python are on PATH. Asserts the cache fires, per-task verdicts come back, candidate-text caching dedups the second call, and no orphaned per-task tmpdirs leak. Not in CI — heavyweight (10 hermes -z calls, real LM spend). --- PLAN.md | 2 +- docs/workflows.md | 11 ++ tests/manual/skill_closed_loop_smoke.py | 209 ++++++++++++++++++++++++ 3 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 tests/manual/skill_closed_loop_smoke.py diff --git a/PLAN.md b/PLAN.md index a8566baf..a7b12172 100644 --- a/PLAN.md +++ b/PLAN.md @@ -372,7 +372,7 @@ The agent can self-invoke optimization: 1. **Benchmark gating (TBLite/YC-Bench) was not wired as a built-in.** The plan made "TBLite within 2%" the next-phase gate. The framework went framework-agnostic (any agent that emits `SKILL.md`), so the Hermes-specific benchmark suites no longer apply uniformly. The built-in deploy gate is a paired-bootstrap CI on a held-out split of the skill's own eval set (`evolution/core/stats.py` + `_check_growth_with_quality_gate`). For users who want a benchmark-style gate, `--benchmark-cmd ""` runs an arbitrary user-provided command after the built-in gate passes; nonzero exit flips the decision to reject with `reason="benchmark_failed"` and a `benchmark` block in `gate_decision.json`. This restores the plan author's "benchmarks as gates" vision in a framework-agnostic way: TBLite users wire `--benchmark-cmd "tblite-runner --threshold 0.02"`, no built-in coupling. 2. **Eval datasets are larger than the planned 15–30 examples.** `eval_dataset_size=150` is the current default, sized so the bootstrap CI half-width can detect ±2% effects. Rationale and the supporting study live in `reports/calibration_findings.md`. -3. **Source D (skill-specific auto-evaluation) is now built for tool descriptions; skill-side auto-eval still deferred.** `evolution/validation/closed_loop` drives a real Hermes Agent via `hermes -z` against a JSONL task suite, scores actual tool-selection behavior with both expected and forbidden tools, compares baseline vs. evolved with a two-condition decision rule (no aggregate regression + no per-task loss unless wins offset 2:1). v1 ships a 5-task suite for `patch`. Skill-side equivalents (planted-bug for `systematic-debugging`, known-paper recall for `arxiv`, planted-issue for `github-code-review`) remain deferred — same harness shape, needs different `ArtifactInstaller` impls. +3. **Source D (skill-specific auto-evaluation) is built for tool descriptions and for `systematic-debugging`; remaining skill-side suites still deferred.** `evolution/validation/closed_loop` drives a real Hermes Agent via `hermes -z` against a JSONL task suite, scores actual tool-selection behavior with both expected and forbidden tools, compares baseline vs. evolved with a two-condition decision rule (no aggregate regression + no per-task loss unless wins offset 2:1). v1 ships a 5-task suite for `patch` (tool-side). Skill-side now ships `systematic-debugging` via a new `SkillFileInstaller` plus a `test_command` verdict mode (exit code of a planted test replaces tool-call membership scoring); same harness, different artifact + scoring shape. Remaining skill-side equivalents (known-paper recall for `arxiv`, planted-issue for `github-code-review`) remain deferred — same harness, different verdict mechanisms (rubric-based for arxiv, planted-issue identification for code-review). 4. **The selection and gating layer is much thicker than the plan anticipated.** Knee-point Pareto selection, a budget-aware reflection-prompt proposer with growth/balanced/compression modes, paired-bootstrap CI, non-inferiority gate option, and the `gate_decision.json` v4 audit schema were added on top of raw GEPA. Rationale: raw "ship the best valset candidate" overfits at small N. See `docs/framework_advantages.md`. 5. **CLI shipped as `python -m evolution.skills.evolve_skill`, not a `hermes evolve skill` subcommand.** The framework rebranded out of Hermes-only branding so it could target any agent framework whose skills are `SKILL.md` files. 6. **`--run-tests` (target-repo pytest as a hard gate) was specified but never wired and has been removed.** Skill text doesn't touch target-repo Python, so coupling the deploy decision to the target repo's test suite would gate on unrelated signal. The CLI flag set a config field that no code read. The unused `EvolutionConfig.run_pytest`, `run_tblite`, `tblite_regression_threshold` fields and `ConstraintValidator.run_test_suite()` method were removed alongside the flag. diff --git a/docs/workflows.md b/docs/workflows.md index 6531e95d..01706618 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -479,6 +479,17 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256 `--closed-loop-mode both` does both: trainset behavioral examples for acceptance, plus the `[CLOSED_LOOP]` feedback block on non-behavioral examples for reflection. +### Skill-path equivalent + +`evolve_skill` exposes the same `--closed-loop-*` flags. Two differences from the tool path: + +- **Verdict mechanism is `test_command`, not tool-call membership.** Skill-side suites set `"test_command": "python test_solution.py"` on each task; the validator runs that command in `fixture_dir` after the agent and passes iff exit code is zero. `expected_tools` / `forbidden_tools` aren't meaningful for "did the agent debug correctly"-shape verdicts. The decision rule (two-condition: aggregate no-regression + per-task wins offset losses 2:1) is unchanged. +- **`SkillFileInstaller` instead of in-place tool description splice.** The user's actual skill may live in a read-only plugin cache, so the installer copies the baseline skill directory into a writable workdir at construction and mutates the copy. `HermesAgentRunner._prime_sandbox` reads `TaskRunContext.skills_src` and copies that workdir's `skills/` into each per-task sandbox so `hermes -z` discovers the candidate. The user's source skill is never touched. + +Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable. + +Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`. + ## Failure-mode summary | Trigger | Outcome | Where to look | diff --git a/tests/manual/skill_closed_loop_smoke.py b/tests/manual/skill_closed_loop_smoke.py new file mode 100644 index 00000000..df398d47 --- /dev/null +++ b/tests/manual/skill_closed_loop_smoke.py @@ -0,0 +1,209 @@ +"""Manual end-to-end smoke for skill-side closed-loop validation. + +Why this exists: + Unit tests mock the validator so they catch wiring bugs without paying + for real LM calls. They don't exercise the real Hermes agent against + the real planted-bug suite, so a regression in (e.g.) how Hermes + discovers skills inside the per-task sandbox, or how `python + test_solution.py` actually scores in the validator, would slip + through CI. This smoke runs the entire closed-loop layer end-to-end + against: + - the checked-in fake target skill at tests/fixtures/skills/systematic_debugging/ + - the checked-in planted-bug suite at evolution/validation/suites/systematic_debugging.jsonl + - whatever Hermes is configured with (uses the user's real ~/.hermes/config.yaml) + + Drives one ClosedLoopValidator.validate() call directly — 5 tasks × + 2 phases (baseline + evolved) = 10 hermes -z invocations. With a + cheap eval model (gpt-4o-mini-ish) this is pennies, not dollars. + +How to run: + uv run python tests/manual/skill_closed_loop_smoke.py + + Exits 0 on success. Not part of CI — heavyweight (drives real + hermes -z subprocesses + real LM spend). + +Prerequisites: + - `hermes` on PATH and configured (`hermes model` to select one) + - Whatever credentials Hermes needs in `~/.hermes/auth.json` +""" + +from __future__ import annotations + +import shutil +import sys +import tempfile +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parents[2] +FAKE_SKILL_PATH = ( + REPO_ROOT / "tests" / "fixtures" / "skills" / "systematic_debugging" / "SKILL.md" +) +SUITE_PATH = ( + REPO_ROOT / "evolution" / "validation" / "suites" / "systematic_debugging.jsonl" +) +SKILL_NAME = "systematic_debugging" + + +# A deliberately different candidate body so the validator has something +# to do — also serves as a smoke for the structural-difference path. +EVOLVED_BODY = ( + "# Systematic debugging — evolved\n" + "\n" + "When a test fails:\n" + "1. Read the test to understand the spec.\n" + "2. Read the buggy code with the spec in mind.\n" + "3. Form a hypothesis about which line is wrong.\n" + "4. Make the smallest change you can.\n" + "5. Re-run the test. If it passes, stop. If not, revert and try again.\n" +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _watermark_tempdirs(prefix_substrings: list[str]) -> set[Path]: + tmp = Path(tempfile.gettempdir()) + found: set[Path] = set() + for entry in tmp.iterdir(): + if any(sub in entry.name for sub in prefix_substrings): + found.add(entry) + return found + + +def _section(title: str) -> None: + print(f"\n{'=' * 60}\n {title}\n{'=' * 60}") + + +# --------------------------------------------------------------------------- +# The smoke +# --------------------------------------------------------------------------- + + +def main() -> int: + _section("Pre-flight checks") + if not FAKE_SKILL_PATH.is_file(): + print(f" ✗ Missing fixture skill at {FAKE_SKILL_PATH}") + return 1 + if not SUITE_PATH.is_file(): + print(f" ✗ Missing suite at {SUITE_PATH}") + return 1 + if shutil.which("hermes") is None: + print( + " ✗ `hermes` not on PATH. Install hermes-agent first " + "(see README); this smoke drives real hermes -z calls." + ) + return 1 + if shutil.which("python") is None: + print(" ✗ `python` not on PATH. Suite tasks use `python test_solution.py`.") + return 1 + print(f" ✓ Fixture skill present: {FAKE_SKILL_PATH}") + print(f" ✓ Suite present: {SUITE_PATH}") + print(f" ✓ hermes binary present: {shutil.which('hermes')}") + + _section("Constructing closed-loop cache") + from evolution.skills.evolve_skill import _maybe_build_closed_loop_cache_skill + from evolution.skills.skill_module import load_skill + + skill = load_skill(FAKE_SKILL_PATH) + print(f" Baseline body ({len(skill['body'])} chars):") + print(" ┌" + "─" * 58) + for line in skill["body"].splitlines()[:5]: + print(f" │ {line}") + if len(skill["body"].splitlines()) > 5: + print(" │ ...") + print(" └" + "─" * 58) + + tempdirs_before = _watermark_tempdirs(["cl_skill_workdir_", "cl_feedback_"]) + cache = _maybe_build_closed_loop_cache_skill( + skill_name=SKILL_NAME, + skill_path=FAKE_SKILL_PATH, + baseline_skill_body=skill["body"], + suite_path=SUITE_PATH, + saturation_threshold=0.95, + min_iters=1, + window_size=4, + gate_mode="always", # force the validator to fire + ) + assert cache is not None, "cache should be constructed when suite_path is set" + print(f" ✓ Cache constructed (gate_mode={cache.gate_mode})") + installer = cache._validator.installer + print(f" ✓ Installer skills_src: {installer.skills_src}") + target = installer.skills_src / SKILL_NAME / "SKILL.md" + assert target.is_file(), f"baseline skill not copied into workdir: {target}" + print(f" ✓ Baseline staged at: {target}") + + _section("Running validator (5 tasks × 2 phases = 10 hermes -z calls)") + print(" This invokes hermes -z 10 times. Wait ~60-180s depending on LM…\n") + start = time.time() + report = cache.get_or_run(EVOLVED_BODY) + elapsed = time.time() - start + print(f"\n ✓ Validator completed in {elapsed:.1f}s") + + if report is None: + print( + " ✗ Validator returned None — gate closed or validator raised. " + "Re-run with logging at WARNING+ to see the cause." + ) + return 1 + + _section("Report") + print(f" Decision: {report.decision}") + for reason in report.decision_reasons: + print(f" • {reason}") + print( + f"\n Baseline: {report.baseline.n_passed}/" + f"{report.baseline.n_passed + report.baseline.n_failed} " + f"({report.baseline.pass_rate:.0%})" + ) + print( + f" Evolved: {report.evolved.n_passed}/" + f"{report.evolved.n_passed + report.evolved.n_failed} " + f"({report.evolved.pass_rate:.0%})" + ) + print(f" Δ: {report.delta.pass_rate_change:+.2f}") + print( + f" Per-task: {report.delta.n_wins} wins, " + f"{report.delta.n_losses} losses, {report.delta.n_ties} ties" + ) + + print("\n Per-task verdicts (baseline → evolved):") + b_by_id = {t.task_id: t for t in report.baseline.tasks} + for ev in report.evolved.tasks: + b = b_by_id.get(ev.task_id) + b_mark = "✓" if (b and b.passed) else ("◌" if (b and b.abstained) else "✗") + e_mark = "✓" if ev.passed else ("◌" if ev.abstained else "✗") + print(f" [{b_mark} → {e_mark}] {ev.task_id}") + + _section("Cache + temp dir housekeeping") + second_report = cache.get_or_run(EVOLVED_BODY) + assert second_report is report, ( + "second get_or_run with same candidate should be a cache hit" + ) + print(" ✓ Candidate-text cache hit on second call (no re-run)") + + tempdirs_after = _watermark_tempdirs(["cl_skill_workdir_", "cl_feedback_"]) + leaked = tempdirs_after - tempdirs_before + # Cache + installer workdirs persist until process exit by design. + # Count them — they should appear here. What we DON'T want to see is + # cl_fixture_ or cl_hermes_home_ persisting (those are per-task tmpdirs + # that the validator/runner cleans up in their own scope). + leaked_per_task = _watermark_tempdirs(["cl_fixture_", "cl_hermes_home_"]) + print(f" New persistent workdirs (expected, ≤ 2): {len(leaked)}") + print(f" Orphaned per-task tmpdirs (should be 0): {len(leaked_per_task)}") + if leaked_per_task: + print(f" ✗ Orphaned: {leaked_per_task}") + return 1 + + _section("All assertions passed") + return 0 + + +if __name__ == "__main__": + sys.exit(main())