From 1a5714eea4bf689984799116f9304fd8840a48d1 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:28:54 -0600
Subject: [PATCH 01/23] feat(prompts): add PromptSource protocol +
 SectionDescriptor

---
 evolution/prompts/__init__.py       |  2 +-
 evolution/prompts/prompt_source.py  | 53 ++++++++++++++++++++++++++++
 tests/prompts/__init__.py           |  0
 tests/prompts/test_prompt_source.py | 54 +++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 evolution/prompts/prompt_source.py
 create mode 100644 tests/prompts/__init__.py
 create mode 100644 tests/prompts/test_prompt_source.py

diff --git a/evolution/prompts/__init__.py b/evolution/prompts/__init__.py
index 85704c83..c342d5d0 100644
--- a/evolution/prompts/__init__.py
+++ b/evolution/prompts/__init__.py
@@ -1 +1 @@
-"""Phase placeholder: prompts evolution."""
+"""Phase 3: system prompt section evolution."""
diff --git a/evolution/prompts/prompt_source.py b/evolution/prompts/prompt_source.py
new file mode 100644
index 00000000..5bd5ca5b
--- /dev/null
+++ b/evolution/prompts/prompt_source.py
@@ -0,0 +1,53 @@
+"""PromptSource Protocol — adapters that read, write, and enumerate named prompt sections.
+
+Phase 3 integrates via in-place splice-and-restore (see
+``HermesPromptSectionInstaller``), so the runtime override seam lives in
+the installer, not here. A PromptSource only needs to read the baseline,
+persist an evolved value, and enumerate what's targetable.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+
+
+@dataclass(frozen=True)
+class SectionDescriptor:
+    """Metadata about an evolvable prompt section.
+
+    ``applicability`` is informational at design time; it's not used for
+    runtime filtering in v1, but downstream joint-optimization work will
+    consume it (e.g., model-family-targeted sections only get evaluated
+    against that family).
+    """
+
+    name: str
+    current_text: str
+    source_path: Path
+    applicability: dict[str, str] = field(default_factory=dict)
+
+
+@runtime_checkable
+class PromptSource(Protocol):
+    """Adapter contract for prompt-section evolution targets."""
+
+    name: str
+
+    def read(self, section_name: str) -> str:
+        """Return the canonical baseline text of the named section."""
+        ...
+
+    def write(self, section_name: str, new_text: str) -> None:
+        """Persist evolved text to the canonical source.
+
+        Used both at deploy time and as the splice primitive the
+        closed-loop installer drives during validation (the installer
+        owns the backup/restore around the mutation).
+        """
+        ...
+
+    def list_sections(self) -> list[SectionDescriptor]:
+        """Enumerate all evolvable sections this source can target."""
+        ...
diff --git a/tests/prompts/__init__.py b/tests/prompts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/prompts/test_prompt_source.py b/tests/prompts/test_prompt_source.py
new file mode 100644
index 00000000..41703567
--- /dev/null
+++ b/tests/prompts/test_prompt_source.py
@@ -0,0 +1,54 @@
+"""Tests for the PromptSource protocol contract."""
+from __future__ import annotations
+
+import dataclasses
+from pathlib import Path
+
+from evolution.prompts.prompt_source import PromptSource, SectionDescriptor
+
+
+def test_section_descriptor_is_frozen():
+    descriptor = SectionDescriptor(
+        name="MEMORY_GUIDANCE",
+        current_text="baseline text",
+        source_path=Path("/tmp/fake.py"),
+    )
+    assert dataclasses.is_dataclass(descriptor)
+    try:
+        descriptor.name = "OTHER"
+    except dataclasses.FrozenInstanceError:
+        return
+    raise AssertionError("SectionDescriptor must be frozen")
+
+
+def test_prompt_source_protocol_runtime_checkable():
+    """A concrete class implementing the three methods satisfies isinstance()."""
+
+    class StubSource:
+        name = "stub"
+
+        def read(self, section_name: str) -> str:
+            return "stub"
+
+        def write(self, section_name: str, new_text: str) -> None:
+            return None
+
+        def list_sections(self) -> list[SectionDescriptor]:
+            return []
+
+    assert isinstance(StubSource(), PromptSource)
+
+
+def test_prompt_source_protocol_rejects_incomplete():
+    """Missing a required method => not a PromptSource."""
+
+    class MissingWrite:
+        name = "incomplete"
+
+        def read(self, section_name: str) -> str:
+            return "x"
+
+        def list_sections(self) -> list[SectionDescriptor]:
+            return []
+
+    assert not isinstance(MissingWrite(), PromptSource)

From 2b857acfae4d8dc7e3aa9c0f27cb5bc5f0975d5b Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:29:27 -0600
Subject: [PATCH 02/23] feat(prompts): HermesPromptSource AST-based read

---
 evolution/prompts/hermes_prompt_source.py  | 64 +++++++++++++++++++++
 tests/prompts/test_hermes_prompt_source.py | 66 ++++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 evolution/prompts/hermes_prompt_source.py
 create mode 100644 tests/prompts/test_hermes_prompt_source.py

diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py
new file mode 100644
index 00000000..1b4412c3
--- /dev/null
+++ b/evolution/prompts/hermes_prompt_source.py
@@ -0,0 +1,64 @@
+"""HermesPromptSource — read/write named string constants in Hermes prompt_builder.py.
+
+Walks ``agent/prompt_builder.py`` for top-level ``NAME = "..."`` (or
+concatenated-string) assignments. v1 supports string-typed constants
+only; dict-typed constants (like ``PLATFORM_HINTS``) raise KeyError on
+read.
+"""
+
+from __future__ import annotations
+
+import ast
+import logging
+from pathlib import Path
+
+from evolution.prompts.prompt_source import SectionDescriptor
+
+logger = logging.getLogger(__name__)
+
+
+class HermesPromptSource:
+    """Read/write named string constants in Hermes prompt_builder.py."""
+
+    name = "hermes_prompt_source"
+
+    def __init__(self, hermes_repo: Path) -> None:
+        self.hermes_repo = Path(hermes_repo)
+        self.prompt_builder_path = self.hermes_repo / "agent" / "prompt_builder.py"
+        if not self.prompt_builder_path.is_file():
+            raise FileNotFoundError(
+                f"prompt_builder.py not found at {self.prompt_builder_path}"
+            )
+
+    def read(self, section_name: str) -> str:
+        constants = self._parse_string_constants()
+        if section_name not in constants:
+            raise KeyError(
+                f"section {section_name!r} not found in {self.prompt_builder_path} "
+                f"(v1 only supports top-level string-typed constants). "
+                f"Available: {sorted(constants)}"
+            )
+        return constants[section_name][0]
+
+    def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]:
+        """Return ``{name: (value, value_ast_node)}`` for every top-level
+        string-typed assignment in prompt_builder.py.
+
+        Concatenated-string forms like ``X = ("a" "b" "c")`` are folded to
+        a single ``ast.Constant`` by the parser, so they read back as one
+        string. The AST node is retained so ``write`` can splice by byte
+        offset.
+        """
+        source = self.prompt_builder_path.read_text(encoding="utf-8")
+        tree = ast.parse(source, filename=str(self.prompt_builder_path))
+        out: dict[str, tuple[str, ast.Constant]] = {}
+        for node in tree.body:
+            if not (isinstance(node, ast.Assign) and len(node.targets) == 1):
+                continue
+            target = node.targets[0]
+            if not isinstance(target, ast.Name):
+                continue
+            value = node.value
+            if isinstance(value, ast.Constant) and isinstance(value.value, str):
+                out[target.id] = (value.value, value)
+        return out
diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py
new file mode 100644
index 00000000..aa1794ef
--- /dev/null
+++ b/tests/prompts/test_hermes_prompt_source.py
@@ -0,0 +1,66 @@
+"""Tests for HermesPromptSource — AST-based read/write/list."""
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from evolution.prompts.hermes_prompt_source import HermesPromptSource
+
+
+@pytest.fixture
+def fake_hermes_repo(tmp_path: Path) -> Path:
+    """A tmp hermes-agent-like checkout with a stub prompt_builder.py."""
+    (tmp_path / "agent").mkdir()
+    pb = tmp_path / "agent" / "prompt_builder.py"
+    pb.write_text(textwrap.dedent('''\
+        """Stub prompt_builder for tests."""
+        import os
+
+        MEMORY_GUIDANCE = (
+            "You have persistent memory across sessions. "
+            "Save durable facts."
+        )
+
+        SKILLS_GUIDANCE = "After completing a complex task, save the approach."
+
+        PLATFORM_HINTS = {
+            "cli": "You are a CLI AI Agent.",
+        }
+
+        def _not_a_constant():
+            return "ignored"
+    '''))
+    return tmp_path
+
+
+def test_read_concatenated_string_constant(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    text = source.read("MEMORY_GUIDANCE")
+    assert "persistent memory" in text
+    assert "Save durable facts." in text
+
+
+def test_read_simple_string_constant(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    text = source.read("SKILLS_GUIDANCE")
+    assert text == "After completing a complex task, save the approach."
+
+
+def test_read_skips_dict_constants(fake_hermes_repo: Path):
+    """PLATFORM_HINTS is a dict; v1 doesn't support dict-shape sections."""
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    with pytest.raises(KeyError, match="PLATFORM_HINTS"):
+        source.read("PLATFORM_HINTS")
+
+
+def test_read_unknown_constant_raises(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    with pytest.raises(KeyError, match="NONEXISTENT"):
+        source.read("NONEXISTENT")
+
+
+def test_missing_prompt_builder_raises(tmp_path: Path):
+    with pytest.raises(FileNotFoundError, match="prompt_builder.py"):
+        HermesPromptSource(hermes_repo=tmp_path)

From c9f48e7c4d62533c2530fd16c1a30902c58d0a9e Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:30:05 -0600
Subject: [PATCH 03/23] feat(prompts): HermesPromptSource AST-based write

---
 evolution/prompts/hermes_prompt_source.py  | 62 ++++++++++++++++++++++
 tests/prompts/test_hermes_prompt_source.py | 29 ++++++++++
 2 files changed, 91 insertions(+)

diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py
index 1b4412c3..bae944f3 100644
--- a/evolution/prompts/hermes_prompt_source.py
+++ b/evolution/prompts/hermes_prompt_source.py
@@ -10,6 +10,9 @@
 
 import ast
 import logging
+import os
+import shutil
+import tempfile
 from pathlib import Path
 
 from evolution.prompts.prompt_source import SectionDescriptor
@@ -62,3 +65,62 @@ def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]:
             if isinstance(value, ast.Constant) and isinstance(value.value, str):
                 out[target.id] = (value.value, value)
         return out
+
+    def write(self, section_name: str, new_text: str) -> None:
+        """Splice ``new_text`` into the named constant in place.
+
+        Uses ``repr()`` for the replacement literal so the new text
+        round-trips byte-equal regardless of embedded newlines, quotes,
+        or backslashes. Other constants are left verbatim.
+
+        The write is atomic (tempfile + ``os.replace``) and guarded: the
+        new bytes must parse as Python before the original is replaced.
+        A botched splice (only possible if AST extraction were wrong)
+        raises and leaves ``prompt_builder.py`` untouched, rather than
+        leaving the user's Hermes unstartable.
+        """
+        constants = self._parse_string_constants()
+        if section_name not in constants:
+            raise KeyError(
+                f"section {section_name!r} not found in {self.prompt_builder_path}"
+            )
+        _, value_node = constants[section_name]
+        data = self.prompt_builder_path.read_bytes()
+        start_offset = _byte_offset(data, value_node.lineno, value_node.col_offset)
+        end_offset = _byte_offset(
+            data, value_node.end_lineno, value_node.end_col_offset
+        )
+        replacement = repr(new_text).encode("utf-8")
+        new_bytes = data[:start_offset] + replacement + data[end_offset:]
+
+        try:
+            ast.parse(new_bytes, filename=str(self.prompt_builder_path))
+        except SyntaxError as exc:
+            raise RuntimeError(
+                f"Refusing to write {self.prompt_builder_path}: spliced output "
+                f"would not parse as Python ({exc}). Original file untouched."
+            ) from exc
+
+        self._atomic_write_bytes(self.prompt_builder_path, new_bytes)
+
+    @staticmethod
+    def _atomic_write_bytes(path: Path, data: bytes) -> None:
+        fd, tmp_name = tempfile.mkstemp(dir=path.parent, suffix=path.suffix)
+        tmp_path = Path(tmp_name)
+        try:
+            with os.fdopen(fd, "wb") as fh:
+                fh.write(data)
+            shutil.copymode(path, tmp_path)
+            os.replace(tmp_path, path)
+        except BaseException:
+            tmp_path.unlink(missing_ok=True)
+            raise
+
+
+def _byte_offset(data: bytes, lineno: int, col_offset: int) -> int:
+    """Convert an AST position (1-based line, 0-based byte column) to an
+    absolute byte offset into ``data``."""
+    lines = data.splitlines(keepends=True)
+    if lineno < 1 or lineno > len(lines):
+        raise ValueError(f"lineno {lineno} out of range [1, {len(lines)}]")
+    return sum(len(line) for line in lines[: lineno - 1]) + col_offset
diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py
index aa1794ef..e177c7ab 100644
--- a/tests/prompts/test_hermes_prompt_source.py
+++ b/tests/prompts/test_hermes_prompt_source.py
@@ -64,3 +64,32 @@ def test_read_unknown_constant_raises(fake_hermes_repo: Path):
 def test_missing_prompt_builder_raises(tmp_path: Path):
     with pytest.raises(FileNotFoundError, match="prompt_builder.py"):
         HermesPromptSource(hermes_repo=tmp_path)
+
+
+def test_write_replaces_string_constant(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    new_text = "Replacement guidance for memory."
+    source.write("MEMORY_GUIDANCE", new_text)
+    assert source.read("MEMORY_GUIDANCE") == new_text
+    # Confirm SKILLS_GUIDANCE was untouched.
+    assert source.read("SKILLS_GUIDANCE") == (
+        "After completing a complex task, save the approach."
+    )
+
+
+def test_write_preserves_file_parseability(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    # A value with newlines, quotes, and backslashes — repr() must
+    # produce a literal that round-trips byte-equal.
+    tricky = 'line one\nline two with "quotes" and \\ backslash'
+    source.write("MEMORY_GUIDANCE", tricky)
+    assert source.read("MEMORY_GUIDANCE") == tricky
+    # File must still be valid Python.
+    import ast as _ast
+    _ast.parse((fake_hermes_repo / "agent" / "prompt_builder.py").read_text())
+
+
+def test_write_unknown_section_raises(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    with pytest.raises(KeyError, match="NONEXISTENT"):
+        source.write("NONEXISTENT", "x")

From 22fac1cd8a34176f238ec023e20d167ddeb95c21 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:30:31 -0600
Subject: [PATCH 04/23] feat(prompts): HermesPromptSource section enumeration

---
 evolution/prompts/hermes_prompt_source.py  | 11 +++++++++++
 tests/prompts/test_hermes_prompt_source.py | 17 +++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py
index bae944f3..b92935b2 100644
--- a/evolution/prompts/hermes_prompt_source.py
+++ b/evolution/prompts/hermes_prompt_source.py
@@ -43,6 +43,17 @@ def read(self, section_name: str) -> str:
             )
         return constants[section_name][0]
 
+    def list_sections(self) -> list[SectionDescriptor]:
+        constants = self._parse_string_constants()
+        return [
+            SectionDescriptor(
+                name=name,
+                current_text=text,
+                source_path=self.prompt_builder_path,
+            )
+            for name, (text, _node) in sorted(constants.items())
+        ]
+
     def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]:
         """Return ``{name: (value, value_ast_node)}`` for every top-level
         string-typed assignment in prompt_builder.py.
diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py
index e177c7ab..c631a435 100644
--- a/tests/prompts/test_hermes_prompt_source.py
+++ b/tests/prompts/test_hermes_prompt_source.py
@@ -93,3 +93,20 @@ def test_write_unknown_section_raises(fake_hermes_repo: Path):
     source = HermesPromptSource(hermes_repo=fake_hermes_repo)
     with pytest.raises(KeyError, match="NONEXISTENT"):
         source.write("NONEXISTENT", "x")
+
+
+def test_list_sections_enumerates_string_constants(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    sections = source.list_sections()
+    names = {s.name for s in sections}
+    assert "MEMORY_GUIDANCE" in names
+    assert "SKILLS_GUIDANCE" in names
+    assert "PLATFORM_HINTS" not in names  # dict-typed → excluded
+
+
+def test_list_sections_populates_descriptors(fake_hermes_repo: Path):
+    source = HermesPromptSource(hermes_repo=fake_hermes_repo)
+    by_name = {s.name: s for s in source.list_sections()}
+    skills = by_name["SKILLS_GUIDANCE"]
+    assert skills.current_text == "After completing a complex task, save the approach."
+    assert skills.source_path == fake_hermes_repo / "agent" / "prompt_builder.py"

From 9f2f4a6727bf5e575fb3282a7fbf36e9e225a077 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:31:19 -0600
Subject: [PATCH 05/23] feat(validation): extend Task with
 expected_save_content

---
 evolution/validation/task.py  | 13 +++++++++++++
 tests/validation/test_task.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/evolution/validation/task.py b/evolution/validation/task.py
index 209c8b38..b7833fa0 100644
--- a/evolution/validation/task.py
+++ b/evolution/validation/task.py
@@ -32,6 +32,11 @@ class Task:
       the verdict is "did the agent's edits make the planted test
       pass" rather than "did the agent invoke the right tools." When
       set, takes precedence over the tool-call rule.
+
+    ``expected_save_content`` is an optional rubric (not exact text)
+    describing what a good ``memory(action='save')`` would contain. It
+    feeds the prompt-section compound verdict's Layer 2 content judge; it
+    has no effect on the Layer 1 tool-call rule above.
     """
 
     task_id: str
@@ -40,6 +45,7 @@ class Task:
     forbidden_tools: tuple[str, ...] = ()
     fixture_setup: dict[str, str] = field(default_factory=dict)
     test_command: Optional[str] = None
+    expected_save_content: Optional[str] = None
 
     def render_message(self, fixture_dir: Path) -> str:
         """Substitute ``{fixture_dir}`` in the message with the resolved path.
@@ -98,6 +104,12 @@ def _task_from_dict(obj: dict, *, source: str) -> Task:
         raise ValueError(
             f"{source}: test_command must be a string (got {type(test_command).__name__})"
         )
+    expected_save_content = obj.get("expected_save_content")
+    if expected_save_content is not None and not isinstance(expected_save_content, str):
+        raise ValueError(
+            f"{source}: expected_save_content must be a string "
+            f"(got {type(expected_save_content).__name__})"
+        )
     return Task(
         task_id=obj["task_id"],
         user_message=obj["user_message"],
@@ -105,4 +117,5 @@ def _task_from_dict(obj: dict, *, source: str) -> Task:
         forbidden_tools=tuple(obj.get("forbidden_tools") or ()),
         fixture_setup=dict(fixture_setup),
         test_command=test_command,
+        expected_save_content=expected_save_content,
     )
diff --git a/tests/validation/test_task.py b/tests/validation/test_task.py
index 4c0b0908..ae658bda 100644
--- a/tests/validation/test_task.py
+++ b/tests/validation/test_task.py
@@ -155,3 +155,35 @@ def test_test_command_non_string_raises(self, tmp_path):
         }])
         with pytest.raises(ValueError, match="test_command must be a string"):
             TaskSuite.from_jsonl(p)
+
+
+class TestExpectedSaveContent:
+    def _write_jsonl(self, path: Path, rows: list[dict]) -> None:
+        path.write_text("\n".join(json.dumps(r) for r in rows) + "\n")
+
+    def test_round_trips_from_jsonl(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{
+            "task_id": "save-pref-001",
+            "user_message": "I prefer uv over pip for Python projects.",
+            "expected_tools": ["memory"],
+            "expected_save_content": "user prefers uv over pip",
+        }])
+        suite = TaskSuite.from_jsonl(p)
+        assert suite.tasks[0].expected_save_content == "user prefers uv over pip"
+
+    def test_defaults_to_none(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{
+            "task_id": "t1", "user_message": "hi", "expected_tools": ["memory"],
+        }])
+        suite = TaskSuite.from_jsonl(p)
+        assert suite.tasks[0].expected_save_content is None
+
+    def test_non_string_raises(self, tmp_path):
+        p = tmp_path / "suite.jsonl"
+        self._write_jsonl(p, [{
+            "task_id": "t", "user_message": "m", "expected_save_content": 42,
+        }])
+        with pytest.raises(ValueError, match="expected_save_content must be a string"):
+            TaskSuite.from_jsonl(p)

From b217e8b0f7f52a03ec619ff9bd327cf9041adc6f Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:32:49 -0600
Subject: [PATCH 06/23] feat(validation): capture tool call args in session
 parser

---
 evolution/validation/agent_runner.py   |  7 ++++
 evolution/validation/hermes_runner.py  | 48 ++++++++++++++++++++++++
 tests/validation/test_hermes_runner.py | 52 ++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)

diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py
index dc4500d4..df9a4d7c 100644
--- a/evolution/validation/agent_runner.py
+++ b/evolution/validation/agent_runner.py
@@ -21,6 +21,12 @@ class AgentRunResult:
     tool_call dicts) the agent invoked during the session. The validator
     only needs names for the expected / forbidden membership tests.
 
+    ``tool_calls_with_args`` carries the same calls in order as
+    ``{"name", "arguments"}`` dicts (arguments parsed from the
+    LLM-emitted JSON). The compound-verdict Layer 2 judge needs the
+    argument payloads — e.g. the content of a ``memory(action='save')``
+    call — which ``tool_calls_seq`` discards.
+
     ``error`` is set when the runner itself failed to drive the agent
     (subprocess timeout, no session JSON written, parse failure). It's
     distinct from "agent invoked a tool that failed" — that's still a
@@ -35,6 +41,7 @@ class AgentRunResult:
     model_name: Optional[str] = None
     error: Optional[str] = None
     session_path: Optional[Path] = None
+    tool_calls_with_args: list[dict] = field(default_factory=list)
 
 
 @dataclass(frozen=True)
diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
index f93e118c..82ddcde0 100644
--- a/evolution/validation/hermes_runner.py
+++ b/evolution/validation/hermes_runner.py
@@ -210,6 +210,7 @@ def parse_session_result(
 
     messages = data.get("messages") or []
     tool_calls_seq = _extract_tool_call_names(messages)
+    tool_calls_with_args = _extract_tool_calls_with_args(messages)
     final_text_tail = _extract_final_text_tail(messages)
     model_name = data.get("model")
 
@@ -219,6 +220,7 @@ def parse_session_result(
         duration_seconds=duration_seconds,
         model_name=model_name,
         session_path=session_path,
+        tool_calls_with_args=tool_calls_with_args,
     )
 
 
@@ -256,6 +258,52 @@ def _call_name(call: dict) -> Optional[str]:
     return str(flat) if flat else None
 
 
+def _extract_tool_calls_with_args(messages: list[dict]) -> list[dict]:
+    """Return ``[{name, arguments}, ...]`` for each assistant tool call.
+
+    Arguments are parsed from the LLM-emitted JSON string. Malformed or
+    non-object arguments fall back to ``{}`` rather than dropping the
+    call — the Layer 2 judge can still treat "memory was invoked with
+    empty args" as a behavior signal. Handles both OpenAI-nested and flat
+    tool_call shapes, mirroring ``_extract_tool_call_names``.
+    """
+    out: list[dict] = []
+    for msg in messages:
+        if msg.get("role") != "assistant":
+            continue
+        for call in msg.get("tool_calls") or []:
+            if not isinstance(call, dict):
+                continue
+            name = _call_name(call)
+            if not name:
+                continue
+            args_raw = _call_arguments_raw(call)
+            try:
+                args = json.loads(args_raw) if args_raw else {}
+            except (json.JSONDecodeError, TypeError):
+                args = {}
+            if not isinstance(args, dict):
+                args = {}
+            out.append({"name": name, "arguments": args})
+    return out
+
+
+def _call_arguments_raw(call: dict) -> str:
+    fn = call.get("function")
+    if isinstance(fn, dict):
+        nested = fn.get("arguments")
+        if isinstance(nested, str):
+            return nested
+        if isinstance(nested, dict):
+            return json.dumps(nested)
+    flat = call.get("arguments")
+    if isinstance(flat, str):
+        return flat
+    if isinstance(flat, dict):
+        return json.dumps(flat)
+    return ""
+
+
 def _extract_final_text_tail(messages: list[dict]) -> str:
     """Last 4096 chars of the last assistant message with text content."""
     for msg in reversed(messages):
diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py
index d970c29e..da1b2fea 100644
--- a/tests/validation/test_hermes_runner.py
+++ b/tests/validation/test_hermes_runner.py
@@ -173,6 +173,58 @@ def test_call_missing_name_skipped(self, tmp_path):
         assert result.tool_calls_seq == ["patch"]
 
 
+class TestParseToolCallArgs:
+    def test_captures_tool_call_args(self, tmp_path):
+        """Sessions with tool calls must surface name AND parsed args."""
+        p = tmp_path / "session.json"
+        _write_session(p, [
+            {"role": "user", "content": "save a fact"},
+            {"role": "assistant", "content": "", "tool_calls": [
+                {"function": {
+                    "name": "memory",
+                    "arguments": json.dumps({
+                        "action": "save",
+                        "content": "user prefers terse responses",
+                    }),
+                }}
+            ]},
+            {"role": "tool", "content": "ok"},
+            {"role": "assistant", "content": "Saved."},
+        ])
+        result = parse_session_result(p, duration_seconds=1.0)
+        assert result.tool_calls_seq == ["memory"]
+        assert len(result.tool_calls_with_args) == 1
+        call = result.tool_calls_with_args[0]
+        assert call["name"] == "memory"
+        assert call["arguments"]["action"] == "save"
+        assert call["arguments"]["content"] == "user prefers terse responses"
+
+    def test_handles_malformed_args(self, tmp_path):
+        """Malformed tool-call arguments JSON must not crash — fall back to {}."""
+        p = tmp_path / "session.json"
+        _write_session(p, [
+            {"role": "assistant", "tool_calls": [
+                {"function": {"name": "memory", "arguments": "{not-json"}}
+            ]},
+        ])
+        result = parse_session_result(p, duration_seconds=1.0)
+        assert result.tool_calls_seq == ["memory"]
+        assert result.tool_calls_with_args == [{"name": "memory", "arguments": {}}]
+
+    def test_handles_flat_dict_args(self, tmp_path):
+        """Flat tool_call shape with an already-parsed dict argument."""
+        p = tmp_path / "session.json"
+        _write_session(p, [
+            {"role": "assistant", "tool_calls": [
+                {"name": "memory", "arguments": {"action": "delete", "key": "x"}}
+            ]},
+        ])
+        result = parse_session_result(p, duration_seconds=1.0)
+        assert result.tool_calls_with_args == [
+            {"name": "memory", "arguments": {"action": "delete", "key": "x"}}
+        ]
+
+
 class TestHermesAgentRunnerSubprocess:
     """The subprocess invocation layer: env + cwd + args plumbing."""
 

From 3385837c45535dbc75c68c94fe3511d6daaa7c40 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:33:37 -0600
Subject: [PATCH 07/23] feat(validation): HermesPromptSectionInstaller

---
 evolution/validation/artifact_installer.py  | 32 ++++++++++
 tests/validation/test_artifact_installer.py | 66 +++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/evolution/validation/artifact_installer.py b/evolution/validation/artifact_installer.py
index 44780ec2..2e56df3e 100644
--- a/evolution/validation/artifact_installer.py
+++ b/evolution/validation/artifact_installer.py
@@ -22,6 +22,7 @@
 from pathlib import Path
 from typing import Optional, Protocol
 
+from evolution.prompts.hermes_prompt_source import HermesPromptSource
 from evolution.tools.hermes_source import HermesToolSource
 from evolution.tools.tool_source import ToolManifest
 
@@ -139,6 +140,37 @@ def _extract_description(self, artifact_source: Path) -> str:
             return manifest.find_tool(self.tool_name).description
 
 
+class HermesPromptSectionInstaller:
+    """Splice an evolved prompt section into Hermes ``agent/prompt_builder.py``.
+
+    The artifact source is a plain-text file holding the candidate
+    section body. ``install`` reads that text and asks ``HermesPromptSource``
+    to splice it into the named string constant in place; the validator's
+    backup/flock/sha-drift machinery (shared with the tool-description
+    path) guards the live checkout and restores it afterward.
+
+    Constraint: the target section must be a top-level string constant
+    (the same shape ``HermesPromptSource`` reads). Dict-typed sections
+    like ``PLATFORM_HINTS`` are not installable.
+    """
+
+    def __init__(self, hermes_repo: Path, section_name: str) -> None:
+        self.hermes_repo = Path(hermes_repo)
+        self.section_name = section_name
+        self._source = HermesPromptSource(self.hermes_repo)
+        self.target_path = self._source.prompt_builder_path
+
+    def install(self, artifact_source: Path) -> str:
+        """Splice the candidate section text from ``artifact_source`` into the
+        live ``prompt_builder.py`` and return the post-install sha256."""
+        new_text = artifact_source.read_text(encoding="utf-8")
+        self._source.write(self.section_name, new_text)
+        return sha256_of(self.target_path)
+
+    def verify_backup(self, backup_path: Path) -> None:
+        verify_python_parses(backup_path)
+
+
 class SkillFileInstaller:
     """Write an evolved SKILL.md into a writable workdir for closed-loop validation.
 
diff --git a/tests/validation/test_artifact_installer.py b/tests/validation/test_artifact_installer.py
index 95b1cfbd..d6d6ea4c 100644
--- a/tests/validation/test_artifact_installer.py
+++ b/tests/validation/test_artifact_installer.py
@@ -261,3 +261,69 @@ def test_rejects_invalid_utf8(self, baseline_skill, tmp_path):
         backup.write_bytes(b"\xff\xfe\x00\x00invalid")
         with pytest.raises(ValueError, match="not valid UTF-8"):
             installer.verify_backup(backup)
+
+
+# ---- HermesPromptSectionInstaller ----
+
+import textwrap
+
+from evolution.validation.artifact_installer import (
+    HermesPromptSectionInstaller,
+    sha256_of,
+)
+
+
+def _fake_hermes_repo(tmp_path: Path) -> Path:
+    (tmp_path / "agent").mkdir()
+    (tmp_path / "agent" / "prompt_builder.py").write_text(textwrap.dedent('''\
+        """Stub prompt_builder."""
+
+        MEMORY_GUIDANCE = (
+            "You have persistent memory. "
+            "Save durable facts."
+        )
+
+        SKILLS_GUIDANCE = "After a complex task, save the approach."
+    '''))
+    return tmp_path
+
+
+class TestHermesPromptSectionInstaller:
+    def test_target_path_is_prompt_builder(self, tmp_path):
+        repo = _fake_hermes_repo(tmp_path)
+        installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE")
+        assert installer.target_path == repo / "agent" / "prompt_builder.py"
+
+    def test_install_splices_candidate_and_returns_sha(self, tmp_path):
+        repo = _fake_hermes_repo(tmp_path)
+        installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE")
+        candidate = tmp_path / "candidate.txt"
+        candidate.write_text("EVOLVED memory guidance body.")
+
+        returned_sha = installer.install(candidate)
+
+        pb = repo / "agent" / "prompt_builder.py"
+        assert returned_sha == sha256_of(pb)
+        # The new text is live; the sibling section is untouched.
+        from evolution.prompts.hermes_prompt_source import HermesPromptSource
+        src = HermesPromptSource(repo)
+        assert src.read("MEMORY_GUIDANCE") == "EVOLVED memory guidance body."
+        assert src.read("SKILLS_GUIDANCE") == "After a complex task, save the approach."
+        # File still parses.
+        import ast
+        ast.parse(pb.read_text())
+
+    def test_verify_backup_rejects_non_python(self, tmp_path):
+        repo = _fake_hermes_repo(tmp_path)
+        installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE")
+        bad = tmp_path / "bad.cl_backup"
+        bad.write_text("def broken(:\n")
+        with pytest.raises(SyntaxError):
+            installer.verify_backup(bad)
+
+    def test_verify_backup_accepts_valid_python(self, tmp_path):
+        repo = _fake_hermes_repo(tmp_path)
+        installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE")
+        good = tmp_path / "good.cl_backup"
+        good.write_text("X = 'ok'\n")
+        installer.verify_backup(good)  # must not raise

From 552a408008ad50beb2de5cadf5d7358d96dcb27d Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:35:19 -0600
Subject: [PATCH 08/23] feat(validation): optional Layer 2 judge in
 ClosedLoopValidator + score_task

---
 evolution/validation/report.py     | 22 ++++++++-
 evolution/validation/validator.py  | 18 +++++++-
 tests/validation/test_report.py    | 71 ++++++++++++++++++++++++++++++
 tests/validation/test_validator.py | 49 +++++++++++++++++++++
 4 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/evolution/validation/report.py b/evolution/validation/report.py
index 94de672d..707cedff 100644
--- a/evolution/validation/report.py
+++ b/evolution/validation/report.py
@@ -11,7 +11,7 @@
 import subprocess
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 from rich.console import Console
 from rich.table import Table
@@ -55,6 +55,8 @@ def score_task(
     test_command: Optional[str] = None,
     fixture_dir: Optional[Path] = None,
     test_command_timeout_seconds: float = 60.0,
+    layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None,
+    layer2_threshold: float = 0.7,
 ) -> tuple[bool, bool]:
     """Return (passed, abstained).
 
@@ -68,6 +70,16 @@ def score_task(
     in this mode. Command failure modes (nonzero exit, timeout,
     FileNotFoundError) all map to ``(False, False)`` — "the test did
     not pass," which is the meaningful verdict regardless of cause.
+
+    Layer 2 (compound verdict, prompt-section suites): when
+    ``layer2_judge_fn`` is provided, a task passes only if Layer 1
+    (trigger membership) passes AND the judge returns a score
+    ``>= layer2_threshold``. The judge receives the subset of
+    ``run.tool_calls_with_args`` whose name is ``memory`` (each item the
+    call's ``arguments`` dict). Layer 2 is short-circuited when Layer 1
+    fails — the judge is never called, so no LLM cost is spent on a task
+    that already failed the trigger test. ``test_command`` mode ignores
+    Layer 2.
     """
     if run.error is not None:
         return False, True
@@ -82,6 +94,14 @@ def score_task(
         return False, False
     if expected_tools and not (invoked & set(expected_tools)):
         return False, False
+    if layer2_judge_fn is not None:
+        memory_calls = [
+            c["arguments"]
+            for c in run.tool_calls_with_args
+            if c.get("name") == "memory"
+        ]
+        if layer2_judge_fn(memory_calls) < layer2_threshold:
+            return False, False
     return True, False
 
 
diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py
index 936692b5..2f788c93 100644
--- a/evolution/validation/validator.py
+++ b/evolution/validation/validator.py
@@ -22,7 +22,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Iterator, Optional
+from typing import Callable, Iterator, Optional
 
 from evolution.validation.agent_runner import AgentRunner, TaskRunContext
 from evolution.validation.artifact_installer import (
@@ -80,9 +80,21 @@ class ClosedLoopValidator:
     times and aggregate.
     """
 
-    def __init__(self, installer: ArtifactInstaller, runner: AgentRunner) -> None:
+    def __init__(
+        self,
+        installer: ArtifactInstaller,
+        runner: AgentRunner,
+        *,
+        layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None,
+        layer2_threshold: float = 0.7,
+    ) -> None:
         self.installer = installer
         self.runner = runner
+        # Optional compound-verdict Layer 2 (prompt-section suites). When
+        # unset, scoring is Layer 1 only — the tool-description path is
+        # unchanged.
+        self.layer2_judge_fn = layer2_judge_fn
+        self.layer2_threshold = layer2_threshold
 
     def validate(self, inputs: ValidationInputs) -> ValidationReport:
         target = self.installer.target_path
@@ -149,6 +161,8 @@ def _run_one_task(self, task: Task) -> TaskResult:
                 run=run,
                 test_command=task.test_command,
                 fixture_dir=fixture_dir,
+                layer2_judge_fn=self.layer2_judge_fn,
+                layer2_threshold=self.layer2_threshold,
             )
             return TaskResult(
                 task_id=task.task_id,
diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py
index ec8eaa8b..98b5d9ab 100644
--- a/tests/validation/test_report.py
+++ b/tests/validation/test_report.py
@@ -68,6 +68,77 @@ def test_error_marks_abstention(self):
         assert abstained
 
 
+class TestScoreTaskLayer2:
+    """Compound verdict: Layer 1 (trigger) + optional Layer 2 (content judge)."""
+
+    def _save_run(self, content: str = "good") -> AgentRunResult:
+        return AgentRunResult(
+            tool_calls_seq=["memory"], final_text_tail="", duration_seconds=0.0,
+            tool_calls_with_args=[
+                {"name": "memory", "arguments": {"action": "save", "content": content}}
+            ],
+        )
+
+    def test_no_judge_is_layer1_only(self):
+        passed, abstained = score_task(
+            expected_tools=("memory",), forbidden_tools=(), run=self._save_run(),
+        )
+        assert passed and not abstained
+
+    def test_passes_when_both_layers_ok(self):
+        passed, abstained = score_task(
+            expected_tools=("memory",), forbidden_tools=(), run=self._save_run(),
+            layer2_judge_fn=lambda calls: 0.9, layer2_threshold=0.7,
+        )
+        assert passed and not abstained
+
+    def test_fails_when_layer2_below_threshold(self):
+        passed, abstained = score_task(
+            expected_tools=("memory",), forbidden_tools=(), run=self._save_run("bad"),
+            layer2_judge_fn=lambda calls: 0.5, layer2_threshold=0.7,
+        )
+        assert not passed and not abstained
+
+    def test_layer1_failure_short_circuits_judge(self):
+        """Layer 1 fail => judge never called (no LLM cost on a failed trigger)."""
+        run = AgentRunResult(
+            tool_calls_seq=[], final_text_tail="", duration_seconds=0.0,
+            tool_calls_with_args=[],
+        )
+        calls_seen = []
+
+        def judge_fn(memory_calls):
+            calls_seen.append(memory_calls)
+            return 1.0
+
+        passed, abstained = score_task(
+            expected_tools=("memory",), forbidden_tools=(), run=run,
+            layer2_judge_fn=judge_fn, layer2_threshold=0.7,
+        )
+        assert not passed
+        assert calls_seen == []
+
+    def test_judge_receives_only_memory_call_args(self):
+        run = AgentRunResult(
+            tool_calls_seq=["read_file", "memory"], final_text_tail="", duration_seconds=0.0,
+            tool_calls_with_args=[
+                {"name": "read_file", "arguments": {"path": "x"}},
+                {"name": "memory", "arguments": {"action": "save", "content": "c"}},
+            ],
+        )
+        received = []
+
+        def judge_fn(memory_calls):
+            received.append(memory_calls)
+            return 1.0
+
+        score_task(
+            expected_tools=("memory",), forbidden_tools=(), run=run,
+            layer2_judge_fn=judge_fn, layer2_threshold=0.7,
+        )
+        assert received == [[{"action": "save", "content": "c"}]]
+
+
 class TestScoreTaskTestCommandMode:
     """When ``test_command`` is set on a task, the verdict is exit-code-driven,
     not tool-call-driven. Used by skill-side suites (e.g., planted-bug:
diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py
index 1e8417eb..3326c7a1 100644
--- a/tests/validation/test_validator.py
+++ b/tests/validation/test_validator.py
@@ -54,6 +54,55 @@ def _write_suite(tmp_path: Path, tasks: list[dict]) -> TaskSuite:
     return TaskSuite.from_jsonl(p)
 
 
+class TestClosedLoopValidatorLayer2:
+    def test_layer2_judge_threaded_into_scoring(self, tmp_path):
+        """A configured Layer 2 judge runs per scored task and can fail a
+        task whose Layer 1 trigger passed."""
+        target = tmp_path / "prompt_builder.py"
+        target.write_text("MEMORY_GUIDANCE = 'orig'\n")
+        baseline = tmp_path / "baseline.txt"
+        baseline.write_text("baseline body")
+        evolved = tmp_path / "evolved.txt"
+        evolved.write_text("evolved body")
+
+        suite = _write_suite(tmp_path, [
+            {"task_id": "t1", "user_message": "save", "expected_tools": ["memory"]},
+        ])
+
+        class _MemoryRunner:
+            def __init__(self, target_path):
+                self.target_path = target_path
+
+            def run(self, ctx):
+                return AgentRunResult(
+                    tool_calls_seq=["memory"], final_text_tail="ok",
+                    duration_seconds=0.1, model_name="test-model",
+                    tool_calls_with_args=[
+                        {"name": "memory", "arguments": {"action": "save", "content": "x"}}
+                    ],
+                )
+
+        judged = []
+
+        def judge_fn(memory_calls):
+            judged.append(memory_calls)
+            return 0.2  # below threshold → Layer 2 fails the task
+
+        validator = ClosedLoopValidator(
+            _StubInstaller(target), _MemoryRunner(target),
+            layer2_judge_fn=judge_fn, layer2_threshold=0.7,
+        )
+        report = validator.validate(ValidationInputs(
+            tool_name="MEMORY_GUIDANCE", suite=suite,
+            baseline_artifact=baseline, evolved_artifact=evolved,
+        ))
+        # Judge invoked once per phase (baseline + evolved) on the one task.
+        assert len(judged) == 2
+        # Both phases fail Layer 2 → 0 pass rate, no regression decision.
+        assert report.baseline.pass_rate == 0.0
+        assert report.evolved.pass_rate == 0.0
+
+
 class TestClosedLoopValidatorHappyPath:
     def test_pass_when_evolved_strictly_improves(self, tmp_path):
         target = tmp_path / "tool.py"

From 8435864354d4275ab73128f4e77a65347c899659 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:36:23 -0600
Subject: [PATCH 09/23] feat(prompts): SaveCallJudge signature + scorer

---
 evolution/prompts/prompt_judge.py  | 107 +++++++++++++++++++++++++++++
 tests/prompts/test_prompt_judge.py |  61 ++++++++++++++++
 2 files changed, 168 insertions(+)
 create mode 100644 evolution/prompts/prompt_judge.py
 create mode 100644 tests/prompts/test_prompt_judge.py

diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py
new file mode 100644
index 00000000..6e693867
--- /dev/null
+++ b/evolution/prompts/prompt_judge.py
@@ -0,0 +1,107 @@
+"""LLM-as-judge for memory-save calls — scores args against MEMORY_GUIDANCE rules.
+
+Layer 2 of the compound verdict. Layer 1 (trigger membership) is handled
+by ``score_task``'s existing expected_tools / forbidden_tools logic.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import dspy
+
+from evolution.core.config import EvolutionConfig
+from evolution.core.fitness import _clamp_to_unit
+
+logger = logging.getLogger(__name__)
+
+MAX_JUDGED_CALLS_PER_TASK = 5
+"""Cap on how many save calls per task the judge will score. Excess calls
+beyond the cap score 0 each — bounds cost on pathological cases where the
+agent saves on every turn."""
+
+
+class SaveCallSignature(dspy.Signature):
+    """Score a memory-save call against MEMORY_GUIDANCE's rules.
+
+    Output ``quality`` (0.0-1.0): how well ``saved_content`` follows the
+    rules — durable (not stale in a week), declarative phrasing (not
+    imperative), focused on facts that prevent future correction, and NOT
+    task progress, PR numbers, or completed-work logs.
+    """
+
+    task: str = dspy.InputField(desc="The user task that prompted the save")
+    expected_content: str = dspy.InputField(
+        desc="A rubric for what the saved content should resemble (not exact text)"
+    )
+    saved_content: str = dspy.InputField(desc="The content the agent actually saved")
+    quality: str = dspy.OutputField(
+        desc="0.0-1.0 quality score per MEMORY_GUIDANCE rules"
+    )
+    feedback: str = dspy.OutputField(
+        desc="One-sentence diagnosis of any rule violation; empty if quality is 1.0"
+    )
+
+
+class SaveCallJudge:
+    """LLM scorer for individual memory-save calls."""
+
+    def __init__(self, config: EvolutionConfig):
+        self.config = config
+        self.judge = dspy.ChainOfThought(SaveCallSignature)
+
+    def score(self, *, task: str, expected_content: str, saved_content: str) -> float:
+        _lm = self.config.get_lm("eval")
+        lm = dspy.LM(
+            _lm.model,
+            **_lm.lm_kwargs,
+            temperature=0.0,
+            max_tokens=1000,
+            request_timeout=60,
+            num_retries=5,
+        )
+        with dspy.context(lm=lm):
+            result = self.judge(
+                task=task,
+                expected_content=expected_content,
+                saved_content=saved_content,
+            )
+        return _clamp_to_unit(result.quality)
+
+
+def judge_save_calls(
+    *,
+    judge: SaveCallJudge | None,
+    calls: list[dict[str, Any]],
+    expected_content: str | None,
+    task_text: str = "",
+) -> float:
+    """Aggregate the Layer 2 score across a task's memory-save calls.
+
+    ``calls`` is the subset of ``tool_calls_with_args`` whose name is
+    ``memory`` — each item the call's ``arguments`` dict. Only
+    ``action == 'save'`` calls are judged.
+
+    Returns 1.0 when no save calls were made (Layer 1 catches the
+    "should-have-saved-but-didn't" failure; Layer 2 only scores what
+    actually happened) and also when no judge/rubric is configured.
+    """
+    save_calls = [c for c in calls if c.get("action") == "save"]
+    if not save_calls:
+        return 1.0
+    if judge is None or expected_content is None:
+        return 1.0
+
+    judged = save_calls[:MAX_JUDGED_CALLS_PER_TASK]
+    unjudged_count = max(0, len(save_calls) - MAX_JUDGED_CALLS_PER_TASK)
+
+    scores: list[float] = []
+    for call in judged:
+        scores.append(judge.score(
+            task=task_text,
+            expected_content=expected_content,
+            saved_content=str(call.get("content", "")),
+        ))
+    scores.extend([0.0] * unjudged_count)
+    return sum(scores) / len(scores)
diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py
new file mode 100644
index 00000000..49ad13aa
--- /dev/null
+++ b/tests/prompts/test_prompt_judge.py
@@ -0,0 +1,61 @@
+"""Tests for the SaveCallJudge — scores memory-save args against MEMORY_GUIDANCE rules."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from evolution.prompts.prompt_judge import SaveCallJudge, judge_save_calls
+
+
+def test_no_save_calls_yields_default():
+    """No save calls at all → score 1.0 (vacuously correct). Layer 1 catches
+    'should have saved but didn't'; Layer 2 only scores content of calls made."""
+    assert judge_save_calls(judge=None, calls=[], expected_content=None) == 1.0
+
+
+def test_invokes_judge_per_call_and_means():
+    fake_judge = MagicMock(spec=SaveCallJudge)
+    fake_judge.score.side_effect = [0.8, 0.6]
+    calls = [
+        {"action": "save", "content": "user prefers concise responses"},
+        {"action": "save", "content": "completed phase 3"},
+    ]
+    score = judge_save_calls(
+        judge=fake_judge, calls=calls,
+        expected_content="user preference about response style",
+    )
+    assert score == pytest.approx(0.7)
+    assert fake_judge.score.call_count == 2
+
+
+def test_caps_at_five_calls():
+    """Pathological: agent saves on every turn. Judge at most 5; excess score 0."""
+    fake_judge = MagicMock(spec=SaveCallJudge)
+    fake_judge.score.return_value = 1.0
+    calls = [{"action": "save", "content": f"item {i}"} for i in range(10)]
+    score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any")
+    # 5 scored 1.0, 5 unjudged scored 0 → mean 0.5
+    assert score == pytest.approx(0.5)
+    assert fake_judge.score.call_count == 5
+
+
+def test_filters_non_save_actions():
+    fake_judge = MagicMock(spec=SaveCallJudge)
+    fake_judge.score.return_value = 1.0
+    calls = [
+        {"action": "delete", "key": "x"},
+        {"action": "save", "content": "real save"},
+    ]
+    score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any")
+    assert score == pytest.approx(1.0)
+    assert fake_judge.score.call_count == 1
+
+
+def test_none_judge_or_expected_is_vacuous_pass():
+    """A save call exists but no judge/rubric configured → don't penalize."""
+    calls = [{"action": "save", "content": "x"}]
+    assert judge_save_calls(judge=None, calls=calls, expected_content="r") == 1.0
+    fake = MagicMock(spec=SaveCallJudge)
+    assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0
+    fake.score.assert_not_called()

From 2365031189f28cd2649e22d3f972830c852d5d18 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:37:38 -0600
Subject: [PATCH 10/23] feat(prompts): PromptModule DSPy wrapper

---
 evolution/prompts/prompt_module.py  | 113 ++++++++++++++++++++++++++++
 tests/prompts/test_prompt_module.py |  44 +++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 evolution/prompts/prompt_module.py
 create mode 100644 tests/prompts/test_prompt_module.py

diff --git a/evolution/prompts/prompt_module.py b/evolution/prompts/prompt_module.py
new file mode 100644
index 00000000..ad8f3487
--- /dev/null
+++ b/evolution/prompts/prompt_module.py
@@ -0,0 +1,113 @@
+"""PromptModule — DSPy module wrapping a prompt-section candidate.
+
+Unlike ``ToolModule``, the predictor here is a passthrough: there is no
+cheap "select a tool from the manifest" classification GEPA can score
+without a real agent. Every meaningful eval requires a Hermes subprocess
+(via closed-loop). The predictor exists only to give GEPA a place to hang
+the candidate text via ``signature.instructions`` — GEPA mutates the
+instructions, the framework extracts the candidate via ``section_text``,
+and the closed-loop scorer runs it against the real agent.
+
+Sentinel markers wrap the candidate region so ``section_text`` reads it
+back unambiguously after GEPA's edits.
+
+DO NOT "simplify" by dropping the predictor wrapper. GEPA discovers
+optimization targets via ``dspy.Module.named_predictors()``, which only
+returns objects with the predictor interface. A bare module with no
+predictor child has nothing for GEPA to mutate.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import dspy
+
+
+class SentinelParseError(ValueError):
+    """The candidate sentinels are missing, duplicated, or malformed."""
+
+
+def _open_sentinel(section_name: str) -> str:
+    return f"<!-- SECTION:{section_name} -->"
+
+
+def _close_sentinel(section_name: str) -> str:
+    return f"<!-- /SECTION:{section_name} -->"
+
+
+def _render_instructions(section_name: str, candidate_text: str) -> str:
+    return (
+        f"The following is a candidate for the {section_name} section of an "
+        f"agent's system prompt. Iteration mutates only the text between the "
+        f"sentinel markers below.\n\n"
+        f"{_open_sentinel(section_name)}{candidate_text}{_close_sentinel(section_name)}"
+    )
+
+
+def _extract_from_sentinels(instructions: str, section_name: str) -> str:
+    open_marker = _open_sentinel(section_name)
+    close_marker = _close_sentinel(section_name)
+    open_count = instructions.count(open_marker)
+    close_count = instructions.count(close_marker)
+    if open_count == 0 or close_count == 0:
+        raise SentinelParseError(
+            f"sentinels for {section_name!r} not found in instructions "
+            f"(open={open_count}, close={close_count})"
+        )
+    if open_count > 1 or close_count > 1:
+        raise SentinelParseError(
+            f"sentinels for {section_name!r} appear multiple times "
+            f"(open={open_count}, close={close_count})"
+        )
+    start = instructions.find(open_marker) + len(open_marker)
+    end = instructions.find(close_marker)
+    if end < start:
+        raise SentinelParseError(
+            f"closing sentinel for {section_name!r} precedes opening sentinel"
+        )
+    return instructions[start:end]
+
+
+class PromptPassthroughSignature(dspy.Signature):
+    """Carrier for the candidate section text via signature.instructions.
+
+    The input/output fields are placeholders; the real evaluation happens
+    behaviorally via closed-loop, routed by the metric's behavioral branch.
+    """
+
+    task: str = dspy.InputField(desc="Placeholder; real evaluation is behavioral")
+    response: str = dspy.OutputField(desc="Placeholder")
+
+
+class PromptModule(dspy.Module):
+    """DSPy module hosting a prompt-section candidate as predictor instructions."""
+
+    def __init__(self, section_name: str, candidate_text: str):
+        super().__init__()
+        self.section_name = section_name
+        self.passthrough = dspy.ChainOfThought(PromptPassthroughSignature)
+        self.passthrough.predict.signature = (
+            self.passthrough.predict.signature.with_instructions(
+                _render_instructions(section_name, candidate_text)
+            )
+        )
+
+    def forward(
+        self,
+        task: str,
+        closed_loop_task_id: Optional[str] = None,
+    ) -> dspy.Prediction:
+        # Always route behaviorally — there is no cheap predictor score for
+        # a prompt section. The metric reads these via getattr.
+        return dspy.Prediction(
+            response="",
+            _closed_loop_task_id=closed_loop_task_id,
+            _candidate_text=self.section_text,
+        )
+
+    @property
+    def section_text(self) -> str:
+        """Extract the current candidate text from the predictor's instructions."""
+        instructions = self.passthrough.predict.signature.instructions
+        return _extract_from_sentinels(instructions, self.section_name)
diff --git a/tests/prompts/test_prompt_module.py b/tests/prompts/test_prompt_module.py
new file mode 100644
index 00000000..b369f003
--- /dev/null
+++ b/tests/prompts/test_prompt_module.py
@@ -0,0 +1,44 @@
+"""PromptModule — DSPy wrapper exposing the candidate section as predictor instructions."""
+from __future__ import annotations
+
+from evolution.prompts.prompt_module import PromptModule
+
+
+def test_stores_candidate_in_predictor_instructions():
+    module = PromptModule(
+        section_name="MEMORY_GUIDANCE",
+        candidate_text="evolved candidate body",
+    )
+    instructions = module.passthrough.predict.signature.instructions
+    assert "evolved candidate body" in instructions
+    assert "MEMORY_GUIDANCE" in instructions
+
+
+def test_section_text_extracts_current_candidate():
+    module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="v1")
+    assert module.section_text == "v1"
+    # Simulate a GEPA mutation of the instructions.
+    new_instructions = module.passthrough.predict.signature.instructions.replace(
+        "v1", "v2-mutated"
+    )
+    module.passthrough.predict.signature = (
+        module.passthrough.predict.signature.with_instructions(new_instructions)
+    )
+    assert module.section_text == "v2-mutated"
+
+
+def test_forward_routes_behavioral():
+    """forward always returns the candidate + task id for behavioral scoring —
+    there's no cheap predictor score for a prompt section."""
+    module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="evolved body")
+    pred = module.forward(task="anything", closed_loop_task_id="task-001")
+    assert pred._candidate_text == "evolved body"
+    assert pred._closed_loop_task_id == "task-001"
+
+
+def test_named_predictors_exposes_target():
+    """GEPA discovers mutation targets via named_predictors(); the passthrough
+    predictor must be visible there."""
+    module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="x")
+    names = [name for name, _ in module.named_predictors()]
+    assert any("passthrough" in n for n in names)

From 625f979bef68518d0ba42522dc0a97d98b5db0e7 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:39:02 -0600
Subject: [PATCH 11/23] feat(prompts): GEPA fitness metric + memoizing splice
 scorer

---
 evolution/prompts/prompt_judge.py  | 84 +++++++++++++++++++++++++++++-
 tests/prompts/test_prompt_judge.py | 71 +++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 1 deletion(-)

diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py
index 6e693867..b4e705a8 100644
--- a/evolution/prompts/prompt_judge.py
+++ b/evolution/prompts/prompt_judge.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import Any, Callable, Optional
 
 import dspy
 
@@ -105,3 +105,85 @@ def judge_save_calls(
         ))
     scores.extend([0.0] * unjudged_count)
     return sum(scores) / len(scores)
+
+
+def make_prompt_fitness_metric(
+    *,
+    baseline_text: str,
+    max_growth: float,
+    closed_loop_scorer: Optional[Callable[[str, str], float]] = None,
+) -> Callable:
+    """Build the GEPA-shaped 5-arg fitness metric for a prompt section.
+
+    All prompt-section eval is behavioral (a real Hermes subprocess), so
+    every prediction must carry ``_closed_loop_task_id`` (set by the
+    dataset builder) and ``_candidate_text`` (set by ``PromptModule``).
+    Predictions missing the task id are degenerate — they score 0 with a
+    diagnostic so the misconfiguration is visible in GEPA feedback rather
+    than silently scoring well.
+
+    ``closed_loop_scorer(task_id, candidate_text) -> float`` runs one
+    closed-loop trial and returns its [0, 1] score. ``None`` disables
+    behavioral scoring (predictions score 0) — useful for dry-run wiring
+    tests that don't want to spawn agents.
+    """
+    baseline_len = len(baseline_text or "")
+    target_len = int(baseline_len * (1 + max_growth)) if baseline_len else 0
+
+    def metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
+        task_id = getattr(pred, "_closed_loop_task_id", None)
+        if task_id is None:
+            return dspy.Prediction(
+                score=0.0,
+                feedback=(
+                    "No closed_loop_task_id on prediction — prompt-section eval "
+                    "requires behavioral routing. Check that the dataset builder "
+                    "set the closed_loop_task_id input field."
+                ),
+            )
+        candidate_text = getattr(pred, "_candidate_text", "") or ""
+        score = 0.0
+        if closed_loop_scorer is not None:
+            score = closed_loop_scorer(task_id, candidate_text)
+
+        feedback = ""
+        if baseline_len:
+            feedback = (
+                f"[BUDGET] candidate={len(candidate_text)} chars, "
+                f"baseline={baseline_len} chars, ceiling={target_len} chars"
+            )
+        return dspy.Prediction(score=score, feedback=feedback)
+
+    return metric
+
+
+_UNSET = object()
+
+
+def make_memoizing_splice_scorer(
+    *,
+    install_fn: Callable[[str], None],
+    score_fn: Callable[[str], float],
+) -> Callable[[str, str], float]:
+    """Build ``closed_loop_scorer(task_id, candidate_text) -> float`` that
+    splices a candidate only when it changes.
+
+    GEPA evaluates a candidate across many tasks in a row. Splice-and-restore
+    is expensive, so this scorer calls ``install_fn(candidate_text)`` only when
+    ``candidate_text`` differs from the currently-installed value; consecutive
+    tasks for the same candidate reuse the live splice. ``score_fn(task_id)``
+    runs the task through the agent with whatever candidate is installed.
+
+    Backup/restore of the mutated source is the caller's responsibility — wrap
+    the whole GEPA run, not each call (the per-run guard mirrors
+    ``ClosedLoopValidator``'s splice-once-per-phase shape).
+    """
+    state: dict[str, Any] = {"installed": _UNSET}
+
+    def scorer(task_id: str, candidate_text: str) -> float:
+        if state["installed"] != candidate_text:
+            install_fn(candidate_text)
+            state["installed"] = candidate_text
+        return score_fn(task_id)
+
+    return scorer
diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py
index 49ad13aa..c0e620a7 100644
--- a/tests/prompts/test_prompt_judge.py
+++ b/tests/prompts/test_prompt_judge.py
@@ -59,3 +59,74 @@ def test_none_judge_or_expected_is_vacuous_pass():
     fake = MagicMock(spec=SaveCallJudge)
     assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0
     fake.score.assert_not_called()
+
+
+# ---- make_prompt_fitness_metric ----
+
+from evolution.prompts.prompt_judge import (
+    make_memoizing_splice_scorer,
+    make_prompt_fitness_metric,
+)
+
+
+def _behavioral_pred(task_id="task-001", candidate="evolved body"):
+    pred = type("Pred", (), {})()
+    pred._closed_loop_task_id = task_id
+    pred._candidate_text = candidate
+    return pred
+
+
+def test_metric_routes_behavioral_through_scorer():
+    seen = []
+
+    def fake_scorer(task_id, candidate_text):
+        seen.append((task_id, candidate_text))
+        return 0.85
+
+    metric = make_prompt_fitness_metric(
+        baseline_text="baseline", max_growth=0.2, closed_loop_scorer=fake_scorer,
+    )
+    result = metric(gold=object(), pred=_behavioral_pred())
+    assert result.score == 0.85
+    assert seen == [("task-001", "evolved body")]
+    assert "BUDGET" in result.feedback  # length feedback present
+
+
+def test_metric_without_task_id_scores_zero():
+    metric = make_prompt_fitness_metric(
+        baseline_text="b", max_growth=0.2, closed_loop_scorer=lambda *_: 1.0,
+    )
+    pred = type("Pred", (), {})()  # no _closed_loop_task_id
+    result = metric(gold=object(), pred=pred)
+    assert result.score == 0.0
+    assert "behavioral" in result.feedback.lower()
+
+
+def test_metric_without_scorer_scores_zero():
+    metric = make_prompt_fitness_metric(
+        baseline_text="b", max_growth=0.2, closed_loop_scorer=None,
+    )
+    result = metric(gold=object(), pred=_behavioral_pred())
+    assert result.score == 0.0
+
+
+# ---- make_memoizing_splice_scorer ----
+
+def test_memoizing_scorer_splices_only_on_candidate_change():
+    installs: list[str] = []
+    scores = {"task-a": 0.7, "task-b": 0.9}
+
+    scorer = make_memoizing_splice_scorer(
+        install_fn=lambda text: installs.append(text),
+        score_fn=lambda task_id: scores[task_id],
+    )
+    # Same candidate across two tasks → one install.
+    assert scorer("task-a", "cand-1") == 0.7
+    assert scorer("task-b", "cand-1") == 0.9
+    assert installs == ["cand-1"]
+    # New candidate → re-splice.
+    assert scorer("task-a", "cand-2") == 0.7
+    assert installs == ["cand-1", "cand-2"]
+    # Back to a prior candidate is NOT cached across changes → re-splice.
+    assert scorer("task-a", "cand-1") == 0.7
+    assert installs == ["cand-1", "cand-2", "cand-1"]

From 725908ba4a5d69eeec7605d910919edfb7d22f65 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Sun, 31 May 2026 20:41:13 -0600
Subject: [PATCH 12/23] feat(prompts): memory_guidance dataset builder +
 curated eval suite

---
 evolution/core/dataset_builder.py             | 99 +++++++++++++++++++
 .../validation/suites/memory_guidance.jsonl   | 23 +++++
 tests/core/test_dataset_builder.py            | 42 ++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 evolution/validation/suites/memory_guidance.jsonl

diff --git a/evolution/core/dataset_builder.py b/evolution/core/dataset_builder.py
index 239e149c..28ac5f69 100644
--- a/evolution/core/dataset_builder.py
+++ b/evolution/core/dataset_builder.py
@@ -485,3 +485,102 @@ def load(path: Path, seed: int = 42) -> EvalDataset:
             val_ratio=0.25,
             holdout_ratio=0.25,
         )
+
+
+MEMORY_GUIDANCE_CATEGORIES = (
+    "save-preference",
+    "save-correction",
+    "dont-save-task-progress",
+    "dont-save-completed-work-log",
+    "declarative-vs-imperative",
+)
+
+_MEMORY_GUIDANCE_CATEGORY_PROMPTS = {
+    "save-preference": (
+        "Generate ONE closed-loop eval task (category: save-preference) where the "
+        "user explicitly states a durable preference the agent SHOULD save to "
+        "memory. Output a single JSON object with fields: user_message, "
+        "expected_tools=[\"memory\"], expected_save_content (a rubric describing "
+        "what a good save would look like — not exact text)."
+    ),
+    "save-correction": (
+        "Generate ONE closed-loop eval task (category: save-correction) where the "
+        "user corrects the agent on a recurring pattern (e.g. 'no, I use uv not "
+        "pip'). The agent SHOULD save the correction. Output a single JSON object "
+        "with fields: user_message, expected_tools=[\"memory\"], "
+        "expected_save_content."
+    ),
+    "dont-save-task-progress": (
+        "Generate ONE closed-loop eval task (category: dont-save-task-progress) "
+        "where the user asks the agent to complete a task (write code, fix a bug). "
+        "The agent SHOULD NOT save task progress to memory. Output a single JSON "
+        "object with fields: user_message, expected_tools=[], "
+        "forbidden_tools=[\"memory\"]."
+    ),
+    "dont-save-completed-work-log": (
+        "Generate ONE closed-loop eval task (category: dont-save-completed-work-log) "
+        "where the user asks for a summary of work done. The agent SHOULD NOT log "
+        "the work to memory. Output a single JSON object with fields: user_message, "
+        "expected_tools=[], forbidden_tools=[\"memory\"]."
+    ),
+    "declarative-vs-imperative": (
+        "Generate ONE closed-loop eval task (category: declarative-vs-imperative) "
+        "where the user states a preference in imperative form ('always respond "
+        "concisely'). The agent SHOULD save it in declarative form ('user prefers "
+        "concise responses'). Output a single JSON object with fields: "
+        "user_message, expected_tools=[\"memory\"], expected_save_content "
+        "(specifying the declarative-phrasing rubric)."
+    ),
+}
+
+
+def build_memory_guidance_dataset(
+    *,
+    lm_call,
+    n_per_category: int = 10,
+) -> list[dict]:
+    """Generate synthetic MEMORY_GUIDANCE eval tasks across the 5 categories.
+
+    ``lm_call`` is a callable taking a prompt string and returning a JSON
+    object (one task) as text. The builder issues ``n_per_category`` calls
+    per category and stamps a unique ``task_id`` on each parsed row so the
+    output is a valid closed-loop suite regardless of what the LM emits for
+    the id. Rows the LM returns that don't parse as a JSON object are
+    skipped (logged), not fatal — a single noisy generation shouldn't abort
+    the whole build.
+
+    Returns a flat list of Task-shaped dicts ready to write to a JSONL suite
+    (consumable by ``TaskSuite.from_jsonl``).
+    """
+    out: list[dict] = []
+    for category in MEMORY_GUIDANCE_CATEGORIES:
+        prompt = _MEMORY_GUIDANCE_CATEGORY_PROMPTS[category]
+        for index in range(n_per_category):
+            raw = lm_call(prompt)
+            row = _parse_memory_task_row(raw)
+            if row is None:
+                logger.warning(
+                    "build_memory_guidance_dataset: unparseable row for "
+                    "category %s index %d", category, index,
+                )
+                continue
+            row["task_id"] = f"{category}-{index:03d}"
+            row.setdefault("expected_tools", [])
+            out.append(row)
+    return out
+
+
+def _parse_memory_task_row(raw: str):
+    """Parse a single JSON object from an LM response. Returns the dict, or
+    None if the text isn't a JSON object (tolerant of fenced/extra prose)."""
+    try:
+        obj = json.loads(raw)
+    except (json.JSONDecodeError, TypeError):
+        match = re.search(r"\{.*\}", str(raw), re.DOTALL)
+        if not match:
+            return None
+        try:
+            obj = json.loads(match.group())
+        except json.JSONDecodeError:
+            return None
+    return obj if isinstance(obj, dict) else None
diff --git a/evolution/validation/suites/memory_guidance.jsonl b/evolution/validation/suites/memory_guidance.jsonl
new file mode 100644
index 00000000..a91999c0
--- /dev/null
+++ b/evolution/validation/suites/memory_guidance.jsonl
@@ -0,0 +1,23 @@
+# Closed-loop validation suite for the Hermes MEMORY_GUIDANCE prompt section.
+# Compound verdict: Layer 1 = memory(save) trigger membership; Layer 2 =
+# content judge against expected_save_content rubric (save categories only).
+# Curated golden set across five categories. Full synthetic expansion is
+# produced by evolution.core.dataset_builder.build_memory_guidance_dataset.
+#
+# --- save-preference: durable preference stated → SHOULD save ---
+{"task_id": "save-pref-uv", "user_message": "Heads up for going forward: I always use uv for Python dependency management, never pip or poetry.", "expected_tools": ["memory"], "expected_save_content": "A durable, declarative fact that the user prefers/uses uv for Python dependency management. Should not be phrased as a one-off task or include transient detail."}
+{"task_id": "save-pref-tabs", "user_message": "Just so you know for all my projects: I use tabs, not spaces, for indentation.", "expected_tools": ["memory"], "expected_save_content": "A durable fact that the user prefers tabs over spaces for indentation across projects. Declarative, not imperative."}
+{"task_id": "save-pref-timezone", "user_message": "For scheduling things, remember I'm based in US Pacific time.", "expected_tools": ["memory"], "expected_save_content": "A durable fact recording the user's timezone (US Pacific). Stable, reusable across sessions."}
+# --- save-correction: user corrects a recurring agent pattern → SHOULD save ---
+{"task_id": "save-corr-pkgmgr", "user_message": "No — stop suggesting npm. I use pnpm for every JS project. Please remember that.", "expected_tools": ["memory"], "expected_save_content": "A durable fact that the user uses pnpm (not npm) for JS projects, capturing the correction so the agent stops re-suggesting npm."}
+{"task_id": "save-corr-name", "user_message": "You keep calling the service 'auth-svc'. Its real name is 'identity-gateway'. Remember that for next time.", "expected_tools": ["memory"], "expected_save_content": "A durable fact mapping the service's correct name (identity-gateway), capturing the user's correction."}
+# --- dont-save-task-progress: complete a task → SHOULD NOT save ---
+{"task_id": "nosave-fix-bug", "user_message": "There's an off-by-one in {fixture_dir}/loop.py — the range should be inclusive of the last index. Fix it.", "expected_tools": [], "forbidden_tools": ["memory"], "fixture_setup": {"loop.py": "def last_items(xs):\n    return xs[0:len(xs)-1]\n"}}
+{"task_id": "nosave-write-fn", "user_message": "Add a function `is_even(n)` to {fixture_dir}/util.py that returns True for even integers.", "expected_tools": [], "forbidden_tools": ["memory"], "fixture_setup": {"util.py": "# helpers\n"}}
+# --- dont-save-completed-work-log: summary of work done → SHOULD NOT save ---
+{"task_id": "nosave-summary", "user_message": "Give me a quick summary of what you changed in the last edit.", "expected_tools": [], "forbidden_tools": ["memory"]}
+{"task_id": "nosave-standup", "user_message": "Write a one-line standup update describing the bug we just fixed.", "expected_tools": [], "forbidden_tools": ["memory"]}
+# --- declarative-vs-imperative: imperative preference → SHOULD save, declaratively ---
+{"task_id": "decl-concise", "user_message": "Always answer me concisely — no preamble, just the answer.", "expected_tools": ["memory"], "expected_save_content": "The preference saved in DECLARATIVE form (e.g. 'user prefers concise answers without preamble'), not copied verbatim as an imperative directive ('always answer concisely')."}
+{"task_id": "decl-no-emoji", "user_message": "Never use emoji in your responses to me.", "expected_tools": ["memory"], "expected_save_content": "A declarative fact that the user prefers responses without emoji (e.g. 'user prefers no emoji in responses'), not stored as a raw imperative command."}
+{"task_id": "decl-tests-first", "user_message": "From now on, write tests before implementation when you work on my code.", "expected_tools": ["memory"], "expected_save_content": "A declarative fact capturing the user's preference for test-first development (e.g. 'user prefers tests written before implementation'), phrased as a durable preference rather than an imperative."}
diff --git a/tests/core/test_dataset_builder.py b/tests/core/test_dataset_builder.py
index b2e9f67a..57273920 100644
--- a/tests/core/test_dataset_builder.py
+++ b/tests/core/test_dataset_builder.py
@@ -171,3 +171,45 @@ def test_lm_constructed_with_bumped_max_tokens(self):
             f"max_tokens regressed from 16000 to {kwargs['max_tokens']}; "
             "JSON truncation will reappear at eval_dataset_size>=60"
         )
+
+
+class TestBuildMemoryGuidanceDataset:
+    def test_uses_all_five_categories(self):
+        from evolution.core.dataset_builder import build_memory_guidance_dataset
+
+        fake_lm = MagicMock()
+        fake_lm.return_value = (
+            '{"task_id": "raw", "user_message": "x", '
+            '"expected_tools": ["memory"], '
+            '"expected_save_content": "preference rubric"}'
+        )
+        examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=2)
+        # 5 categories × 2 tasks each
+        assert len(examples) == 10
+        categories = [
+            "save-preference",
+            "save-correction",
+            "dont-save-task-progress",
+            "dont-save-completed-work-log",
+            "declarative-vs-imperative",
+        ]
+        invoked_prompts = [c.args[0] for c in fake_lm.call_args_list]
+        for cat in categories:
+            assert any(cat in p for p in invoked_prompts), f"category {cat!r} not prompted"
+
+    def test_stamps_unique_task_ids(self):
+        from evolution.core.dataset_builder import build_memory_guidance_dataset
+
+        fake_lm = MagicMock(return_value=(
+            '{"user_message": "x", "expected_tools": ["memory"]}'
+        ))
+        examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=2)
+        ids = [e["task_id"] for e in examples]
+        assert len(ids) == len(set(ids)), "task_ids must be unique"
+
+    def test_skips_unparseable_rows(self):
+        from evolution.core.dataset_builder import build_memory_guidance_dataset
+
+        fake_lm = MagicMock(return_value="not json at all")
+        examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=1)
+        assert examples == []

From d17ead826cd2001e637bf563c5c5f7097ba20fa4 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 08:02:15 -0600
Subject: [PATCH 13/23] feat(prompts): PromptSectionProposer
 (sentinel-preserving GEPA proposal fn)

---
 evolution/prompts/prompt_proposer.py  | 159 ++++++++++++++++++++++++++
 tests/prompts/test_prompt_proposer.py |  82 +++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 evolution/prompts/prompt_proposer.py
 create mode 100644 tests/prompts/test_prompt_proposer.py

diff --git a/evolution/prompts/prompt_proposer.py b/evolution/prompts/prompt_proposer.py
new file mode 100644
index 00000000..b2d00d7d
--- /dev/null
+++ b/evolution/prompts/prompt_proposer.py
@@ -0,0 +1,159 @@
+"""GEPA instruction_proposer for prompt-section evolution.
+
+Mirrors ``BudgetAwareToolProposer``: subclasses ``BudgetAwareProposer`` for
+the budget-tracking infrastructure but installs a prompt-section reflection
+template whose hard constraint is sentinel preservation. ``__call__`` runs the
+inherited proposer LM, then passes the candidate through ``extract_and_rebuild``
+so only the sentinel-delimited region survives.
+
+On ``SentinelParseError`` the call re-raises (after incrementing
+``sentinel_failures``) rather than returning the parent unchanged — GEPA's
+reflective_mutation path skips the iteration, avoiding a phantom
+identical-to-parent candidate that would pollute the selection pool.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Mapping, Sequence
+
+import dspy
+
+from evolution.prompts.prompt_module import (
+    SentinelParseError,
+    _extract_from_sentinels,
+    _render_instructions,
+)
+from evolution.skills.budget_aware_proposer import BudgetAwareProposer
+
+logger = logging.getLogger(__name__)
+
+
+_PROMPT_PROPOSER_TEMPLATE = """\
+You are revising one section ({section_name}) of an agent's system prompt.
+The instruction below wraps the current candidate text between the markers
+`<!-- SECTION:{section_name} -->` and `<!-- /SECTION:{section_name} -->`.
+
+Hard constraint - sentinel preservation:
+Modify only the text between those two markers. Do not change the markers
+themselves, and do not add any text outside them.
+
+Length budget: at most {target_chars} characters for the section body (between
+the markers). The current body is {baseline_chars} characters.
+
+Hard constraint - grounding citation:
+Every change must quote or paraphrase a specific phrase from the feedback. If a
+failure is not actionable from the section text (model error, judge
+disagreement, out-of-distribution input), skip it.
+
+Your task: rewrite the current section to fix the failures shown below,
+modifying only the sentinel-delimited region for {section_name}.
+
+Steps:
+1. Read each failure in the feedback. Classify it as (a) the agent misapplied
+   existing guidance -> refine the wording, (b) the agent lacked guidance it
+   needed -> add it, or (c) not actionable from the section text -> skip.
+2. Apply changes only for (a) and (b), only inside the sentinel region.
+3. For each change, name the specific feedback phrase that grounded it.
+4. Match the voice and density of the existing section.
+5. If more additions are warranted than fit within {target_chars}, address the
+   most-grounded failures first; GEPA will run again with the updated baseline.
+
+If the feedback below is empty or contains no concrete failures, return the
+current instruction unchanged.
+
+Output the full instruction text (markers included, only the sentinel-delimited
+region modified). No preamble, no markdown fences, no explanation.
+"""
+
+
+def extract_and_rebuild(candidate: str, section_name: str) -> str:
+    """Extract the sentinel region from a candidate full-instructions string
+    and re-render the instructions around it.
+
+    Pure function — testable without LM mocks. Raises ``SentinelParseError``
+    if the candidate didn't preserve the sentinels.
+    """
+    new_body = _extract_from_sentinels(candidate, section_name)
+    return _render_instructions(section_name, new_body)
+
+
+class _PromptProposalSignature(dspy.Signature):
+    """Placeholder; overwritten per-instance via with_instructions so the
+    section-specific template (section_name, target_chars, baseline_chars
+    baked in) is installed."""
+
+    current_instruction: str = dspy.InputField(
+        desc="The current instruction with the sentinel-wrapped section body"
+    )
+    examples_with_feedback: str = dspy.InputField(
+        desc="Failure feedback from the eval to ground refinements in"
+    )
+    improved_instruction: str = dspy.OutputField(
+        desc="The revised instruction with only the sentinel region modified"
+    )
+
+
+class PromptSectionProposer(BudgetAwareProposer):
+    """GEPA-compatible ProposalFn for prompt-section evolution."""
+
+    component_name = "passthrough.predict"
+
+    def __init__(
+        self,
+        section_name: str,
+        baseline_chars: int,
+        max_growth: float = 0.2,
+        safety_margin: float = 0.10,
+    ):
+        super().__init__(
+            baseline_chars=baseline_chars,
+            max_growth=max_growth,
+            safety_margin=safety_margin,
+        )
+        self.section_name = section_name
+        self.sentinel_failures = 0
+
+        template = _PROMPT_PROPOSER_TEMPLATE.format(
+            section_name=section_name,
+            target_chars=self.target_chars,
+            baseline_chars=baseline_chars,
+        )
+        self.propose = dspy.Predict(
+            _PromptProposalSignature.with_instructions(template)
+        )
+
+    def __call__(
+        self,
+        candidate: dict[str, str],
+        reflective_dataset: Mapping[str, Sequence[Mapping[str, Any]]],
+        components_to_update: list[str],
+    ) -> dict[str, str]:
+        if self.component_name not in components_to_update:
+            return {}
+        if self.component_name not in candidate:
+            return {}
+
+        current_instruction = candidate[self.component_name]
+        feedback = self._format_examples(
+            reflective_dataset.get(self.component_name, [])
+        )
+        prediction = self.propose(
+            current_instruction=current_instruction,
+            examples_with_feedback=feedback,
+        )
+        new_candidate = prediction.improved_instruction
+
+        try:
+            rebuilt = extract_and_rebuild(new_candidate, self.section_name)
+        except SentinelParseError as exc:
+            self.sentinel_failures += 1
+            excerpt = new_candidate[:200] + ("..." if len(new_candidate) > 200 else "")
+            logger.warning(
+                "PromptSectionProposer: sentinel parse failure (#%d) for %r: %s. "
+                "Candidate excerpt: %r",
+                self.sentinel_failures, self.section_name, exc, excerpt,
+            )
+            raise
+
+        return {self.component_name: rebuilt}
diff --git a/tests/prompts/test_prompt_proposer.py b/tests/prompts/test_prompt_proposer.py
new file mode 100644
index 00000000..41ecd45e
--- /dev/null
+++ b/tests/prompts/test_prompt_proposer.py
@@ -0,0 +1,82 @@
+"""Tests for PromptSectionProposer — sentinel-preserving GEPA proposal fn."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import dspy
+
+from evolution.prompts.prompt_module import (
+    _close_sentinel,
+    _open_sentinel,
+    _render_instructions,
+)
+from evolution.prompts.prompt_proposer import (
+    PromptSectionProposer,
+    extract_and_rebuild,
+)
+
+
+SECTION = "MEMORY_GUIDANCE"
+
+
+def _wrapped(body: str) -> str:
+    return _render_instructions(SECTION, body)
+
+
+def test_extract_and_rebuild_round_trips_sentinels():
+    candidate = _wrapped("a refined body")
+    rebuilt = extract_and_rebuild(candidate, SECTION)
+    # The rebuilt instructions still carry intact sentinels around the new body.
+    assert _open_sentinel(SECTION) in rebuilt
+    assert _close_sentinel(SECTION) in rebuilt
+    assert "a refined body" in rebuilt
+
+
+def test_proposer_only_acts_on_its_component():
+    proposer = PromptSectionProposer(
+        section_name=SECTION, baseline_chars=100,
+    )
+    # A request that doesn't include our component returns empty.
+    out = proposer(
+        candidate={"passthrough.predict": _wrapped("x")},
+        reflective_dataset={},
+        components_to_update=["something.else"],
+    )
+    assert out == {}
+
+
+def test_proposer_rebuilds_sentinel_region(monkeypatch):
+    proposer = PromptSectionProposer(section_name=SECTION, baseline_chars=100)
+
+    # Stub the LM proposal: return a full-instructions string with the
+    # sentinel region edited.
+    fake_pred = MagicMock()
+    fake_pred.improved_instruction = _wrapped("LM-revised memory guidance")
+    proposer.propose = MagicMock(return_value=fake_pred)
+
+    out = proposer(
+        candidate={"passthrough.predict": _wrapped("original")},
+        reflective_dataset={"passthrough.predict": [{"Feedback": "be clearer"}]},
+        components_to_update=["passthrough.predict"],
+    )
+    assert "passthrough.predict" in out
+    assert "LM-revised memory guidance" in out["passthrough.predict"]
+    assert _open_sentinel(SECTION) in out["passthrough.predict"]
+
+
+def test_proposer_raises_on_sentinel_loss():
+    from evolution.prompts.prompt_module import SentinelParseError
+
+    proposer = PromptSectionProposer(section_name=SECTION, baseline_chars=100)
+    fake_pred = MagicMock()
+    fake_pred.improved_instruction = "the model dropped the sentinels entirely"
+    proposer.propose = MagicMock(return_value=fake_pred)
+
+    import pytest
+    with pytest.raises(SentinelParseError):
+        proposer(
+            candidate={"passthrough.predict": _wrapped("original")},
+            reflective_dataset={"passthrough.predict": [{"Feedback": "x"}]},
+            components_to_update=["passthrough.predict"],
+        )
+    assert proposer.sentinel_failures == 1

From 6b13272af13f5e8d995e731cda1042c701594c94 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 08:12:22 -0600
Subject: [PATCH 14/23] =?UTF-8?q?feat(prompts):=20evolve=5Fprompt=5Fsectio?=
 =?UTF-8?q?n=20CLI=20=E2=80=94=20GEPA=20+=20saturation=20+=20budget=20+=20?=
 =?UTF-8?q?closed-loop=20deploy=20gate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires HermesPromptSectionInstaller + HermesAgentRunner + ClosedLoopValidator
into a full-parity evolution pipeline for prompt sections. GEPA mutates via
PromptSectionProposer; the inner loop scores through a serialized memoizing
splice scorer; the deploy gate runs baseline-vs-evolved closed-loop on the
holdout suite. Saturation pre-flight default-denies a saturated baseline;
budget cap aborts on overrun.

ClosedLoopValidator's Layer 2 hook becomes a per-task judge factory so the
content judge can read each task's expected_save_content rubric. The memoizing
scorer serializes splice+run under a lock — dspy.Evaluate is multi-threaded but
prompt_builder.py is a single shared file. PR automation is deferred for prompt
sections (copying a full evolved file over origin/base would pollute the diff
with the local override-hook commit).
---
 evolution/prompts/evolve_prompt_section.py  | 632 ++++++++++++++++++++
 evolution/prompts/prompt_judge.py           |  19 +-
 evolution/validation/validator.py           |  21 +-
 tests/prompts/test_evolve_prompt_section.py | 110 ++++
 tests/validation/test_validator.py          |  16 +-
 5 files changed, 784 insertions(+), 14 deletions(-)
 create mode 100644 evolution/prompts/evolve_prompt_section.py
 create mode 100644 tests/prompts/test_evolve_prompt_section.py

diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py
new file mode 100644
index 00000000..288b8737
--- /dev/null
+++ b/evolution/prompts/evolve_prompt_section.py
@@ -0,0 +1,632 @@
+"""Evolve a named system-prompt section in Hermes ``prompt_builder.py`` via DSPy + GEPA.
+
+Mirrors ``evolution.tools.evolve_tool`` but for prompt sections, with the
+splice-and-restore integration model (see ``HermesPromptSectionInstaller``).
+The whole evaluation is behavioral: every candidate is spliced into the live
+``prompt_builder.py`` and scored by a real ``hermes -z`` subprocess, so the
+deploy gate is a ``ClosedLoopValidator`` run rather than a synthetic-judge
+holdout.
+
+Usage:
+    python -m evolution.prompts.evolve_prompt_section \\
+        --section MEMORY_GUIDANCE \\
+        --hermes-repo ~/src/NousResearch/hermes-agent \\
+        --tasks evolution/validation/suites/memory_guidance.jsonl \\
+        --iterations 10
+"""
+
+from __future__ import annotations
+
+import fcntl
+import json
+import logging
+import random
+import sys
+import tempfile
+import threading
+import time
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Iterator, Optional
+
+import click
+import dspy
+from rich.console import Console
+
+from evolution.core.config import EvolutionConfig
+from evolution.core.hermes_provider import instantiate_lm, resolve_default_lm
+from evolution.core.lm_timing_callback import (
+    COST_LEDGER,
+    CostCeilingExceeded,
+    register_litellm_cost_callback,
+    register_litellm_failure_callback,
+)
+from evolution.core.pr_automation import disabled_pr_block
+from evolution.core.quality_gate import write_gate_decision
+from evolution.core.run_inputs import build_run_inputs
+from evolution.core.saturation_check import (
+    is_non_interactive,
+    interactive_confirm,
+    render_saturation_panel,
+    saturation_preflight,
+)
+from evolution.prompts.hermes_prompt_source import HermesPromptSource
+from evolution.prompts.prompt_judge import (
+    SaveCallJudge,
+    judge_save_calls,
+    make_memoizing_splice_scorer,
+    make_prompt_fitness_metric,
+)
+from evolution.prompts.prompt_module import PromptModule, _extract_from_sentinels
+from evolution.prompts.prompt_proposer import PromptSectionProposer
+from evolution.validation.agent_runner import TaskRunContext
+from evolution.validation.artifact_installer import (
+    HermesPromptSectionInstaller,
+    atomic_write_bytes,
+)
+from evolution.validation.hermes_runner import (
+    DEFAULT_TASK_TIMEOUT_SECONDS,
+    HermesAgentRunner,
+)
+from evolution.validation.report import score_task
+from evolution.validation.task import Task, TaskSuite
+from evolution.validation.validator import ClosedLoopValidator, ValidationInputs
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    datefmt="%Y/%m/%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+console = Console()
+
+_GATE_SCHEMA_VERSION = "5"
+_BACKUP_SUFFIX = ".cl_backup"
+_LOCK_FILENAME = ".cl_validation.lock"
+
+
+def _split_train_holdout(
+    tasks: tuple[Task, ...], *, holdout_ratio: float, seed: int
+) -> tuple[list[Task], list[Task]]:
+    """Deterministic train/holdout split, stratified only by shuffle+seed.
+
+    Guarantees at least one task on each side when there are >= 2 tasks so
+    GEPA has something to train on and the deploy gate has something to
+    evaluate.
+    """
+    ordered = list(tasks)
+    random.Random(seed).shuffle(ordered)
+    n_holdout = max(1, int(round(len(ordered) * holdout_ratio)))
+    n_holdout = min(n_holdout, len(ordered) - 1) if len(ordered) > 1 else len(ordered)
+    holdout = ordered[:n_holdout]
+    train = ordered[n_holdout:]
+    return train, holdout
+
+
+def _behavioral_examples(tasks: list[Task]) -> list[dspy.Example]:
+    """Build GEPA examples whose inputs drive ``PromptModule.forward`` into the
+    behavioral branch (task message + closed_loop_task_id)."""
+    return [
+        dspy.Example(
+            task=t.user_message,
+            closed_loop_task_id=t.task_id,
+        ).with_inputs("task", "closed_loop_task_id")
+        for t in tasks
+    ]
+
+
+def _make_layer2_factory(judge: Optional[SaveCallJudge]):
+    """Per-task Layer 2 scorer: binds the task's rubric + message into a
+    ``score_task``-shaped ``Callable[[list[dict]], float]``. Returns ``None``
+    for tasks without an ``expected_save_content`` rubric (no content to
+    judge)."""
+
+    def factory(task: Task):
+        if task.expected_save_content is None:
+            return None
+
+        def judge_fn(memory_calls: list[dict]) -> float:
+            return judge_save_calls(
+                judge=judge,
+                calls=memory_calls,
+                expected_content=task.expected_save_content,
+                task_text=task.user_message,
+            )
+
+        return judge_fn
+
+    return factory
+
+
+def _section_text_from_candidate(candidate: Any, section_name: str) -> str:
+    """Extract the section body from a GEPA-built candidate (module or
+    component dict), reading the sentinel-delimited region."""
+    if isinstance(candidate, dict):
+        instructions = candidate.get("passthrough.predict", "")
+    else:
+        instructions = candidate.passthrough.predict.signature.instructions or ""
+    return _extract_from_sentinels(instructions, section_name)
+
+
+@contextmanager
+def _prompt_builder_guard(target_path: Path) -> Iterator[None]:
+    """Back up ``prompt_builder.py`` + hold the shared closed-loop flock for the
+    duration of GEPA evolution, then restore the original bytes on exit.
+
+    The GEPA inner loop splices candidates directly into the live file; this
+    guard guarantees the user's checkout is byte-restored afterward and that no
+    concurrent harness run (which uses the same lock + backup names) mutates it
+    mid-flight. Sequenced before the deploy-gate ``ClosedLoopValidator``, which
+    acquires the same lock itself — never nested.
+    """
+    backup_path = target_path.with_suffix(target_path.suffix + _BACKUP_SUFFIX)
+    if backup_path.exists():
+        raise RuntimeError(
+            f"Stale backup at {backup_path} — a prior run did not clean up. "
+            f"Restore {target_path} from it manually, then retry."
+        )
+    lock_fd = open(target_path.parent / _LOCK_FILENAME, "w")
+    try:
+        try:
+            fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError as exc:
+            raise RuntimeError(
+                f"Another harness run holds {target_path.parent / _LOCK_FILENAME}. "
+                f"Wait for it to finish."
+            ) from exc
+        atomic_write_bytes(backup_path, target_path.read_bytes())
+        try:
+            yield
+        finally:
+            atomic_write_bytes(target_path, backup_path.read_bytes())
+            backup_path.unlink(missing_ok=True)
+    finally:
+        fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
+        lock_fd.close()
+
+
+def _run_one_task_score(
+    task: Task,
+    *,
+    runner: HermesAgentRunner,
+    layer2_factory,
+    layer2_threshold: float,
+) -> float:
+    """Run a single task through the agent with whatever section is currently
+    spliced, returning 1.0 on pass else 0.0 (abstentions score 0.0 in-loop —
+    the deploy gate handles abstentions properly)."""
+    with tempfile.TemporaryDirectory(prefix="ps_inner_") as fixture_tmp:
+        fixture_dir = Path(fixture_tmp)
+        for relative_path, content in task.fixture_setup.items():
+            dest = fixture_dir / relative_path
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_text(content)
+        ctx = TaskRunContext(
+            user_message=task.render_message(fixture_dir),
+            fixture_dir=fixture_dir,
+        )
+        run = runner.run(ctx)
+        passed, abstained = score_task(
+            expected_tools=task.expected_tools,
+            forbidden_tools=task.forbidden_tools,
+            run=run,
+            test_command=task.test_command,
+            fixture_dir=fixture_dir,
+            layer2_judge_fn=layer2_factory(task),
+            layer2_threshold=layer2_threshold,
+        )
+        if abstained:
+            return 0.0
+        return 1.0 if passed else 0.0
+
+
+def evolve_prompt_section(
+    section_name: str,
+    hermes_repo: Path,
+    tasks_path: Path,
+    *,
+    iterations: int = 10,
+    holdout_ratio: float = 0.5,
+    seed: int = 42,
+    max_growth: float = 0.2,
+    optimizer_model: Optional[str] = None,
+    reflection_model: Optional[str] = None,
+    eval_model: Optional[str] = None,
+    agent_model: Optional[str] = None,
+    layer2_threshold: float = 0.7,
+    task_timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
+    max_total_cost_usd: Optional[float] = 150.0,
+    gepa_minibatch_size: int = 3,
+    gepa_acceptance: str = "improvement-or-equal",
+    skip_saturation_check: bool = False,
+    force_saturation_check: bool = False,
+    apply: bool = False,
+    create_pr_flag: bool = False,
+    dry_run: bool = False,
+    output_dir: Optional[Path] = None,
+) -> dict[str, Any]:
+    """Evolve one prompt section end-to-end. Returns a summary dict."""
+    hermes_repo = Path(hermes_repo).resolve()
+    source = HermesPromptSource(hermes_repo)
+    baseline_text = source.read(section_name)
+    baseline_chars = len(baseline_text)
+
+    suite = TaskSuite.from_jsonl(tasks_path)
+    train_tasks, holdout_tasks = _split_train_holdout(
+        suite.tasks, holdout_ratio=holdout_ratio, seed=seed
+    )
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if output_dir is None:
+        output_dir = Path("output") / "prompts" / section_name / timestamp
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    config = EvolutionConfig(
+        iterations=iterations,
+        optimizer_model=optimizer_model,
+        reflection_model=reflection_model,
+        eval_model=eval_model,
+        judge_model=eval_model,
+        seed=seed,
+        reflection_minibatch_size=gepa_minibatch_size,
+        gepa_acceptance=gepa_acceptance.replace("-", "_"),
+    )
+
+    console.print(
+        f"\n[bold cyan]Prompt Section Self-Evolution[/bold cyan] — "
+        f"Evolving section: [bold]{section_name}[/bold]\n"
+    )
+    console.print(f"  Hermes repo: {hermes_repo}")
+    console.print(f"  Baseline ({baseline_chars} chars): {baseline_text[:80]}…")
+    console.print(
+        f"  Tasks: {len(suite.tasks)} ({len(train_tasks)} train / "
+        f"{len(holdout_tasks)} holdout), sha256 {suite.sha256[:12]}…"
+    )
+    console.print(f"  Output dir: {output_dir}")
+
+    run_inputs = build_run_inputs(
+        config=config,
+        iterations=iterations,
+        optimizer_model=optimizer_model,
+        quality_gate_preset="default",
+        eval_source="closed_loop",
+        gepa_acceptance=config.gepa_acceptance,
+        create_pr=create_pr_flag,
+    )
+    section_payload = {
+        "artifact_type": "prompt_section",
+        "target_section": section_name,
+        "baseline_chars": baseline_chars,
+    }
+
+    if dry_run:
+        console.print("[yellow]Dry run — skipping all LM/agent work.[/yellow]")
+        # Exercise the module + proposer wiring without spending money.
+        _ = PromptModule(section_name, baseline_text)
+        _ = PromptSectionProposer(section_name, baseline_chars=baseline_chars)
+        decision_payload = {
+            "schema_version": _GATE_SCHEMA_VERSION,
+            "decision": "dry_run",
+            "reason": "dry_run",
+            "decision_signal": "closed_loop",
+            "run_inputs": run_inputs,
+            "pr_created": disabled_pr_block(),
+            **section_payload,
+        }
+        write_gate_decision(output_dir, decision_payload)
+        return {"decision": "dry_run", "reason": "dry_run"}
+
+    register_litellm_failure_callback()
+    register_litellm_cost_callback()
+    COST_LEDGER.reset()
+    COST_LEDGER.set_ceiling(max_total_cost_usd)
+    if max_total_cost_usd is not None:
+        console.print(f"  Cost ceiling: ${max_total_cost_usd:.2f}")
+
+    installer = HermesPromptSectionInstaller(hermes_repo, section_name)
+    runner = HermesAgentRunner(
+        timeout_seconds=task_timeout_seconds, model=agent_model
+    )
+    judge = SaveCallJudge(config)
+    layer2_factory = _make_layer2_factory(judge)
+
+    tasks_by_id = {t.task_id: t for t in suite.tasks}
+
+    def install_candidate(candidate_text: str) -> None:
+        source.write(section_name, candidate_text)
+
+    def score_task_id(task_id: str) -> float:
+        return _run_one_task_score(
+            tasks_by_id[task_id],
+            runner=runner,
+            layer2_factory=layer2_factory,
+            layer2_threshold=layer2_threshold,
+        )
+
+    # One lock serializes splice+run across dspy.Evaluate's thread pool — the
+    # spliced prompt_builder.py is a single shared mutable file.
+    scorer = make_memoizing_splice_scorer(
+        install_fn=install_candidate,
+        score_fn=score_task_id,
+        lock=threading.Lock(),
+    )
+
+    metric = make_prompt_fitness_metric(
+        baseline_text=baseline_text,
+        max_growth=max_growth,
+        closed_loop_scorer=scorer,
+    )
+
+    eval_lm = instantiate_lm(
+        resolve_default_lm(role="eval", explicit_model=eval_model),
+        temperature=0.0, request_timeout=120, num_retries=3,
+    )
+    reflection_lm = instantiate_lm(
+        resolve_default_lm(
+            role="reflection", explicit_model=reflection_model or optimizer_model
+        ),
+        temperature=1.0, max_tokens=32000, cache=False,
+        request_timeout=300, num_retries=2,
+    )
+
+    baseline_module = PromptModule(section_name, baseline_text)
+    proposer = PromptSectionProposer(section_name, baseline_chars=baseline_chars)
+    trainset = _behavioral_examples(train_tasks)
+    valset = _behavioral_examples(holdout_tasks)
+
+    try:
+        start_time = time.time()
+        with _prompt_builder_guard(installer.target_path):
+            # --- Saturation pre-flight (baseline behavior on holdout) ---
+            if not skip_saturation_check:
+                sat_report = saturation_preflight(
+                    baseline_module=baseline_module,
+                    holdout_examples=_behavioral_examples(holdout_tasks),
+                    metric=metric,
+                    lm=eval_lm,
+                    baseline_artifact_text=baseline_text,
+                )
+                render_saturation_panel(sat_report, console=console)
+                if sat_report.band != "healthy" and not force_saturation_check:
+                    if is_non_interactive():
+                        console.print(
+                            "[yellow]Non-interactive context; refusing to "
+                            "proceed (saturated baseline). Pass "
+                            "--force-saturation-check to override.[/yellow]"
+                        )
+                        write_gate_decision(output_dir, {
+                            "schema_version": _GATE_SCHEMA_VERSION,
+                            "decision": "denied",
+                            "reason": "saturated_baseline",
+                            "decision_signal": "closed_loop",
+                            "saturation_band": sat_report.band,
+                            "run_inputs": run_inputs,
+                            "pr_created": disabled_pr_block(),
+                            **section_payload,
+                        })
+                        return {"decision": "denied", "reason": "saturated_baseline"}
+                    if not interactive_confirm():
+                        console.print("[yellow]Aborted by user.[/yellow]")
+                        return {"decision": "aborted", "reason": "user_abort"}
+
+            # --- GEPA optimization ---
+            console.print(
+                f"\n[bold cyan]Running GEPA (max_full_evals={iterations})[/bold cyan]\n"
+            )
+            optimizer = dspy.GEPA(
+                metric=metric,
+                max_full_evals=iterations,
+                reflection_lm=reflection_lm,
+                seed=config.seed,
+                track_stats=True,
+                instruction_proposer=proposer,
+                reflection_minibatch_size=config.reflection_minibatch_size,
+                gepa_kwargs={"acceptance_criterion": config.gepa_acceptance},
+            )
+            optimized = optimizer.compile(
+                baseline_module, trainset=trainset, valset=valset
+            )
+
+        # Guard released here — prompt_builder.py is restored to baseline.
+        elapsed = time.time() - start_time
+
+        if hasattr(optimized, "detailed_results"):
+            details = optimized.detailed_results
+            evolved_text = _section_text_from_candidate(
+                details.candidates[details.best_idx], section_name
+            )
+            console.print(
+                f"\n[bold]Candidate selection[/bold]: GEPA val-argmax "
+                f"(candidate {details.best_idx}, "
+                f"val={details.val_aggregate_scores[details.best_idx]:.3f}, "
+                f"{len(evolved_text)} chars)"
+            )
+        else:
+            evolved_text = optimized.section_text
+
+        # --- Deploy gate: closed-loop baseline vs evolved on the holdout suite ---
+        console.print(
+            f"\n[bold]Deploy gate[/bold]: closed-loop on "
+            f"{len(holdout_tasks)} holdout tasks"
+        )
+        holdout_suite = TaskSuite(
+            path=suite.path, sha256=suite.sha256, tasks=tuple(holdout_tasks)
+        )
+        baseline_file = output_dir / "baseline_section.txt"
+        evolved_file = output_dir / "evolved_section.txt"
+        baseline_file.write_text(baseline_text, encoding="utf-8")
+        evolved_file.write_text(evolved_text, encoding="utf-8")
+
+        validator = ClosedLoopValidator(
+            installer=installer,
+            runner=runner,
+            layer2_judge_factory=layer2_factory,
+            layer2_threshold=layer2_threshold,
+        )
+        report = validator.validate(ValidationInputs(
+            tool_name=section_name,
+            suite=holdout_suite,
+            baseline_artifact=baseline_file,
+            evolved_artifact=evolved_file,
+        ))
+        deploy = report.decision == "pass"
+    except CostCeilingExceeded as exc:
+        console.print(f"[red]✗ Cost ceiling exceeded: {exc}[/red]")
+        write_gate_decision(output_dir, {
+            "schema_version": _GATE_SCHEMA_VERSION,
+            "decision": "aborted",
+            "reason": "cost_ceiling_exceeded",
+            "decision_signal": "closed_loop",
+            "cost": COST_LEDGER.summary(),
+            "run_inputs": run_inputs,
+            "pr_created": disabled_pr_block(),
+            **section_payload,
+        })
+        return {"decision": "aborted", "reason": "cost_ceiling_exceeded"}
+
+    # PR automation for prompt sections is deferred: create_pr copies a full
+    # evolved file over origin/<base>'s prompt_builder.py, but our local
+    # checkout carries the (unmerged) override-hook commit, which would
+    # pollute the PR diff with unrelated changes. Until a section-scoped PR
+    # path lands, --create-pr is recorded as skipped; use --apply + a manual PR.
+    pr_block = disabled_pr_block()
+    if create_pr_flag:
+        pr_block = {
+            "status": "skipped",
+            "reason": "prompt-section PR automation deferred (would pollute diff "
+                      "with the local override-hook commit); use --apply + manual PR",
+            "url": None,
+        }
+
+    decision_payload = {
+        "schema_version": _GATE_SCHEMA_VERSION,
+        "decision": "deploy" if deploy else "reject",
+        "reason": "passed" if deploy else "closed_loop_gate",
+        "decision_signal": "closed_loop",
+        "baseline_chars": baseline_chars,
+        "evolved_chars": len(evolved_text),
+        "growth_pct": (len(evolved_text) - baseline_chars) / max(1, baseline_chars),
+        "closed_loop": {
+            "decision": report.decision,
+            "decision_reasons": report.decision_reasons,
+            "baseline_pass_rate": report.baseline.pass_rate,
+            "evolved_pass_rate": report.evolved.pass_rate,
+            "n_wins": report.delta.n_wins,
+            "n_losses": report.delta.n_losses,
+            "n_ties": report.delta.n_ties,
+        },
+        "sentinel_failures": proposer.sentinel_failures,
+        "elapsed_seconds": elapsed,
+        "cost": COST_LEDGER.summary(),
+        "run_inputs": run_inputs,
+        "pr_created": pr_block,
+        **section_payload,
+    }
+    gate_path = write_gate_decision(output_dir, decision_payload)
+    console.print(f"  [dim]Gate decision logged to {gate_path}[/dim]")
+
+    if not deploy:
+        console.print(
+            f"[red]✗ Evolved section REJECTED by closed-loop gate "
+            f"({report.decision}) — not deploying[/red]"
+        )
+        return {"decision": "reject", "reason": "closed_loop_gate"}
+
+    console.print(
+        f"[green]✓ Evolved section PASSED "
+        f"(baseline {report.baseline.pass_rate:.2f} → "
+        f"evolved {report.evolved.pass_rate:.2f}, "
+        f"{report.delta.n_wins}W/{report.delta.n_losses}L)[/green]"
+    )
+    if apply:
+        source.write(section_name, evolved_text)
+        console.print(
+            f"  [green]✓ Applied evolved {section_name} to {installer.target_path}[/green]"
+        )
+
+    return {
+        "decision": "deploy",
+        "reason": "passed",
+        "evolved_chars": len(evolved_text),
+        "applied": apply,
+    }
+
+
+@click.command()
+@click.option("--section", "section_name", required=True,
+              help="The prompt_builder.py string constant to evolve (e.g. MEMORY_GUIDANCE).")
+@click.option("--hermes-repo", required=True,
+              type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
+              help="Path to your hermes-agent checkout.")
+@click.option("--tasks", "tasks_path", required=True,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+              help="Path to a JSONL eval suite (e.g. suites/memory_guidance.jsonl).")
+@click.option("--iterations", default=10, type=click.IntRange(min=1),
+              help="GEPA max_full_evals (default 10).")
+@click.option("--holdout-ratio", default=0.5, type=click.FloatRange(0.0, 1.0),
+              help="Fraction of tasks held out for the deploy gate (default 0.5).")
+@click.option("--seed", default=42, type=int, help="Split + GEPA seed.")
+@click.option("--max-growth", default=0.2, type=float,
+              help="Section length budget as a fraction over baseline (default 0.2).")
+@click.option("--optimizer-model", default=None)
+@click.option("--reflection-model", default=None)
+@click.option("--eval-model", default=None, help="Judge model for Layer 2 content scoring.")
+@click.option("--agent-model", default=None,
+              help="Model the hermes -z agent runs as (deliberately weaker exposes more signal).")
+@click.option("--layer2-threshold", default=0.7, type=click.FloatRange(0.0, 1.0),
+              help="Min content-judge score for a save task to pass (default 0.7).")
+@click.option("--task-timeout-seconds", default=DEFAULT_TASK_TIMEOUT_SECONDS,
+              type=click.IntRange(min=1))
+@click.option("--max-cost-usd", "max_total_cost_usd", default=150.0, type=float,
+              help="Abort if cumulative spend exceeds this (default $150).")
+@click.option("--gepa-minibatch-size", default=3, type=click.IntRange(min=1))
+@click.option("--gepa-acceptance", default="improvement-or-equal",
+              type=click.Choice(["improvement-or-equal", "strict-improvement"]))
+@click.option("--skip-saturation-check", is_flag=True, default=False)
+@click.option("--force-saturation-check", is_flag=True, default=False,
+              help="Proceed even if the baseline looks saturated.")
+@click.option("--apply", is_flag=True, default=False,
+              help="On a passing gate, write the evolved section into prompt_builder.py.")
+@click.option("--create-pr", "create_pr_flag", is_flag=True, default=False,
+              help="(Deferred for prompt sections — recorded as skipped.)")
+@click.option("--dry-run", is_flag=True, default=False,
+              help="Exercise wiring without any LM/agent calls.")
+@click.option("--output-dir", default=None,
+              type=click.Path(file_okay=False, dir_okay=True, path_type=Path))
+def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed,
+         max_growth, optimizer_model, reflection_model, eval_model, agent_model,
+         layer2_threshold, task_timeout_seconds, max_total_cost_usd,
+         gepa_minibatch_size, gepa_acceptance, skip_saturation_check,
+         force_saturation_check, apply, create_pr_flag, dry_run, output_dir):
+    """Evolve one Hermes system-prompt section via GEPA + closed-loop validation."""
+    result = evolve_prompt_section(
+        section_name=section_name,
+        hermes_repo=hermes_repo,
+        tasks_path=tasks_path,
+        iterations=iterations,
+        holdout_ratio=holdout_ratio,
+        seed=seed,
+        max_growth=max_growth,
+        optimizer_model=optimizer_model,
+        reflection_model=reflection_model,
+        eval_model=eval_model,
+        agent_model=agent_model,
+        layer2_threshold=layer2_threshold,
+        task_timeout_seconds=task_timeout_seconds,
+        max_total_cost_usd=max_total_cost_usd,
+        gepa_minibatch_size=gepa_minibatch_size,
+        gepa_acceptance=gepa_acceptance,
+        skip_saturation_check=skip_saturation_check,
+        force_saturation_check=force_saturation_check,
+        apply=apply,
+        create_pr_flag=create_pr_flag,
+        dry_run=dry_run,
+        output_dir=output_dir,
+    )
+    sys.exit(0 if result["decision"] in {"deploy", "dry_run"} else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py
index b4e705a8..8d5a9624 100644
--- a/evolution/prompts/prompt_judge.py
+++ b/evolution/prompts/prompt_judge.py
@@ -7,6 +7,7 @@
 from __future__ import annotations
 
 import logging
+import threading
 from typing import Any, Callable, Optional
 
 import dspy
@@ -164,6 +165,7 @@ def make_memoizing_splice_scorer(
     *,
     install_fn: Callable[[str], None],
     score_fn: Callable[[str], float],
+    lock: Optional[threading.Lock] = None,
 ) -> Callable[[str, str], float]:
     """Build ``closed_loop_scorer(task_id, candidate_text) -> float`` that
     splices a candidate only when it changes.
@@ -174,16 +176,25 @@ def make_memoizing_splice_scorer(
     tasks for the same candidate reuse the live splice. ``score_fn(task_id)``
     runs the task through the agent with whatever candidate is installed.
 
+    The splice + run is serialized under ``lock`` (a fresh ``threading.Lock``
+    by default). ``dspy.Evaluate`` scores with a thread pool, but the spliced
+    ``prompt_builder.py`` is one shared mutable file — without serialization a
+    second thread could re-splice a different candidate while the first thread's
+    ``hermes -z`` subprocess is mid-read. Behavioral scoring is therefore
+    effectively serial; that's an accepted v1 cost of splice-and-restore.
+
     Backup/restore of the mutated source is the caller's responsibility — wrap
     the whole GEPA run, not each call (the per-run guard mirrors
     ``ClosedLoopValidator``'s splice-once-per-phase shape).
     """
     state: dict[str, Any] = {"installed": _UNSET}
+    lock = lock if lock is not None else threading.Lock()
 
     def scorer(task_id: str, candidate_text: str) -> float:
-        if state["installed"] != candidate_text:
-            install_fn(candidate_text)
-            state["installed"] = candidate_text
-        return score_fn(task_id)
+        with lock:
+            if state["installed"] != candidate_text:
+                install_fn(candidate_text)
+                state["installed"] = candidate_text
+            return score_fn(task_id)
 
     return scorer
diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py
index 2f788c93..de248f07 100644
--- a/evolution/validation/validator.py
+++ b/evolution/validation/validator.py
@@ -85,15 +85,19 @@ def __init__(
         installer: ArtifactInstaller,
         runner: AgentRunner,
         *,
-        layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None,
+        layer2_judge_factory: Optional[
+            Callable[[Task], Optional[Callable[[list[dict]], float]]]
+        ] = None,
         layer2_threshold: float = 0.7,
     ) -> None:
         self.installer = installer
         self.runner = runner
-        # Optional compound-verdict Layer 2 (prompt-section suites). When
-        # unset, scoring is Layer 1 only — the tool-description path is
-        # unchanged.
-        self.layer2_judge_fn = layer2_judge_fn
+        # Optional compound-verdict Layer 2 (prompt-section suites). The
+        # factory builds a per-task scorer from the task — prompt-section
+        # judging needs the task's expected_save_content rubric and message,
+        # which a single global fn couldn't carry. When unset, scoring is
+        # Layer 1 only and the tool-description path is unchanged.
+        self.layer2_judge_factory = layer2_judge_factory
         self.layer2_threshold = layer2_threshold
 
     def validate(self, inputs: ValidationInputs) -> ValidationReport:
@@ -155,13 +159,18 @@ def _run_one_task(self, task: Task) -> TaskResult:
                 skills_src=getattr(self.installer, "skills_src", None),
             )
             run = self.runner.run(ctx)
+            layer2_judge_fn = (
+                self.layer2_judge_factory(task)
+                if self.layer2_judge_factory is not None
+                else None
+            )
             passed, abstained = score_task(
                 expected_tools=task.expected_tools,
                 forbidden_tools=task.forbidden_tools,
                 run=run,
                 test_command=task.test_command,
                 fixture_dir=fixture_dir,
-                layer2_judge_fn=self.layer2_judge_fn,
+                layer2_judge_fn=layer2_judge_fn,
                 layer2_threshold=self.layer2_threshold,
             )
             return TaskResult(
diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py
new file mode 100644
index 00000000..34081c58
--- /dev/null
+++ b/tests/prompts/test_evolve_prompt_section.py
@@ -0,0 +1,110 @@
+"""Wiring tests for evolve_prompt_section — pure helpers + dry-run (no LM/agent)."""
+from __future__ import annotations
+
+import json
+import textwrap
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from evolution.prompts.evolve_prompt_section import (
+    _make_layer2_factory,
+    _section_text_from_candidate,
+    _split_train_holdout,
+    evolve_prompt_section,
+    main,
+)
+from evolution.prompts.prompt_module import PromptModule
+from evolution.validation.task import Task
+
+
+def _task(task_id: str, rubric: str | None = None) -> Task:
+    return Task(
+        task_id=task_id, user_message="m", expected_tools=("memory",),
+        expected_save_content=rubric,
+    )
+
+
+def test_split_is_deterministic_and_non_empty():
+    tasks = tuple(_task(f"t{i}") for i in range(10))
+    train1, holdout1 = _split_train_holdout(tasks, holdout_ratio=0.5, seed=42)
+    train2, holdout2 = _split_train_holdout(tasks, holdout_ratio=0.5, seed=42)
+    assert [t.task_id for t in train1] == [t.task_id for t in train2]
+    assert [t.task_id for t in holdout1] == [t.task_id for t in holdout2]
+    assert train1 and holdout1
+    assert len(train1) + len(holdout1) == 10
+
+
+def test_split_keeps_both_sides_non_empty_at_extremes():
+    tasks = tuple(_task(f"t{i}") for i in range(4))
+    train, holdout = _split_train_holdout(tasks, holdout_ratio=1.0, seed=1)
+    assert train and holdout  # never starve the train side
+
+
+def test_layer2_factory_returns_none_without_rubric():
+    factory = _make_layer2_factory(judge=None)
+    assert factory(_task("t1", rubric=None)) is None
+    assert callable(factory(_task("t2", rubric="a rubric")))
+
+
+def test_section_text_from_candidate_module_and_dict():
+    module = PromptModule("MEMORY_GUIDANCE", "candidate body")
+    assert _section_text_from_candidate(module, "MEMORY_GUIDANCE") == "candidate body"
+    instructions = module.passthrough.predict.signature.instructions
+    assert (
+        _section_text_from_candidate(
+            {"passthrough.predict": instructions}, "MEMORY_GUIDANCE"
+        )
+        == "candidate body"
+    )
+
+
+def _fake_repo(tmp_path: Path) -> Path:
+    (tmp_path / "agent").mkdir()
+    (tmp_path / "agent" / "prompt_builder.py").write_text(textwrap.dedent('''\
+        MEMORY_GUIDANCE = "Save durable facts about the user."
+    '''))
+    return tmp_path
+
+
+def _suite(tmp_path: Path) -> Path:
+    p = tmp_path / "suite.jsonl"
+    p.write_text("\n".join(json.dumps(r) for r in [
+        {"task_id": "s1", "user_message": "I use uv.",
+         "expected_tools": ["memory"], "expected_save_content": "prefers uv"},
+        {"task_id": "n1", "user_message": "summarize work",
+         "expected_tools": [], "forbidden_tools": ["memory"]},
+    ]) + "\n")
+    return p
+
+
+def test_dry_run_writes_gate_decision(tmp_path):
+    repo = _fake_repo(tmp_path)
+    suite = _suite(tmp_path)
+    out = tmp_path / "out"
+    result = evolve_prompt_section(
+        section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite,
+        dry_run=True, output_dir=out,
+    )
+    assert result["decision"] == "dry_run"
+    gate = json.loads((out / "gate_decision.json").read_text())
+    assert gate["artifact_type"] == "prompt_section"
+    assert gate["target_section"] == "MEMORY_GUIDANCE"
+    # The baseline file must be byte-identical after a dry run (untouched).
+    assert "Save durable facts about the user." in (
+        repo / "agent" / "prompt_builder.py"
+    ).read_text()
+
+
+def test_cli_dry_run_exits_zero(tmp_path):
+    repo = _fake_repo(tmp_path)
+    suite = _suite(tmp_path)
+    runner = CliRunner()
+    res = runner.invoke(main, [
+        "--section", "MEMORY_GUIDANCE",
+        "--hermes-repo", str(repo),
+        "--tasks", str(suite),
+        "--dry-run",
+        "--output-dir", str(tmp_path / "out"),
+    ])
+    assert res.exit_code == 0, res.output
diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py
index 3326c7a1..8767d1a8 100644
--- a/tests/validation/test_validator.py
+++ b/tests/validation/test_validator.py
@@ -83,14 +83,20 @@ def run(self, ctx):
                 )
 
         judged = []
+        tasks_seen = []
 
-        def judge_fn(memory_calls):
-            judged.append(memory_calls)
-            return 0.2  # below threshold → Layer 2 fails the task
+        def judge_factory(task):
+            tasks_seen.append(task.task_id)
+
+            def judge_fn(memory_calls):
+                judged.append(memory_calls)
+                return 0.2  # below threshold → Layer 2 fails the task
+
+            return judge_fn
 
         validator = ClosedLoopValidator(
             _StubInstaller(target), _MemoryRunner(target),
-            layer2_judge_fn=judge_fn, layer2_threshold=0.7,
+            layer2_judge_factory=judge_factory, layer2_threshold=0.7,
         )
         report = validator.validate(ValidationInputs(
             tool_name="MEMORY_GUIDANCE", suite=suite,
@@ -98,6 +104,8 @@ def judge_fn(memory_calls):
         ))
         # Judge invoked once per phase (baseline + evolved) on the one task.
         assert len(judged) == 2
+        # Factory received the task each phase.
+        assert tasks_seen == ["t1", "t1"]
         # Both phases fail Layer 2 → 0 pass rate, no regression decision.
         assert report.baseline.pass_rate == 0.0
         assert report.evolved.pass_rate == 0.0

From fc853abcb5d3ba33fb0c7b3e0f50169d6c5a4b41 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 12:59:18 -0600
Subject: [PATCH 15/23] fix(validation): read agent sessions from hermes
 state.db
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modern hermes -z one-shot mode is ephemeral — it prints only the final
response and no longer writes session_*.json. Sessions now persist to a
SQLite state.db in HERMES_HOME. The runner globbed for the obsolete JSON
files, so every closed-loop run abstained ('no session JSON'). Read the
most-recent session's messages from state.db instead; the tool_calls column
holds the same OpenAI-nested shape the extractors already parse, so the
message-extraction core is shared between the JSON and DB paths. Unblocks all
closed-loop validation (tools, skills, and prompt sections).
---
 evolution/validation/hermes_runner.py  | 127 ++++++++++++++++++++-----
 tests/validation/test_hermes_runner.py | 110 +++++++++++++++++++--
 2 files changed, 205 insertions(+), 32 deletions(-)

diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
index 82ddcde0..27498461 100644
--- a/evolution/validation/hermes_runner.py
+++ b/evolution/validation/hermes_runner.py
@@ -18,6 +18,7 @@
 import logging
 import os
 import shutil
+import sqlite3
 import subprocess
 import tempfile
 import time
@@ -153,15 +154,17 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
                 )
             duration = time.time() - start
 
-            session_path = _find_latest_session(sandbox / "sessions")
-            if session_path is None:
+            # Modern hermes persists the session to a SQLite ``state.db`` in
+            # HERMES_HOME (one-shot ``-z`` no longer writes ``session_*.json``).
+            db_path = sandbox / "state.db"
+            if not db_path.is_file():
                 return AgentRunResult(
                     tool_calls_seq=[],
                     final_text_tail="",
                     duration_seconds=duration,
-                    error="no session JSON written by hermes -z",
+                    error="no session written by hermes -z (state.db absent)",
                 )
-            return parse_session_result(session_path, duration_seconds=duration)
+            return parse_session_from_db(db_path, duration_seconds=duration)
         finally:
             shutil.rmtree(sandbox, ignore_errors=True)
 
@@ -175,17 +178,6 @@ def _prime_sandbox(self, sandbox: Path, ctx: TaskRunContext) -> None:
             shutil.copytree(ctx.skills_src, sandbox / "skills")
 
 
-def _find_latest_session(sessions_dir: Path) -> Optional[Path]:
-    if not sessions_dir.exists():
-        return None
-    candidates = sorted(
-        sessions_dir.glob("session_*.json"),
-        key=lambda p: p.stat().st_mtime,
-        reverse=True,
-    )
-    return candidates[0] if candidates else None
-
-
 def parse_session_result(
     session_path: Path,
     *,
@@ -193,9 +185,9 @@ def parse_session_result(
 ) -> AgentRunResult:
     """Read a Hermes session JSON and extract the tool-call sequence + final text.
 
-    Public for tests: hand-crafted fixture JSONs in
-    ``tests/validation/test_hermes_runner.py`` exercise this directly
-    rather than going through the subprocess layer.
+    Retained for the legacy ``session_*.json`` shape and unit tests that
+    exercise the message extractors with hand-crafted fixtures. The live
+    runner reads ``state.db`` via ``parse_session_from_db``.
     """
     try:
         data = json.loads(session_path.read_text())
@@ -209,18 +201,103 @@ def parse_session_result(
         )
 
     messages = data.get("messages") or []
-    tool_calls_seq = _extract_tool_call_names(messages)
-    tool_calls_with_args = _extract_tool_calls_with_args(messages)
-    final_text_tail = _extract_final_text_tail(messages)
-    model_name = data.get("model")
+    return _result_from_messages(
+        messages,
+        duration_seconds=duration_seconds,
+        model_name=data.get("model"),
+        session_path=session_path,
+    )
+
 
+def parse_session_from_db(
+    db_path: Path,
+    *,
+    duration_seconds: float,
+) -> AgentRunResult:
+    """Reconstruct an ``AgentRunResult`` from a Hermes ``state.db``.
+
+    Modern hermes persists each session to SQLite. We read the most-recent
+    session's messages and normalize them into the same message-dict shape the
+    legacy JSON path produced, so the existing extractors work unchanged. The
+    ``messages.tool_calls`` column holds the OpenAI-nested
+    ``{"function": {"name", "arguments"}}`` list verbatim.
+    """
+    try:
+        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+    except sqlite3.Error as exc:
+        return AgentRunResult(
+            tool_calls_seq=[],
+            final_text_tail="",
+            duration_seconds=duration_seconds,
+            error=f"could not open session DB at {db_path}: {exc}",
+            session_path=db_path,
+        )
+    try:
+        conn.row_factory = sqlite3.Row
+        session = conn.execute(
+            "SELECT id, model FROM sessions ORDER BY started_at DESC LIMIT 1"
+        ).fetchone()
+        if session is None:
+            return AgentRunResult(
+                tool_calls_seq=[],
+                final_text_tail="",
+                duration_seconds=duration_seconds,
+                error=f"session DB at {db_path} has no sessions",
+                session_path=db_path,
+            )
+        rows = conn.execute(
+            "SELECT role, content, tool_calls FROM messages "
+            "WHERE session_id = ? ORDER BY id",
+            (session["id"],),
+        ).fetchall()
+    except sqlite3.Error as exc:
+        return AgentRunResult(
+            tool_calls_seq=[],
+            final_text_tail="",
+            duration_seconds=duration_seconds,
+            error=f"could not read session DB at {db_path}: {exc}",
+            session_path=db_path,
+        )
+    finally:
+        conn.close()
+
+    messages: list[dict] = []
+    for row in rows:
+        raw_calls = row["tool_calls"]
+        parsed_calls: Any = None
+        if raw_calls:
+            try:
+                parsed_calls = json.loads(raw_calls)
+            except (json.JSONDecodeError, TypeError):
+                parsed_calls = None
+        messages.append({
+            "role": row["role"],
+            "content": row["content"] or "",
+            "tool_calls": parsed_calls,
+        })
+    return _result_from_messages(
+        messages,
+        duration_seconds=duration_seconds,
+        model_name=session["model"],
+        session_path=db_path,
+    )
+
+
+def _result_from_messages(
+    messages: list[dict],
+    *,
+    duration_seconds: float,
+    model_name: Optional[str],
+    session_path: Optional[Path],
+) -> AgentRunResult:
+    """Build an ``AgentRunResult`` from a normalized message list."""
     return AgentRunResult(
-        tool_calls_seq=tool_calls_seq,
-        final_text_tail=final_text_tail,
+        tool_calls_seq=_extract_tool_call_names(messages),
+        final_text_tail=_extract_final_text_tail(messages),
         duration_seconds=duration_seconds,
         model_name=model_name,
         session_path=session_path,
-        tool_calls_with_args=tool_calls_with_args,
+        tool_calls_with_args=_extract_tool_calls_with_args(messages),
     )
 
 
diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py
index da1b2fea..243d3dbf 100644
--- a/tests/validation/test_hermes_runner.py
+++ b/tests/validation/test_hermes_runner.py
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 import json
+import sqlite3
 from pathlib import Path
 from unittest.mock import patch
 
@@ -20,10 +21,45 @@
 from evolution.validation.hermes_runner import (
     HermesAgentRunner,
     _strip_litellm_provider_prefix,
+    parse_session_from_db,
     parse_session_result,
 )
 
 
+def _make_state_db(path: Path, *, session_id: str, model: str, messages: list[dict],
+                   started_at: float = 1.0) -> None:
+    """Create a minimal hermes-shaped state.db with one session + messages.
+
+    Each ``messages`` entry: ``{"role", "content"?, "tool_calls"?}`` where
+    ``tool_calls`` is a Python list serialized to the ``tool_calls`` TEXT
+    column (the OpenAI-nested shape hermes stores).
+    """
+    conn = sqlite3.connect(path)
+    conn.executescript(
+        """
+        CREATE TABLE sessions (id TEXT PRIMARY KEY, model TEXT, started_at REAL);
+        CREATE TABLE messages (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT, role TEXT, content TEXT, tool_calls TEXT
+        );
+        """
+    )
+    conn.execute(
+        "INSERT INTO sessions (id, model, started_at) VALUES (?, ?, ?)",
+        (session_id, model, started_at),
+    )
+    for m in messages:
+        tc = m.get("tool_calls")
+        conn.execute(
+            "INSERT INTO messages (session_id, role, content, tool_calls) "
+            "VALUES (?, ?, ?, ?)",
+            (session_id, m["role"], m.get("content"),
+             json.dumps(tc) if tc is not None else None),
+        )
+    conn.commit()
+    conn.close()
+
+
 class TestStripLitellmProviderPrefix:
     """The hermes -m flag interprets '<word>/<model>' as openrouter-style
     routing. Direct-provider users naturally pass litellm-formatted names
@@ -225,6 +261,65 @@ def test_handles_flat_dict_args(self, tmp_path):
         ]
 
 
+class TestParseSessionFromDb:
+    """The state.db parse layer — modern hermes persists sessions to SQLite."""
+
+    def test_extracts_tool_calls_and_args(self, tmp_path):
+        db = tmp_path / "state.db"
+        _make_state_db(db, session_id="s1", model="gpt-5.4-mini", messages=[
+            {"role": "user", "content": "remember I use uv"},
+            {"role": "assistant", "tool_calls": [
+                {"type": "function", "function": {
+                    "name": "memory",
+                    "arguments": json.dumps({"action": "add", "content": "uses uv"}),
+                }}
+            ]},
+            {"role": "tool", "content": "ok"},
+            {"role": "assistant", "content": "Saved."},
+        ])
+        result = parse_session_from_db(db, duration_seconds=2.0)
+        assert result.error is None
+        assert result.model_name == "gpt-5.4-mini"
+        assert result.tool_calls_seq == ["memory"]
+        assert result.tool_calls_with_args == [
+            {"name": "memory", "arguments": {"action": "add", "content": "uses uv"}}
+        ]
+        assert result.final_text_tail == "Saved."
+
+    def test_no_sessions_is_error(self, tmp_path):
+        db = tmp_path / "state.db"
+        conn = sqlite3.connect(db)
+        conn.executescript(
+            "CREATE TABLE sessions (id TEXT, model TEXT, started_at REAL);"
+            "CREATE TABLE messages (id INTEGER PRIMARY KEY, session_id TEXT, "
+            "role TEXT, content TEXT, tool_calls TEXT);"
+        )
+        conn.commit()
+        conn.close()
+        result = parse_session_from_db(db, duration_seconds=1.0)
+        assert result.error is not None
+        assert "no sessions" in result.error
+
+    def test_picks_most_recent_session(self, tmp_path):
+        db = tmp_path / "state.db"
+        _make_state_db(db, session_id="old", model="m", started_at=1.0, messages=[
+            {"role": "assistant", "tool_calls": [{"function": {"name": "patch"}}]},
+        ])
+        # Add a newer session with a different tool call.
+        conn = sqlite3.connect(db)
+        conn.execute("INSERT INTO sessions (id, model, started_at) VALUES (?,?,?)",
+                     ("new", "m", 2.0))
+        conn.execute(
+            "INSERT INTO messages (session_id, role, content, tool_calls) VALUES (?,?,?,?)",
+            ("new", "assistant", None,
+             json.dumps([{"function": {"name": "write_file"}}])),
+        )
+        conn.commit()
+        conn.close()
+        result = parse_session_from_db(db, duration_seconds=1.0)
+        assert result.tool_calls_seq == ["write_file"]
+
+
 class TestHermesAgentRunnerSubprocess:
     """The subprocess invocation layer: env + cwd + args plumbing."""
 
@@ -242,12 +337,13 @@ def _fake_run(*args, **kwargs):
             captured["args"] = args[0] if args else kwargs.get("args")
             captured["env"] = kwargs.get("env")
             captured["cwd"] = kwargs.get("cwd")
-            # Drop a minimal session JSON so the parse layer succeeds.
+            # Drop a minimal state.db so the parse layer succeeds.
             sandbox = Path(kwargs["env"]["HERMES_HOME"])
-            (sandbox / "sessions").mkdir(exist_ok=True)
-            _write_session(
-                sandbox / "sessions" / "session_test.json",
-                [{"role": "assistant", "tool_calls": [{"function": {"name": "patch"}}]}],
+            _make_state_db(
+                sandbox / "state.db",
+                session_id="s1", model="test-model",
+                messages=[{"role": "assistant", "tool_calls": [
+                    {"function": {"name": "patch"}}]}],
             )
             return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
 
@@ -302,7 +398,7 @@ def test_no_session_written_returns_error_result(self, fixture_dir, tmp_path):
         runner = HermesAgentRunner(user_config_path=tmp_path / "x")
 
         def _fake_run(*args, **kwargs):
-            # Don't drop a session JSON.
+            # Don't write a state.db.
             return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})()
 
         with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run):
@@ -311,7 +407,7 @@ def _fake_run(*args, **kwargs):
                 fixture_dir=fixture_dir,
             ))
         assert result.error is not None
-        assert "no session JSON" in result.error
+        assert "state.db absent" in result.error
 
     def test_user_config_copied_into_sandbox_when_exists(self, fixture_dir, tmp_path):
         user_config = tmp_path / "user_config.yaml"

From 63f3fd270d0dff40d00169b4dc4fb6aa14460e8a Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 12:59:18 -0600
Subject: [PATCH 16/23] fix(prompts): judge real memory actions (add/replace),
 not 'save'

The Hermes memory tool's content-bearing actions are add and replace (full
set: add/replace/remove/read); there is no 'save' action. The Layer 2 filter
matched the nonexistent 'save', so it never scored any real call. Match
SAVE_ACTIONS = {add, replace} instead.
---
 evolution/prompts/prompt_judge.py  | 12 +++++++++---
 tests/prompts/test_prompt_judge.py | 14 ++++++++------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py
index 8d5a9624..8d6c0b4b 100644
--- a/evolution/prompts/prompt_judge.py
+++ b/evolution/prompts/prompt_judge.py
@@ -22,6 +22,12 @@
 beyond the cap score 0 each — bounds cost on pathological cases where the
 agent saves on every turn."""
 
+SAVE_ACTIONS = frozenset({"add", "replace"})
+"""Hermes ``memory`` tool actions that persist content worth judging. The
+tool's full action set is add / replace / remove / read (see
+``tools/memory_tool.py``); only ``add`` and ``replace`` carry a ``content``
+payload, so only those are content-judged. ``remove`` / ``read`` are not saves."""
+
 
 class SaveCallSignature(dspy.Signature):
     """Score a memory-save call against MEMORY_GUIDANCE's rules.
@@ -81,14 +87,14 @@ def judge_save_calls(
     """Aggregate the Layer 2 score across a task's memory-save calls.
 
     ``calls`` is the subset of ``tool_calls_with_args`` whose name is
-    ``memory`` — each item the call's ``arguments`` dict. Only
-    ``action == 'save'`` calls are judged.
+    ``memory`` — each item the call's ``arguments`` dict. Only content-bearing
+    save actions (``add`` / ``replace``, see ``SAVE_ACTIONS``) are judged.
 
     Returns 1.0 when no save calls were made (Layer 1 catches the
     "should-have-saved-but-didn't" failure; Layer 2 only scores what
     actually happened) and also when no judge/rubric is configured.
     """
-    save_calls = [c for c in calls if c.get("action") == "save"]
+    save_calls = [c for c in calls if c.get("action") in SAVE_ACTIONS]
     if not save_calls:
         return 1.0
     if judge is None or expected_content is None:
diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py
index c0e620a7..e1e4efe8 100644
--- a/tests/prompts/test_prompt_judge.py
+++ b/tests/prompts/test_prompt_judge.py
@@ -18,8 +18,8 @@ def test_invokes_judge_per_call_and_means():
     fake_judge = MagicMock(spec=SaveCallJudge)
     fake_judge.score.side_effect = [0.8, 0.6]
     calls = [
-        {"action": "save", "content": "user prefers concise responses"},
-        {"action": "save", "content": "completed phase 3"},
+        {"action": "add", "content": "user prefers concise responses"},
+        {"action": "replace", "content": "completed phase 3"},
     ]
     score = judge_save_calls(
         judge=fake_judge, calls=calls,
@@ -33,7 +33,7 @@ def test_caps_at_five_calls():
     """Pathological: agent saves on every turn. Judge at most 5; excess score 0."""
     fake_judge = MagicMock(spec=SaveCallJudge)
     fake_judge.score.return_value = 1.0
-    calls = [{"action": "save", "content": f"item {i}"} for i in range(10)]
+    calls = [{"action": "add", "content": f"item {i}"} for i in range(10)]
     score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any")
     # 5 scored 1.0, 5 unjudged scored 0 → mean 0.5
     assert score == pytest.approx(0.5)
@@ -41,11 +41,13 @@ def test_caps_at_five_calls():
 
 
 def test_filters_non_save_actions():
+    """Only content-bearing actions (add/replace) are judged; remove/read skipped."""
     fake_judge = MagicMock(spec=SaveCallJudge)
     fake_judge.score.return_value = 1.0
     calls = [
-        {"action": "delete", "key": "x"},
-        {"action": "save", "content": "real save"},
+        {"action": "remove", "old_text": "x"},
+        {"action": "read"},
+        {"action": "add", "content": "real save"},
     ]
     score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any")
     assert score == pytest.approx(1.0)
@@ -54,7 +56,7 @@ def test_filters_non_save_actions():
 
 def test_none_judge_or_expected_is_vacuous_pass():
     """A save call exists but no judge/rubric configured → don't penalize."""
-    calls = [{"action": "save", "content": "x"}]
+    calls = [{"action": "add", "content": "x"}]
     assert judge_save_calls(judge=None, calls=calls, expected_content="r") == 1.0
     fake = MagicMock(spec=SaveCallJudge)
     assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0

From f685314eb10b611d6dc00c54b64211feb30d7764 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 12:59:18 -0600
Subject: [PATCH 17/23] fix(prompts): invoke passthrough predictor so GEPA can
 reflect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PromptModule.forward returned a Prediction without calling the predictor, so
GEPA captured no trace for passthrough.predict and make_reflective_dataset
raised 'No valid predictions found' every iteration — no candidate was ever
proposed. The tool path gets traces from synthetic examples; prompt sections
are pure-behavioral, so forward must call the passthrough to produce a trace.
The predictor output stays a placeholder; the real score is the metric's
behavioral branch.
---
 evolution/prompts/prompt_module.py  | 13 ++++++++++---
 tests/prompts/test_prompt_module.py | 15 +++++++++++----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/evolution/prompts/prompt_module.py b/evolution/prompts/prompt_module.py
index ad8f3487..9cd9de0b 100644
--- a/evolution/prompts/prompt_module.py
+++ b/evolution/prompts/prompt_module.py
@@ -98,10 +98,17 @@ def forward(
         task: str,
         closed_loop_task_id: Optional[str] = None,
     ) -> dspy.Prediction:
-        # Always route behaviorally — there is no cheap predictor score for
-        # a prompt section. The metric reads these via getattr.
+        # Invoke the passthrough predictor so GEPA captures a trace for
+        # ``passthrough.predict``. Without a traced predictor call, GEPA's
+        # make_reflective_dataset finds "no valid predictions" and never
+        # proposes a mutation (the tool path gets traces from synthetic
+        # examples; prompt sections are pure-behavioral, so the trace has
+        # to come from here). The predictor's output is a placeholder — the
+        # real score comes from the metric's behavioral branch, which reads
+        # the candidate text + task id attached below.
+        result = self.passthrough(task=task)
         return dspy.Prediction(
-            response="",
+            response=getattr(result, "response", ""),
             _closed_loop_task_id=closed_loop_task_id,
             _candidate_text=self.section_text,
         )
diff --git a/tests/prompts/test_prompt_module.py b/tests/prompts/test_prompt_module.py
index b369f003..667c88cd 100644
--- a/tests/prompts/test_prompt_module.py
+++ b/tests/prompts/test_prompt_module.py
@@ -27,11 +27,18 @@ def test_section_text_extracts_current_candidate():
     assert module.section_text == "v2-mutated"
 
 
-def test_forward_routes_behavioral():
-    """forward always returns the candidate + task id for behavioral scoring —
-    there's no cheap predictor score for a prompt section."""
+def test_forward_invokes_predictor_and_attaches_behavioral_fields():
+    """forward calls the passthrough predictor (so GEPA gets a trace) and
+    attaches the candidate text + task id for the metric's behavioral branch."""
+    import dspy
+    from dspy.utils.dummies import DummyLM
+
     module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="evolved body")
-    pred = module.forward(task="anything", closed_loop_task_id="task-001")
+    # DummyLM lets the real predictor run offline (no network), so section_text
+    # still resolves while a predictor trace is produced for GEPA.
+    with dspy.context(lm=DummyLM([{"reasoning": "n/a", "response": "placeholder"}])):
+        pred = module.forward(task="anything", closed_loop_task_id="task-001")
+
     assert pred._candidate_text == "evolved body"
     assert pred._closed_loop_task_id == "task-001"
 

From 3c96e57c9abd627cff6563d3e66a49f9cfadf9ba Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 13:02:22 -0600
Subject: [PATCH 18/23] test(validation): use real memory actions (add) in
 compound-verdict fixtures

---
 tests/validation/test_report.py    | 6 +++---
 tests/validation/test_validator.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py
index 98b5d9ab..1000f9ca 100644
--- a/tests/validation/test_report.py
+++ b/tests/validation/test_report.py
@@ -75,7 +75,7 @@ def _save_run(self, content: str = "good") -> AgentRunResult:
         return AgentRunResult(
             tool_calls_seq=["memory"], final_text_tail="", duration_seconds=0.0,
             tool_calls_with_args=[
-                {"name": "memory", "arguments": {"action": "save", "content": content}}
+                {"name": "memory", "arguments": {"action": "add", "content": content}}
             ],
         )
 
@@ -123,7 +123,7 @@ def test_judge_receives_only_memory_call_args(self):
             tool_calls_seq=["read_file", "memory"], final_text_tail="", duration_seconds=0.0,
             tool_calls_with_args=[
                 {"name": "read_file", "arguments": {"path": "x"}},
-                {"name": "memory", "arguments": {"action": "save", "content": "c"}},
+                {"name": "memory", "arguments": {"action": "add", "content": "c"}},
             ],
         )
         received = []
@@ -136,7 +136,7 @@ def judge_fn(memory_calls):
             expected_tools=("memory",), forbidden_tools=(), run=run,
             layer2_judge_fn=judge_fn, layer2_threshold=0.7,
         )
-        assert received == [[{"action": "save", "content": "c"}]]
+        assert received == [[{"action": "add", "content": "c"}]]
 
 
 class TestScoreTaskTestCommandMode:
diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py
index 8767d1a8..433e323a 100644
--- a/tests/validation/test_validator.py
+++ b/tests/validation/test_validator.py
@@ -78,7 +78,7 @@ def run(self, ctx):
                     tool_calls_seq=["memory"], final_text_tail="ok",
                     duration_seconds=0.1, model_name="test-model",
                     tool_calls_with_args=[
-                        {"name": "memory", "arguments": {"action": "save", "content": "x"}}
+                        {"name": "memory", "arguments": {"action": "add", "content": "x"}}
                     ],
                 )
 

From 384a6d472a4944ea57a713e282794e17f1cc5df7 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 19:14:55 -0600
Subject: [PATCH 19/23] fix(prompts): configure global LM so GEPA worker
 threads can run the predictor

The forward() trace fix was necessary but insufficient: GEPA evaluates the
module in worker threads that don't inherit the saturation pre-flight's
dspy.context(lm=...), so the passthrough predictor raised 'No LM is loaded',
captured no trajectories, and never proposed. Set the global default LM via
dspy.configure (matching evolve_tool), which the parallelizer propagates to
worker threads. GEPA now scores the valset correctly and the proposer fires;
on a saturated target it correctly declines to mutate (no failures to ground
a change in).
---
 evolution/prompts/evolve_prompt_section.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py
index 288b8737..bd8d227e 100644
--- a/evolution/prompts/evolve_prompt_section.py
+++ b/evolution/prompts/evolve_prompt_section.py
@@ -39,6 +39,7 @@
 from evolution.core.lm_timing_callback import (
     COST_LEDGER,
     CostCeilingExceeded,
+    LMTimingCallback,
     register_litellm_cost_callback,
     register_litellm_failure_callback,
 )
@@ -363,6 +364,15 @@ def score_task_id(task_id: str) -> float:
         resolve_default_lm(role="eval", explicit_model=eval_model),
         temperature=0.0, request_timeout=120, num_retries=3,
     )
+    # Set the global default LM so the passthrough predictor resolves an LM
+    # inside GEPA's worker threads (dspy.context only covers the saturation
+    # pre-flight's own eval). Without this, forward()'s passthrough call raises
+    # "No LM is loaded" in GEPA threads → no trajectories → no proposal.
+    dspy.configure(
+        lm=eval_lm,
+        warn_on_type_mismatch=False,
+        callbacks=[LMTimingCallback()],
+    )
     reflection_lm = instantiate_lm(
         resolve_default_lm(
             role="reflection", explicit_model=reflection_model or optimizer_model

From d44b81f2687cff5a9a671440b3d9286ace95465c Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 20:42:57 -0600
Subject: [PATCH 20/23] feat(prompts): --baseline-override-file to evolve from
 arbitrary starting text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lets evolution start from text other than the live section — e.g. a
deliberately-weakened or adversarial baseline to create headroom for
demonstrating a real mutation, or a regression-injection ablation. The live
section remains the splice/restore target (backed up + restored), so the user's
file is never left mutated; --apply still writes the evolved text. Verified
end-to-end: an adversarial 'never save' baseline scored 0.67, GEPA proposed a
corrected section, deploy gate measured 0.67 -> 1.00 (2W/0L).
---
 evolution/prompts/evolve_prompt_section.py  | 21 +++++++++++++++++++--
 tests/prompts/test_evolve_prompt_section.py | 18 ++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py
index bd8d227e..ef5ae3d1 100644
--- a/evolution/prompts/evolve_prompt_section.py
+++ b/evolution/prompts/evolve_prompt_section.py
@@ -246,11 +246,21 @@ def evolve_prompt_section(
     create_pr_flag: bool = False,
     dry_run: bool = False,
     output_dir: Optional[Path] = None,
+    baseline_override_file: Optional[Path] = None,
 ) -> dict[str, Any]:
     """Evolve one prompt section end-to-end. Returns a summary dict."""
     hermes_repo = Path(hermes_repo).resolve()
     source = HermesPromptSource(hermes_repo)
-    baseline_text = source.read(section_name)
+    # The live section is always the splice/restore target. ``baseline_override``
+    # lets evolution START from different text (e.g. a deliberately-weakened
+    # baseline to create headroom, or a regression-injection ablation) without
+    # touching the real file — the guard still backs up and restores the live
+    # section. ``--apply`` writes the evolved text into the live section as usual.
+    source.read(section_name)  # validate the section exists / is a string constant
+    if baseline_override_file is not None:
+        baseline_text = Path(baseline_override_file).read_text(encoding="utf-8")
+    else:
+        baseline_text = source.read(section_name)
     baseline_chars = len(baseline_text)
 
     suite = TaskSuite.from_jsonl(tasks_path)
@@ -605,11 +615,17 @@ def score_task_id(task_id: str) -> float:
               help="Exercise wiring without any LM/agent calls.")
 @click.option("--output-dir", default=None,
               type=click.Path(file_okay=False, dir_okay=True, path_type=Path))
+@click.option("--baseline-override-file", default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+              help="Start evolution from this text instead of the live section "
+                   "(e.g. a weakened baseline to create headroom). The live file "
+                   "is still backed up + restored; --apply writes the evolved text.")
 def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed,
          max_growth, optimizer_model, reflection_model, eval_model, agent_model,
          layer2_threshold, task_timeout_seconds, max_total_cost_usd,
          gepa_minibatch_size, gepa_acceptance, skip_saturation_check,
-         force_saturation_check, apply, create_pr_flag, dry_run, output_dir):
+         force_saturation_check, apply, create_pr_flag, dry_run, output_dir,
+         baseline_override_file):
     """Evolve one Hermes system-prompt section via GEPA + closed-loop validation."""
     result = evolve_prompt_section(
         section_name=section_name,
@@ -634,6 +650,7 @@ def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed,
         create_pr_flag=create_pr_flag,
         dry_run=dry_run,
         output_dir=output_dir,
+        baseline_override_file=baseline_override_file,
     )
     sys.exit(0 if result["decision"] in {"deploy", "dry_run"} else 1)
 
diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py
index 34081c58..62e85ea9 100644
--- a/tests/prompts/test_evolve_prompt_section.py
+++ b/tests/prompts/test_evolve_prompt_section.py
@@ -96,6 +96,24 @@ def test_dry_run_writes_gate_decision(tmp_path):
     ).read_text()
 
 
+def test_baseline_override_file_replaces_live_section(tmp_path):
+    repo = _fake_repo(tmp_path)
+    suite = _suite(tmp_path)
+    override = tmp_path / "weak.txt"
+    override.write_text("a deliberately weak baseline")
+    out = tmp_path / "out"
+    evolve_prompt_section(
+        section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite,
+        dry_run=True, output_dir=out, baseline_override_file=override,
+    )
+    gate = json.loads((out / "gate_decision.json").read_text())
+    assert gate["baseline_chars"] == len("a deliberately weak baseline")
+    # The live file is never touched by an override dry run.
+    assert "Save durable facts about the user." in (
+        repo / "agent" / "prompt_builder.py"
+    ).read_text()
+
+
 def test_cli_dry_run_exits_zero(tmp_path):
     repo = _fake_repo(tmp_path)
     suite = _suite(tmp_path)

From 621b23da6a26440f922c03ee40a83ca0745a7220 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 20:44:27 -0600
Subject: [PATCH 21/23] =?UTF-8?q?docs(plan):=20Phase=203=20deviations=20?=
 =?UTF-8?q?=E2=80=94=20splice-and-restore,=20compound=20verdict,=20state.d?=
 =?UTF-8?q?b=20runner=20fix,=20adversarial-baseline=20proof?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PLAN.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/PLAN.md b/PLAN.md
index 5c587f64..d03e5001 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -466,6 +466,8 @@ These descriptions are sent with every API call as part of the tool schema — e
 
 **Goal:** Optimize the sections of the system prompt that guide agent behavior.
 
+**Status:** ✅ Complete (MEMORY_GUIDANCE proof point). See "Deviations from plan" at the end of this section.
+
 **Prerequisite:** Phase 2 gate passed — benchmark gating validated, GEPA producing sensible text mutations.
 
 **Week 1 (Build):** Build section-as-DSPy-parameter wrapper for the 5 evolvable prompt sections. Build behavioral test suite generator. This is the riskiest tier so far — system prompt changes affect everything.
@@ -535,6 +537,26 @@ The system prompt is assembled in `run_agent.py` / `agent/prompt_builder.py` fro
 - Identity section must retain core traits (helpful, direct, admits uncertainty)
 - Platform hints must remain platform-accurate (don't tell Telegram to use ANSI codes)
 
+**Deviations from plan (Phase 3):**
+
+1. **Integration is in-place splice-and-restore, not an env-var hook or a plugin.** The design's primary path routed candidate overrides through an upstream `HERMES_PROMPT_OVERRIDES_JSON` env var; that hook was not accepted upstream, so depending on it would make the framework a local-only patch that silently no-ops on any hermes pull. A plugin alternative was ruled out as non-viable: consumers bind the constants at import time (`from agent.prompt_builder import MEMORY_GUIDANCE` in `run_agent.py` and `agent/system_prompt.py`), so a plugin's `register()` runs too late to reach them. Phase 3 instead splices the candidate directly into `agent/prompt_builder.py` (byte-precise AST replacement via `repr()`, parse-checked) and restores from an atomic backup, reusing Phase 2's `ClosedLoopValidator` flock + sha-drift + stale-backup machinery. No upstream dependency; runs against stock Hermes. This also **collapsed the planned parallel `PromptSectionValidator`** into a small `HermesPromptSectionInstaller` (an `ArtifactInstaller`) plus a one-method Layer-2 hook on the shared validator — less code than the design called for.
+
+2. **One target section per run; MEMORY_GUIDANCE is the only proof point.** Joint multi-section optimization and identity/persona evolution are deferred — joint runs carry Phase 2's "stealing selections" risk, and `DEFAULT_AGENT_IDENTITY` has no tool-call anchor for the verdict. The `PromptSource` abstraction supports the other string sections (`SKILLS_GUIDANCE`, `SESSION_SEARCH_GUIDANCE`, etc.) with no refactor; dict-typed sections like `PLATFORM_HINTS` are out of scope for v1 (string constants only).
+
+3. **Verdict is compound (tool-membership + LLM content judge), threaded per-task.** Layer 1 is the Phase 2 expected/forbidden rule on whether `memory` was invoked; Layer 2 is an LLM judge scoring the saved content against each task's `expected_save_content` rubric. The validator builds the judge per-task (a factory) so the content judge sees the task's rubric and message — a fixed global judge couldn't. Note the real Hermes `memory` tool actions are **`add`/`replace`** (content-bearing), not `save`; the full set is add/replace/remove/read.
+
+4. **Eval suite ships as a curated 12-task golden set, not 50 synthetic + 10 golden.** A hand-authored `memory_guidance.jsonl` spans the five categories (save-preference, save-correction, dont-save-task-progress, dont-save-completed-work-log, declarative-vs-imperative). The synthetic generator (`build_memory_guidance_dataset`) is built and unit-tested, but full synthetic expansion via a funded generation run is deferred — curation gives a higher-signal first suite and avoids upfront generation spend.
+
+5. **PR automation is deferred for prompt sections.** `create_pr` atomically copies a full evolved artifact over `origin/<base>`'s file; deriving an evolved `prompt_builder.py` from the local checkout would carry the unmerged override-hook commit into the PR diff. `--create-pr` is accepted but records a skipped block; the deploy path is `--apply` (writes the evolved section into the live file) plus a manual PR. A section-scoped PR path (splice into `origin/<base>`'s file, not the local one) is future work.
+
+6. **The shared closed-loop runner had to be rebuilt for current Hermes.** Surfaced by the Phase 3 end-to-end smoke: `hermes -z` one-shot mode is now ephemeral — it prints only the final response and no longer writes `session_*.json`; sessions persist to a SQLite `state.db` in `HERMES_HOME`. `HermesAgentRunner` globbed for the obsolete JSON files, so **every** closed-loop run had been silently abstaining ("no session JSON") across Phases 1–3. The runner now reads the most-recent session's messages from `state.db` (the `tool_calls` column carries the same OpenAI-nested shape the extractors already parse). This is a shared-infrastructure fix that unblocks all closed-loop validation, not just prompts.
+
+7. **Behavioral eval is serialized, and agent-subprocess cost is invisible to the budget cap.** Because every candidate is spliced into one shared `prompt_builder.py`, the GEPA inner-loop scorer serializes splice+run under a lock (DSPy's evaluator is multi-threaded; the shared file is not). Per-section closed-loop is therefore effectively serial — an accepted v1 cost of the splice-and-restore model. The agent's own LM spend happens inside the `hermes` child process, invisible to the in-process cost ledger, so `--max-cost-usd` bounds only judge + reflection + passthrough spend; `sessions.actual_cost_usd` in `state.db` could close that gap later.
+
+8. **Saturation default-deny confirmed on a capable agent; a demonstrated improvement required an adversarial baseline.** With `gpt-5.4-mini`, both the live `MEMORY_GUIDANCE` and a *passively*-weakened baseline scored 1.0 / 6 holdout — `no_headroom`, correctly default-denied. This matches Phase 2's "regression-catching, not improvement-finding on tuned artifacts" finding and the binary model-tier effect (a capable agent saves correctly regardless of vague guidance). A real mutation was demonstrated only by *actively misdirecting* the baseline: an adversarial "never proactively save" section scored 0.67, GEPA's reflective proposer inverted it to restore proactive saving (and made it shorter), and the deploy gate measured **0.67 → 1.00, 2 wins / 0 losses → deploy**. The `--baseline-override-file` flag enables this ablation (and regression-injection testing generally) without mutating the live section.
+
+9. **Benchmark gating again not built in (same as Phases 1–2).** The built-in deploy gate is paired-bootstrap CI plus the dual-condition rule on the holdout; `--benchmark-cmd` remains the external-benchmark hook. TBLite / YC-Bench wiring is left to the user's `--benchmark-cmd`.
+
 ### Phase 4: Code Evolution via Darwinian Evolver
 
 **Goal:** Evolve tool implementation code for better performance and fewer bugs.

From b681c08c47d012fb9aec3d83cd44194d5bf000e4 Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Mon, 1 Jun 2026 21:53:27 -0600
Subject: [PATCH 22/23] =?UTF-8?q?fix(prompts,validation):=20address=20PR?=
 =?UTF-8?q?=20review=20=E2=80=94=20abstain=20on=20corrupt=20sessions,=20do?=
 =?UTF-8?q?c/comment=20accuracy,=20guards=20+=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Critical: a malformed tool_calls column in state.db now abstains (error set +
  logged) instead of reading as 'agent invoked no tools', which scored a
  DB-format regression as a fake behavioral failure and contaminated fitness.
- Surface previously-silent fallbacks: malformed tool-call args, a memory call
  with no save action, and an unparseable judge score now log.
- Doc/comment accuracy: memory action is add (not the nonexistent 'save'); tool
  schema enum is add/replace/remove (not 'read'); state.db tool_calls is the
  flat shape (nested handled for compat); the memoizing-scorer/validator splice
  cadence; the guard wraps pre-flight + GEPA; _closed_loop_task_id is set by
  PromptModule.forward.
- Reject a <2-task suite up front (empty GEPA trainset otherwise).
- Tests: parse_session_from_db malformed/corrupt/missing-table matrix; the
  _prompt_builder_guard restore round-trip, stale-backup refusal, and concurrent
  lock refusal; the single-task-suite guard.
---
 evolution/prompts/evolve_prompt_section.py  | 12 ++++-
 evolution/prompts/prompt_judge.py           | 46 +++++++++++++-----
 evolution/validation/agent_runner.py        |  4 +-
 evolution/validation/hermes_runner.py       | 30 ++++++++++--
 evolution/validation/task.py                |  2 +-
 tests/prompts/test_evolve_prompt_section.py | 54 +++++++++++++++++++++
 tests/validation/test_hermes_runner.py      | 38 +++++++++++++++
 7 files changed, 166 insertions(+), 20 deletions(-)

diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py
index ef5ae3d1..4558e5c8 100644
--- a/evolution/prompts/evolve_prompt_section.py
+++ b/evolution/prompts/evolve_prompt_section.py
@@ -153,9 +153,11 @@ def _section_text_from_candidate(candidate: Any, section_name: str) -> str:
 @contextmanager
 def _prompt_builder_guard(target_path: Path) -> Iterator[None]:
     """Back up ``prompt_builder.py`` + hold the shared closed-loop flock for the
-    duration of GEPA evolution, then restore the original bytes on exit.
+    duration of the saturation pre-flight + GEPA evolution, then restore the
+    original bytes on exit.
 
-    The GEPA inner loop splices candidates directly into the live file; this
+    The pre-flight and GEPA inner loop splice candidates directly into the live
+    file; this
     guard guarantees the user's checkout is byte-restored afterward and that no
     concurrent harness run (which uses the same lock + backup names) mutates it
     mid-flight. Sequenced before the deploy-gate ``ClosedLoopValidator``, which
@@ -264,6 +266,12 @@ def evolve_prompt_section(
     baseline_chars = len(baseline_text)
 
     suite = TaskSuite.from_jsonl(tasks_path)
+    if len(suite.tasks) < 2:
+        raise ValueError(
+            f"{tasks_path} has {len(suite.tasks)} task(s); need at least 2 so the "
+            f"split yields a non-empty GEPA trainset and a non-empty deploy-gate "
+            f"holdout."
+        )
     train_tasks, holdout_tasks = _split_train_holdout(
         suite.tasks, holdout_ratio=holdout_ratio, seed=seed
     )
diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py
index 8d6c0b4b..d6e20cc4 100644
--- a/evolution/prompts/prompt_judge.py
+++ b/evolution/prompts/prompt_judge.py
@@ -24,9 +24,9 @@
 
 SAVE_ACTIONS = frozenset({"add", "replace"})
 """Hermes ``memory`` tool actions that persist content worth judging. The
-tool's full action set is add / replace / remove / read (see
-``tools/memory_tool.py``); only ``add`` and ``replace`` carry a ``content``
-payload, so only those are content-judged. ``remove`` / ``read`` are not saves."""
+tool's schema enum is add / replace / remove (see ``tools/memory_tool.py``);
+only ``add`` and ``replace`` carry a ``content`` payload, so only those are
+content-judged. ``remove`` is not a save."""
 
 
 class SaveCallSignature(dspy.Signature):
@@ -74,6 +74,17 @@ def score(self, *, task: str, expected_content: str, saved_content: str) -> floa
                 expected_content=expected_content,
                 saved_content=saved_content,
             )
+        # _clamp_to_unit returns a neutral 0.5 on unparseable output. A 0.5 is
+        # below the default 0.7 threshold, so a garbled judge response silently
+        # fails an otherwise-good save — log the raw value so that's debuggable
+        # rather than indistinguishable from a real mediocre score.
+        try:
+            float(str(result.quality).strip())
+        except (ValueError, TypeError):
+            logger.warning(
+                "SaveCallJudge: unparseable quality %r from judge LM; "
+                "falling back to neutral 0.5", result.quality,
+            )
         return _clamp_to_unit(result.quality)
 
 
@@ -90,12 +101,22 @@ def judge_save_calls(
     ``memory`` — each item the call's ``arguments`` dict. Only content-bearing
     save actions (``add`` / ``replace``, see ``SAVE_ACTIONS``) are judged.
 
-    Returns 1.0 when no save calls were made (Layer 1 catches the
-    "should-have-saved-but-didn't" failure; Layer 2 only scores what
-    actually happened) and also when no judge/rubric is configured.
+    Returns 1.0 when no save calls were made (Layer 1 catches the case where
+    ``memory`` was never invoked; note it does NOT backstop a ``memory`` call
+    with a non-save action like ``remove`` — that still scores a vacuous 1.0
+    here) and also when no judge/rubric is configured.
     """
     save_calls = [c for c in calls if c.get("action") in SAVE_ACTIONS]
     if not save_calls:
+        # Distinguish "no memory call" (expected, silent) from "memory was
+        # invoked but nothing matched SAVE_ACTIONS" (worth surfacing — a save
+        # we can't score, e.g. an action rename or malformed empty-args call).
+        if calls:
+            logger.info(
+                "judge_save_calls: %d memory call(s) but no save action "
+                "(actions=%s); returning vacuous 1.0",
+                len(calls), [c.get("action") for c in calls],
+            )
         return 1.0
     if judge is None or expected_content is None:
         return 1.0
@@ -123,9 +144,11 @@ def make_prompt_fitness_metric(
     """Build the GEPA-shaped 5-arg fitness metric for a prompt section.
 
     All prompt-section eval is behavioral (a real Hermes subprocess), so
-    every prediction must carry ``_closed_loop_task_id`` (set by the
-    dataset builder) and ``_candidate_text`` (set by ``PromptModule``).
-    Predictions missing the task id are degenerate — they score 0 with a
+    every prediction must carry ``_closed_loop_task_id`` and
+    ``_candidate_text`` — both attached by ``PromptModule.forward`` (the task
+    id flows in as the ``closed_loop_task_id`` input field built by
+    ``_behavioral_examples``). Predictions missing the task id are degenerate
+    — they score 0 with a
     diagnostic so the misconfiguration is visible in GEPA feedback rather
     than silently scoring well.
 
@@ -190,8 +213,9 @@ def make_memoizing_splice_scorer(
     effectively serial; that's an accepted v1 cost of splice-and-restore.
 
     Backup/restore of the mutated source is the caller's responsibility — wrap
-    the whole GEPA run, not each call (the per-run guard mirrors
-    ``ClosedLoopValidator``'s splice-once-per-phase shape).
+    the whole GEPA run, not each call. This mirrors ``ClosedLoopValidator``,
+    which backs up once and restores once around both phases (it re-splices the
+    artifact on every task inside a phase, not once per phase).
     """
     state: dict[str, Any] = {"installed": _UNSET}
     lock = lock if lock is not None else threading.Lock()
diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py
index df9a4d7c..6d36227c 100644
--- a/evolution/validation/agent_runner.py
+++ b/evolution/validation/agent_runner.py
@@ -24,11 +24,11 @@ class AgentRunResult:
     ``tool_calls_with_args`` carries the same calls in order as
     ``{"name", "arguments"}`` dicts (arguments parsed from the
     LLM-emitted JSON). The compound-verdict Layer 2 judge needs the
-    argument payloads — e.g. the content of a ``memory(action='save')``
+    argument payloads — e.g. the content of a ``memory(action='add')``
     call — which ``tool_calls_seq`` discards.
 
     ``error`` is set when the runner itself failed to drive the agent
-    (subprocess timeout, no session JSON written, parse failure). It's
+    (subprocess timeout, no session written, parse failure). It's
     distinct from "agent invoked a tool that failed" — that's still a
     valid run, just one where the agent struggled. Tasks with ``error``
     are counted as *abstentions* in the report, not as failures, so a
diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
index 27498461..e0f904ec 100644
--- a/evolution/validation/hermes_runner.py
+++ b/evolution/validation/hermes_runner.py
@@ -219,8 +219,14 @@ def parse_session_from_db(
     Modern hermes persists each session to SQLite. We read the most-recent
     session's messages and normalize them into the same message-dict shape the
     legacy JSON path produced, so the existing extractors work unchanged. The
-    ``messages.tool_calls`` column holds the OpenAI-nested
-    ``{"function": {"name", "arguments"}}`` list verbatim.
+    ``messages.tool_calls`` column holds the tool-call list verbatim — current
+    hermes writes the flat ``{"name", "arguments"}`` shape; the extractors also
+    accept the older OpenAI-nested ``{"function": {...}}`` shape.
+
+    A row whose ``tool_calls`` column won't parse as JSON aborts with an
+    ``error`` result (the task abstains) rather than being silently read as
+    "agent invoked no tools" — that would score a DB-format regression as a
+    behavioral failure and contaminate the fitness signal.
     """
     try:
         conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
@@ -268,8 +274,19 @@ def parse_session_from_db(
         if raw_calls:
             try:
                 parsed_calls = json.loads(raw_calls)
-            except (json.JSONDecodeError, TypeError):
-                parsed_calls = None
+            except (json.JSONDecodeError, TypeError) as exc:
+                logger.warning(
+                    "malformed tool_calls JSON in session %s at %s (%s); "
+                    "abstaining rather than scoring the task as a no-op",
+                    session["id"], db_path, exc,
+                )
+                return AgentRunResult(
+                    tool_calls_seq=[],
+                    final_text_tail="",
+                    duration_seconds=duration_seconds,
+                    error=f"malformed tool_calls JSON in session DB at {db_path}: {exc}",
+                    session_path=db_path,
+                )
         messages.append({
             "role": row["role"],
             "content": row["content"] or "",
@@ -358,6 +375,11 @@ def _extract_tool_calls_with_args(messages: list[dict]) -> list[dict]:
             try:
                 args = json.loads(args_raw) if args_raw else {}
             except (json.JSONDecodeError, TypeError):
+                logger.warning(
+                    "malformed arguments for tool call %r (%r); using {} — a "
+                    "content judge will see an empty-args call",
+                    name, args_raw[:120],
+                )
                 args = {}
             if not isinstance(args, dict):
                 args = {}
diff --git a/evolution/validation/task.py b/evolution/validation/task.py
index b7833fa0..3b6f4bbe 100644
--- a/evolution/validation/task.py
+++ b/evolution/validation/task.py
@@ -34,7 +34,7 @@ class Task:
       set, takes precedence over the tool-call rule.
 
     ``expected_save_content`` is an optional rubric (not exact text)
-    describing what a good ``memory(action='save')`` would contain. It
+    describing what a good ``memory(action='add')`` would contain. It
     feeds the prompt-section compound verdict's Layer 2 content judge; it
     has no effect on the Layer 1 tool-call rule above.
     """
diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py
index 62e85ea9..50c0efc9 100644
--- a/tests/prompts/test_evolve_prompt_section.py
+++ b/tests/prompts/test_evolve_prompt_section.py
@@ -7,8 +7,15 @@
 
 from click.testing import CliRunner
 
+import fcntl
+
+import pytest
+
 from evolution.prompts.evolve_prompt_section import (
+    _BACKUP_SUFFIX,
+    _LOCK_FILENAME,
     _make_layer2_factory,
+    _prompt_builder_guard,
     _section_text_from_candidate,
     _split_train_holdout,
     evolve_prompt_section,
@@ -114,6 +121,53 @@ def test_baseline_override_file_replaces_live_section(tmp_path):
     ).read_text()
 
 
+class TestPromptBuilderGuard:
+    def test_restores_bytes_even_on_exception(self, tmp_path):
+        target = tmp_path / "pb.py"
+        target.write_text("ORIGINAL = 1\n")
+        original = target.read_bytes()
+        with pytest.raises(RuntimeError, match="boom"):
+            with _prompt_builder_guard(target):
+                target.write_text("MUTATED = 2\n")
+                raise RuntimeError("boom")
+        assert target.read_bytes() == original
+        assert not target.with_suffix(target.suffix + _BACKUP_SUFFIX).exists()
+
+    def test_refuses_stale_backup(self, tmp_path):
+        target = tmp_path / "pb.py"
+        target.write_text("X = 1\n")
+        target.with_suffix(target.suffix + _BACKUP_SUFFIX).write_text("stale")
+        with pytest.raises(RuntimeError, match="[Ss]tale backup"):
+            with _prompt_builder_guard(target):
+                pass
+
+    def test_refuses_when_another_run_holds_the_lock(self, tmp_path):
+        target = tmp_path / "pb.py"
+        target.write_text("X = 1\n")
+        other = open(target.parent / _LOCK_FILENAME, "w")
+        fcntl.flock(other.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+        try:
+            with pytest.raises(RuntimeError, match="holds"):
+                with _prompt_builder_guard(target):
+                    pass
+        finally:
+            fcntl.flock(other.fileno(), fcntl.LOCK_UN)
+            other.close()
+
+
+def test_rejects_single_task_suite(tmp_path):
+    repo = _fake_repo(tmp_path)
+    suite = tmp_path / "one.jsonl"
+    suite.write_text(json.dumps({
+        "task_id": "only", "user_message": "x", "expected_tools": ["memory"],
+    }) + "\n")
+    with pytest.raises(ValueError, match="at least 2"):
+        evolve_prompt_section(
+            section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite,
+            dry_run=True, output_dir=tmp_path / "out",
+        )
+
+
 def test_cli_dry_run_exits_zero(tmp_path):
     repo = _fake_repo(tmp_path)
     suite = _suite(tmp_path)
diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py
index 243d3dbf..3fb83f20 100644
--- a/tests/validation/test_hermes_runner.py
+++ b/tests/validation/test_hermes_runner.py
@@ -319,6 +319,44 @@ def test_picks_most_recent_session(self, tmp_path):
         result = parse_session_from_db(db, duration_seconds=1.0)
         assert result.tool_calls_seq == ["write_file"]
 
+    def test_malformed_tool_calls_column_abstains(self, tmp_path):
+        """A corrupt tool_calls column must abstain (error set), not read as
+        'agent invoked no tools' (which would score a hard behavioral fail)."""
+        db = tmp_path / "state.db"
+        _make_state_db(db, session_id="s1", model="m",
+                       messages=[{"role": "user", "content": "hi"}])
+        conn = sqlite3.connect(db)
+        conn.execute(
+            "INSERT INTO messages (session_id, role, content, tool_calls) VALUES (?,?,?,?)",
+            ("s1", "assistant", "", "{not-valid-json"),
+        )
+        conn.commit()
+        conn.close()
+        result = parse_session_from_db(db, duration_seconds=1.0)
+        assert result.error is not None
+        assert "malformed tool_calls" in result.error
+        assert result.tool_calls_seq == []
+
+    def test_corrupt_db_file_errors(self, tmp_path):
+        bad = tmp_path / "state.db"
+        bad.write_bytes(b"this is not a sqlite database at all")
+        result = parse_session_from_db(bad, duration_seconds=1.0)
+        assert result.error is not None
+        assert "could not" in result.error  # open or read, depending on sqlite
+
+    def test_missing_messages_table_errors(self, tmp_path):
+        db = tmp_path / "state.db"
+        conn = sqlite3.connect(db)
+        conn.executescript(
+            "CREATE TABLE sessions (id TEXT, model TEXT, started_at REAL);"
+            "INSERT INTO sessions VALUES ('s1', 'm', 1.0);"
+        )
+        conn.commit()
+        conn.close()
+        result = parse_session_from_db(db, duration_seconds=1.0)
+        assert result.error is not None
+        assert "could not read" in result.error
+
 
 class TestHermesAgentRunnerSubprocess:
     """The subprocess invocation layer: env + cwd + args plumbing."""

From 5658e7bb1df36ae4866b8780d2dd78aee5cd61db Mon Sep 17 00:00:00 2001
From: Justin Ramos <justin.ramos@gmail.com>
Date: Tue, 2 Jun 2026 07:43:12 -0600
Subject: [PATCH 23/23] refactor(prompts): narrow PromptSource Protocol to read
 + write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

read/write are the only members the evolution driver exercises (the runtime
override seam moved to HermesPromptSectionInstaller). name and list_sections
had no production consumer, so they're no longer part of the shared contract —
list_sections + SectionDescriptor remain as concrete conveniences on
HermesPromptSource for a future --list-sections affordance. Every member of a
Protocol is a cost on every future implementer; this keeps the contract to
exactly what's shared.
---
 evolution/prompts/prompt_source.py  | 21 ++++++++++++---------
 tests/prompts/test_prompt_source.py | 13 ++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/evolution/prompts/prompt_source.py b/evolution/prompts/prompt_source.py
index 5bd5ca5b..0d056d92 100644
--- a/evolution/prompts/prompt_source.py
+++ b/evolution/prompts/prompt_source.py
@@ -1,9 +1,13 @@
-"""PromptSource Protocol — adapters that read, write, and enumerate named prompt sections.
+"""PromptSource Protocol — adapters that read and write named prompt sections.
 
 Phase 3 integrates via in-place splice-and-restore (see
 ``HermesPromptSectionInstaller``), so the runtime override seam lives in
-the installer, not here. A PromptSource only needs to read the baseline,
-persist an evolved value, and enumerate what's targetable.
+the installer, not here. The contract is deliberately just read + write: the
+driver reads the baseline and persists/splices an evolved value, and nothing
+more is shared across implementers. Enumeration (``list_sections`` →
+``SectionDescriptor``) is a concrete convenience on ``HermesPromptSource`` for
+a future ``--list-sections`` affordance, not part of the contract every
+adapter must satisfy.
 """
 
 from __future__ import annotations
@@ -31,9 +35,12 @@ class SectionDescriptor:
 
 @runtime_checkable
 class PromptSource(Protocol):
-    """Adapter contract for prompt-section evolution targets."""
+    """Adapter contract for prompt-section evolution targets: read + write.
 
-    name: str
+    Kept minimal on purpose — these are the only members the evolution driver
+    exercises. Concrete adapters may offer more (e.g. ``HermesPromptSource``
+    also enumerates sections), but those are not part of the shared contract.
+    """
 
     def read(self, section_name: str) -> str:
         """Return the canonical baseline text of the named section."""
@@ -47,7 +54,3 @@ def write(self, section_name: str, new_text: str) -> None:
         owns the backup/restore around the mutation).
         """
         ...
-
-    def list_sections(self) -> list[SectionDescriptor]:
-        """Enumerate all evolvable sections this source can target."""
-        ...
diff --git a/tests/prompts/test_prompt_source.py b/tests/prompts/test_prompt_source.py
index 41703567..89e3d3ff 100644
--- a/tests/prompts/test_prompt_source.py
+++ b/tests/prompts/test_prompt_source.py
@@ -22,20 +22,16 @@ def test_section_descriptor_is_frozen():
 
 
 def test_prompt_source_protocol_runtime_checkable():
-    """A concrete class implementing the three methods satisfies isinstance()."""
+    """read + write are the whole contract — a class with just those satisfies
+    isinstance(), with no need to enumerate or carry a name."""
 
     class StubSource:
-        name = "stub"
-
         def read(self, section_name: str) -> str:
             return "stub"
 
         def write(self, section_name: str, new_text: str) -> None:
             return None
 
-        def list_sections(self) -> list[SectionDescriptor]:
-            return []
-
     assert isinstance(StubSource(), PromptSource)
 
 
@@ -43,12 +39,7 @@ def test_prompt_source_protocol_rejects_incomplete():
     """Missing a required method => not a PromptSource."""
 
     class MissingWrite:
-        name = "incomplete"
-
         def read(self, section_name: str) -> str:
             return "x"
 
-        def list_sections(self) -> list[SectionDescriptor]:
-            return []
-
     assert not isinstance(MissingWrite(), PromptSource)