From 1a5714eea4bf689984799116f9304fd8840a48d1 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:28:54 -0600 Subject: [PATCH 01/23] feat(prompts): add PromptSource protocol + SectionDescriptor --- evolution/prompts/__init__.py | 2 +- evolution/prompts/prompt_source.py | 53 ++++++++++++++++++++++++++++ tests/prompts/__init__.py | 0 tests/prompts/test_prompt_source.py | 54 +++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 evolution/prompts/prompt_source.py create mode 100644 tests/prompts/__init__.py create mode 100644 tests/prompts/test_prompt_source.py diff --git a/evolution/prompts/__init__.py b/evolution/prompts/__init__.py index 85704c83..c342d5d0 100644 --- a/evolution/prompts/__init__.py +++ b/evolution/prompts/__init__.py @@ -1 +1 @@ -"""Phase placeholder: prompts evolution.""" +"""Phase 3: system prompt section evolution.""" diff --git a/evolution/prompts/prompt_source.py b/evolution/prompts/prompt_source.py new file mode 100644 index 00000000..5bd5ca5b --- /dev/null +++ b/evolution/prompts/prompt_source.py @@ -0,0 +1,53 @@ +"""PromptSource Protocol — adapters that read, write, and enumerate named prompt sections. + +Phase 3 integrates via in-place splice-and-restore (see +``HermesPromptSectionInstaller``), so the runtime override seam lives in +the installer, not here. A PromptSource only needs to read the baseline, +persist an evolved value, and enumerate what's targetable. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Protocol, runtime_checkable + + +@dataclass(frozen=True) +class SectionDescriptor: + """Metadata about an evolvable prompt section. + + ``applicability`` is informational at design time; it's not used for + runtime filtering in v1, but downstream joint-optimization work will + consume it (e.g., model-family-targeted sections only get evaluated + against that family). + """ + + name: str + current_text: str + source_path: Path + applicability: dict[str, str] = field(default_factory=dict) + + +@runtime_checkable +class PromptSource(Protocol): + """Adapter contract for prompt-section evolution targets.""" + + name: str + + def read(self, section_name: str) -> str: + """Return the canonical baseline text of the named section.""" + ... + + def write(self, section_name: str, new_text: str) -> None: + """Persist evolved text to the canonical source. + + Used both at deploy time and as the splice primitive the + closed-loop installer drives during validation (the installer + owns the backup/restore around the mutation). + """ + ... + + def list_sections(self) -> list[SectionDescriptor]: + """Enumerate all evolvable sections this source can target.""" + ... diff --git a/tests/prompts/__init__.py b/tests/prompts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/prompts/test_prompt_source.py b/tests/prompts/test_prompt_source.py new file mode 100644 index 00000000..41703567 --- /dev/null +++ b/tests/prompts/test_prompt_source.py @@ -0,0 +1,54 @@ +"""Tests for the PromptSource protocol contract.""" +from __future__ import annotations + +import dataclasses +from pathlib import Path + +from evolution.prompts.prompt_source import PromptSource, SectionDescriptor + + +def test_section_descriptor_is_frozen(): + descriptor = SectionDescriptor( + name="MEMORY_GUIDANCE", + current_text="baseline text", + source_path=Path("/tmp/fake.py"), + ) + assert dataclasses.is_dataclass(descriptor) + try: + descriptor.name = "OTHER" + except dataclasses.FrozenInstanceError: + return + raise AssertionError("SectionDescriptor must be frozen") + + +def test_prompt_source_protocol_runtime_checkable(): + """A concrete class implementing the three methods satisfies isinstance().""" + + class StubSource: + name = "stub" + + def read(self, section_name: str) -> str: + return "stub" + + def write(self, section_name: str, new_text: str) -> None: + return None + + def list_sections(self) -> list[SectionDescriptor]: + return [] + + assert isinstance(StubSource(), PromptSource) + + +def test_prompt_source_protocol_rejects_incomplete(): + """Missing a required method => not a PromptSource.""" + + class MissingWrite: + name = "incomplete" + + def read(self, section_name: str) -> str: + return "x" + + def list_sections(self) -> list[SectionDescriptor]: + return [] + + assert not isinstance(MissingWrite(), PromptSource) From 2b857acfae4d8dc7e3aa9c0f27cb5bc5f0975d5b Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:29:27 -0600 Subject: [PATCH 02/23] feat(prompts): HermesPromptSource AST-based read --- evolution/prompts/hermes_prompt_source.py | 64 +++++++++++++++++++++ tests/prompts/test_hermes_prompt_source.py | 66 ++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 evolution/prompts/hermes_prompt_source.py create mode 100644 tests/prompts/test_hermes_prompt_source.py diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py new file mode 100644 index 00000000..1b4412c3 --- /dev/null +++ b/evolution/prompts/hermes_prompt_source.py @@ -0,0 +1,64 @@ +"""HermesPromptSource — read/write named string constants in Hermes prompt_builder.py. + +Walks ``agent/prompt_builder.py`` for top-level ``NAME = "..."`` (or +concatenated-string) assignments. v1 supports string-typed constants +only; dict-typed constants (like ``PLATFORM_HINTS``) raise KeyError on +read. +""" + +from __future__ import annotations + +import ast +import logging +from pathlib import Path + +from evolution.prompts.prompt_source import SectionDescriptor + +logger = logging.getLogger(__name__) + + +class HermesPromptSource: + """Read/write named string constants in Hermes prompt_builder.py.""" + + name = "hermes_prompt_source" + + def __init__(self, hermes_repo: Path) -> None: + self.hermes_repo = Path(hermes_repo) + self.prompt_builder_path = self.hermes_repo / "agent" / "prompt_builder.py" + if not self.prompt_builder_path.is_file(): + raise FileNotFoundError( + f"prompt_builder.py not found at {self.prompt_builder_path}" + ) + + def read(self, section_name: str) -> str: + constants = self._parse_string_constants() + if section_name not in constants: + raise KeyError( + f"section {section_name!r} not found in {self.prompt_builder_path} " + f"(v1 only supports top-level string-typed constants). " + f"Available: {sorted(constants)}" + ) + return constants[section_name][0] + + def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]: + """Return ``{name: (value, value_ast_node)}`` for every top-level + string-typed assignment in prompt_builder.py. + + Concatenated-string forms like ``X = ("a" "b" "c")`` are folded to + a single ``ast.Constant`` by the parser, so they read back as one + string. The AST node is retained so ``write`` can splice by byte + offset. + """ + source = self.prompt_builder_path.read_text(encoding="utf-8") + tree = ast.parse(source, filename=str(self.prompt_builder_path)) + out: dict[str, tuple[str, ast.Constant]] = {} + for node in tree.body: + if not (isinstance(node, ast.Assign) and len(node.targets) == 1): + continue + target = node.targets[0] + if not isinstance(target, ast.Name): + continue + value = node.value + if isinstance(value, ast.Constant) and isinstance(value.value, str): + out[target.id] = (value.value, value) + return out diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py new file mode 100644 index 00000000..aa1794ef --- /dev/null +++ b/tests/prompts/test_hermes_prompt_source.py @@ -0,0 +1,66 @@ +"""Tests for HermesPromptSource — AST-based read/write/list.""" +from __future__ import annotations + +import textwrap +from pathlib import Path + +import pytest + +from evolution.prompts.hermes_prompt_source import HermesPromptSource + + +@pytest.fixture +def fake_hermes_repo(tmp_path: Path) -> Path: + """A tmp hermes-agent-like checkout with a stub prompt_builder.py.""" + (tmp_path / "agent").mkdir() + pb = tmp_path / "agent" / "prompt_builder.py" + pb.write_text(textwrap.dedent('''\ + """Stub prompt_builder for tests.""" + import os + + MEMORY_GUIDANCE = ( + "You have persistent memory across sessions. " + "Save durable facts." + ) + + SKILLS_GUIDANCE = "After completing a complex task, save the approach." + + PLATFORM_HINTS = { + "cli": "You are a CLI AI Agent.", + } + + def _not_a_constant(): + return "ignored" + ''')) + return tmp_path + + +def test_read_concatenated_string_constant(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + text = source.read("MEMORY_GUIDANCE") + assert "persistent memory" in text + assert "Save durable facts." in text + + +def test_read_simple_string_constant(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + text = source.read("SKILLS_GUIDANCE") + assert text == "After completing a complex task, save the approach." + + +def test_read_skips_dict_constants(fake_hermes_repo: Path): + """PLATFORM_HINTS is a dict; v1 doesn't support dict-shape sections.""" + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + with pytest.raises(KeyError, match="PLATFORM_HINTS"): + source.read("PLATFORM_HINTS") + + +def test_read_unknown_constant_raises(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + with pytest.raises(KeyError, match="NONEXISTENT"): + source.read("NONEXISTENT") + + +def test_missing_prompt_builder_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError, match="prompt_builder.py"): + HermesPromptSource(hermes_repo=tmp_path) From c9f48e7c4d62533c2530fd16c1a30902c58d0a9e Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:30:05 -0600 Subject: [PATCH 03/23] feat(prompts): HermesPromptSource AST-based write --- evolution/prompts/hermes_prompt_source.py | 62 ++++++++++++++++++++++ tests/prompts/test_hermes_prompt_source.py | 29 ++++++++++ 2 files changed, 91 insertions(+) diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py index 1b4412c3..bae944f3 100644 --- a/evolution/prompts/hermes_prompt_source.py +++ b/evolution/prompts/hermes_prompt_source.py @@ -10,6 +10,9 @@ import ast import logging +import os +import shutil +import tempfile from pathlib import Path from evolution.prompts.prompt_source import SectionDescriptor @@ -62,3 +65,62 @@ def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]: if isinstance(value, ast.Constant) and isinstance(value.value, str): out[target.id] = (value.value, value) return out + + def write(self, section_name: str, new_text: str) -> None: + """Splice ``new_text`` into the named constant in place. + + Uses ``repr()`` for the replacement literal so the new text + round-trips byte-equal regardless of embedded newlines, quotes, + or backslashes. Other constants are left verbatim. + + The write is atomic (tempfile + ``os.replace``) and guarded: the + new bytes must parse as Python before the original is replaced. + A botched splice (only possible if AST extraction were wrong) + raises and leaves ``prompt_builder.py`` untouched, rather than + leaving the user's Hermes unstartable. + """ + constants = self._parse_string_constants() + if section_name not in constants: + raise KeyError( + f"section {section_name!r} not found in {self.prompt_builder_path}" + ) + _, value_node = constants[section_name] + data = self.prompt_builder_path.read_bytes() + start_offset = _byte_offset(data, value_node.lineno, value_node.col_offset) + end_offset = _byte_offset( + data, value_node.end_lineno, value_node.end_col_offset + ) + replacement = repr(new_text).encode("utf-8") + new_bytes = data[:start_offset] + replacement + data[end_offset:] + + try: + ast.parse(new_bytes, filename=str(self.prompt_builder_path)) + except SyntaxError as exc: + raise RuntimeError( + f"Refusing to write {self.prompt_builder_path}: spliced output " + f"would not parse as Python ({exc}). Original file untouched." + ) from exc + + self._atomic_write_bytes(self.prompt_builder_path, new_bytes) + + @staticmethod + def _atomic_write_bytes(path: Path, data: bytes) -> None: + fd, tmp_name = tempfile.mkstemp(dir=path.parent, suffix=path.suffix) + tmp_path = Path(tmp_name) + try: + with os.fdopen(fd, "wb") as fh: + fh.write(data) + shutil.copymode(path, tmp_path) + os.replace(tmp_path, path) + except BaseException: + tmp_path.unlink(missing_ok=True) + raise + + +def _byte_offset(data: bytes, lineno: int, col_offset: int) -> int: + """Convert an AST position (1-based line, 0-based byte column) to an + absolute byte offset into ``data``.""" + lines = data.splitlines(keepends=True) + if lineno < 1 or lineno > len(lines): + raise ValueError(f"lineno {lineno} out of range [1, {len(lines)}]") + return sum(len(line) for line in lines[: lineno - 1]) + col_offset diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py index aa1794ef..e177c7ab 100644 --- a/tests/prompts/test_hermes_prompt_source.py +++ b/tests/prompts/test_hermes_prompt_source.py @@ -64,3 +64,32 @@ def test_read_unknown_constant_raises(fake_hermes_repo: Path): def test_missing_prompt_builder_raises(tmp_path: Path): with pytest.raises(FileNotFoundError, match="prompt_builder.py"): HermesPromptSource(hermes_repo=tmp_path) + + +def test_write_replaces_string_constant(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + new_text = "Replacement guidance for memory." + source.write("MEMORY_GUIDANCE", new_text) + assert source.read("MEMORY_GUIDANCE") == new_text + # Confirm SKILLS_GUIDANCE was untouched. + assert source.read("SKILLS_GUIDANCE") == ( + "After completing a complex task, save the approach." + ) + + +def test_write_preserves_file_parseability(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + # A value with newlines, quotes, and backslashes — repr() must + # produce a literal that round-trips byte-equal. + tricky = 'line one\nline two with "quotes" and \\ backslash' + source.write("MEMORY_GUIDANCE", tricky) + assert source.read("MEMORY_GUIDANCE") == tricky + # File must still be valid Python. + import ast as _ast + _ast.parse((fake_hermes_repo / "agent" / "prompt_builder.py").read_text()) + + +def test_write_unknown_section_raises(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + with pytest.raises(KeyError, match="NONEXISTENT"): + source.write("NONEXISTENT", "x") From 22fac1cd8a34176f238ec023e20d167ddeb95c21 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:30:31 -0600 Subject: [PATCH 04/23] feat(prompts): HermesPromptSource section enumeration --- evolution/prompts/hermes_prompt_source.py | 11 +++++++++++ tests/prompts/test_hermes_prompt_source.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/evolution/prompts/hermes_prompt_source.py b/evolution/prompts/hermes_prompt_source.py index bae944f3..b92935b2 100644 --- a/evolution/prompts/hermes_prompt_source.py +++ b/evolution/prompts/hermes_prompt_source.py @@ -43,6 +43,17 @@ def read(self, section_name: str) -> str: ) return constants[section_name][0] + def list_sections(self) -> list[SectionDescriptor]: + constants = self._parse_string_constants() + return [ + SectionDescriptor( + name=name, + current_text=text, + source_path=self.prompt_builder_path, + ) + for name, (text, _node) in sorted(constants.items()) + ] + def _parse_string_constants(self) -> dict[str, tuple[str, ast.Constant]]: """Return ``{name: (value, value_ast_node)}`` for every top-level string-typed assignment in prompt_builder.py. diff --git a/tests/prompts/test_hermes_prompt_source.py b/tests/prompts/test_hermes_prompt_source.py index e177c7ab..c631a435 100644 --- a/tests/prompts/test_hermes_prompt_source.py +++ b/tests/prompts/test_hermes_prompt_source.py @@ -93,3 +93,20 @@ def test_write_unknown_section_raises(fake_hermes_repo: Path): source = HermesPromptSource(hermes_repo=fake_hermes_repo) with pytest.raises(KeyError, match="NONEXISTENT"): source.write("NONEXISTENT", "x") + + +def test_list_sections_enumerates_string_constants(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + sections = source.list_sections() + names = {s.name for s in sections} + assert "MEMORY_GUIDANCE" in names + assert "SKILLS_GUIDANCE" in names + assert "PLATFORM_HINTS" not in names # dict-typed → excluded + + +def test_list_sections_populates_descriptors(fake_hermes_repo: Path): + source = HermesPromptSource(hermes_repo=fake_hermes_repo) + by_name = {s.name: s for s in source.list_sections()} + skills = by_name["SKILLS_GUIDANCE"] + assert skills.current_text == "After completing a complex task, save the approach." + assert skills.source_path == fake_hermes_repo / "agent" / "prompt_builder.py" From 9f2f4a6727bf5e575fb3282a7fbf36e9e225a077 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:31:19 -0600 Subject: [PATCH 05/23] feat(validation): extend Task with expected_save_content --- evolution/validation/task.py | 13 +++++++++++++ tests/validation/test_task.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/evolution/validation/task.py b/evolution/validation/task.py index 209c8b38..b7833fa0 100644 --- a/evolution/validation/task.py +++ b/evolution/validation/task.py @@ -32,6 +32,11 @@ class Task: the verdict is "did the agent's edits make the planted test pass" rather than "did the agent invoke the right tools." When set, takes precedence over the tool-call rule. + + ``expected_save_content`` is an optional rubric (not exact text) + describing what a good ``memory(action='save')`` would contain. It + feeds the prompt-section compound verdict's Layer 2 content judge; it + has no effect on the Layer 1 tool-call rule above. """ task_id: str @@ -40,6 +45,7 @@ class Task: forbidden_tools: tuple[str, ...] = () fixture_setup: dict[str, str] = field(default_factory=dict) test_command: Optional[str] = None + expected_save_content: Optional[str] = None def render_message(self, fixture_dir: Path) -> str: """Substitute ``{fixture_dir}`` in the message with the resolved path. @@ -98,6 +104,12 @@ def _task_from_dict(obj: dict, *, source: str) -> Task: raise ValueError( f"{source}: test_command must be a string (got {type(test_command).__name__})" ) + expected_save_content = obj.get("expected_save_content") + if expected_save_content is not None and not isinstance(expected_save_content, str): + raise ValueError( + f"{source}: expected_save_content must be a string " + f"(got {type(expected_save_content).__name__})" + ) return Task( task_id=obj["task_id"], user_message=obj["user_message"], @@ -105,4 +117,5 @@ def _task_from_dict(obj: dict, *, source: str) -> Task: forbidden_tools=tuple(obj.get("forbidden_tools") or ()), fixture_setup=dict(fixture_setup), test_command=test_command, + expected_save_content=expected_save_content, ) diff --git a/tests/validation/test_task.py b/tests/validation/test_task.py index 4c0b0908..ae658bda 100644 --- a/tests/validation/test_task.py +++ b/tests/validation/test_task.py @@ -155,3 +155,35 @@ def test_test_command_non_string_raises(self, tmp_path): }]) with pytest.raises(ValueError, match="test_command must be a string"): TaskSuite.from_jsonl(p) + + +class TestExpectedSaveContent: + def _write_jsonl(self, path: Path, rows: list[dict]) -> None: + path.write_text("\n".join(json.dumps(r) for r in rows) + "\n") + + def test_round_trips_from_jsonl(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{ + "task_id": "save-pref-001", + "user_message": "I prefer uv over pip for Python projects.", + "expected_tools": ["memory"], + "expected_save_content": "user prefers uv over pip", + }]) + suite = TaskSuite.from_jsonl(p) + assert suite.tasks[0].expected_save_content == "user prefers uv over pip" + + def test_defaults_to_none(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{ + "task_id": "t1", "user_message": "hi", "expected_tools": ["memory"], + }]) + suite = TaskSuite.from_jsonl(p) + assert suite.tasks[0].expected_save_content is None + + def test_non_string_raises(self, tmp_path): + p = tmp_path / "suite.jsonl" + self._write_jsonl(p, [{ + "task_id": "t", "user_message": "m", "expected_save_content": 42, + }]) + with pytest.raises(ValueError, match="expected_save_content must be a string"): + TaskSuite.from_jsonl(p) From b217e8b0f7f52a03ec619ff9bd327cf9041adc6f Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:32:49 -0600 Subject: [PATCH 06/23] feat(validation): capture tool call args in session parser --- evolution/validation/agent_runner.py | 7 ++++ evolution/validation/hermes_runner.py | 48 ++++++++++++++++++++++++ tests/validation/test_hermes_runner.py | 52 ++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py index dc4500d4..df9a4d7c 100644 --- a/evolution/validation/agent_runner.py +++ b/evolution/validation/agent_runner.py @@ -21,6 +21,12 @@ class AgentRunResult: tool_call dicts) the agent invoked during the session. The validator only needs names for the expected / forbidden membership tests. + ``tool_calls_with_args`` carries the same calls in order as + ``{"name", "arguments"}`` dicts (arguments parsed from the + LLM-emitted JSON). The compound-verdict Layer 2 judge needs the + argument payloads — e.g. the content of a ``memory(action='save')`` + call — which ``tool_calls_seq`` discards. + ``error`` is set when the runner itself failed to drive the agent (subprocess timeout, no session JSON written, parse failure). It's distinct from "agent invoked a tool that failed" — that's still a @@ -35,6 +41,7 @@ class AgentRunResult: model_name: Optional[str] = None error: Optional[str] = None session_path: Optional[Path] = None + tool_calls_with_args: list[dict] = field(default_factory=list) @dataclass(frozen=True) diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index f93e118c..82ddcde0 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -210,6 +210,7 @@ def parse_session_result( messages = data.get("messages") or [] tool_calls_seq = _extract_tool_call_names(messages) + tool_calls_with_args = _extract_tool_calls_with_args(messages) final_text_tail = _extract_final_text_tail(messages) model_name = data.get("model") @@ -219,6 +220,7 @@ def parse_session_result( duration_seconds=duration_seconds, model_name=model_name, session_path=session_path, + tool_calls_with_args=tool_calls_with_args, ) @@ -256,6 +258,52 @@ def _call_name(call: dict) -> Optional[str]: return str(flat) if flat else None +def _extract_tool_calls_with_args(messages: list[dict]) -> list[dict]: + """Return ``[{name, arguments}, ...]`` for each assistant tool call. + + Arguments are parsed from the LLM-emitted JSON string. Malformed or + non-object arguments fall back to ``{}`` rather than dropping the + call — the Layer 2 judge can still treat "memory was invoked with + empty args" as a behavior signal. Handles both OpenAI-nested and flat + tool_call shapes, mirroring ``_extract_tool_call_names``. + """ + out: list[dict] = [] + for msg in messages: + if msg.get("role") != "assistant": + continue + for call in msg.get("tool_calls") or []: + if not isinstance(call, dict): + continue + name = _call_name(call) + if not name: + continue + args_raw = _call_arguments_raw(call) + try: + args = json.loads(args_raw) if args_raw else {} + except (json.JSONDecodeError, TypeError): + args = {} + if not isinstance(args, dict): + args = {} + out.append({"name": name, "arguments": args}) + return out + + +def _call_arguments_raw(call: dict) -> str: + fn = call.get("function") + if isinstance(fn, dict): + nested = fn.get("arguments") + if isinstance(nested, str): + return nested + if isinstance(nested, dict): + return json.dumps(nested) + flat = call.get("arguments") + if isinstance(flat, str): + return flat + if isinstance(flat, dict): + return json.dumps(flat) + return "" + + def _extract_final_text_tail(messages: list[dict]) -> str: """Last 4096 chars of the last assistant message with text content.""" for msg in reversed(messages): diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index d970c29e..da1b2fea 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -173,6 +173,58 @@ def test_call_missing_name_skipped(self, tmp_path): assert result.tool_calls_seq == ["patch"] +class TestParseToolCallArgs: + def test_captures_tool_call_args(self, tmp_path): + """Sessions with tool calls must surface name AND parsed args.""" + p = tmp_path / "session.json" + _write_session(p, [ + {"role": "user", "content": "save a fact"}, + {"role": "assistant", "content": "", "tool_calls": [ + {"function": { + "name": "memory", + "arguments": json.dumps({ + "action": "save", + "content": "user prefers terse responses", + }), + }} + ]}, + {"role": "tool", "content": "ok"}, + {"role": "assistant", "content": "Saved."}, + ]) + result = parse_session_result(p, duration_seconds=1.0) + assert result.tool_calls_seq == ["memory"] + assert len(result.tool_calls_with_args) == 1 + call = result.tool_calls_with_args[0] + assert call["name"] == "memory" + assert call["arguments"]["action"] == "save" + assert call["arguments"]["content"] == "user prefers terse responses" + + def test_handles_malformed_args(self, tmp_path): + """Malformed tool-call arguments JSON must not crash — fall back to {}.""" + p = tmp_path / "session.json" + _write_session(p, [ + {"role": "assistant", "tool_calls": [ + {"function": {"name": "memory", "arguments": "{not-json"}} + ]}, + ]) + result = parse_session_result(p, duration_seconds=1.0) + assert result.tool_calls_seq == ["memory"] + assert result.tool_calls_with_args == [{"name": "memory", "arguments": {}}] + + def test_handles_flat_dict_args(self, tmp_path): + """Flat tool_call shape with an already-parsed dict argument.""" + p = tmp_path / "session.json" + _write_session(p, [ + {"role": "assistant", "tool_calls": [ + {"name": "memory", "arguments": {"action": "delete", "key": "x"}} + ]}, + ]) + result = parse_session_result(p, duration_seconds=1.0) + assert result.tool_calls_with_args == [ + {"name": "memory", "arguments": {"action": "delete", "key": "x"}} + ] + + class TestHermesAgentRunnerSubprocess: """The subprocess invocation layer: env + cwd + args plumbing.""" From 3385837c45535dbc75c68c94fe3511d6daaa7c40 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:33:37 -0600 Subject: [PATCH 07/23] feat(validation): HermesPromptSectionInstaller --- evolution/validation/artifact_installer.py | 32 ++++++++++ tests/validation/test_artifact_installer.py | 66 +++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/evolution/validation/artifact_installer.py b/evolution/validation/artifact_installer.py index 44780ec2..2e56df3e 100644 --- a/evolution/validation/artifact_installer.py +++ b/evolution/validation/artifact_installer.py @@ -22,6 +22,7 @@ from pathlib import Path from typing import Optional, Protocol +from evolution.prompts.hermes_prompt_source import HermesPromptSource from evolution.tools.hermes_source import HermesToolSource from evolution.tools.tool_source import ToolManifest @@ -139,6 +140,37 @@ def _extract_description(self, artifact_source: Path) -> str: return manifest.find_tool(self.tool_name).description +class HermesPromptSectionInstaller: + """Splice an evolved prompt section into Hermes ``agent/prompt_builder.py``. + + The artifact source is a plain-text file holding the candidate + section body. ``install`` reads that text and asks ``HermesPromptSource`` + to splice it into the named string constant in place; the validator's + backup/flock/sha-drift machinery (shared with the tool-description + path) guards the live checkout and restores it afterward. + + Constraint: the target section must be a top-level string constant + (the same shape ``HermesPromptSource`` reads). Dict-typed sections + like ``PLATFORM_HINTS`` are not installable. + """ + + def __init__(self, hermes_repo: Path, section_name: str) -> None: + self.hermes_repo = Path(hermes_repo) + self.section_name = section_name + self._source = HermesPromptSource(self.hermes_repo) + self.target_path = self._source.prompt_builder_path + + def install(self, artifact_source: Path) -> str: + """Splice the candidate section text from ``artifact_source`` into the + live ``prompt_builder.py`` and return the post-install sha256.""" + new_text = artifact_source.read_text(encoding="utf-8") + self._source.write(self.section_name, new_text) + return sha256_of(self.target_path) + + def verify_backup(self, backup_path: Path) -> None: + verify_python_parses(backup_path) + + class SkillFileInstaller: """Write an evolved SKILL.md into a writable workdir for closed-loop validation. diff --git a/tests/validation/test_artifact_installer.py b/tests/validation/test_artifact_installer.py index 95b1cfbd..d6d6ea4c 100644 --- a/tests/validation/test_artifact_installer.py +++ b/tests/validation/test_artifact_installer.py @@ -261,3 +261,69 @@ def test_rejects_invalid_utf8(self, baseline_skill, tmp_path): backup.write_bytes(b"\xff\xfe\x00\x00invalid") with pytest.raises(ValueError, match="not valid UTF-8"): installer.verify_backup(backup) + + +# ---- HermesPromptSectionInstaller ---- + +import textwrap + +from evolution.validation.artifact_installer import ( + HermesPromptSectionInstaller, + sha256_of, +) + + +def _fake_hermes_repo(tmp_path: Path) -> Path: + (tmp_path / "agent").mkdir() + (tmp_path / "agent" / "prompt_builder.py").write_text(textwrap.dedent('''\ + """Stub prompt_builder.""" + + MEMORY_GUIDANCE = ( + "You have persistent memory. " + "Save durable facts." + ) + + SKILLS_GUIDANCE = "After a complex task, save the approach." + ''')) + return tmp_path + + +class TestHermesPromptSectionInstaller: + def test_target_path_is_prompt_builder(self, tmp_path): + repo = _fake_hermes_repo(tmp_path) + installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE") + assert installer.target_path == repo / "agent" / "prompt_builder.py" + + def test_install_splices_candidate_and_returns_sha(self, tmp_path): + repo = _fake_hermes_repo(tmp_path) + installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE") + candidate = tmp_path / "candidate.txt" + candidate.write_text("EVOLVED memory guidance body.") + + returned_sha = installer.install(candidate) + + pb = repo / "agent" / "prompt_builder.py" + assert returned_sha == sha256_of(pb) + # The new text is live; the sibling section is untouched. + from evolution.prompts.hermes_prompt_source import HermesPromptSource + src = HermesPromptSource(repo) + assert src.read("MEMORY_GUIDANCE") == "EVOLVED memory guidance body." + assert src.read("SKILLS_GUIDANCE") == "After a complex task, save the approach." + # File still parses. + import ast + ast.parse(pb.read_text()) + + def test_verify_backup_rejects_non_python(self, tmp_path): + repo = _fake_hermes_repo(tmp_path) + installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE") + bad = tmp_path / "bad.cl_backup" + bad.write_text("def broken(:\n") + with pytest.raises(SyntaxError): + installer.verify_backup(bad) + + def test_verify_backup_accepts_valid_python(self, tmp_path): + repo = _fake_hermes_repo(tmp_path) + installer = HermesPromptSectionInstaller(repo, "MEMORY_GUIDANCE") + good = tmp_path / "good.cl_backup" + good.write_text("X = 'ok'\n") + installer.verify_backup(good) # must not raise From 552a408008ad50beb2de5cadf5d7358d96dcb27d Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:35:19 -0600 Subject: [PATCH 08/23] feat(validation): optional Layer 2 judge in ClosedLoopValidator + score_task --- evolution/validation/report.py | 22 ++++++++- evolution/validation/validator.py | 18 +++++++- tests/validation/test_report.py | 71 ++++++++++++++++++++++++++++++ tests/validation/test_validator.py | 49 +++++++++++++++++++++ 4 files changed, 157 insertions(+), 3 deletions(-) diff --git a/evolution/validation/report.py b/evolution/validation/report.py index 94de672d..707cedff 100644 --- a/evolution/validation/report.py +++ b/evolution/validation/report.py @@ -11,7 +11,7 @@ import subprocess from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Any, Optional +from typing import Any, Callable, Optional from rich.console import Console from rich.table import Table @@ -55,6 +55,8 @@ def score_task( test_command: Optional[str] = None, fixture_dir: Optional[Path] = None, test_command_timeout_seconds: float = 60.0, + layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None, + layer2_threshold: float = 0.7, ) -> tuple[bool, bool]: """Return (passed, abstained). @@ -68,6 +70,16 @@ def score_task( in this mode. Command failure modes (nonzero exit, timeout, FileNotFoundError) all map to ``(False, False)`` — "the test did not pass," which is the meaningful verdict regardless of cause. + + Layer 2 (compound verdict, prompt-section suites): when + ``layer2_judge_fn`` is provided, a task passes only if Layer 1 + (trigger membership) passes AND the judge returns a score + ``>= layer2_threshold``. The judge receives the subset of + ``run.tool_calls_with_args`` whose name is ``memory`` (each item the + call's ``arguments`` dict). Layer 2 is short-circuited when Layer 1 + fails — the judge is never called, so no LLM cost is spent on a task + that already failed the trigger test. ``test_command`` mode ignores + Layer 2. """ if run.error is not None: return False, True @@ -82,6 +94,14 @@ def score_task( return False, False if expected_tools and not (invoked & set(expected_tools)): return False, False + if layer2_judge_fn is not None: + memory_calls = [ + c["arguments"] + for c in run.tool_calls_with_args + if c.get("name") == "memory" + ] + if layer2_judge_fn(memory_calls) < layer2_threshold: + return False, False return True, False diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py index 936692b5..2f788c93 100644 --- a/evolution/validation/validator.py +++ b/evolution/validation/validator.py @@ -22,7 +22,7 @@ from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Optional +from typing import Callable, Iterator, Optional from evolution.validation.agent_runner import AgentRunner, TaskRunContext from evolution.validation.artifact_installer import ( @@ -80,9 +80,21 @@ class ClosedLoopValidator: times and aggregate. """ - def __init__(self, installer: ArtifactInstaller, runner: AgentRunner) -> None: + def __init__( + self, + installer: ArtifactInstaller, + runner: AgentRunner, + *, + layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None, + layer2_threshold: float = 0.7, + ) -> None: self.installer = installer self.runner = runner + # Optional compound-verdict Layer 2 (prompt-section suites). When + # unset, scoring is Layer 1 only — the tool-description path is + # unchanged. + self.layer2_judge_fn = layer2_judge_fn + self.layer2_threshold = layer2_threshold def validate(self, inputs: ValidationInputs) -> ValidationReport: target = self.installer.target_path @@ -149,6 +161,8 @@ def _run_one_task(self, task: Task) -> TaskResult: run=run, test_command=task.test_command, fixture_dir=fixture_dir, + layer2_judge_fn=self.layer2_judge_fn, + layer2_threshold=self.layer2_threshold, ) return TaskResult( task_id=task.task_id, diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py index ec8eaa8b..98b5d9ab 100644 --- a/tests/validation/test_report.py +++ b/tests/validation/test_report.py @@ -68,6 +68,77 @@ def test_error_marks_abstention(self): assert abstained +class TestScoreTaskLayer2: + """Compound verdict: Layer 1 (trigger) + optional Layer 2 (content judge).""" + + def _save_run(self, content: str = "good") -> AgentRunResult: + return AgentRunResult( + tool_calls_seq=["memory"], final_text_tail="", duration_seconds=0.0, + tool_calls_with_args=[ + {"name": "memory", "arguments": {"action": "save", "content": content}} + ], + ) + + def test_no_judge_is_layer1_only(self): + passed, abstained = score_task( + expected_tools=("memory",), forbidden_tools=(), run=self._save_run(), + ) + assert passed and not abstained + + def test_passes_when_both_layers_ok(self): + passed, abstained = score_task( + expected_tools=("memory",), forbidden_tools=(), run=self._save_run(), + layer2_judge_fn=lambda calls: 0.9, layer2_threshold=0.7, + ) + assert passed and not abstained + + def test_fails_when_layer2_below_threshold(self): + passed, abstained = score_task( + expected_tools=("memory",), forbidden_tools=(), run=self._save_run("bad"), + layer2_judge_fn=lambda calls: 0.5, layer2_threshold=0.7, + ) + assert not passed and not abstained + + def test_layer1_failure_short_circuits_judge(self): + """Layer 1 fail => judge never called (no LLM cost on a failed trigger).""" + run = AgentRunResult( + tool_calls_seq=[], final_text_tail="", duration_seconds=0.0, + tool_calls_with_args=[], + ) + calls_seen = [] + + def judge_fn(memory_calls): + calls_seen.append(memory_calls) + return 1.0 + + passed, abstained = score_task( + expected_tools=("memory",), forbidden_tools=(), run=run, + layer2_judge_fn=judge_fn, layer2_threshold=0.7, + ) + assert not passed + assert calls_seen == [] + + def test_judge_receives_only_memory_call_args(self): + run = AgentRunResult( + tool_calls_seq=["read_file", "memory"], final_text_tail="", duration_seconds=0.0, + tool_calls_with_args=[ + {"name": "read_file", "arguments": {"path": "x"}}, + {"name": "memory", "arguments": {"action": "save", "content": "c"}}, + ], + ) + received = [] + + def judge_fn(memory_calls): + received.append(memory_calls) + return 1.0 + + score_task( + expected_tools=("memory",), forbidden_tools=(), run=run, + layer2_judge_fn=judge_fn, layer2_threshold=0.7, + ) + assert received == [[{"action": "save", "content": "c"}]] + + class TestScoreTaskTestCommandMode: """When ``test_command`` is set on a task, the verdict is exit-code-driven, not tool-call-driven. Used by skill-side suites (e.g., planted-bug: diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py index 1e8417eb..3326c7a1 100644 --- a/tests/validation/test_validator.py +++ b/tests/validation/test_validator.py @@ -54,6 +54,55 @@ def _write_suite(tmp_path: Path, tasks: list[dict]) -> TaskSuite: return TaskSuite.from_jsonl(p) +class TestClosedLoopValidatorLayer2: + def test_layer2_judge_threaded_into_scoring(self, tmp_path): + """A configured Layer 2 judge runs per scored task and can fail a + task whose Layer 1 trigger passed.""" + target = tmp_path / "prompt_builder.py" + target.write_text("MEMORY_GUIDANCE = 'orig'\n") + baseline = tmp_path / "baseline.txt" + baseline.write_text("baseline body") + evolved = tmp_path / "evolved.txt" + evolved.write_text("evolved body") + + suite = _write_suite(tmp_path, [ + {"task_id": "t1", "user_message": "save", "expected_tools": ["memory"]}, + ]) + + class _MemoryRunner: + def __init__(self, target_path): + self.target_path = target_path + + def run(self, ctx): + return AgentRunResult( + tool_calls_seq=["memory"], final_text_tail="ok", + duration_seconds=0.1, model_name="test-model", + tool_calls_with_args=[ + {"name": "memory", "arguments": {"action": "save", "content": "x"}} + ], + ) + + judged = [] + + def judge_fn(memory_calls): + judged.append(memory_calls) + return 0.2 # below threshold → Layer 2 fails the task + + validator = ClosedLoopValidator( + _StubInstaller(target), _MemoryRunner(target), + layer2_judge_fn=judge_fn, layer2_threshold=0.7, + ) + report = validator.validate(ValidationInputs( + tool_name="MEMORY_GUIDANCE", suite=suite, + baseline_artifact=baseline, evolved_artifact=evolved, + )) + # Judge invoked once per phase (baseline + evolved) on the one task. + assert len(judged) == 2 + # Both phases fail Layer 2 → 0 pass rate, no regression decision. + assert report.baseline.pass_rate == 0.0 + assert report.evolved.pass_rate == 0.0 + + class TestClosedLoopValidatorHappyPath: def test_pass_when_evolved_strictly_improves(self, tmp_path): target = tmp_path / "tool.py" From 8435864354d4275ab73128f4e77a65347c899659 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:36:23 -0600 Subject: [PATCH 09/23] feat(prompts): SaveCallJudge signature + scorer --- evolution/prompts/prompt_judge.py | 107 +++++++++++++++++++++++++++++ tests/prompts/test_prompt_judge.py | 61 ++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 evolution/prompts/prompt_judge.py create mode 100644 tests/prompts/test_prompt_judge.py diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py new file mode 100644 index 00000000..6e693867 --- /dev/null +++ b/evolution/prompts/prompt_judge.py @@ -0,0 +1,107 @@ +"""LLM-as-judge for memory-save calls — scores args against MEMORY_GUIDANCE rules. + +Layer 2 of the compound verdict. Layer 1 (trigger membership) is handled +by ``score_task``'s existing expected_tools / forbidden_tools logic. +""" + +from __future__ import annotations + +import logging +from typing import Any + +import dspy + +from evolution.core.config import EvolutionConfig +from evolution.core.fitness import _clamp_to_unit + +logger = logging.getLogger(__name__) + +MAX_JUDGED_CALLS_PER_TASK = 5 +"""Cap on how many save calls per task the judge will score. Excess calls +beyond the cap score 0 each — bounds cost on pathological cases where the +agent saves on every turn.""" + + +class SaveCallSignature(dspy.Signature): + """Score a memory-save call against MEMORY_GUIDANCE's rules. + + Output ``quality`` (0.0-1.0): how well ``saved_content`` follows the + rules — durable (not stale in a week), declarative phrasing (not + imperative), focused on facts that prevent future correction, and NOT + task progress, PR numbers, or completed-work logs. + """ + + task: str = dspy.InputField(desc="The user task that prompted the save") + expected_content: str = dspy.InputField( + desc="A rubric for what the saved content should resemble (not exact text)" + ) + saved_content: str = dspy.InputField(desc="The content the agent actually saved") + quality: str = dspy.OutputField( + desc="0.0-1.0 quality score per MEMORY_GUIDANCE rules" + ) + feedback: str = dspy.OutputField( + desc="One-sentence diagnosis of any rule violation; empty if quality is 1.0" + ) + + +class SaveCallJudge: + """LLM scorer for individual memory-save calls.""" + + def __init__(self, config: EvolutionConfig): + self.config = config + self.judge = dspy.ChainOfThought(SaveCallSignature) + + def score(self, *, task: str, expected_content: str, saved_content: str) -> float: + _lm = self.config.get_lm("eval") + lm = dspy.LM( + _lm.model, + **_lm.lm_kwargs, + temperature=0.0, + max_tokens=1000, + request_timeout=60, + num_retries=5, + ) + with dspy.context(lm=lm): + result = self.judge( + task=task, + expected_content=expected_content, + saved_content=saved_content, + ) + return _clamp_to_unit(result.quality) + + +def judge_save_calls( + *, + judge: SaveCallJudge | None, + calls: list[dict[str, Any]], + expected_content: str | None, + task_text: str = "", +) -> float: + """Aggregate the Layer 2 score across a task's memory-save calls. + + ``calls`` is the subset of ``tool_calls_with_args`` whose name is + ``memory`` — each item the call's ``arguments`` dict. Only + ``action == 'save'`` calls are judged. + + Returns 1.0 when no save calls were made (Layer 1 catches the + "should-have-saved-but-didn't" failure; Layer 2 only scores what + actually happened) and also when no judge/rubric is configured. + """ + save_calls = [c for c in calls if c.get("action") == "save"] + if not save_calls: + return 1.0 + if judge is None or expected_content is None: + return 1.0 + + judged = save_calls[:MAX_JUDGED_CALLS_PER_TASK] + unjudged_count = max(0, len(save_calls) - MAX_JUDGED_CALLS_PER_TASK) + + scores: list[float] = [] + for call in judged: + scores.append(judge.score( + task=task_text, + expected_content=expected_content, + saved_content=str(call.get("content", "")), + )) + scores.extend([0.0] * unjudged_count) + return sum(scores) / len(scores) diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py new file mode 100644 index 00000000..49ad13aa --- /dev/null +++ b/tests/prompts/test_prompt_judge.py @@ -0,0 +1,61 @@ +"""Tests for the SaveCallJudge — scores memory-save args against MEMORY_GUIDANCE rules.""" +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from evolution.prompts.prompt_judge import SaveCallJudge, judge_save_calls + + +def test_no_save_calls_yields_default(): + """No save calls at all → score 1.0 (vacuously correct). Layer 1 catches + 'should have saved but didn't'; Layer 2 only scores content of calls made.""" + assert judge_save_calls(judge=None, calls=[], expected_content=None) == 1.0 + + +def test_invokes_judge_per_call_and_means(): + fake_judge = MagicMock(spec=SaveCallJudge) + fake_judge.score.side_effect = [0.8, 0.6] + calls = [ + {"action": "save", "content": "user prefers concise responses"}, + {"action": "save", "content": "completed phase 3"}, + ] + score = judge_save_calls( + judge=fake_judge, calls=calls, + expected_content="user preference about response style", + ) + assert score == pytest.approx(0.7) + assert fake_judge.score.call_count == 2 + + +def test_caps_at_five_calls(): + """Pathological: agent saves on every turn. Judge at most 5; excess score 0.""" + fake_judge = MagicMock(spec=SaveCallJudge) + fake_judge.score.return_value = 1.0 + calls = [{"action": "save", "content": f"item {i}"} for i in range(10)] + score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any") + # 5 scored 1.0, 5 unjudged scored 0 → mean 0.5 + assert score == pytest.approx(0.5) + assert fake_judge.score.call_count == 5 + + +def test_filters_non_save_actions(): + fake_judge = MagicMock(spec=SaveCallJudge) + fake_judge.score.return_value = 1.0 + calls = [ + {"action": "delete", "key": "x"}, + {"action": "save", "content": "real save"}, + ] + score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any") + assert score == pytest.approx(1.0) + assert fake_judge.score.call_count == 1 + + +def test_none_judge_or_expected_is_vacuous_pass(): + """A save call exists but no judge/rubric configured → don't penalize.""" + calls = [{"action": "save", "content": "x"}] + assert judge_save_calls(judge=None, calls=calls, expected_content="r") == 1.0 + fake = MagicMock(spec=SaveCallJudge) + assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0 + fake.score.assert_not_called() From 2365031189f28cd2649e22d3f972830c852d5d18 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:37:38 -0600 Subject: [PATCH 10/23] feat(prompts): PromptModule DSPy wrapper --- evolution/prompts/prompt_module.py | 113 ++++++++++++++++++++++++++++ tests/prompts/test_prompt_module.py | 44 +++++++++++ 2 files changed, 157 insertions(+) create mode 100644 evolution/prompts/prompt_module.py create mode 100644 tests/prompts/test_prompt_module.py diff --git a/evolution/prompts/prompt_module.py b/evolution/prompts/prompt_module.py new file mode 100644 index 00000000..ad8f3487 --- /dev/null +++ b/evolution/prompts/prompt_module.py @@ -0,0 +1,113 @@ +"""PromptModule — DSPy module wrapping a prompt-section candidate. + +Unlike ``ToolModule``, the predictor here is a passthrough: there is no +cheap "select a tool from the manifest" classification GEPA can score +without a real agent. Every meaningful eval requires a Hermes subprocess +(via closed-loop). The predictor exists only to give GEPA a place to hang +the candidate text via ``signature.instructions`` — GEPA mutates the +instructions, the framework extracts the candidate via ``section_text``, +and the closed-loop scorer runs it against the real agent. + +Sentinel markers wrap the candidate region so ``section_text`` reads it +back unambiguously after GEPA's edits. + +DO NOT "simplify" by dropping the predictor wrapper. GEPA discovers +optimization targets via ``dspy.Module.named_predictors()``, which only +returns objects with the predictor interface. A bare module with no +predictor child has nothing for GEPA to mutate. +""" + +from __future__ import annotations + +from typing import Optional + +import dspy + + +class SentinelParseError(ValueError): + """The candidate sentinels are missing, duplicated, or malformed.""" + + +def _open_sentinel(section_name: str) -> str: + return f"" + + +def _close_sentinel(section_name: str) -> str: + return f"" + + +def _render_instructions(section_name: str, candidate_text: str) -> str: + return ( + f"The following is a candidate for the {section_name} section of an " + f"agent's system prompt. Iteration mutates only the text between the " + f"sentinel markers below.\n\n" + f"{_open_sentinel(section_name)}{candidate_text}{_close_sentinel(section_name)}" + ) + + +def _extract_from_sentinels(instructions: str, section_name: str) -> str: + open_marker = _open_sentinel(section_name) + close_marker = _close_sentinel(section_name) + open_count = instructions.count(open_marker) + close_count = instructions.count(close_marker) + if open_count == 0 or close_count == 0: + raise SentinelParseError( + f"sentinels for {section_name!r} not found in instructions " + f"(open={open_count}, close={close_count})" + ) + if open_count > 1 or close_count > 1: + raise SentinelParseError( + f"sentinels for {section_name!r} appear multiple times " + f"(open={open_count}, close={close_count})" + ) + start = instructions.find(open_marker) + len(open_marker) + end = instructions.find(close_marker) + if end < start: + raise SentinelParseError( + f"closing sentinel for {section_name!r} precedes opening sentinel" + ) + return instructions[start:end] + + +class PromptPassthroughSignature(dspy.Signature): + """Carrier for the candidate section text via signature.instructions. + + The input/output fields are placeholders; the real evaluation happens + behaviorally via closed-loop, routed by the metric's behavioral branch. + """ + + task: str = dspy.InputField(desc="Placeholder; real evaluation is behavioral") + response: str = dspy.OutputField(desc="Placeholder") + + +class PromptModule(dspy.Module): + """DSPy module hosting a prompt-section candidate as predictor instructions.""" + + def __init__(self, section_name: str, candidate_text: str): + super().__init__() + self.section_name = section_name + self.passthrough = dspy.ChainOfThought(PromptPassthroughSignature) + self.passthrough.predict.signature = ( + self.passthrough.predict.signature.with_instructions( + _render_instructions(section_name, candidate_text) + ) + ) + + def forward( + self, + task: str, + closed_loop_task_id: Optional[str] = None, + ) -> dspy.Prediction: + # Always route behaviorally — there is no cheap predictor score for + # a prompt section. The metric reads these via getattr. + return dspy.Prediction( + response="", + _closed_loop_task_id=closed_loop_task_id, + _candidate_text=self.section_text, + ) + + @property + def section_text(self) -> str: + """Extract the current candidate text from the predictor's instructions.""" + instructions = self.passthrough.predict.signature.instructions + return _extract_from_sentinels(instructions, self.section_name) diff --git a/tests/prompts/test_prompt_module.py b/tests/prompts/test_prompt_module.py new file mode 100644 index 00000000..b369f003 --- /dev/null +++ b/tests/prompts/test_prompt_module.py @@ -0,0 +1,44 @@ +"""PromptModule — DSPy wrapper exposing the candidate section as predictor instructions.""" +from __future__ import annotations + +from evolution.prompts.prompt_module import PromptModule + + +def test_stores_candidate_in_predictor_instructions(): + module = PromptModule( + section_name="MEMORY_GUIDANCE", + candidate_text="evolved candidate body", + ) + instructions = module.passthrough.predict.signature.instructions + assert "evolved candidate body" in instructions + assert "MEMORY_GUIDANCE" in instructions + + +def test_section_text_extracts_current_candidate(): + module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="v1") + assert module.section_text == "v1" + # Simulate a GEPA mutation of the instructions. + new_instructions = module.passthrough.predict.signature.instructions.replace( + "v1", "v2-mutated" + ) + module.passthrough.predict.signature = ( + module.passthrough.predict.signature.with_instructions(new_instructions) + ) + assert module.section_text == "v2-mutated" + + +def test_forward_routes_behavioral(): + """forward always returns the candidate + task id for behavioral scoring — + there's no cheap predictor score for a prompt section.""" + module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="evolved body") + pred = module.forward(task="anything", closed_loop_task_id="task-001") + assert pred._candidate_text == "evolved body" + assert pred._closed_loop_task_id == "task-001" + + +def test_named_predictors_exposes_target(): + """GEPA discovers mutation targets via named_predictors(); the passthrough + predictor must be visible there.""" + module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="x") + names = [name for name, _ in module.named_predictors()] + assert any("passthrough" in n for n in names) From 625f979bef68518d0ba42522dc0a97d98b5db0e7 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:39:02 -0600 Subject: [PATCH 11/23] feat(prompts): GEPA fitness metric + memoizing splice scorer --- evolution/prompts/prompt_judge.py | 84 +++++++++++++++++++++++++++++- tests/prompts/test_prompt_judge.py | 71 +++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 1 deletion(-) diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py index 6e693867..b4e705a8 100644 --- a/evolution/prompts/prompt_judge.py +++ b/evolution/prompts/prompt_judge.py @@ -7,7 +7,7 @@ from __future__ import annotations import logging -from typing import Any +from typing import Any, Callable, Optional import dspy @@ -105,3 +105,85 @@ def judge_save_calls( )) scores.extend([0.0] * unjudged_count) return sum(scores) / len(scores) + + +def make_prompt_fitness_metric( + *, + baseline_text: str, + max_growth: float, + closed_loop_scorer: Optional[Callable[[str, str], float]] = None, +) -> Callable: + """Build the GEPA-shaped 5-arg fitness metric for a prompt section. + + All prompt-section eval is behavioral (a real Hermes subprocess), so + every prediction must carry ``_closed_loop_task_id`` (set by the + dataset builder) and ``_candidate_text`` (set by ``PromptModule``). + Predictions missing the task id are degenerate — they score 0 with a + diagnostic so the misconfiguration is visible in GEPA feedback rather + than silently scoring well. + + ``closed_loop_scorer(task_id, candidate_text) -> float`` runs one + closed-loop trial and returns its [0, 1] score. ``None`` disables + behavioral scoring (predictions score 0) — useful for dry-run wiring + tests that don't want to spawn agents. + """ + baseline_len = len(baseline_text or "") + target_len = int(baseline_len * (1 + max_growth)) if baseline_len else 0 + + def metric(gold, pred, trace=None, pred_name=None, pred_trace=None): + task_id = getattr(pred, "_closed_loop_task_id", None) + if task_id is None: + return dspy.Prediction( + score=0.0, + feedback=( + "No closed_loop_task_id on prediction — prompt-section eval " + "requires behavioral routing. Check that the dataset builder " + "set the closed_loop_task_id input field." + ), + ) + candidate_text = getattr(pred, "_candidate_text", "") or "" + score = 0.0 + if closed_loop_scorer is not None: + score = closed_loop_scorer(task_id, candidate_text) + + feedback = "" + if baseline_len: + feedback = ( + f"[BUDGET] candidate={len(candidate_text)} chars, " + f"baseline={baseline_len} chars, ceiling={target_len} chars" + ) + return dspy.Prediction(score=score, feedback=feedback) + + return metric + + +_UNSET = object() + + +def make_memoizing_splice_scorer( + *, + install_fn: Callable[[str], None], + score_fn: Callable[[str], float], +) -> Callable[[str, str], float]: + """Build ``closed_loop_scorer(task_id, candidate_text) -> float`` that + splices a candidate only when it changes. + + GEPA evaluates a candidate across many tasks in a row. Splice-and-restore + is expensive, so this scorer calls ``install_fn(candidate_text)`` only when + ``candidate_text`` differs from the currently-installed value; consecutive + tasks for the same candidate reuse the live splice. ``score_fn(task_id)`` + runs the task through the agent with whatever candidate is installed. + + Backup/restore of the mutated source is the caller's responsibility — wrap + the whole GEPA run, not each call (the per-run guard mirrors + ``ClosedLoopValidator``'s splice-once-per-phase shape). + """ + state: dict[str, Any] = {"installed": _UNSET} + + def scorer(task_id: str, candidate_text: str) -> float: + if state["installed"] != candidate_text: + install_fn(candidate_text) + state["installed"] = candidate_text + return score_fn(task_id) + + return scorer diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py index 49ad13aa..c0e620a7 100644 --- a/tests/prompts/test_prompt_judge.py +++ b/tests/prompts/test_prompt_judge.py @@ -59,3 +59,74 @@ def test_none_judge_or_expected_is_vacuous_pass(): fake = MagicMock(spec=SaveCallJudge) assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0 fake.score.assert_not_called() + + +# ---- make_prompt_fitness_metric ---- + +from evolution.prompts.prompt_judge import ( + make_memoizing_splice_scorer, + make_prompt_fitness_metric, +) + + +def _behavioral_pred(task_id="task-001", candidate="evolved body"): + pred = type("Pred", (), {})() + pred._closed_loop_task_id = task_id + pred._candidate_text = candidate + return pred + + +def test_metric_routes_behavioral_through_scorer(): + seen = [] + + def fake_scorer(task_id, candidate_text): + seen.append((task_id, candidate_text)) + return 0.85 + + metric = make_prompt_fitness_metric( + baseline_text="baseline", max_growth=0.2, closed_loop_scorer=fake_scorer, + ) + result = metric(gold=object(), pred=_behavioral_pred()) + assert result.score == 0.85 + assert seen == [("task-001", "evolved body")] + assert "BUDGET" in result.feedback # length feedback present + + +def test_metric_without_task_id_scores_zero(): + metric = make_prompt_fitness_metric( + baseline_text="b", max_growth=0.2, closed_loop_scorer=lambda *_: 1.0, + ) + pred = type("Pred", (), {})() # no _closed_loop_task_id + result = metric(gold=object(), pred=pred) + assert result.score == 0.0 + assert "behavioral" in result.feedback.lower() + + +def test_metric_without_scorer_scores_zero(): + metric = make_prompt_fitness_metric( + baseline_text="b", max_growth=0.2, closed_loop_scorer=None, + ) + result = metric(gold=object(), pred=_behavioral_pred()) + assert result.score == 0.0 + + +# ---- make_memoizing_splice_scorer ---- + +def test_memoizing_scorer_splices_only_on_candidate_change(): + installs: list[str] = [] + scores = {"task-a": 0.7, "task-b": 0.9} + + scorer = make_memoizing_splice_scorer( + install_fn=lambda text: installs.append(text), + score_fn=lambda task_id: scores[task_id], + ) + # Same candidate across two tasks → one install. + assert scorer("task-a", "cand-1") == 0.7 + assert scorer("task-b", "cand-1") == 0.9 + assert installs == ["cand-1"] + # New candidate → re-splice. + assert scorer("task-a", "cand-2") == 0.7 + assert installs == ["cand-1", "cand-2"] + # Back to a prior candidate is NOT cached across changes → re-splice. + assert scorer("task-a", "cand-1") == 0.7 + assert installs == ["cand-1", "cand-2", "cand-1"] From 725908ba4a5d69eeec7605d910919edfb7d22f65 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Sun, 31 May 2026 20:41:13 -0600 Subject: [PATCH 12/23] feat(prompts): memory_guidance dataset builder + curated eval suite --- evolution/core/dataset_builder.py | 99 +++++++++++++++++++ .../validation/suites/memory_guidance.jsonl | 23 +++++ tests/core/test_dataset_builder.py | 42 ++++++++ 3 files changed, 164 insertions(+) create mode 100644 evolution/validation/suites/memory_guidance.jsonl diff --git a/evolution/core/dataset_builder.py b/evolution/core/dataset_builder.py index 239e149c..28ac5f69 100644 --- a/evolution/core/dataset_builder.py +++ b/evolution/core/dataset_builder.py @@ -485,3 +485,102 @@ def load(path: Path, seed: int = 42) -> EvalDataset: val_ratio=0.25, holdout_ratio=0.25, ) + + +MEMORY_GUIDANCE_CATEGORIES = ( + "save-preference", + "save-correction", + "dont-save-task-progress", + "dont-save-completed-work-log", + "declarative-vs-imperative", +) + +_MEMORY_GUIDANCE_CATEGORY_PROMPTS = { + "save-preference": ( + "Generate ONE closed-loop eval task (category: save-preference) where the " + "user explicitly states a durable preference the agent SHOULD save to " + "memory. Output a single JSON object with fields: user_message, " + "expected_tools=[\"memory\"], expected_save_content (a rubric describing " + "what a good save would look like — not exact text)." + ), + "save-correction": ( + "Generate ONE closed-loop eval task (category: save-correction) where the " + "user corrects the agent on a recurring pattern (e.g. 'no, I use uv not " + "pip'). The agent SHOULD save the correction. Output a single JSON object " + "with fields: user_message, expected_tools=[\"memory\"], " + "expected_save_content." + ), + "dont-save-task-progress": ( + "Generate ONE closed-loop eval task (category: dont-save-task-progress) " + "where the user asks the agent to complete a task (write code, fix a bug). " + "The agent SHOULD NOT save task progress to memory. Output a single JSON " + "object with fields: user_message, expected_tools=[], " + "forbidden_tools=[\"memory\"]." + ), + "dont-save-completed-work-log": ( + "Generate ONE closed-loop eval task (category: dont-save-completed-work-log) " + "where the user asks for a summary of work done. The agent SHOULD NOT log " + "the work to memory. Output a single JSON object with fields: user_message, " + "expected_tools=[], forbidden_tools=[\"memory\"]." + ), + "declarative-vs-imperative": ( + "Generate ONE closed-loop eval task (category: declarative-vs-imperative) " + "where the user states a preference in imperative form ('always respond " + "concisely'). The agent SHOULD save it in declarative form ('user prefers " + "concise responses'). Output a single JSON object with fields: " + "user_message, expected_tools=[\"memory\"], expected_save_content " + "(specifying the declarative-phrasing rubric)." + ), +} + + +def build_memory_guidance_dataset( + *, + lm_call, + n_per_category: int = 10, +) -> list[dict]: + """Generate synthetic MEMORY_GUIDANCE eval tasks across the 5 categories. + + ``lm_call`` is a callable taking a prompt string and returning a JSON + object (one task) as text. The builder issues ``n_per_category`` calls + per category and stamps a unique ``task_id`` on each parsed row so the + output is a valid closed-loop suite regardless of what the LM emits for + the id. Rows the LM returns that don't parse as a JSON object are + skipped (logged), not fatal — a single noisy generation shouldn't abort + the whole build. + + Returns a flat list of Task-shaped dicts ready to write to a JSONL suite + (consumable by ``TaskSuite.from_jsonl``). + """ + out: list[dict] = [] + for category in MEMORY_GUIDANCE_CATEGORIES: + prompt = _MEMORY_GUIDANCE_CATEGORY_PROMPTS[category] + for index in range(n_per_category): + raw = lm_call(prompt) + row = _parse_memory_task_row(raw) + if row is None: + logger.warning( + "build_memory_guidance_dataset: unparseable row for " + "category %s index %d", category, index, + ) + continue + row["task_id"] = f"{category}-{index:03d}" + row.setdefault("expected_tools", []) + out.append(row) + return out + + +def _parse_memory_task_row(raw: str): + """Parse a single JSON object from an LM response. Returns the dict, or + None if the text isn't a JSON object (tolerant of fenced/extra prose).""" + try: + obj = json.loads(raw) + except (json.JSONDecodeError, TypeError): + match = re.search(r"\{.*\}", str(raw), re.DOTALL) + if not match: + return None + try: + obj = json.loads(match.group()) + except json.JSONDecodeError: + return None + return obj if isinstance(obj, dict) else None diff --git a/evolution/validation/suites/memory_guidance.jsonl b/evolution/validation/suites/memory_guidance.jsonl new file mode 100644 index 00000000..a91999c0 --- /dev/null +++ b/evolution/validation/suites/memory_guidance.jsonl @@ -0,0 +1,23 @@ +# Closed-loop validation suite for the Hermes MEMORY_GUIDANCE prompt section. +# Compound verdict: Layer 1 = memory(save) trigger membership; Layer 2 = +# content judge against expected_save_content rubric (save categories only). +# Curated golden set across five categories. Full synthetic expansion is +# produced by evolution.core.dataset_builder.build_memory_guidance_dataset. +# +# --- save-preference: durable preference stated → SHOULD save --- +{"task_id": "save-pref-uv", "user_message": "Heads up for going forward: I always use uv for Python dependency management, never pip or poetry.", "expected_tools": ["memory"], "expected_save_content": "A durable, declarative fact that the user prefers/uses uv for Python dependency management. Should not be phrased as a one-off task or include transient detail."} +{"task_id": "save-pref-tabs", "user_message": "Just so you know for all my projects: I use tabs, not spaces, for indentation.", "expected_tools": ["memory"], "expected_save_content": "A durable fact that the user prefers tabs over spaces for indentation across projects. Declarative, not imperative."} +{"task_id": "save-pref-timezone", "user_message": "For scheduling things, remember I'm based in US Pacific time.", "expected_tools": ["memory"], "expected_save_content": "A durable fact recording the user's timezone (US Pacific). Stable, reusable across sessions."} +# --- save-correction: user corrects a recurring agent pattern → SHOULD save --- +{"task_id": "save-corr-pkgmgr", "user_message": "No — stop suggesting npm. I use pnpm for every JS project. Please remember that.", "expected_tools": ["memory"], "expected_save_content": "A durable fact that the user uses pnpm (not npm) for JS projects, capturing the correction so the agent stops re-suggesting npm."} +{"task_id": "save-corr-name", "user_message": "You keep calling the service 'auth-svc'. Its real name is 'identity-gateway'. Remember that for next time.", "expected_tools": ["memory"], "expected_save_content": "A durable fact mapping the service's correct name (identity-gateway), capturing the user's correction."} +# --- dont-save-task-progress: complete a task → SHOULD NOT save --- +{"task_id": "nosave-fix-bug", "user_message": "There's an off-by-one in {fixture_dir}/loop.py — the range should be inclusive of the last index. Fix it.", "expected_tools": [], "forbidden_tools": ["memory"], "fixture_setup": {"loop.py": "def last_items(xs):\n return xs[0:len(xs)-1]\n"}} +{"task_id": "nosave-write-fn", "user_message": "Add a function `is_even(n)` to {fixture_dir}/util.py that returns True for even integers.", "expected_tools": [], "forbidden_tools": ["memory"], "fixture_setup": {"util.py": "# helpers\n"}} +# --- dont-save-completed-work-log: summary of work done → SHOULD NOT save --- +{"task_id": "nosave-summary", "user_message": "Give me a quick summary of what you changed in the last edit.", "expected_tools": [], "forbidden_tools": ["memory"]} +{"task_id": "nosave-standup", "user_message": "Write a one-line standup update describing the bug we just fixed.", "expected_tools": [], "forbidden_tools": ["memory"]} +# --- declarative-vs-imperative: imperative preference → SHOULD save, declaratively --- +{"task_id": "decl-concise", "user_message": "Always answer me concisely — no preamble, just the answer.", "expected_tools": ["memory"], "expected_save_content": "The preference saved in DECLARATIVE form (e.g. 'user prefers concise answers without preamble'), not copied verbatim as an imperative directive ('always answer concisely')."} +{"task_id": "decl-no-emoji", "user_message": "Never use emoji in your responses to me.", "expected_tools": ["memory"], "expected_save_content": "A declarative fact that the user prefers responses without emoji (e.g. 'user prefers no emoji in responses'), not stored as a raw imperative command."} +{"task_id": "decl-tests-first", "user_message": "From now on, write tests before implementation when you work on my code.", "expected_tools": ["memory"], "expected_save_content": "A declarative fact capturing the user's preference for test-first development (e.g. 'user prefers tests written before implementation'), phrased as a durable preference rather than an imperative."} diff --git a/tests/core/test_dataset_builder.py b/tests/core/test_dataset_builder.py index b2e9f67a..57273920 100644 --- a/tests/core/test_dataset_builder.py +++ b/tests/core/test_dataset_builder.py @@ -171,3 +171,45 @@ def test_lm_constructed_with_bumped_max_tokens(self): f"max_tokens regressed from 16000 to {kwargs['max_tokens']}; " "JSON truncation will reappear at eval_dataset_size>=60" ) + + +class TestBuildMemoryGuidanceDataset: + def test_uses_all_five_categories(self): + from evolution.core.dataset_builder import build_memory_guidance_dataset + + fake_lm = MagicMock() + fake_lm.return_value = ( + '{"task_id": "raw", "user_message": "x", ' + '"expected_tools": ["memory"], ' + '"expected_save_content": "preference rubric"}' + ) + examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=2) + # 5 categories × 2 tasks each + assert len(examples) == 10 + categories = [ + "save-preference", + "save-correction", + "dont-save-task-progress", + "dont-save-completed-work-log", + "declarative-vs-imperative", + ] + invoked_prompts = [c.args[0] for c in fake_lm.call_args_list] + for cat in categories: + assert any(cat in p for p in invoked_prompts), f"category {cat!r} not prompted" + + def test_stamps_unique_task_ids(self): + from evolution.core.dataset_builder import build_memory_guidance_dataset + + fake_lm = MagicMock(return_value=( + '{"user_message": "x", "expected_tools": ["memory"]}' + )) + examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=2) + ids = [e["task_id"] for e in examples] + assert len(ids) == len(set(ids)), "task_ids must be unique" + + def test_skips_unparseable_rows(self): + from evolution.core.dataset_builder import build_memory_guidance_dataset + + fake_lm = MagicMock(return_value="not json at all") + examples = build_memory_guidance_dataset(lm_call=fake_lm, n_per_category=1) + assert examples == [] From d17ead826cd2001e637bf563c5c5f7097ba20fa4 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 08:02:15 -0600 Subject: [PATCH 13/23] feat(prompts): PromptSectionProposer (sentinel-preserving GEPA proposal fn) --- evolution/prompts/prompt_proposer.py | 159 ++++++++++++++++++++++++++ tests/prompts/test_prompt_proposer.py | 82 +++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 evolution/prompts/prompt_proposer.py create mode 100644 tests/prompts/test_prompt_proposer.py diff --git a/evolution/prompts/prompt_proposer.py b/evolution/prompts/prompt_proposer.py new file mode 100644 index 00000000..b2d00d7d --- /dev/null +++ b/evolution/prompts/prompt_proposer.py @@ -0,0 +1,159 @@ +"""GEPA instruction_proposer for prompt-section evolution. + +Mirrors ``BudgetAwareToolProposer``: subclasses ``BudgetAwareProposer`` for +the budget-tracking infrastructure but installs a prompt-section reflection +template whose hard constraint is sentinel preservation. ``__call__`` runs the +inherited proposer LM, then passes the candidate through ``extract_and_rebuild`` +so only the sentinel-delimited region survives. + +On ``SentinelParseError`` the call re-raises (after incrementing +``sentinel_failures``) rather than returning the parent unchanged — GEPA's +reflective_mutation path skips the iteration, avoiding a phantom +identical-to-parent candidate that would pollute the selection pool. +""" + +from __future__ import annotations + +import logging +from typing import Any, Mapping, Sequence + +import dspy + +from evolution.prompts.prompt_module import ( + SentinelParseError, + _extract_from_sentinels, + _render_instructions, +) +from evolution.skills.budget_aware_proposer import BudgetAwareProposer + +logger = logging.getLogger(__name__) + + +_PROMPT_PROPOSER_TEMPLATE = """\ +You are revising one section ({section_name}) of an agent's system prompt. +The instruction below wraps the current candidate text between the markers +`` and ``. + +Hard constraint - sentinel preservation: +Modify only the text between those two markers. Do not change the markers +themselves, and do not add any text outside them. + +Length budget: at most {target_chars} characters for the section body (between +the markers). The current body is {baseline_chars} characters. + +Hard constraint - grounding citation: +Every change must quote or paraphrase a specific phrase from the feedback. If a +failure is not actionable from the section text (model error, judge +disagreement, out-of-distribution input), skip it. + +Your task: rewrite the current section to fix the failures shown below, +modifying only the sentinel-delimited region for {section_name}. + +Steps: +1. Read each failure in the feedback. Classify it as (a) the agent misapplied + existing guidance -> refine the wording, (b) the agent lacked guidance it + needed -> add it, or (c) not actionable from the section text -> skip. +2. Apply changes only for (a) and (b), only inside the sentinel region. +3. For each change, name the specific feedback phrase that grounded it. +4. Match the voice and density of the existing section. +5. If more additions are warranted than fit within {target_chars}, address the + most-grounded failures first; GEPA will run again with the updated baseline. + +If the feedback below is empty or contains no concrete failures, return the +current instruction unchanged. + +Output the full instruction text (markers included, only the sentinel-delimited +region modified). No preamble, no markdown fences, no explanation. +""" + + +def extract_and_rebuild(candidate: str, section_name: str) -> str: + """Extract the sentinel region from a candidate full-instructions string + and re-render the instructions around it. + + Pure function — testable without LM mocks. Raises ``SentinelParseError`` + if the candidate didn't preserve the sentinels. + """ + new_body = _extract_from_sentinels(candidate, section_name) + return _render_instructions(section_name, new_body) + + +class _PromptProposalSignature(dspy.Signature): + """Placeholder; overwritten per-instance via with_instructions so the + section-specific template (section_name, target_chars, baseline_chars + baked in) is installed.""" + + current_instruction: str = dspy.InputField( + desc="The current instruction with the sentinel-wrapped section body" + ) + examples_with_feedback: str = dspy.InputField( + desc="Failure feedback from the eval to ground refinements in" + ) + improved_instruction: str = dspy.OutputField( + desc="The revised instruction with only the sentinel region modified" + ) + + +class PromptSectionProposer(BudgetAwareProposer): + """GEPA-compatible ProposalFn for prompt-section evolution.""" + + component_name = "passthrough.predict" + + def __init__( + self, + section_name: str, + baseline_chars: int, + max_growth: float = 0.2, + safety_margin: float = 0.10, + ): + super().__init__( + baseline_chars=baseline_chars, + max_growth=max_growth, + safety_margin=safety_margin, + ) + self.section_name = section_name + self.sentinel_failures = 0 + + template = _PROMPT_PROPOSER_TEMPLATE.format( + section_name=section_name, + target_chars=self.target_chars, + baseline_chars=baseline_chars, + ) + self.propose = dspy.Predict( + _PromptProposalSignature.with_instructions(template) + ) + + def __call__( + self, + candidate: dict[str, str], + reflective_dataset: Mapping[str, Sequence[Mapping[str, Any]]], + components_to_update: list[str], + ) -> dict[str, str]: + if self.component_name not in components_to_update: + return {} + if self.component_name not in candidate: + return {} + + current_instruction = candidate[self.component_name] + feedback = self._format_examples( + reflective_dataset.get(self.component_name, []) + ) + prediction = self.propose( + current_instruction=current_instruction, + examples_with_feedback=feedback, + ) + new_candidate = prediction.improved_instruction + + try: + rebuilt = extract_and_rebuild(new_candidate, self.section_name) + except SentinelParseError as exc: + self.sentinel_failures += 1 + excerpt = new_candidate[:200] + ("..." if len(new_candidate) > 200 else "") + logger.warning( + "PromptSectionProposer: sentinel parse failure (#%d) for %r: %s. " + "Candidate excerpt: %r", + self.sentinel_failures, self.section_name, exc, excerpt, + ) + raise + + return {self.component_name: rebuilt} diff --git a/tests/prompts/test_prompt_proposer.py b/tests/prompts/test_prompt_proposer.py new file mode 100644 index 00000000..41ecd45e --- /dev/null +++ b/tests/prompts/test_prompt_proposer.py @@ -0,0 +1,82 @@ +"""Tests for PromptSectionProposer — sentinel-preserving GEPA proposal fn.""" +from __future__ import annotations + +from unittest.mock import MagicMock + +import dspy + +from evolution.prompts.prompt_module import ( + _close_sentinel, + _open_sentinel, + _render_instructions, +) +from evolution.prompts.prompt_proposer import ( + PromptSectionProposer, + extract_and_rebuild, +) + + +SECTION = "MEMORY_GUIDANCE" + + +def _wrapped(body: str) -> str: + return _render_instructions(SECTION, body) + + +def test_extract_and_rebuild_round_trips_sentinels(): + candidate = _wrapped("a refined body") + rebuilt = extract_and_rebuild(candidate, SECTION) + # The rebuilt instructions still carry intact sentinels around the new body. + assert _open_sentinel(SECTION) in rebuilt + assert _close_sentinel(SECTION) in rebuilt + assert "a refined body" in rebuilt + + +def test_proposer_only_acts_on_its_component(): + proposer = PromptSectionProposer( + section_name=SECTION, baseline_chars=100, + ) + # A request that doesn't include our component returns empty. + out = proposer( + candidate={"passthrough.predict": _wrapped("x")}, + reflective_dataset={}, + components_to_update=["something.else"], + ) + assert out == {} + + +def test_proposer_rebuilds_sentinel_region(monkeypatch): + proposer = PromptSectionProposer(section_name=SECTION, baseline_chars=100) + + # Stub the LM proposal: return a full-instructions string with the + # sentinel region edited. + fake_pred = MagicMock() + fake_pred.improved_instruction = _wrapped("LM-revised memory guidance") + proposer.propose = MagicMock(return_value=fake_pred) + + out = proposer( + candidate={"passthrough.predict": _wrapped("original")}, + reflective_dataset={"passthrough.predict": [{"Feedback": "be clearer"}]}, + components_to_update=["passthrough.predict"], + ) + assert "passthrough.predict" in out + assert "LM-revised memory guidance" in out["passthrough.predict"] + assert _open_sentinel(SECTION) in out["passthrough.predict"] + + +def test_proposer_raises_on_sentinel_loss(): + from evolution.prompts.prompt_module import SentinelParseError + + proposer = PromptSectionProposer(section_name=SECTION, baseline_chars=100) + fake_pred = MagicMock() + fake_pred.improved_instruction = "the model dropped the sentinels entirely" + proposer.propose = MagicMock(return_value=fake_pred) + + import pytest + with pytest.raises(SentinelParseError): + proposer( + candidate={"passthrough.predict": _wrapped("original")}, + reflective_dataset={"passthrough.predict": [{"Feedback": "x"}]}, + components_to_update=["passthrough.predict"], + ) + assert proposer.sentinel_failures == 1 From 6b13272af13f5e8d995e731cda1042c701594c94 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 08:12:22 -0600 Subject: [PATCH 14/23] =?UTF-8?q?feat(prompts):=20evolve=5Fprompt=5Fsectio?= =?UTF-8?q?n=20CLI=20=E2=80=94=20GEPA=20+=20saturation=20+=20budget=20+=20?= =?UTF-8?q?closed-loop=20deploy=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires HermesPromptSectionInstaller + HermesAgentRunner + ClosedLoopValidator into a full-parity evolution pipeline for prompt sections. GEPA mutates via PromptSectionProposer; the inner loop scores through a serialized memoizing splice scorer; the deploy gate runs baseline-vs-evolved closed-loop on the holdout suite. Saturation pre-flight default-denies a saturated baseline; budget cap aborts on overrun. ClosedLoopValidator's Layer 2 hook becomes a per-task judge factory so the content judge can read each task's expected_save_content rubric. The memoizing scorer serializes splice+run under a lock — dspy.Evaluate is multi-threaded but prompt_builder.py is a single shared file. PR automation is deferred for prompt sections (copying a full evolved file over origin/base would pollute the diff with the local override-hook commit). --- evolution/prompts/evolve_prompt_section.py | 632 ++++++++++++++++++++ evolution/prompts/prompt_judge.py | 19 +- evolution/validation/validator.py | 21 +- tests/prompts/test_evolve_prompt_section.py | 110 ++++ tests/validation/test_validator.py | 16 +- 5 files changed, 784 insertions(+), 14 deletions(-) create mode 100644 evolution/prompts/evolve_prompt_section.py create mode 100644 tests/prompts/test_evolve_prompt_section.py diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py new file mode 100644 index 00000000..288b8737 --- /dev/null +++ b/evolution/prompts/evolve_prompt_section.py @@ -0,0 +1,632 @@ +"""Evolve a named system-prompt section in Hermes ``prompt_builder.py`` via DSPy + GEPA. + +Mirrors ``evolution.tools.evolve_tool`` but for prompt sections, with the +splice-and-restore integration model (see ``HermesPromptSectionInstaller``). +The whole evaluation is behavioral: every candidate is spliced into the live +``prompt_builder.py`` and scored by a real ``hermes -z`` subprocess, so the +deploy gate is a ``ClosedLoopValidator`` run rather than a synthetic-judge +holdout. + +Usage: + python -m evolution.prompts.evolve_prompt_section \\ + --section MEMORY_GUIDANCE \\ + --hermes-repo ~/src/NousResearch/hermes-agent \\ + --tasks evolution/validation/suites/memory_guidance.jsonl \\ + --iterations 10 +""" + +from __future__ import annotations + +import fcntl +import json +import logging +import random +import sys +import tempfile +import threading +import time +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +from typing import Any, Iterator, Optional + +import click +import dspy +from rich.console import Console + +from evolution.core.config import EvolutionConfig +from evolution.core.hermes_provider import instantiate_lm, resolve_default_lm +from evolution.core.lm_timing_callback import ( + COST_LEDGER, + CostCeilingExceeded, + register_litellm_cost_callback, + register_litellm_failure_callback, +) +from evolution.core.pr_automation import disabled_pr_block +from evolution.core.quality_gate import write_gate_decision +from evolution.core.run_inputs import build_run_inputs +from evolution.core.saturation_check import ( + is_non_interactive, + interactive_confirm, + render_saturation_panel, + saturation_preflight, +) +from evolution.prompts.hermes_prompt_source import HermesPromptSource +from evolution.prompts.prompt_judge import ( + SaveCallJudge, + judge_save_calls, + make_memoizing_splice_scorer, + make_prompt_fitness_metric, +) +from evolution.prompts.prompt_module import PromptModule, _extract_from_sentinels +from evolution.prompts.prompt_proposer import PromptSectionProposer +from evolution.validation.agent_runner import TaskRunContext +from evolution.validation.artifact_installer import ( + HermesPromptSectionInstaller, + atomic_write_bytes, +) +from evolution.validation.hermes_runner import ( + DEFAULT_TASK_TIMEOUT_SECONDS, + HermesAgentRunner, +) +from evolution.validation.report import score_task +from evolution.validation.task import Task, TaskSuite +from evolution.validation.validator import ClosedLoopValidator, ValidationInputs + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", +) +logger = logging.getLogger(__name__) +console = Console() + +_GATE_SCHEMA_VERSION = "5" +_BACKUP_SUFFIX = ".cl_backup" +_LOCK_FILENAME = ".cl_validation.lock" + + +def _split_train_holdout( + tasks: tuple[Task, ...], *, holdout_ratio: float, seed: int +) -> tuple[list[Task], list[Task]]: + """Deterministic train/holdout split, stratified only by shuffle+seed. + + Guarantees at least one task on each side when there are >= 2 tasks so + GEPA has something to train on and the deploy gate has something to + evaluate. + """ + ordered = list(tasks) + random.Random(seed).shuffle(ordered) + n_holdout = max(1, int(round(len(ordered) * holdout_ratio))) + n_holdout = min(n_holdout, len(ordered) - 1) if len(ordered) > 1 else len(ordered) + holdout = ordered[:n_holdout] + train = ordered[n_holdout:] + return train, holdout + + +def _behavioral_examples(tasks: list[Task]) -> list[dspy.Example]: + """Build GEPA examples whose inputs drive ``PromptModule.forward`` into the + behavioral branch (task message + closed_loop_task_id).""" + return [ + dspy.Example( + task=t.user_message, + closed_loop_task_id=t.task_id, + ).with_inputs("task", "closed_loop_task_id") + for t in tasks + ] + + +def _make_layer2_factory(judge: Optional[SaveCallJudge]): + """Per-task Layer 2 scorer: binds the task's rubric + message into a + ``score_task``-shaped ``Callable[[list[dict]], float]``. Returns ``None`` + for tasks without an ``expected_save_content`` rubric (no content to + judge).""" + + def factory(task: Task): + if task.expected_save_content is None: + return None + + def judge_fn(memory_calls: list[dict]) -> float: + return judge_save_calls( + judge=judge, + calls=memory_calls, + expected_content=task.expected_save_content, + task_text=task.user_message, + ) + + return judge_fn + + return factory + + +def _section_text_from_candidate(candidate: Any, section_name: str) -> str: + """Extract the section body from a GEPA-built candidate (module or + component dict), reading the sentinel-delimited region.""" + if isinstance(candidate, dict): + instructions = candidate.get("passthrough.predict", "") + else: + instructions = candidate.passthrough.predict.signature.instructions or "" + return _extract_from_sentinels(instructions, section_name) + + +@contextmanager +def _prompt_builder_guard(target_path: Path) -> Iterator[None]: + """Back up ``prompt_builder.py`` + hold the shared closed-loop flock for the + duration of GEPA evolution, then restore the original bytes on exit. + + The GEPA inner loop splices candidates directly into the live file; this + guard guarantees the user's checkout is byte-restored afterward and that no + concurrent harness run (which uses the same lock + backup names) mutates it + mid-flight. Sequenced before the deploy-gate ``ClosedLoopValidator``, which + acquires the same lock itself — never nested. + """ + backup_path = target_path.with_suffix(target_path.suffix + _BACKUP_SUFFIX) + if backup_path.exists(): + raise RuntimeError( + f"Stale backup at {backup_path} — a prior run did not clean up. " + f"Restore {target_path} from it manually, then retry." + ) + lock_fd = open(target_path.parent / _LOCK_FILENAME, "w") + try: + try: + fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError as exc: + raise RuntimeError( + f"Another harness run holds {target_path.parent / _LOCK_FILENAME}. " + f"Wait for it to finish." + ) from exc + atomic_write_bytes(backup_path, target_path.read_bytes()) + try: + yield + finally: + atomic_write_bytes(target_path, backup_path.read_bytes()) + backup_path.unlink(missing_ok=True) + finally: + fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN) + lock_fd.close() + + +def _run_one_task_score( + task: Task, + *, + runner: HermesAgentRunner, + layer2_factory, + layer2_threshold: float, +) -> float: + """Run a single task through the agent with whatever section is currently + spliced, returning 1.0 on pass else 0.0 (abstentions score 0.0 in-loop — + the deploy gate handles abstentions properly).""" + with tempfile.TemporaryDirectory(prefix="ps_inner_") as fixture_tmp: + fixture_dir = Path(fixture_tmp) + for relative_path, content in task.fixture_setup.items(): + dest = fixture_dir / relative_path + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text(content) + ctx = TaskRunContext( + user_message=task.render_message(fixture_dir), + fixture_dir=fixture_dir, + ) + run = runner.run(ctx) + passed, abstained = score_task( + expected_tools=task.expected_tools, + forbidden_tools=task.forbidden_tools, + run=run, + test_command=task.test_command, + fixture_dir=fixture_dir, + layer2_judge_fn=layer2_factory(task), + layer2_threshold=layer2_threshold, + ) + if abstained: + return 0.0 + return 1.0 if passed else 0.0 + + +def evolve_prompt_section( + section_name: str, + hermes_repo: Path, + tasks_path: Path, + *, + iterations: int = 10, + holdout_ratio: float = 0.5, + seed: int = 42, + max_growth: float = 0.2, + optimizer_model: Optional[str] = None, + reflection_model: Optional[str] = None, + eval_model: Optional[str] = None, + agent_model: Optional[str] = None, + layer2_threshold: float = 0.7, + task_timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS, + max_total_cost_usd: Optional[float] = 150.0, + gepa_minibatch_size: int = 3, + gepa_acceptance: str = "improvement-or-equal", + skip_saturation_check: bool = False, + force_saturation_check: bool = False, + apply: bool = False, + create_pr_flag: bool = False, + dry_run: bool = False, + output_dir: Optional[Path] = None, +) -> dict[str, Any]: + """Evolve one prompt section end-to-end. Returns a summary dict.""" + hermes_repo = Path(hermes_repo).resolve() + source = HermesPromptSource(hermes_repo) + baseline_text = source.read(section_name) + baseline_chars = len(baseline_text) + + suite = TaskSuite.from_jsonl(tasks_path) + train_tasks, holdout_tasks = _split_train_holdout( + suite.tasks, holdout_ratio=holdout_ratio, seed=seed + ) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + if output_dir is None: + output_dir = Path("output") / "prompts" / section_name / timestamp + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + config = EvolutionConfig( + iterations=iterations, + optimizer_model=optimizer_model, + reflection_model=reflection_model, + eval_model=eval_model, + judge_model=eval_model, + seed=seed, + reflection_minibatch_size=gepa_minibatch_size, + gepa_acceptance=gepa_acceptance.replace("-", "_"), + ) + + console.print( + f"\n[bold cyan]Prompt Section Self-Evolution[/bold cyan] — " + f"Evolving section: [bold]{section_name}[/bold]\n" + ) + console.print(f" Hermes repo: {hermes_repo}") + console.print(f" Baseline ({baseline_chars} chars): {baseline_text[:80]}…") + console.print( + f" Tasks: {len(suite.tasks)} ({len(train_tasks)} train / " + f"{len(holdout_tasks)} holdout), sha256 {suite.sha256[:12]}…" + ) + console.print(f" Output dir: {output_dir}") + + run_inputs = build_run_inputs( + config=config, + iterations=iterations, + optimizer_model=optimizer_model, + quality_gate_preset="default", + eval_source="closed_loop", + gepa_acceptance=config.gepa_acceptance, + create_pr=create_pr_flag, + ) + section_payload = { + "artifact_type": "prompt_section", + "target_section": section_name, + "baseline_chars": baseline_chars, + } + + if dry_run: + console.print("[yellow]Dry run — skipping all LM/agent work.[/yellow]") + # Exercise the module + proposer wiring without spending money. + _ = PromptModule(section_name, baseline_text) + _ = PromptSectionProposer(section_name, baseline_chars=baseline_chars) + decision_payload = { + "schema_version": _GATE_SCHEMA_VERSION, + "decision": "dry_run", + "reason": "dry_run", + "decision_signal": "closed_loop", + "run_inputs": run_inputs, + "pr_created": disabled_pr_block(), + **section_payload, + } + write_gate_decision(output_dir, decision_payload) + return {"decision": "dry_run", "reason": "dry_run"} + + register_litellm_failure_callback() + register_litellm_cost_callback() + COST_LEDGER.reset() + COST_LEDGER.set_ceiling(max_total_cost_usd) + if max_total_cost_usd is not None: + console.print(f" Cost ceiling: ${max_total_cost_usd:.2f}") + + installer = HermesPromptSectionInstaller(hermes_repo, section_name) + runner = HermesAgentRunner( + timeout_seconds=task_timeout_seconds, model=agent_model + ) + judge = SaveCallJudge(config) + layer2_factory = _make_layer2_factory(judge) + + tasks_by_id = {t.task_id: t for t in suite.tasks} + + def install_candidate(candidate_text: str) -> None: + source.write(section_name, candidate_text) + + def score_task_id(task_id: str) -> float: + return _run_one_task_score( + tasks_by_id[task_id], + runner=runner, + layer2_factory=layer2_factory, + layer2_threshold=layer2_threshold, + ) + + # One lock serializes splice+run across dspy.Evaluate's thread pool — the + # spliced prompt_builder.py is a single shared mutable file. + scorer = make_memoizing_splice_scorer( + install_fn=install_candidate, + score_fn=score_task_id, + lock=threading.Lock(), + ) + + metric = make_prompt_fitness_metric( + baseline_text=baseline_text, + max_growth=max_growth, + closed_loop_scorer=scorer, + ) + + eval_lm = instantiate_lm( + resolve_default_lm(role="eval", explicit_model=eval_model), + temperature=0.0, request_timeout=120, num_retries=3, + ) + reflection_lm = instantiate_lm( + resolve_default_lm( + role="reflection", explicit_model=reflection_model or optimizer_model + ), + temperature=1.0, max_tokens=32000, cache=False, + request_timeout=300, num_retries=2, + ) + + baseline_module = PromptModule(section_name, baseline_text) + proposer = PromptSectionProposer(section_name, baseline_chars=baseline_chars) + trainset = _behavioral_examples(train_tasks) + valset = _behavioral_examples(holdout_tasks) + + try: + start_time = time.time() + with _prompt_builder_guard(installer.target_path): + # --- Saturation pre-flight (baseline behavior on holdout) --- + if not skip_saturation_check: + sat_report = saturation_preflight( + baseline_module=baseline_module, + holdout_examples=_behavioral_examples(holdout_tasks), + metric=metric, + lm=eval_lm, + baseline_artifact_text=baseline_text, + ) + render_saturation_panel(sat_report, console=console) + if sat_report.band != "healthy" and not force_saturation_check: + if is_non_interactive(): + console.print( + "[yellow]Non-interactive context; refusing to " + "proceed (saturated baseline). Pass " + "--force-saturation-check to override.[/yellow]" + ) + write_gate_decision(output_dir, { + "schema_version": _GATE_SCHEMA_VERSION, + "decision": "denied", + "reason": "saturated_baseline", + "decision_signal": "closed_loop", + "saturation_band": sat_report.band, + "run_inputs": run_inputs, + "pr_created": disabled_pr_block(), + **section_payload, + }) + return {"decision": "denied", "reason": "saturated_baseline"} + if not interactive_confirm(): + console.print("[yellow]Aborted by user.[/yellow]") + return {"decision": "aborted", "reason": "user_abort"} + + # --- GEPA optimization --- + console.print( + f"\n[bold cyan]Running GEPA (max_full_evals={iterations})[/bold cyan]\n" + ) + optimizer = dspy.GEPA( + metric=metric, + max_full_evals=iterations, + reflection_lm=reflection_lm, + seed=config.seed, + track_stats=True, + instruction_proposer=proposer, + reflection_minibatch_size=config.reflection_minibatch_size, + gepa_kwargs={"acceptance_criterion": config.gepa_acceptance}, + ) + optimized = optimizer.compile( + baseline_module, trainset=trainset, valset=valset + ) + + # Guard released here — prompt_builder.py is restored to baseline. + elapsed = time.time() - start_time + + if hasattr(optimized, "detailed_results"): + details = optimized.detailed_results + evolved_text = _section_text_from_candidate( + details.candidates[details.best_idx], section_name + ) + console.print( + f"\n[bold]Candidate selection[/bold]: GEPA val-argmax " + f"(candidate {details.best_idx}, " + f"val={details.val_aggregate_scores[details.best_idx]:.3f}, " + f"{len(evolved_text)} chars)" + ) + else: + evolved_text = optimized.section_text + + # --- Deploy gate: closed-loop baseline vs evolved on the holdout suite --- + console.print( + f"\n[bold]Deploy gate[/bold]: closed-loop on " + f"{len(holdout_tasks)} holdout tasks" + ) + holdout_suite = TaskSuite( + path=suite.path, sha256=suite.sha256, tasks=tuple(holdout_tasks) + ) + baseline_file = output_dir / "baseline_section.txt" + evolved_file = output_dir / "evolved_section.txt" + baseline_file.write_text(baseline_text, encoding="utf-8") + evolved_file.write_text(evolved_text, encoding="utf-8") + + validator = ClosedLoopValidator( + installer=installer, + runner=runner, + layer2_judge_factory=layer2_factory, + layer2_threshold=layer2_threshold, + ) + report = validator.validate(ValidationInputs( + tool_name=section_name, + suite=holdout_suite, + baseline_artifact=baseline_file, + evolved_artifact=evolved_file, + )) + deploy = report.decision == "pass" + except CostCeilingExceeded as exc: + console.print(f"[red]✗ Cost ceiling exceeded: {exc}[/red]") + write_gate_decision(output_dir, { + "schema_version": _GATE_SCHEMA_VERSION, + "decision": "aborted", + "reason": "cost_ceiling_exceeded", + "decision_signal": "closed_loop", + "cost": COST_LEDGER.summary(), + "run_inputs": run_inputs, + "pr_created": disabled_pr_block(), + **section_payload, + }) + return {"decision": "aborted", "reason": "cost_ceiling_exceeded"} + + # PR automation for prompt sections is deferred: create_pr copies a full + # evolved file over origin/'s prompt_builder.py, but our local + # checkout carries the (unmerged) override-hook commit, which would + # pollute the PR diff with unrelated changes. Until a section-scoped PR + # path lands, --create-pr is recorded as skipped; use --apply + a manual PR. + pr_block = disabled_pr_block() + if create_pr_flag: + pr_block = { + "status": "skipped", + "reason": "prompt-section PR automation deferred (would pollute diff " + "with the local override-hook commit); use --apply + manual PR", + "url": None, + } + + decision_payload = { + "schema_version": _GATE_SCHEMA_VERSION, + "decision": "deploy" if deploy else "reject", + "reason": "passed" if deploy else "closed_loop_gate", + "decision_signal": "closed_loop", + "baseline_chars": baseline_chars, + "evolved_chars": len(evolved_text), + "growth_pct": (len(evolved_text) - baseline_chars) / max(1, baseline_chars), + "closed_loop": { + "decision": report.decision, + "decision_reasons": report.decision_reasons, + "baseline_pass_rate": report.baseline.pass_rate, + "evolved_pass_rate": report.evolved.pass_rate, + "n_wins": report.delta.n_wins, + "n_losses": report.delta.n_losses, + "n_ties": report.delta.n_ties, + }, + "sentinel_failures": proposer.sentinel_failures, + "elapsed_seconds": elapsed, + "cost": COST_LEDGER.summary(), + "run_inputs": run_inputs, + "pr_created": pr_block, + **section_payload, + } + gate_path = write_gate_decision(output_dir, decision_payload) + console.print(f" [dim]Gate decision logged to {gate_path}[/dim]") + + if not deploy: + console.print( + f"[red]✗ Evolved section REJECTED by closed-loop gate " + f"({report.decision}) — not deploying[/red]" + ) + return {"decision": "reject", "reason": "closed_loop_gate"} + + console.print( + f"[green]✓ Evolved section PASSED " + f"(baseline {report.baseline.pass_rate:.2f} → " + f"evolved {report.evolved.pass_rate:.2f}, " + f"{report.delta.n_wins}W/{report.delta.n_losses}L)[/green]" + ) + if apply: + source.write(section_name, evolved_text) + console.print( + f" [green]✓ Applied evolved {section_name} to {installer.target_path}[/green]" + ) + + return { + "decision": "deploy", + "reason": "passed", + "evolved_chars": len(evolved_text), + "applied": apply, + } + + +@click.command() +@click.option("--section", "section_name", required=True, + help="The prompt_builder.py string constant to evolve (e.g. MEMORY_GUIDANCE).") +@click.option("--hermes-repo", required=True, + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + help="Path to your hermes-agent checkout.") +@click.option("--tasks", "tasks_path", required=True, + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), + help="Path to a JSONL eval suite (e.g. suites/memory_guidance.jsonl).") +@click.option("--iterations", default=10, type=click.IntRange(min=1), + help="GEPA max_full_evals (default 10).") +@click.option("--holdout-ratio", default=0.5, type=click.FloatRange(0.0, 1.0), + help="Fraction of tasks held out for the deploy gate (default 0.5).") +@click.option("--seed", default=42, type=int, help="Split + GEPA seed.") +@click.option("--max-growth", default=0.2, type=float, + help="Section length budget as a fraction over baseline (default 0.2).") +@click.option("--optimizer-model", default=None) +@click.option("--reflection-model", default=None) +@click.option("--eval-model", default=None, help="Judge model for Layer 2 content scoring.") +@click.option("--agent-model", default=None, + help="Model the hermes -z agent runs as (deliberately weaker exposes more signal).") +@click.option("--layer2-threshold", default=0.7, type=click.FloatRange(0.0, 1.0), + help="Min content-judge score for a save task to pass (default 0.7).") +@click.option("--task-timeout-seconds", default=DEFAULT_TASK_TIMEOUT_SECONDS, + type=click.IntRange(min=1)) +@click.option("--max-cost-usd", "max_total_cost_usd", default=150.0, type=float, + help="Abort if cumulative spend exceeds this (default $150).") +@click.option("--gepa-minibatch-size", default=3, type=click.IntRange(min=1)) +@click.option("--gepa-acceptance", default="improvement-or-equal", + type=click.Choice(["improvement-or-equal", "strict-improvement"])) +@click.option("--skip-saturation-check", is_flag=True, default=False) +@click.option("--force-saturation-check", is_flag=True, default=False, + help="Proceed even if the baseline looks saturated.") +@click.option("--apply", is_flag=True, default=False, + help="On a passing gate, write the evolved section into prompt_builder.py.") +@click.option("--create-pr", "create_pr_flag", is_flag=True, default=False, + help="(Deferred for prompt sections — recorded as skipped.)") +@click.option("--dry-run", is_flag=True, default=False, + help="Exercise wiring without any LM/agent calls.") +@click.option("--output-dir", default=None, + type=click.Path(file_okay=False, dir_okay=True, path_type=Path)) +def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed, + max_growth, optimizer_model, reflection_model, eval_model, agent_model, + layer2_threshold, task_timeout_seconds, max_total_cost_usd, + gepa_minibatch_size, gepa_acceptance, skip_saturation_check, + force_saturation_check, apply, create_pr_flag, dry_run, output_dir): + """Evolve one Hermes system-prompt section via GEPA + closed-loop validation.""" + result = evolve_prompt_section( + section_name=section_name, + hermes_repo=hermes_repo, + tasks_path=tasks_path, + iterations=iterations, + holdout_ratio=holdout_ratio, + seed=seed, + max_growth=max_growth, + optimizer_model=optimizer_model, + reflection_model=reflection_model, + eval_model=eval_model, + agent_model=agent_model, + layer2_threshold=layer2_threshold, + task_timeout_seconds=task_timeout_seconds, + max_total_cost_usd=max_total_cost_usd, + gepa_minibatch_size=gepa_minibatch_size, + gepa_acceptance=gepa_acceptance, + skip_saturation_check=skip_saturation_check, + force_saturation_check=force_saturation_check, + apply=apply, + create_pr_flag=create_pr_flag, + dry_run=dry_run, + output_dir=output_dir, + ) + sys.exit(0 if result["decision"] in {"deploy", "dry_run"} else 1) + + +if __name__ == "__main__": + main() diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py index b4e705a8..8d5a9624 100644 --- a/evolution/prompts/prompt_judge.py +++ b/evolution/prompts/prompt_judge.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +import threading from typing import Any, Callable, Optional import dspy @@ -164,6 +165,7 @@ def make_memoizing_splice_scorer( *, install_fn: Callable[[str], None], score_fn: Callable[[str], float], + lock: Optional[threading.Lock] = None, ) -> Callable[[str, str], float]: """Build ``closed_loop_scorer(task_id, candidate_text) -> float`` that splices a candidate only when it changes. @@ -174,16 +176,25 @@ def make_memoizing_splice_scorer( tasks for the same candidate reuse the live splice. ``score_fn(task_id)`` runs the task through the agent with whatever candidate is installed. + The splice + run is serialized under ``lock`` (a fresh ``threading.Lock`` + by default). ``dspy.Evaluate`` scores with a thread pool, but the spliced + ``prompt_builder.py`` is one shared mutable file — without serialization a + second thread could re-splice a different candidate while the first thread's + ``hermes -z`` subprocess is mid-read. Behavioral scoring is therefore + effectively serial; that's an accepted v1 cost of splice-and-restore. + Backup/restore of the mutated source is the caller's responsibility — wrap the whole GEPA run, not each call (the per-run guard mirrors ``ClosedLoopValidator``'s splice-once-per-phase shape). """ state: dict[str, Any] = {"installed": _UNSET} + lock = lock if lock is not None else threading.Lock() def scorer(task_id: str, candidate_text: str) -> float: - if state["installed"] != candidate_text: - install_fn(candidate_text) - state["installed"] = candidate_text - return score_fn(task_id) + with lock: + if state["installed"] != candidate_text: + install_fn(candidate_text) + state["installed"] = candidate_text + return score_fn(task_id) return scorer diff --git a/evolution/validation/validator.py b/evolution/validation/validator.py index 2f788c93..de248f07 100644 --- a/evolution/validation/validator.py +++ b/evolution/validation/validator.py @@ -85,15 +85,19 @@ def __init__( installer: ArtifactInstaller, runner: AgentRunner, *, - layer2_judge_fn: Optional[Callable[[list[dict]], float]] = None, + layer2_judge_factory: Optional[ + Callable[[Task], Optional[Callable[[list[dict]], float]]] + ] = None, layer2_threshold: float = 0.7, ) -> None: self.installer = installer self.runner = runner - # Optional compound-verdict Layer 2 (prompt-section suites). When - # unset, scoring is Layer 1 only — the tool-description path is - # unchanged. - self.layer2_judge_fn = layer2_judge_fn + # Optional compound-verdict Layer 2 (prompt-section suites). The + # factory builds a per-task scorer from the task — prompt-section + # judging needs the task's expected_save_content rubric and message, + # which a single global fn couldn't carry. When unset, scoring is + # Layer 1 only and the tool-description path is unchanged. + self.layer2_judge_factory = layer2_judge_factory self.layer2_threshold = layer2_threshold def validate(self, inputs: ValidationInputs) -> ValidationReport: @@ -155,13 +159,18 @@ def _run_one_task(self, task: Task) -> TaskResult: skills_src=getattr(self.installer, "skills_src", None), ) run = self.runner.run(ctx) + layer2_judge_fn = ( + self.layer2_judge_factory(task) + if self.layer2_judge_factory is not None + else None + ) passed, abstained = score_task( expected_tools=task.expected_tools, forbidden_tools=task.forbidden_tools, run=run, test_command=task.test_command, fixture_dir=fixture_dir, - layer2_judge_fn=self.layer2_judge_fn, + layer2_judge_fn=layer2_judge_fn, layer2_threshold=self.layer2_threshold, ) return TaskResult( diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py new file mode 100644 index 00000000..34081c58 --- /dev/null +++ b/tests/prompts/test_evolve_prompt_section.py @@ -0,0 +1,110 @@ +"""Wiring tests for evolve_prompt_section — pure helpers + dry-run (no LM/agent).""" +from __future__ import annotations + +import json +import textwrap +from pathlib import Path + +from click.testing import CliRunner + +from evolution.prompts.evolve_prompt_section import ( + _make_layer2_factory, + _section_text_from_candidate, + _split_train_holdout, + evolve_prompt_section, + main, +) +from evolution.prompts.prompt_module import PromptModule +from evolution.validation.task import Task + + +def _task(task_id: str, rubric: str | None = None) -> Task: + return Task( + task_id=task_id, user_message="m", expected_tools=("memory",), + expected_save_content=rubric, + ) + + +def test_split_is_deterministic_and_non_empty(): + tasks = tuple(_task(f"t{i}") for i in range(10)) + train1, holdout1 = _split_train_holdout(tasks, holdout_ratio=0.5, seed=42) + train2, holdout2 = _split_train_holdout(tasks, holdout_ratio=0.5, seed=42) + assert [t.task_id for t in train1] == [t.task_id for t in train2] + assert [t.task_id for t in holdout1] == [t.task_id for t in holdout2] + assert train1 and holdout1 + assert len(train1) + len(holdout1) == 10 + + +def test_split_keeps_both_sides_non_empty_at_extremes(): + tasks = tuple(_task(f"t{i}") for i in range(4)) + train, holdout = _split_train_holdout(tasks, holdout_ratio=1.0, seed=1) + assert train and holdout # never starve the train side + + +def test_layer2_factory_returns_none_without_rubric(): + factory = _make_layer2_factory(judge=None) + assert factory(_task("t1", rubric=None)) is None + assert callable(factory(_task("t2", rubric="a rubric"))) + + +def test_section_text_from_candidate_module_and_dict(): + module = PromptModule("MEMORY_GUIDANCE", "candidate body") + assert _section_text_from_candidate(module, "MEMORY_GUIDANCE") == "candidate body" + instructions = module.passthrough.predict.signature.instructions + assert ( + _section_text_from_candidate( + {"passthrough.predict": instructions}, "MEMORY_GUIDANCE" + ) + == "candidate body" + ) + + +def _fake_repo(tmp_path: Path) -> Path: + (tmp_path / "agent").mkdir() + (tmp_path / "agent" / "prompt_builder.py").write_text(textwrap.dedent('''\ + MEMORY_GUIDANCE = "Save durable facts about the user." + ''')) + return tmp_path + + +def _suite(tmp_path: Path) -> Path: + p = tmp_path / "suite.jsonl" + p.write_text("\n".join(json.dumps(r) for r in [ + {"task_id": "s1", "user_message": "I use uv.", + "expected_tools": ["memory"], "expected_save_content": "prefers uv"}, + {"task_id": "n1", "user_message": "summarize work", + "expected_tools": [], "forbidden_tools": ["memory"]}, + ]) + "\n") + return p + + +def test_dry_run_writes_gate_decision(tmp_path): + repo = _fake_repo(tmp_path) + suite = _suite(tmp_path) + out = tmp_path / "out" + result = evolve_prompt_section( + section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite, + dry_run=True, output_dir=out, + ) + assert result["decision"] == "dry_run" + gate = json.loads((out / "gate_decision.json").read_text()) + assert gate["artifact_type"] == "prompt_section" + assert gate["target_section"] == "MEMORY_GUIDANCE" + # The baseline file must be byte-identical after a dry run (untouched). + assert "Save durable facts about the user." in ( + repo / "agent" / "prompt_builder.py" + ).read_text() + + +def test_cli_dry_run_exits_zero(tmp_path): + repo = _fake_repo(tmp_path) + suite = _suite(tmp_path) + runner = CliRunner() + res = runner.invoke(main, [ + "--section", "MEMORY_GUIDANCE", + "--hermes-repo", str(repo), + "--tasks", str(suite), + "--dry-run", + "--output-dir", str(tmp_path / "out"), + ]) + assert res.exit_code == 0, res.output diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py index 3326c7a1..8767d1a8 100644 --- a/tests/validation/test_validator.py +++ b/tests/validation/test_validator.py @@ -83,14 +83,20 @@ def run(self, ctx): ) judged = [] + tasks_seen = [] - def judge_fn(memory_calls): - judged.append(memory_calls) - return 0.2 # below threshold → Layer 2 fails the task + def judge_factory(task): + tasks_seen.append(task.task_id) + + def judge_fn(memory_calls): + judged.append(memory_calls) + return 0.2 # below threshold → Layer 2 fails the task + + return judge_fn validator = ClosedLoopValidator( _StubInstaller(target), _MemoryRunner(target), - layer2_judge_fn=judge_fn, layer2_threshold=0.7, + layer2_judge_factory=judge_factory, layer2_threshold=0.7, ) report = validator.validate(ValidationInputs( tool_name="MEMORY_GUIDANCE", suite=suite, @@ -98,6 +104,8 @@ def judge_fn(memory_calls): )) # Judge invoked once per phase (baseline + evolved) on the one task. assert len(judged) == 2 + # Factory received the task each phase. + assert tasks_seen == ["t1", "t1"] # Both phases fail Layer 2 → 0 pass rate, no regression decision. assert report.baseline.pass_rate == 0.0 assert report.evolved.pass_rate == 0.0 From fc853abcb5d3ba33fb0c7b3e0f50169d6c5a4b41 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 12:59:18 -0600 Subject: [PATCH 15/23] fix(validation): read agent sessions from hermes state.db MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern hermes -z one-shot mode is ephemeral — it prints only the final response and no longer writes session_*.json. Sessions now persist to a SQLite state.db in HERMES_HOME. The runner globbed for the obsolete JSON files, so every closed-loop run abstained ('no session JSON'). Read the most-recent session's messages from state.db instead; the tool_calls column holds the same OpenAI-nested shape the extractors already parse, so the message-extraction core is shared between the JSON and DB paths. Unblocks all closed-loop validation (tools, skills, and prompt sections). --- evolution/validation/hermes_runner.py | 127 ++++++++++++++++++++----- tests/validation/test_hermes_runner.py | 110 +++++++++++++++++++-- 2 files changed, 205 insertions(+), 32 deletions(-) diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index 82ddcde0..27498461 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -18,6 +18,7 @@ import logging import os import shutil +import sqlite3 import subprocess import tempfile import time @@ -153,15 +154,17 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult: ) duration = time.time() - start - session_path = _find_latest_session(sandbox / "sessions") - if session_path is None: + # Modern hermes persists the session to a SQLite ``state.db`` in + # HERMES_HOME (one-shot ``-z`` no longer writes ``session_*.json``). + db_path = sandbox / "state.db" + if not db_path.is_file(): return AgentRunResult( tool_calls_seq=[], final_text_tail="", duration_seconds=duration, - error="no session JSON written by hermes -z", + error="no session written by hermes -z (state.db absent)", ) - return parse_session_result(session_path, duration_seconds=duration) + return parse_session_from_db(db_path, duration_seconds=duration) finally: shutil.rmtree(sandbox, ignore_errors=True) @@ -175,17 +178,6 @@ def _prime_sandbox(self, sandbox: Path, ctx: TaskRunContext) -> None: shutil.copytree(ctx.skills_src, sandbox / "skills") -def _find_latest_session(sessions_dir: Path) -> Optional[Path]: - if not sessions_dir.exists(): - return None - candidates = sorted( - sessions_dir.glob("session_*.json"), - key=lambda p: p.stat().st_mtime, - reverse=True, - ) - return candidates[0] if candidates else None - - def parse_session_result( session_path: Path, *, @@ -193,9 +185,9 @@ def parse_session_result( ) -> AgentRunResult: """Read a Hermes session JSON and extract the tool-call sequence + final text. - Public for tests: hand-crafted fixture JSONs in - ``tests/validation/test_hermes_runner.py`` exercise this directly - rather than going through the subprocess layer. + Retained for the legacy ``session_*.json`` shape and unit tests that + exercise the message extractors with hand-crafted fixtures. The live + runner reads ``state.db`` via ``parse_session_from_db``. """ try: data = json.loads(session_path.read_text()) @@ -209,18 +201,103 @@ def parse_session_result( ) messages = data.get("messages") or [] - tool_calls_seq = _extract_tool_call_names(messages) - tool_calls_with_args = _extract_tool_calls_with_args(messages) - final_text_tail = _extract_final_text_tail(messages) - model_name = data.get("model") + return _result_from_messages( + messages, + duration_seconds=duration_seconds, + model_name=data.get("model"), + session_path=session_path, + ) + +def parse_session_from_db( + db_path: Path, + *, + duration_seconds: float, +) -> AgentRunResult: + """Reconstruct an ``AgentRunResult`` from a Hermes ``state.db``. + + Modern hermes persists each session to SQLite. We read the most-recent + session's messages and normalize them into the same message-dict shape the + legacy JSON path produced, so the existing extractors work unchanged. The + ``messages.tool_calls`` column holds the OpenAI-nested + ``{"function": {"name", "arguments"}}`` list verbatim. + """ + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + except sqlite3.Error as exc: + return AgentRunResult( + tool_calls_seq=[], + final_text_tail="", + duration_seconds=duration_seconds, + error=f"could not open session DB at {db_path}: {exc}", + session_path=db_path, + ) + try: + conn.row_factory = sqlite3.Row + session = conn.execute( + "SELECT id, model FROM sessions ORDER BY started_at DESC LIMIT 1" + ).fetchone() + if session is None: + return AgentRunResult( + tool_calls_seq=[], + final_text_tail="", + duration_seconds=duration_seconds, + error=f"session DB at {db_path} has no sessions", + session_path=db_path, + ) + rows = conn.execute( + "SELECT role, content, tool_calls FROM messages " + "WHERE session_id = ? ORDER BY id", + (session["id"],), + ).fetchall() + except sqlite3.Error as exc: + return AgentRunResult( + tool_calls_seq=[], + final_text_tail="", + duration_seconds=duration_seconds, + error=f"could not read session DB at {db_path}: {exc}", + session_path=db_path, + ) + finally: + conn.close() + + messages: list[dict] = [] + for row in rows: + raw_calls = row["tool_calls"] + parsed_calls: Any = None + if raw_calls: + try: + parsed_calls = json.loads(raw_calls) + except (json.JSONDecodeError, TypeError): + parsed_calls = None + messages.append({ + "role": row["role"], + "content": row["content"] or "", + "tool_calls": parsed_calls, + }) + return _result_from_messages( + messages, + duration_seconds=duration_seconds, + model_name=session["model"], + session_path=db_path, + ) + + +def _result_from_messages( + messages: list[dict], + *, + duration_seconds: float, + model_name: Optional[str], + session_path: Optional[Path], +) -> AgentRunResult: + """Build an ``AgentRunResult`` from a normalized message list.""" return AgentRunResult( - tool_calls_seq=tool_calls_seq, - final_text_tail=final_text_tail, + tool_calls_seq=_extract_tool_call_names(messages), + final_text_tail=_extract_final_text_tail(messages), duration_seconds=duration_seconds, model_name=model_name, session_path=session_path, - tool_calls_with_args=tool_calls_with_args, + tool_calls_with_args=_extract_tool_calls_with_args(messages), ) diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index da1b2fea..243d3dbf 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -11,6 +11,7 @@ from __future__ import annotations import json +import sqlite3 from pathlib import Path from unittest.mock import patch @@ -20,10 +21,45 @@ from evolution.validation.hermes_runner import ( HermesAgentRunner, _strip_litellm_provider_prefix, + parse_session_from_db, parse_session_result, ) +def _make_state_db(path: Path, *, session_id: str, model: str, messages: list[dict], + started_at: float = 1.0) -> None: + """Create a minimal hermes-shaped state.db with one session + messages. + + Each ``messages`` entry: ``{"role", "content"?, "tool_calls"?}`` where + ``tool_calls`` is a Python list serialized to the ``tool_calls`` TEXT + column (the OpenAI-nested shape hermes stores). + """ + conn = sqlite3.connect(path) + conn.executescript( + """ + CREATE TABLE sessions (id TEXT PRIMARY KEY, model TEXT, started_at REAL); + CREATE TABLE messages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT, role TEXT, content TEXT, tool_calls TEXT + ); + """ + ) + conn.execute( + "INSERT INTO sessions (id, model, started_at) VALUES (?, ?, ?)", + (session_id, model, started_at), + ) + for m in messages: + tc = m.get("tool_calls") + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_calls) " + "VALUES (?, ?, ?, ?)", + (session_id, m["role"], m.get("content"), + json.dumps(tc) if tc is not None else None), + ) + conn.commit() + conn.close() + + class TestStripLitellmProviderPrefix: """The hermes -m flag interprets '/' as openrouter-style routing. Direct-provider users naturally pass litellm-formatted names @@ -225,6 +261,65 @@ def test_handles_flat_dict_args(self, tmp_path): ] +class TestParseSessionFromDb: + """The state.db parse layer — modern hermes persists sessions to SQLite.""" + + def test_extracts_tool_calls_and_args(self, tmp_path): + db = tmp_path / "state.db" + _make_state_db(db, session_id="s1", model="gpt-5.4-mini", messages=[ + {"role": "user", "content": "remember I use uv"}, + {"role": "assistant", "tool_calls": [ + {"type": "function", "function": { + "name": "memory", + "arguments": json.dumps({"action": "add", "content": "uses uv"}), + }} + ]}, + {"role": "tool", "content": "ok"}, + {"role": "assistant", "content": "Saved."}, + ]) + result = parse_session_from_db(db, duration_seconds=2.0) + assert result.error is None + assert result.model_name == "gpt-5.4-mini" + assert result.tool_calls_seq == ["memory"] + assert result.tool_calls_with_args == [ + {"name": "memory", "arguments": {"action": "add", "content": "uses uv"}} + ] + assert result.final_text_tail == "Saved." + + def test_no_sessions_is_error(self, tmp_path): + db = tmp_path / "state.db" + conn = sqlite3.connect(db) + conn.executescript( + "CREATE TABLE sessions (id TEXT, model TEXT, started_at REAL);" + "CREATE TABLE messages (id INTEGER PRIMARY KEY, session_id TEXT, " + "role TEXT, content TEXT, tool_calls TEXT);" + ) + conn.commit() + conn.close() + result = parse_session_from_db(db, duration_seconds=1.0) + assert result.error is not None + assert "no sessions" in result.error + + def test_picks_most_recent_session(self, tmp_path): + db = tmp_path / "state.db" + _make_state_db(db, session_id="old", model="m", started_at=1.0, messages=[ + {"role": "assistant", "tool_calls": [{"function": {"name": "patch"}}]}, + ]) + # Add a newer session with a different tool call. + conn = sqlite3.connect(db) + conn.execute("INSERT INTO sessions (id, model, started_at) VALUES (?,?,?)", + ("new", "m", 2.0)) + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_calls) VALUES (?,?,?,?)", + ("new", "assistant", None, + json.dumps([{"function": {"name": "write_file"}}])), + ) + conn.commit() + conn.close() + result = parse_session_from_db(db, duration_seconds=1.0) + assert result.tool_calls_seq == ["write_file"] + + class TestHermesAgentRunnerSubprocess: """The subprocess invocation layer: env + cwd + args plumbing.""" @@ -242,12 +337,13 @@ def _fake_run(*args, **kwargs): captured["args"] = args[0] if args else kwargs.get("args") captured["env"] = kwargs.get("env") captured["cwd"] = kwargs.get("cwd") - # Drop a minimal session JSON so the parse layer succeeds. + # Drop a minimal state.db so the parse layer succeeds. sandbox = Path(kwargs["env"]["HERMES_HOME"]) - (sandbox / "sessions").mkdir(exist_ok=True) - _write_session( - sandbox / "sessions" / "session_test.json", - [{"role": "assistant", "tool_calls": [{"function": {"name": "patch"}}]}], + _make_state_db( + sandbox / "state.db", + session_id="s1", model="test-model", + messages=[{"role": "assistant", "tool_calls": [ + {"function": {"name": "patch"}}]}], ) return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() @@ -302,7 +398,7 @@ def test_no_session_written_returns_error_result(self, fixture_dir, tmp_path): runner = HermesAgentRunner(user_config_path=tmp_path / "x") def _fake_run(*args, **kwargs): - # Don't drop a session JSON. + # Don't write a state.db. return type("CP", (), {"returncode": 0, "stdout": "", "stderr": ""})() with patch("evolution.validation.hermes_runner.subprocess.run", side_effect=_fake_run): @@ -311,7 +407,7 @@ def _fake_run(*args, **kwargs): fixture_dir=fixture_dir, )) assert result.error is not None - assert "no session JSON" in result.error + assert "state.db absent" in result.error def test_user_config_copied_into_sandbox_when_exists(self, fixture_dir, tmp_path): user_config = tmp_path / "user_config.yaml" From 63f3fd270d0dff40d00169b4dc4fb6aa14460e8a Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 12:59:18 -0600 Subject: [PATCH 16/23] fix(prompts): judge real memory actions (add/replace), not 'save' The Hermes memory tool's content-bearing actions are add and replace (full set: add/replace/remove/read); there is no 'save' action. The Layer 2 filter matched the nonexistent 'save', so it never scored any real call. Match SAVE_ACTIONS = {add, replace} instead. --- evolution/prompts/prompt_judge.py | 12 +++++++++--- tests/prompts/test_prompt_judge.py | 14 ++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py index 8d5a9624..8d6c0b4b 100644 --- a/evolution/prompts/prompt_judge.py +++ b/evolution/prompts/prompt_judge.py @@ -22,6 +22,12 @@ beyond the cap score 0 each — bounds cost on pathological cases where the agent saves on every turn.""" +SAVE_ACTIONS = frozenset({"add", "replace"}) +"""Hermes ``memory`` tool actions that persist content worth judging. The +tool's full action set is add / replace / remove / read (see +``tools/memory_tool.py``); only ``add`` and ``replace`` carry a ``content`` +payload, so only those are content-judged. ``remove`` / ``read`` are not saves.""" + class SaveCallSignature(dspy.Signature): """Score a memory-save call against MEMORY_GUIDANCE's rules. @@ -81,14 +87,14 @@ def judge_save_calls( """Aggregate the Layer 2 score across a task's memory-save calls. ``calls`` is the subset of ``tool_calls_with_args`` whose name is - ``memory`` — each item the call's ``arguments`` dict. Only - ``action == 'save'`` calls are judged. + ``memory`` — each item the call's ``arguments`` dict. Only content-bearing + save actions (``add`` / ``replace``, see ``SAVE_ACTIONS``) are judged. Returns 1.0 when no save calls were made (Layer 1 catches the "should-have-saved-but-didn't" failure; Layer 2 only scores what actually happened) and also when no judge/rubric is configured. """ - save_calls = [c for c in calls if c.get("action") == "save"] + save_calls = [c for c in calls if c.get("action") in SAVE_ACTIONS] if not save_calls: return 1.0 if judge is None or expected_content is None: diff --git a/tests/prompts/test_prompt_judge.py b/tests/prompts/test_prompt_judge.py index c0e620a7..e1e4efe8 100644 --- a/tests/prompts/test_prompt_judge.py +++ b/tests/prompts/test_prompt_judge.py @@ -18,8 +18,8 @@ def test_invokes_judge_per_call_and_means(): fake_judge = MagicMock(spec=SaveCallJudge) fake_judge.score.side_effect = [0.8, 0.6] calls = [ - {"action": "save", "content": "user prefers concise responses"}, - {"action": "save", "content": "completed phase 3"}, + {"action": "add", "content": "user prefers concise responses"}, + {"action": "replace", "content": "completed phase 3"}, ] score = judge_save_calls( judge=fake_judge, calls=calls, @@ -33,7 +33,7 @@ def test_caps_at_five_calls(): """Pathological: agent saves on every turn. Judge at most 5; excess score 0.""" fake_judge = MagicMock(spec=SaveCallJudge) fake_judge.score.return_value = 1.0 - calls = [{"action": "save", "content": f"item {i}"} for i in range(10)] + calls = [{"action": "add", "content": f"item {i}"} for i in range(10)] score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any") # 5 scored 1.0, 5 unjudged scored 0 → mean 0.5 assert score == pytest.approx(0.5) @@ -41,11 +41,13 @@ def test_caps_at_five_calls(): def test_filters_non_save_actions(): + """Only content-bearing actions (add/replace) are judged; remove/read skipped.""" fake_judge = MagicMock(spec=SaveCallJudge) fake_judge.score.return_value = 1.0 calls = [ - {"action": "delete", "key": "x"}, - {"action": "save", "content": "real save"}, + {"action": "remove", "old_text": "x"}, + {"action": "read"}, + {"action": "add", "content": "real save"}, ] score = judge_save_calls(judge=fake_judge, calls=calls, expected_content="any") assert score == pytest.approx(1.0) @@ -54,7 +56,7 @@ def test_filters_non_save_actions(): def test_none_judge_or_expected_is_vacuous_pass(): """A save call exists but no judge/rubric configured → don't penalize.""" - calls = [{"action": "save", "content": "x"}] + calls = [{"action": "add", "content": "x"}] assert judge_save_calls(judge=None, calls=calls, expected_content="r") == 1.0 fake = MagicMock(spec=SaveCallJudge) assert judge_save_calls(judge=fake, calls=calls, expected_content=None) == 1.0 From f685314eb10b611d6dc00c54b64211feb30d7764 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 12:59:18 -0600 Subject: [PATCH 17/23] fix(prompts): invoke passthrough predictor so GEPA can reflect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PromptModule.forward returned a Prediction without calling the predictor, so GEPA captured no trace for passthrough.predict and make_reflective_dataset raised 'No valid predictions found' every iteration — no candidate was ever proposed. The tool path gets traces from synthetic examples; prompt sections are pure-behavioral, so forward must call the passthrough to produce a trace. The predictor output stays a placeholder; the real score is the metric's behavioral branch. --- evolution/prompts/prompt_module.py | 13 ++++++++++--- tests/prompts/test_prompt_module.py | 15 +++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/evolution/prompts/prompt_module.py b/evolution/prompts/prompt_module.py index ad8f3487..9cd9de0b 100644 --- a/evolution/prompts/prompt_module.py +++ b/evolution/prompts/prompt_module.py @@ -98,10 +98,17 @@ def forward( task: str, closed_loop_task_id: Optional[str] = None, ) -> dspy.Prediction: - # Always route behaviorally — there is no cheap predictor score for - # a prompt section. The metric reads these via getattr. + # Invoke the passthrough predictor so GEPA captures a trace for + # ``passthrough.predict``. Without a traced predictor call, GEPA's + # make_reflective_dataset finds "no valid predictions" and never + # proposes a mutation (the tool path gets traces from synthetic + # examples; prompt sections are pure-behavioral, so the trace has + # to come from here). The predictor's output is a placeholder — the + # real score comes from the metric's behavioral branch, which reads + # the candidate text + task id attached below. + result = self.passthrough(task=task) return dspy.Prediction( - response="", + response=getattr(result, "response", ""), _closed_loop_task_id=closed_loop_task_id, _candidate_text=self.section_text, ) diff --git a/tests/prompts/test_prompt_module.py b/tests/prompts/test_prompt_module.py index b369f003..667c88cd 100644 --- a/tests/prompts/test_prompt_module.py +++ b/tests/prompts/test_prompt_module.py @@ -27,11 +27,18 @@ def test_section_text_extracts_current_candidate(): assert module.section_text == "v2-mutated" -def test_forward_routes_behavioral(): - """forward always returns the candidate + task id for behavioral scoring — - there's no cheap predictor score for a prompt section.""" +def test_forward_invokes_predictor_and_attaches_behavioral_fields(): + """forward calls the passthrough predictor (so GEPA gets a trace) and + attaches the candidate text + task id for the metric's behavioral branch.""" + import dspy + from dspy.utils.dummies import DummyLM + module = PromptModule(section_name="MEMORY_GUIDANCE", candidate_text="evolved body") - pred = module.forward(task="anything", closed_loop_task_id="task-001") + # DummyLM lets the real predictor run offline (no network), so section_text + # still resolves while a predictor trace is produced for GEPA. + with dspy.context(lm=DummyLM([{"reasoning": "n/a", "response": "placeholder"}])): + pred = module.forward(task="anything", closed_loop_task_id="task-001") + assert pred._candidate_text == "evolved body" assert pred._closed_loop_task_id == "task-001" From 3c96e57c9abd627cff6563d3e66a49f9cfadf9ba Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 13:02:22 -0600 Subject: [PATCH 18/23] test(validation): use real memory actions (add) in compound-verdict fixtures --- tests/validation/test_report.py | 6 +++--- tests/validation/test_validator.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/validation/test_report.py b/tests/validation/test_report.py index 98b5d9ab..1000f9ca 100644 --- a/tests/validation/test_report.py +++ b/tests/validation/test_report.py @@ -75,7 +75,7 @@ def _save_run(self, content: str = "good") -> AgentRunResult: return AgentRunResult( tool_calls_seq=["memory"], final_text_tail="", duration_seconds=0.0, tool_calls_with_args=[ - {"name": "memory", "arguments": {"action": "save", "content": content}} + {"name": "memory", "arguments": {"action": "add", "content": content}} ], ) @@ -123,7 +123,7 @@ def test_judge_receives_only_memory_call_args(self): tool_calls_seq=["read_file", "memory"], final_text_tail="", duration_seconds=0.0, tool_calls_with_args=[ {"name": "read_file", "arguments": {"path": "x"}}, - {"name": "memory", "arguments": {"action": "save", "content": "c"}}, + {"name": "memory", "arguments": {"action": "add", "content": "c"}}, ], ) received = [] @@ -136,7 +136,7 @@ def judge_fn(memory_calls): expected_tools=("memory",), forbidden_tools=(), run=run, layer2_judge_fn=judge_fn, layer2_threshold=0.7, ) - assert received == [[{"action": "save", "content": "c"}]] + assert received == [[{"action": "add", "content": "c"}]] class TestScoreTaskTestCommandMode: diff --git a/tests/validation/test_validator.py b/tests/validation/test_validator.py index 8767d1a8..433e323a 100644 --- a/tests/validation/test_validator.py +++ b/tests/validation/test_validator.py @@ -78,7 +78,7 @@ def run(self, ctx): tool_calls_seq=["memory"], final_text_tail="ok", duration_seconds=0.1, model_name="test-model", tool_calls_with_args=[ - {"name": "memory", "arguments": {"action": "save", "content": "x"}} + {"name": "memory", "arguments": {"action": "add", "content": "x"}} ], ) From 384a6d472a4944ea57a713e282794e17f1cc5df7 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 19:14:55 -0600 Subject: [PATCH 19/23] fix(prompts): configure global LM so GEPA worker threads can run the predictor The forward() trace fix was necessary but insufficient: GEPA evaluates the module in worker threads that don't inherit the saturation pre-flight's dspy.context(lm=...), so the passthrough predictor raised 'No LM is loaded', captured no trajectories, and never proposed. Set the global default LM via dspy.configure (matching evolve_tool), which the parallelizer propagates to worker threads. GEPA now scores the valset correctly and the proposer fires; on a saturated target it correctly declines to mutate (no failures to ground a change in). --- evolution/prompts/evolve_prompt_section.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py index 288b8737..bd8d227e 100644 --- a/evolution/prompts/evolve_prompt_section.py +++ b/evolution/prompts/evolve_prompt_section.py @@ -39,6 +39,7 @@ from evolution.core.lm_timing_callback import ( COST_LEDGER, CostCeilingExceeded, + LMTimingCallback, register_litellm_cost_callback, register_litellm_failure_callback, ) @@ -363,6 +364,15 @@ def score_task_id(task_id: str) -> float: resolve_default_lm(role="eval", explicit_model=eval_model), temperature=0.0, request_timeout=120, num_retries=3, ) + # Set the global default LM so the passthrough predictor resolves an LM + # inside GEPA's worker threads (dspy.context only covers the saturation + # pre-flight's own eval). Without this, forward()'s passthrough call raises + # "No LM is loaded" in GEPA threads → no trajectories → no proposal. + dspy.configure( + lm=eval_lm, + warn_on_type_mismatch=False, + callbacks=[LMTimingCallback()], + ) reflection_lm = instantiate_lm( resolve_default_lm( role="reflection", explicit_model=reflection_model or optimizer_model From d44b81f2687cff5a9a671440b3d9286ace95465c Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 20:42:57 -0600 Subject: [PATCH 20/23] feat(prompts): --baseline-override-file to evolve from arbitrary starting text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets evolution start from text other than the live section — e.g. a deliberately-weakened or adversarial baseline to create headroom for demonstrating a real mutation, or a regression-injection ablation. The live section remains the splice/restore target (backed up + restored), so the user's file is never left mutated; --apply still writes the evolved text. Verified end-to-end: an adversarial 'never save' baseline scored 0.67, GEPA proposed a corrected section, deploy gate measured 0.67 -> 1.00 (2W/0L). --- evolution/prompts/evolve_prompt_section.py | 21 +++++++++++++++++++-- tests/prompts/test_evolve_prompt_section.py | 18 ++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py index bd8d227e..ef5ae3d1 100644 --- a/evolution/prompts/evolve_prompt_section.py +++ b/evolution/prompts/evolve_prompt_section.py @@ -246,11 +246,21 @@ def evolve_prompt_section( create_pr_flag: bool = False, dry_run: bool = False, output_dir: Optional[Path] = None, + baseline_override_file: Optional[Path] = None, ) -> dict[str, Any]: """Evolve one prompt section end-to-end. Returns a summary dict.""" hermes_repo = Path(hermes_repo).resolve() source = HermesPromptSource(hermes_repo) - baseline_text = source.read(section_name) + # The live section is always the splice/restore target. ``baseline_override`` + # lets evolution START from different text (e.g. a deliberately-weakened + # baseline to create headroom, or a regression-injection ablation) without + # touching the real file — the guard still backs up and restores the live + # section. ``--apply`` writes the evolved text into the live section as usual. + source.read(section_name) # validate the section exists / is a string constant + if baseline_override_file is not None: + baseline_text = Path(baseline_override_file).read_text(encoding="utf-8") + else: + baseline_text = source.read(section_name) baseline_chars = len(baseline_text) suite = TaskSuite.from_jsonl(tasks_path) @@ -605,11 +615,17 @@ def score_task_id(task_id: str) -> float: help="Exercise wiring without any LM/agent calls.") @click.option("--output-dir", default=None, type=click.Path(file_okay=False, dir_okay=True, path_type=Path)) +@click.option("--baseline-override-file", default=None, + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), + help="Start evolution from this text instead of the live section " + "(e.g. a weakened baseline to create headroom). The live file " + "is still backed up + restored; --apply writes the evolved text.") def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed, max_growth, optimizer_model, reflection_model, eval_model, agent_model, layer2_threshold, task_timeout_seconds, max_total_cost_usd, gepa_minibatch_size, gepa_acceptance, skip_saturation_check, - force_saturation_check, apply, create_pr_flag, dry_run, output_dir): + force_saturation_check, apply, create_pr_flag, dry_run, output_dir, + baseline_override_file): """Evolve one Hermes system-prompt section via GEPA + closed-loop validation.""" result = evolve_prompt_section( section_name=section_name, @@ -634,6 +650,7 @@ def main(section_name, hermes_repo, tasks_path, iterations, holdout_ratio, seed, create_pr_flag=create_pr_flag, dry_run=dry_run, output_dir=output_dir, + baseline_override_file=baseline_override_file, ) sys.exit(0 if result["decision"] in {"deploy", "dry_run"} else 1) diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py index 34081c58..62e85ea9 100644 --- a/tests/prompts/test_evolve_prompt_section.py +++ b/tests/prompts/test_evolve_prompt_section.py @@ -96,6 +96,24 @@ def test_dry_run_writes_gate_decision(tmp_path): ).read_text() +def test_baseline_override_file_replaces_live_section(tmp_path): + repo = _fake_repo(tmp_path) + suite = _suite(tmp_path) + override = tmp_path / "weak.txt" + override.write_text("a deliberately weak baseline") + out = tmp_path / "out" + evolve_prompt_section( + section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite, + dry_run=True, output_dir=out, baseline_override_file=override, + ) + gate = json.loads((out / "gate_decision.json").read_text()) + assert gate["baseline_chars"] == len("a deliberately weak baseline") + # The live file is never touched by an override dry run. + assert "Save durable facts about the user." in ( + repo / "agent" / "prompt_builder.py" + ).read_text() + + def test_cli_dry_run_exits_zero(tmp_path): repo = _fake_repo(tmp_path) suite = _suite(tmp_path) From 621b23da6a26440f922c03ee40a83ca0745a7220 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 20:44:27 -0600 Subject: [PATCH 21/23] =?UTF-8?q?docs(plan):=20Phase=203=20deviations=20?= =?UTF-8?q?=E2=80=94=20splice-and-restore,=20compound=20verdict,=20state.d?= =?UTF-8?q?b=20runner=20fix,=20adversarial-baseline=20proof?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PLAN.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/PLAN.md b/PLAN.md index 5c587f64..d03e5001 100644 --- a/PLAN.md +++ b/PLAN.md @@ -466,6 +466,8 @@ These descriptions are sent with every API call as part of the tool schema — e **Goal:** Optimize the sections of the system prompt that guide agent behavior. +**Status:** ✅ Complete (MEMORY_GUIDANCE proof point). See "Deviations from plan" at the end of this section. + **Prerequisite:** Phase 2 gate passed — benchmark gating validated, GEPA producing sensible text mutations. **Week 1 (Build):** Build section-as-DSPy-parameter wrapper for the 5 evolvable prompt sections. Build behavioral test suite generator. This is the riskiest tier so far — system prompt changes affect everything. @@ -535,6 +537,26 @@ The system prompt is assembled in `run_agent.py` / `agent/prompt_builder.py` fro - Identity section must retain core traits (helpful, direct, admits uncertainty) - Platform hints must remain platform-accurate (don't tell Telegram to use ANSI codes) +**Deviations from plan (Phase 3):** + +1. **Integration is in-place splice-and-restore, not an env-var hook or a plugin.** The design's primary path routed candidate overrides through an upstream `HERMES_PROMPT_OVERRIDES_JSON` env var; that hook was not accepted upstream, so depending on it would make the framework a local-only patch that silently no-ops on any hermes pull. A plugin alternative was ruled out as non-viable: consumers bind the constants at import time (`from agent.prompt_builder import MEMORY_GUIDANCE` in `run_agent.py` and `agent/system_prompt.py`), so a plugin's `register()` runs too late to reach them. Phase 3 instead splices the candidate directly into `agent/prompt_builder.py` (byte-precise AST replacement via `repr()`, parse-checked) and restores from an atomic backup, reusing Phase 2's `ClosedLoopValidator` flock + sha-drift + stale-backup machinery. No upstream dependency; runs against stock Hermes. This also **collapsed the planned parallel `PromptSectionValidator`** into a small `HermesPromptSectionInstaller` (an `ArtifactInstaller`) plus a one-method Layer-2 hook on the shared validator — less code than the design called for. + +2. **One target section per run; MEMORY_GUIDANCE is the only proof point.** Joint multi-section optimization and identity/persona evolution are deferred — joint runs carry Phase 2's "stealing selections" risk, and `DEFAULT_AGENT_IDENTITY` has no tool-call anchor for the verdict. The `PromptSource` abstraction supports the other string sections (`SKILLS_GUIDANCE`, `SESSION_SEARCH_GUIDANCE`, etc.) with no refactor; dict-typed sections like `PLATFORM_HINTS` are out of scope for v1 (string constants only). + +3. **Verdict is compound (tool-membership + LLM content judge), threaded per-task.** Layer 1 is the Phase 2 expected/forbidden rule on whether `memory` was invoked; Layer 2 is an LLM judge scoring the saved content against each task's `expected_save_content` rubric. The validator builds the judge per-task (a factory) so the content judge sees the task's rubric and message — a fixed global judge couldn't. Note the real Hermes `memory` tool actions are **`add`/`replace`** (content-bearing), not `save`; the full set is add/replace/remove/read. + +4. **Eval suite ships as a curated 12-task golden set, not 50 synthetic + 10 golden.** A hand-authored `memory_guidance.jsonl` spans the five categories (save-preference, save-correction, dont-save-task-progress, dont-save-completed-work-log, declarative-vs-imperative). The synthetic generator (`build_memory_guidance_dataset`) is built and unit-tested, but full synthetic expansion via a funded generation run is deferred — curation gives a higher-signal first suite and avoids upfront generation spend. + +5. **PR automation is deferred for prompt sections.** `create_pr` atomically copies a full evolved artifact over `origin/`'s file; deriving an evolved `prompt_builder.py` from the local checkout would carry the unmerged override-hook commit into the PR diff. `--create-pr` is accepted but records a skipped block; the deploy path is `--apply` (writes the evolved section into the live file) plus a manual PR. A section-scoped PR path (splice into `origin/`'s file, not the local one) is future work. + +6. **The shared closed-loop runner had to be rebuilt for current Hermes.** Surfaced by the Phase 3 end-to-end smoke: `hermes -z` one-shot mode is now ephemeral — it prints only the final response and no longer writes `session_*.json`; sessions persist to a SQLite `state.db` in `HERMES_HOME`. `HermesAgentRunner` globbed for the obsolete JSON files, so **every** closed-loop run had been silently abstaining ("no session JSON") across Phases 1–3. The runner now reads the most-recent session's messages from `state.db` (the `tool_calls` column carries the same OpenAI-nested shape the extractors already parse). This is a shared-infrastructure fix that unblocks all closed-loop validation, not just prompts. + +7. **Behavioral eval is serialized, and agent-subprocess cost is invisible to the budget cap.** Because every candidate is spliced into one shared `prompt_builder.py`, the GEPA inner-loop scorer serializes splice+run under a lock (DSPy's evaluator is multi-threaded; the shared file is not). Per-section closed-loop is therefore effectively serial — an accepted v1 cost of the splice-and-restore model. The agent's own LM spend happens inside the `hermes` child process, invisible to the in-process cost ledger, so `--max-cost-usd` bounds only judge + reflection + passthrough spend; `sessions.actual_cost_usd` in `state.db` could close that gap later. + +8. **Saturation default-deny confirmed on a capable agent; a demonstrated improvement required an adversarial baseline.** With `gpt-5.4-mini`, both the live `MEMORY_GUIDANCE` and a *passively*-weakened baseline scored 1.0 / 6 holdout — `no_headroom`, correctly default-denied. This matches Phase 2's "regression-catching, not improvement-finding on tuned artifacts" finding and the binary model-tier effect (a capable agent saves correctly regardless of vague guidance). A real mutation was demonstrated only by *actively misdirecting* the baseline: an adversarial "never proactively save" section scored 0.67, GEPA's reflective proposer inverted it to restore proactive saving (and made it shorter), and the deploy gate measured **0.67 → 1.00, 2 wins / 0 losses → deploy**. The `--baseline-override-file` flag enables this ablation (and regression-injection testing generally) without mutating the live section. + +9. **Benchmark gating again not built in (same as Phases 1–2).** The built-in deploy gate is paired-bootstrap CI plus the dual-condition rule on the holdout; `--benchmark-cmd` remains the external-benchmark hook. TBLite / YC-Bench wiring is left to the user's `--benchmark-cmd`. + ### Phase 4: Code Evolution via Darwinian Evolver **Goal:** Evolve tool implementation code for better performance and fewer bugs. From b681c08c47d012fb9aec3d83cd44194d5bf000e4 Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Mon, 1 Jun 2026 21:53:27 -0600 Subject: [PATCH 22/23] =?UTF-8?q?fix(prompts,validation):=20address=20PR?= =?UTF-8?q?=20review=20=E2=80=94=20abstain=20on=20corrupt=20sessions,=20do?= =?UTF-8?q?c/comment=20accuracy,=20guards=20+=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Critical: a malformed tool_calls column in state.db now abstains (error set + logged) instead of reading as 'agent invoked no tools', which scored a DB-format regression as a fake behavioral failure and contaminated fitness. - Surface previously-silent fallbacks: malformed tool-call args, a memory call with no save action, and an unparseable judge score now log. - Doc/comment accuracy: memory action is add (not the nonexistent 'save'); tool schema enum is add/replace/remove (not 'read'); state.db tool_calls is the flat shape (nested handled for compat); the memoizing-scorer/validator splice cadence; the guard wraps pre-flight + GEPA; _closed_loop_task_id is set by PromptModule.forward. - Reject a <2-task suite up front (empty GEPA trainset otherwise). - Tests: parse_session_from_db malformed/corrupt/missing-table matrix; the _prompt_builder_guard restore round-trip, stale-backup refusal, and concurrent lock refusal; the single-task-suite guard. --- evolution/prompts/evolve_prompt_section.py | 12 ++++- evolution/prompts/prompt_judge.py | 46 +++++++++++++----- evolution/validation/agent_runner.py | 4 +- evolution/validation/hermes_runner.py | 30 ++++++++++-- evolution/validation/task.py | 2 +- tests/prompts/test_evolve_prompt_section.py | 54 +++++++++++++++++++++ tests/validation/test_hermes_runner.py | 38 +++++++++++++++ 7 files changed, 166 insertions(+), 20 deletions(-) diff --git a/evolution/prompts/evolve_prompt_section.py b/evolution/prompts/evolve_prompt_section.py index ef5ae3d1..4558e5c8 100644 --- a/evolution/prompts/evolve_prompt_section.py +++ b/evolution/prompts/evolve_prompt_section.py @@ -153,9 +153,11 @@ def _section_text_from_candidate(candidate: Any, section_name: str) -> str: @contextmanager def _prompt_builder_guard(target_path: Path) -> Iterator[None]: """Back up ``prompt_builder.py`` + hold the shared closed-loop flock for the - duration of GEPA evolution, then restore the original bytes on exit. + duration of the saturation pre-flight + GEPA evolution, then restore the + original bytes on exit. - The GEPA inner loop splices candidates directly into the live file; this + The pre-flight and GEPA inner loop splice candidates directly into the live + file; this guard guarantees the user's checkout is byte-restored afterward and that no concurrent harness run (which uses the same lock + backup names) mutates it mid-flight. Sequenced before the deploy-gate ``ClosedLoopValidator``, which @@ -264,6 +266,12 @@ def evolve_prompt_section( baseline_chars = len(baseline_text) suite = TaskSuite.from_jsonl(tasks_path) + if len(suite.tasks) < 2: + raise ValueError( + f"{tasks_path} has {len(suite.tasks)} task(s); need at least 2 so the " + f"split yields a non-empty GEPA trainset and a non-empty deploy-gate " + f"holdout." + ) train_tasks, holdout_tasks = _split_train_holdout( suite.tasks, holdout_ratio=holdout_ratio, seed=seed ) diff --git a/evolution/prompts/prompt_judge.py b/evolution/prompts/prompt_judge.py index 8d6c0b4b..d6e20cc4 100644 --- a/evolution/prompts/prompt_judge.py +++ b/evolution/prompts/prompt_judge.py @@ -24,9 +24,9 @@ SAVE_ACTIONS = frozenset({"add", "replace"}) """Hermes ``memory`` tool actions that persist content worth judging. The -tool's full action set is add / replace / remove / read (see -``tools/memory_tool.py``); only ``add`` and ``replace`` carry a ``content`` -payload, so only those are content-judged. ``remove`` / ``read`` are not saves.""" +tool's schema enum is add / replace / remove (see ``tools/memory_tool.py``); +only ``add`` and ``replace`` carry a ``content`` payload, so only those are +content-judged. ``remove`` is not a save.""" class SaveCallSignature(dspy.Signature): @@ -74,6 +74,17 @@ def score(self, *, task: str, expected_content: str, saved_content: str) -> floa expected_content=expected_content, saved_content=saved_content, ) + # _clamp_to_unit returns a neutral 0.5 on unparseable output. A 0.5 is + # below the default 0.7 threshold, so a garbled judge response silently + # fails an otherwise-good save — log the raw value so that's debuggable + # rather than indistinguishable from a real mediocre score. + try: + float(str(result.quality).strip()) + except (ValueError, TypeError): + logger.warning( + "SaveCallJudge: unparseable quality %r from judge LM; " + "falling back to neutral 0.5", result.quality, + ) return _clamp_to_unit(result.quality) @@ -90,12 +101,22 @@ def judge_save_calls( ``memory`` — each item the call's ``arguments`` dict. Only content-bearing save actions (``add`` / ``replace``, see ``SAVE_ACTIONS``) are judged. - Returns 1.0 when no save calls were made (Layer 1 catches the - "should-have-saved-but-didn't" failure; Layer 2 only scores what - actually happened) and also when no judge/rubric is configured. + Returns 1.0 when no save calls were made (Layer 1 catches the case where + ``memory`` was never invoked; note it does NOT backstop a ``memory`` call + with a non-save action like ``remove`` — that still scores a vacuous 1.0 + here) and also when no judge/rubric is configured. """ save_calls = [c for c in calls if c.get("action") in SAVE_ACTIONS] if not save_calls: + # Distinguish "no memory call" (expected, silent) from "memory was + # invoked but nothing matched SAVE_ACTIONS" (worth surfacing — a save + # we can't score, e.g. an action rename or malformed empty-args call). + if calls: + logger.info( + "judge_save_calls: %d memory call(s) but no save action " + "(actions=%s); returning vacuous 1.0", + len(calls), [c.get("action") for c in calls], + ) return 1.0 if judge is None or expected_content is None: return 1.0 @@ -123,9 +144,11 @@ def make_prompt_fitness_metric( """Build the GEPA-shaped 5-arg fitness metric for a prompt section. All prompt-section eval is behavioral (a real Hermes subprocess), so - every prediction must carry ``_closed_loop_task_id`` (set by the - dataset builder) and ``_candidate_text`` (set by ``PromptModule``). - Predictions missing the task id are degenerate — they score 0 with a + every prediction must carry ``_closed_loop_task_id`` and + ``_candidate_text`` — both attached by ``PromptModule.forward`` (the task + id flows in as the ``closed_loop_task_id`` input field built by + ``_behavioral_examples``). Predictions missing the task id are degenerate + — they score 0 with a diagnostic so the misconfiguration is visible in GEPA feedback rather than silently scoring well. @@ -190,8 +213,9 @@ def make_memoizing_splice_scorer( effectively serial; that's an accepted v1 cost of splice-and-restore. Backup/restore of the mutated source is the caller's responsibility — wrap - the whole GEPA run, not each call (the per-run guard mirrors - ``ClosedLoopValidator``'s splice-once-per-phase shape). + the whole GEPA run, not each call. This mirrors ``ClosedLoopValidator``, + which backs up once and restores once around both phases (it re-splices the + artifact on every task inside a phase, not once per phase). """ state: dict[str, Any] = {"installed": _UNSET} lock = lock if lock is not None else threading.Lock() diff --git a/evolution/validation/agent_runner.py b/evolution/validation/agent_runner.py index df9a4d7c..6d36227c 100644 --- a/evolution/validation/agent_runner.py +++ b/evolution/validation/agent_runner.py @@ -24,11 +24,11 @@ class AgentRunResult: ``tool_calls_with_args`` carries the same calls in order as ``{"name", "arguments"}`` dicts (arguments parsed from the LLM-emitted JSON). The compound-verdict Layer 2 judge needs the - argument payloads — e.g. the content of a ``memory(action='save')`` + argument payloads — e.g. the content of a ``memory(action='add')`` call — which ``tool_calls_seq`` discards. ``error`` is set when the runner itself failed to drive the agent - (subprocess timeout, no session JSON written, parse failure). It's + (subprocess timeout, no session written, parse failure). It's distinct from "agent invoked a tool that failed" — that's still a valid run, just one where the agent struggled. Tasks with ``error`` are counted as *abstentions* in the report, not as failures, so a diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py index 27498461..e0f904ec 100644 --- a/evolution/validation/hermes_runner.py +++ b/evolution/validation/hermes_runner.py @@ -219,8 +219,14 @@ def parse_session_from_db( Modern hermes persists each session to SQLite. We read the most-recent session's messages and normalize them into the same message-dict shape the legacy JSON path produced, so the existing extractors work unchanged. The - ``messages.tool_calls`` column holds the OpenAI-nested - ``{"function": {"name", "arguments"}}`` list verbatim. + ``messages.tool_calls`` column holds the tool-call list verbatim — current + hermes writes the flat ``{"name", "arguments"}`` shape; the extractors also + accept the older OpenAI-nested ``{"function": {...}}`` shape. + + A row whose ``tool_calls`` column won't parse as JSON aborts with an + ``error`` result (the task abstains) rather than being silently read as + "agent invoked no tools" — that would score a DB-format regression as a + behavioral failure and contaminate the fitness signal. """ try: conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) @@ -268,8 +274,19 @@ def parse_session_from_db( if raw_calls: try: parsed_calls = json.loads(raw_calls) - except (json.JSONDecodeError, TypeError): - parsed_calls = None + except (json.JSONDecodeError, TypeError) as exc: + logger.warning( + "malformed tool_calls JSON in session %s at %s (%s); " + "abstaining rather than scoring the task as a no-op", + session["id"], db_path, exc, + ) + return AgentRunResult( + tool_calls_seq=[], + final_text_tail="", + duration_seconds=duration_seconds, + error=f"malformed tool_calls JSON in session DB at {db_path}: {exc}", + session_path=db_path, + ) messages.append({ "role": row["role"], "content": row["content"] or "", @@ -358,6 +375,11 @@ def _extract_tool_calls_with_args(messages: list[dict]) -> list[dict]: try: args = json.loads(args_raw) if args_raw else {} except (json.JSONDecodeError, TypeError): + logger.warning( + "malformed arguments for tool call %r (%r); using {} — a " + "content judge will see an empty-args call", + name, args_raw[:120], + ) args = {} if not isinstance(args, dict): args = {} diff --git a/evolution/validation/task.py b/evolution/validation/task.py index b7833fa0..3b6f4bbe 100644 --- a/evolution/validation/task.py +++ b/evolution/validation/task.py @@ -34,7 +34,7 @@ class Task: set, takes precedence over the tool-call rule. ``expected_save_content`` is an optional rubric (not exact text) - describing what a good ``memory(action='save')`` would contain. It + describing what a good ``memory(action='add')`` would contain. It feeds the prompt-section compound verdict's Layer 2 content judge; it has no effect on the Layer 1 tool-call rule above. """ diff --git a/tests/prompts/test_evolve_prompt_section.py b/tests/prompts/test_evolve_prompt_section.py index 62e85ea9..50c0efc9 100644 --- a/tests/prompts/test_evolve_prompt_section.py +++ b/tests/prompts/test_evolve_prompt_section.py @@ -7,8 +7,15 @@ from click.testing import CliRunner +import fcntl + +import pytest + from evolution.prompts.evolve_prompt_section import ( + _BACKUP_SUFFIX, + _LOCK_FILENAME, _make_layer2_factory, + _prompt_builder_guard, _section_text_from_candidate, _split_train_holdout, evolve_prompt_section, @@ -114,6 +121,53 @@ def test_baseline_override_file_replaces_live_section(tmp_path): ).read_text() +class TestPromptBuilderGuard: + def test_restores_bytes_even_on_exception(self, tmp_path): + target = tmp_path / "pb.py" + target.write_text("ORIGINAL = 1\n") + original = target.read_bytes() + with pytest.raises(RuntimeError, match="boom"): + with _prompt_builder_guard(target): + target.write_text("MUTATED = 2\n") + raise RuntimeError("boom") + assert target.read_bytes() == original + assert not target.with_suffix(target.suffix + _BACKUP_SUFFIX).exists() + + def test_refuses_stale_backup(self, tmp_path): + target = tmp_path / "pb.py" + target.write_text("X = 1\n") + target.with_suffix(target.suffix + _BACKUP_SUFFIX).write_text("stale") + with pytest.raises(RuntimeError, match="[Ss]tale backup"): + with _prompt_builder_guard(target): + pass + + def test_refuses_when_another_run_holds_the_lock(self, tmp_path): + target = tmp_path / "pb.py" + target.write_text("X = 1\n") + other = open(target.parent / _LOCK_FILENAME, "w") + fcntl.flock(other.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + try: + with pytest.raises(RuntimeError, match="holds"): + with _prompt_builder_guard(target): + pass + finally: + fcntl.flock(other.fileno(), fcntl.LOCK_UN) + other.close() + + +def test_rejects_single_task_suite(tmp_path): + repo = _fake_repo(tmp_path) + suite = tmp_path / "one.jsonl" + suite.write_text(json.dumps({ + "task_id": "only", "user_message": "x", "expected_tools": ["memory"], + }) + "\n") + with pytest.raises(ValueError, match="at least 2"): + evolve_prompt_section( + section_name="MEMORY_GUIDANCE", hermes_repo=repo, tasks_path=suite, + dry_run=True, output_dir=tmp_path / "out", + ) + + def test_cli_dry_run_exits_zero(tmp_path): repo = _fake_repo(tmp_path) suite = _suite(tmp_path) diff --git a/tests/validation/test_hermes_runner.py b/tests/validation/test_hermes_runner.py index 243d3dbf..3fb83f20 100644 --- a/tests/validation/test_hermes_runner.py +++ b/tests/validation/test_hermes_runner.py @@ -319,6 +319,44 @@ def test_picks_most_recent_session(self, tmp_path): result = parse_session_from_db(db, duration_seconds=1.0) assert result.tool_calls_seq == ["write_file"] + def test_malformed_tool_calls_column_abstains(self, tmp_path): + """A corrupt tool_calls column must abstain (error set), not read as + 'agent invoked no tools' (which would score a hard behavioral fail).""" + db = tmp_path / "state.db" + _make_state_db(db, session_id="s1", model="m", + messages=[{"role": "user", "content": "hi"}]) + conn = sqlite3.connect(db) + conn.execute( + "INSERT INTO messages (session_id, role, content, tool_calls) VALUES (?,?,?,?)", + ("s1", "assistant", "", "{not-valid-json"), + ) + conn.commit() + conn.close() + result = parse_session_from_db(db, duration_seconds=1.0) + assert result.error is not None + assert "malformed tool_calls" in result.error + assert result.tool_calls_seq == [] + + def test_corrupt_db_file_errors(self, tmp_path): + bad = tmp_path / "state.db" + bad.write_bytes(b"this is not a sqlite database at all") + result = parse_session_from_db(bad, duration_seconds=1.0) + assert result.error is not None + assert "could not" in result.error # open or read, depending on sqlite + + def test_missing_messages_table_errors(self, tmp_path): + db = tmp_path / "state.db" + conn = sqlite3.connect(db) + conn.executescript( + "CREATE TABLE sessions (id TEXT, model TEXT, started_at REAL);" + "INSERT INTO sessions VALUES ('s1', 'm', 1.0);" + ) + conn.commit() + conn.close() + result = parse_session_from_db(db, duration_seconds=1.0) + assert result.error is not None + assert "could not read" in result.error + class TestHermesAgentRunnerSubprocess: """The subprocess invocation layer: env + cwd + args plumbing.""" From 5658e7bb1df36ae4866b8780d2dd78aee5cd61db Mon Sep 17 00:00:00 2001 From: Justin Ramos Date: Tue, 2 Jun 2026 07:43:12 -0600 Subject: [PATCH 23/23] refactor(prompts): narrow PromptSource Protocol to read + write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read/write are the only members the evolution driver exercises (the runtime override seam moved to HermesPromptSectionInstaller). name and list_sections had no production consumer, so they're no longer part of the shared contract — list_sections + SectionDescriptor remain as concrete conveniences on HermesPromptSource for a future --list-sections affordance. Every member of a Protocol is a cost on every future implementer; this keeps the contract to exactly what's shared. --- evolution/prompts/prompt_source.py | 21 ++++++++++++--------- tests/prompts/test_prompt_source.py | 13 ++----------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/evolution/prompts/prompt_source.py b/evolution/prompts/prompt_source.py index 5bd5ca5b..0d056d92 100644 --- a/evolution/prompts/prompt_source.py +++ b/evolution/prompts/prompt_source.py @@ -1,9 +1,13 @@ -"""PromptSource Protocol — adapters that read, write, and enumerate named prompt sections. +"""PromptSource Protocol — adapters that read and write named prompt sections. Phase 3 integrates via in-place splice-and-restore (see ``HermesPromptSectionInstaller``), so the runtime override seam lives in -the installer, not here. A PromptSource only needs to read the baseline, -persist an evolved value, and enumerate what's targetable. +the installer, not here. The contract is deliberately just read + write: the +driver reads the baseline and persists/splices an evolved value, and nothing +more is shared across implementers. Enumeration (``list_sections`` → +``SectionDescriptor``) is a concrete convenience on ``HermesPromptSource`` for +a future ``--list-sections`` affordance, not part of the contract every +adapter must satisfy. """ from __future__ import annotations @@ -31,9 +35,12 @@ class SectionDescriptor: @runtime_checkable class PromptSource(Protocol): - """Adapter contract for prompt-section evolution targets.""" + """Adapter contract for prompt-section evolution targets: read + write. - name: str + Kept minimal on purpose — these are the only members the evolution driver + exercises. Concrete adapters may offer more (e.g. ``HermesPromptSource`` + also enumerates sections), but those are not part of the shared contract. + """ def read(self, section_name: str) -> str: """Return the canonical baseline text of the named section.""" @@ -47,7 +54,3 @@ def write(self, section_name: str, new_text: str) -> None: owns the backup/restore around the mutation). """ ... - - def list_sections(self) -> list[SectionDescriptor]: - """Enumerate all evolvable sections this source can target.""" - ... diff --git a/tests/prompts/test_prompt_source.py b/tests/prompts/test_prompt_source.py index 41703567..89e3d3ff 100644 --- a/tests/prompts/test_prompt_source.py +++ b/tests/prompts/test_prompt_source.py @@ -22,20 +22,16 @@ def test_section_descriptor_is_frozen(): def test_prompt_source_protocol_runtime_checkable(): - """A concrete class implementing the three methods satisfies isinstance().""" + """read + write are the whole contract — a class with just those satisfies + isinstance(), with no need to enumerate or carry a name.""" class StubSource: - name = "stub" - def read(self, section_name: str) -> str: return "stub" def write(self, section_name: str, new_text: str) -> None: return None - def list_sections(self) -> list[SectionDescriptor]: - return [] - assert isinstance(StubSource(), PromptSource) @@ -43,12 +39,7 @@ def test_prompt_source_protocol_rejects_incomplete(): """Missing a required method => not a PromptSource.""" class MissingWrite: - name = "incomplete" - def read(self, section_name: str) -> str: return "x" - def list_sections(self) -> list[SectionDescriptor]: - return [] - assert not isinstance(MissingWrite(), PromptSource)