From a7681c89941637f64613161f7276b64873d1335c Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Mon, 20 Apr 2026 01:19:56 -0500 Subject: [PATCH 1/4] refactor: split managed_agents.py (620 LOC) into a package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seven-submodule decomposition along SDK-resource-lifecycle seams: managed_agents/__init__.py barrel + full docstring with all the Step-0 smoke test SDK quirks managed_agents/_constants.py (35) beta headers + make_client + the $0.08/hr session rate managed_agents/environments.py (54) create / archive env managed_agents/skills.py (167) upload + 3-step archive dance + archive_skill_safe + name extractor managed_agents/agents.py (57) create / archive competitor agent managed_agents/sessions.py (124) create / archive session + send_user_message + event polling managed_agents/output.py (211) post-run trace introspection — written_files, bash-write parsing, token usage, runtime cost Every public name is re-exported from the package __init__ so 38 call sites keep their ``from skillforge.agents import managed_agents`` + ``managed_agents.upload_skill(...)`` usage unchanged. Tests against two private helpers (_extract_skill_name_from_md, _normalize_output_path) were accessing them on the module directly; those are re-exported through the barrel so test patches continue to resolve. QA: ruff + mypy + 411 pytest (unchanged) all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/managed_agents.py | 620 ------------------ skillforge/agents/managed_agents/__init__.py | 97 +++ .../agents/managed_agents/_constants.py | 35 + skillforge/agents/managed_agents/agents.py | 59 ++ .../agents/managed_agents/environments.py | 55 ++ skillforge/agents/managed_agents/output.py | 208 ++++++ skillforge/agents/managed_agents/sessions.py | 124 ++++ skillforge/agents/managed_agents/skills.py | 163 +++++ 8 files changed, 741 insertions(+), 620 deletions(-) delete mode 100644 skillforge/agents/managed_agents.py create mode 100644 skillforge/agents/managed_agents/__init__.py create mode 100644 skillforge/agents/managed_agents/_constants.py create mode 100644 skillforge/agents/managed_agents/agents.py create mode 100644 skillforge/agents/managed_agents/environments.py create mode 100644 skillforge/agents/managed_agents/output.py create mode 100644 skillforge/agents/managed_agents/sessions.py create mode 100644 skillforge/agents/managed_agents/skills.py diff --git a/skillforge/agents/managed_agents.py b/skillforge/agents/managed_agents.py deleted file mode 100644 index dd6430f..0000000 --- a/skillforge/agents/managed_agents.py +++ /dev/null @@ -1,620 +0,0 @@ -"""Thin typed wrapper around the Anthropic Managed Agents + Skills beta APIs. - -Hides SDK quirks discovered during the Step 0 smoke test -(``scripts/smoke_skill_upload.py``): - -- Skill uploads must place ``SKILL.md`` inside a top-level folder; a bare - ``SKILL.md`` filename returns 400. -- ``beta.skills.delete()`` does NOT auto-clean versions — the 3-step - dance is required (``versions.list`` → ``versions.delete`` per version - → ``skills.delete``). -- Anthropic ships built-in skills (xlsx/pptx/pdf/docx) with - ``source="anthropic"``. Cleanup must NEVER attempt to delete them — the - guard is enforced here. -- ``beta.sessions.events.stream()`` is unusable: the SDK routes it through - the Anthropic Messages API SSE decoder which only recognizes - ``message_start``/``content_block_delta``/etc. and silently filters out - every Managed Agents event type. We poll ``events.list(order="asc")`` - instead. -- Tool name in ``agent_toolset_20260401`` is ``write`` (not - ``write_file``). Input shape: ``{"file_path": str, "content": str}``. - Bash tool input is ``{"command": str}`` only. -- Token usage path: ``event.model_usage.input_tokens`` / - ``.output_tokens`` / ``.cache_creation_input_tokens`` / - ``.cache_read_input_tokens`` on ``span.model_request_end`` events. -- Session runtime cost = (``status_idle.processed_at`` - - ``status_running.processed_at``) hours × $0.08. - -This module is the ONLY place that imports beta resource paths from the -``anthropic`` SDK. ``competitor_managed.py`` and the engine consume only -the wrapper's typed return values. -""" - -from __future__ import annotations - -import asyncio -import contextlib -import re -import time -from collections.abc import AsyncIterator -from datetime import datetime -from typing import Any - -from anthropic import AsyncAnthropic - -from skillforge.config import ANTHROPIC_API_KEY - -# --------------------------------------------------------------------------- -# Beta header constants — pinned per PLAN-V1.2 architectural decision #6. -# Treat any version bump as a plan-edit event, not a silent dependency -# update. Update both the constant and the bump notes in the journal. -# --------------------------------------------------------------------------- - -MANAGED_AGENTS_BETA: str = "managed-agents-2026-04-01" -SKILLS_BETA: str = "skills-2025-10-02" - -# Built-in skill source — never delete. Confirmed via Step 0 inspection of -# the four pre-existing skills (xlsx/pptx/pdf/docx) on the org. -ANTHROPIC_SKILL_SOURCE = "anthropic" - -# $0.08 per session-hour metered while status == running. Mirrors the -# constant in skillforge.config; duplicated here so this module can be -# imported standalone without pulling the whole config tree. -SESSION_RUNTIME_USD_PER_HOUR = 0.08 - - -# --------------------------------------------------------------------------- -# Client construction -# --------------------------------------------------------------------------- - - -def make_client(timeout: float = 600.0) -> AsyncAnthropic: - """Construct an AsyncAnthropic client wired to skillforge config. - - The caller is responsible for closing the client (``await client.close()``) - or using it as an async context manager. - """ - return AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=timeout) - - -# --------------------------------------------------------------------------- -# Environment lifecycle -# --------------------------------------------------------------------------- - - -async def create_environment( - client: AsyncAnthropic, - *, - run_id: str, - packages: list[str] | None = None, -) -> str: - """Create a cloud environment with the given pip packages pre-installed. - - Returns the environment id. The caller stores it on the EvolutionRun - and reuses it across all competitor sessions in that run. - """ - pkg_list = packages if packages is not None else ["pytest", "ruff"] - resp = await client.beta.environments.create( - name=f"sf-run-{run_id[:12]}", - config={ - "type": "cloud", - "packages": { - "type": "packages", - "pip": pkg_list, - }, - }, - betas=[MANAGED_AGENTS_BETA], - ) - return resp.id - - -async def archive_environment(client: AsyncAnthropic, environment_id: str) -> None: - """Best-effort environment teardown. Logs and swallows errors. - - Cleanup must never block. The ``leaked_environments`` counterpart - would go here if we needed bookkeeping; for now we accept the - leak — environments are cheap and Anthropic GCs them. - """ - with contextlib.suppress(Exception): - await client.beta.environments.archive( - environment_id, - betas=[MANAGED_AGENTS_BETA], - ) - - -# --------------------------------------------------------------------------- -# Skill upload + 3-step delete dance -# --------------------------------------------------------------------------- - - -async def upload_skill( - client: AsyncAnthropic, - *, - name: str, - skill_md: str, -) -> str: - """Upload a SKILL.md as a versioned org-level custom skill. - - Two empirical constraints from Step 0: - - 1. The file must live inside a top-level folder — passing a bare - ``SKILL.md`` filename returns ``400 SKILL.md file must be exactly - in the top-level folder.`` - 2. **The folder name must MATCH the ``name:`` field in the SKILL.md - frontmatter** — surfaced during the live end-to-end smoke. The - ``name`` argument to this function is therefore IGNORED for the - folder/upload — we always extract the actual frontmatter name and - use that. The ``name`` arg is still used as the ``display_title`` - (which can be anything human-readable). - - The Anthropic Skills API hard-requires the payload to start literally - with ``---``. A UTF-8 BOM or stray leading whitespace — which neither - our structural validator nor JSON round-tripping strips — is enough - to earn a ``400 SKILL.md must start with YAML frontmatter (---)``. - We normalize here so the ~1% of model outputs with a leading BOM or - whitespace still upload cleanly instead of falling back to inline. - - Returns the new ``skill_id``. The caller is responsible for archiving it - via :func:`archive_skill` after the session completes. - """ - # Strip leading BOM + whitespace the API is strict about; don't touch - # the rest of the body so checksum/fitness stays stable. - normalized = skill_md.lstrip("\ufeff \t\r\n") - if not normalized.startswith("---"): - raise ValueError( - "upload_skill: skill_md does not start with YAML frontmatter (---) " - "after stripping BOM/whitespace — refusing to call the API" - ) - folder = _extract_skill_name_from_md(normalized) or name - resp = await client.beta.skills.create( - display_title=name, - files=[ - ( - f"{folder}/SKILL.md", - normalized.encode("utf-8"), - "text/markdown", - ) - ], - betas=[SKILLS_BETA], - ) - return resp.id - - -_SKILL_NAME_RE = re.compile(r"^name:\s*(?P[^\s\n]+)\s*$", re.MULTILINE) - - -def _extract_skill_name_from_md(skill_md: str) -> str | None: - """Pull the ``name`` field out of a SKILL.md's YAML frontmatter. - - Robust to variations in YAML formatting — uses a simple regex against - the raw text instead of parsing YAML, because the API's matching is - string-literal so we want exactly what's in the file. Returns None - if no name field is found. - """ - if not skill_md.startswith("---"): - return None - try: - _, fm_block, _ = skill_md.split("---", 2) - except ValueError: - return None - match = _SKILL_NAME_RE.search(fm_block) - if not match: - return None - return match.group("name").strip() - - -async def archive_skill(client: AsyncAnthropic, skill_id: str) -> None: - """Tear down a custom skill via the 3-step delete dance. - - Steps: - 1. ``versions.list(skill_id)`` — paginator over version objects - 2. ``versions.delete(version=ver_str, skill_id=skill_id)`` for each - 3. ``skills.delete(skill_id)`` - - **Anthropic built-in skills are protected**: we never list or delete - a skill we did not upload. The caller is responsible for passing - only ``skill_id``s that came from :func:`upload_skill`. As a - belt-and-suspenders, we re-fetch the skill via ``retrieve`` and - refuse to proceed if its ``source`` is ``anthropic``. - - Best-effort: any error in the dance is raised so the caller can log - a leak in the ``leaked_skills`` table. Use :func:`archive_skill_safe` - if you want a swallow-and-log variant. - """ - # Built-in guard - try: - existing = await client.beta.skills.retrieve(skill_id, betas=[SKILLS_BETA]) - source = getattr(existing, "source", None) - if source == ANTHROPIC_SKILL_SOURCE: - raise PermissionError( - f"refusing to archive Anthropic built-in skill {skill_id} " - f"(source={source!r})" - ) - except PermissionError: - raise - except Exception: # noqa: BLE001 - # If retrieve fails (skill already gone? auth issue?), proceed — - # the delete dance will surface a clearer error if there's a - # real problem. - pass - - # Step 1+2: enumerate and delete versions - versions_page = await client.beta.skills.versions.list( - skill_id, betas=[SKILLS_BETA] - ) - async for version in versions_page: - ver = getattr(version, "version", None) - if ver is None and hasattr(version, "model_dump"): - ver = version.model_dump().get("version") - if ver is None: - continue - await client.beta.skills.versions.delete( - version=str(ver), - skill_id=skill_id, - betas=[SKILLS_BETA], - ) - - # Step 3: delete the skill itself - await client.beta.skills.delete(skill_id, betas=[SKILLS_BETA]) - - -async def archive_skill_safe( - client: AsyncAnthropic, - skill_id: str, -) -> tuple[bool, str | None]: - """Swallow-and-log variant. Returns ``(success, error_message)``.""" - try: - await archive_skill(client, skill_id) - return True, None - except Exception as exc: # noqa: BLE001 - return False, f"{exc.__class__.__name__}: {str(exc)[:300]}" - - -# --------------------------------------------------------------------------- -# Agent lifecycle -# --------------------------------------------------------------------------- - - -async def create_competitor_agent( - client: AsyncAnthropic, - *, - name: str, - model: str, - system_prompt: str, - skill_id: str | None = None, -) -> str: - """Create a Managed Agent for one competitor run. - - The agent is configured with the standard ``agent_toolset_20260401`` - (bash/edit/read/write/glob/grep/web_fetch/web_search) and an optional - custom skill linked via the ``skills`` field. - - The Advisor Strategy (``advisor_20260301``) is intentionally NOT - wired here — Step 0 confirmed it's not yet supported in the SDK or - on our beta access. When it lands, add a second tool entry behind a - ``COMPETITOR_ADVISOR`` flag. - """ - kwargs: dict[str, Any] = { - "name": name, - "model": model, - "system": system_prompt, - "tools": [{"type": "agent_toolset_20260401"}], - "betas": [MANAGED_AGENTS_BETA], - } - if skill_id is not None: - # BetaManagedAgentsCustomSkillParams shape: - # {"skill_id": str, "type": "custom", "version": Optional[str]} - # Empirical errors during the e2e smoke caught two prior shape - # mistakes: type="skill" (must be "custom"), id=... (must be - # skill_id=...). Both surfaced as 400 invalid_request_error. - kwargs["skills"] = [{"skill_id": skill_id, "type": "custom"}] - resp = await client.beta.agents.create(**kwargs) - return resp.id - - -async def archive_agent(client: AsyncAnthropic, agent_id: str) -> None: - """Best-effort agent teardown.""" - with contextlib.suppress(Exception): - await client.beta.agents.archive(agent_id, betas=[MANAGED_AGENTS_BETA]) - - -# --------------------------------------------------------------------------- -# Session lifecycle -# --------------------------------------------------------------------------- - - -async def create_session( - client: AsyncAnthropic, - *, - agent_id: str, - environment_id: str, - title: str | None = None, -) -> str: - """Create a session and return its id.""" - kwargs: dict[str, Any] = { - "agent": agent_id, - "environment_id": environment_id, - "betas": [MANAGED_AGENTS_BETA], - } - if title is not None: - kwargs["title"] = title - resp = await client.beta.sessions.create(**kwargs) - return resp.id - - -async def archive_session(client: AsyncAnthropic, session_id: str) -> None: - """Best-effort session teardown.""" - with contextlib.suppress(Exception): - await client.beta.sessions.archive( - session_id, betas=[MANAGED_AGENTS_BETA] - ) - - -async def send_user_message( - client: AsyncAnthropic, - session_id: str, - text: str, -) -> None: - """Send a single ``user.message`` event into a session.""" - await client.beta.sessions.events.send( - session_id, - events=[ - { - "type": "user.message", - "content": [{"type": "text", "text": text}], - } - ], - betas=[MANAGED_AGENTS_BETA], - ) - - -# --------------------------------------------------------------------------- -# Event polling — replaces the broken events.stream() -# --------------------------------------------------------------------------- - - -async def iter_session_events( - client: AsyncAnthropic, - session_id: str, - *, - deadline_seconds: float = 300.0, - poll_interval: float = 2.0, - page_limit: int = 100, -) -> AsyncIterator[dict]: - """Yield session events as plain dicts until ``session.status_idle`` arrives. - - Polls ``beta.sessions.events.list(order="asc")`` every ``poll_interval`` - seconds. Yields each new event exactly once (deduped by ``id``). - Stops on the first ``session.status_idle`` event OR when - ``deadline_seconds`` elapses. - - Why polling instead of ``events.stream()``: the SDK's stream wrapper - routes through the Anthropic Messages API SSE decoder, which only - recognizes Messages event names and silently filters out every - Managed Agents event type. ``events.list()`` returns structured - ``BetaManagedAgentsSessionEvent`` objects directly. See PLAN-V1.2 - §"Step 0 empirical findings" for the full investigation. - """ - deadline = time.monotonic() + deadline_seconds - seen_ids: set[str] = set() - idle_seen = False - - while time.monotonic() < deadline and not idle_seen: - page = await client.beta.sessions.events.list( - session_id, - limit=page_limit, - order="asc", - betas=[MANAGED_AGENTS_BETA], - ) - async for ev in page: - ev_id = getattr(ev, "id", None) - if ev_id is None or ev_id in seen_ids: - continue - seen_ids.add(ev_id) - d = ev.model_dump() if hasattr(ev, "model_dump") else dict(ev) - yield d - if d.get("type") == "session.status_idle": - idle_seen = True - break - - if idle_seen: - return - await asyncio.sleep(poll_interval) - - -# --------------------------------------------------------------------------- -# Event parsing helpers -# --------------------------------------------------------------------------- - - -def extract_written_files(events: list[dict]) -> dict[str, str]: - """Reconstruct ``output_files`` from a session's event stream. - - Strategy: - 1. Walk all ``agent.tool_use`` events with ``name == "write"``. - The ``input.file_path`` and ``input.content`` keys are present - and complete (verified in Step 0). - 2. Walk all ``agent.tool_use`` events with ``name == "bash"``. - Parse the ``input.command`` for common file-write idioms: - heredoc redirects (``cat > path << EOF ... EOF``), simple - redirects (``echo "..." > path``), ``tee path <<<``, - ``printf "..." > path``. Best-effort — bash output is opaque - and the command may use shell expansion that we can't safely - eval. - - All paths are normalized to RELATIVE form: leading slashes are - stripped (the agent typically writes to absolute paths inside its - cloud sandbox, but L1's deterministic runner consumes relative - paths under a temp dir). The smoke test caught this — writing to - ``/output/solution.py`` in the cloud became ``Path('/') / '/output'`` - on the local FS and crashed L1's mkdir with a read-only filesystem - error. - - Later writes to the same path overwrite earlier ones (last-write-wins). - Files written via the ``edit`` tool are NOT captured here — that - tool produces a patch event, not a content event. v1.3 follow-up. - """ - out: dict[str, str] = {} - - for ev in events: - if ev.get("type") != "agent.tool_use": - continue - name = ev.get("name", "") - inp = ev.get("input") or {} - if not isinstance(inp, dict): - continue - - if name == "write": - path = inp.get("file_path") - content = inp.get("content") - if isinstance(path, str) and isinstance(content, str): - out[_normalize_output_path(path)] = content - - elif name == "bash": - cmd = inp.get("command") - if not isinstance(cmd, str): - continue - for path, content in _parse_bash_writes(cmd): - out[_normalize_output_path(path)] = content - - return out - - -def _normalize_output_path(path: str) -> str: - """Strip leading slashes so the path is relative for L1 consumption. - - Also collapses ``./`` prefixes and any leading whitespace. The result - is always safe to pass to ``Path(tmp_dir) / normalized_path`` without - accidentally jumping out of the temp dir via an absolute path or a - parent traversal. - """ - p = path.strip().lstrip("/") - while p.startswith("./"): - p = p[2:] - return p - - -_HEREDOC_RE = re.compile( - # cat redirects stdout to a file (`cat > path`); tee takes the path as a - # positional arg (`tee path`). Make the `>` optional so both work. - r"(?:cat|tee)\s*(?:-[a-z]+\s*)*(?:>\s*)?(?P['\"]?\S+['\"]?)\s*" - r"<<\s*['\"]?(?P\w+)['\"]?\n(?P.*?)\n(?P=delim)\s*$", - re.DOTALL | re.MULTILINE, -) -_SIMPLE_REDIRECT_RE = re.compile( - r"echo\s+(?P['\"][^'\"]*['\"]|\S+)\s*>\s*(?P['\"]?\S+['\"]?)" -) - - -def _parse_bash_writes(command: str) -> list[tuple[str, str]]: - """Best-effort parser for shell file-write idioms in a bash command string. - - Recognizes: - - ``cat > path << EOF ... EOF`` and ``cat > path << 'EOF' ... EOF`` - - ``tee path << EOF ... EOF`` - - ``echo "content" > path`` - - Returns a list of ``(path, content)`` tuples. Strips quoting from - paths. Returns an empty list if nothing recognizable matches. - """ - results: list[tuple[str, str]] = [] - - for match in _HEREDOC_RE.finditer(command): - path = match.group("path").strip().strip("'\"") - body = match.group("body") - results.append((path, body)) - - for match in _SIMPLE_REDIRECT_RE.finditer(command): - path = match.group("path").strip().strip("'\"") - content = match.group("content").strip().strip("'\"") - results.append((path, content)) - - return results - - -def compute_token_usage(events: list[dict]) -> dict[str, int]: - """Sum token usage across all ``span.model_request_end`` events. - - Returns a dict with ``input``, ``output``, ``cache_creation_input``, - ``cache_read_input``, and ``n_requests`` keys. Missing fields default - to 0. Field paths verified in Step 0: - ``event.model_usage.{input_tokens, output_tokens, - cache_creation_input_tokens, cache_read_input_tokens}``. - """ - totals = { - "input": 0, - "output": 0, - "cache_creation_input": 0, - "cache_read_input": 0, - "n_requests": 0, - } - for ev in events: - if ev.get("type") != "span.model_request_end": - continue - usage = ev.get("model_usage") or {} - if not isinstance(usage, dict): - continue - totals["input"] += int(usage.get("input_tokens") or 0) - totals["output"] += int(usage.get("output_tokens") or 0) - totals["cache_creation_input"] += int(usage.get("cache_creation_input_tokens") or 0) - totals["cache_read_input"] += int(usage.get("cache_read_input_tokens") or 0) - totals["n_requests"] += 1 - return totals - - -def compute_session_runtime_hours(events: list[dict]) -> float: - """Return ``(idle_time - running_time)`` in hours, or 0.0 if either is missing. - - Used to compute the session-runtime line item in - ``CompetitionResult.cost_breakdown`` — multiply the result by - :data:`SESSION_RUNTIME_USD_PER_HOUR` (``$0.08``) for USD. - """ - running_at: datetime | None = None - idle_at: datetime | None = None - - for ev in events: - etype = ev.get("type") - ts_raw = ev.get("processed_at") - if ts_raw is None: - continue - try: - if isinstance(ts_raw, datetime): - ts = ts_raw - else: - ts = datetime.fromisoformat(str(ts_raw).replace("Z", "+00:00")) - except (ValueError, TypeError): - continue - if etype == "session.status_running" and running_at is None: - running_at = ts - elif etype == "session.status_idle": - idle_at = ts - - if running_at is None or idle_at is None: - return 0.0 - delta = (idle_at - running_at).total_seconds() - if delta < 0: - return 0.0 - return delta / 3600.0 - - -def session_was_skill_loaded(events: list[dict], skill_id: str | None) -> bool: - """Return True if any event indicates the agent loaded the custom skill. - - For now, this is a heuristic: if the session was created with a - ``skill_id`` AND the agent emitted at least one tool_use after - ``session.status_running``, we consider the skill "loaded" (the - agent had access and chose to use tools). Refine in v1.3 once - Anthropic exposes a ``skill_load`` or equivalent event. - - Returns False if ``skill_id`` is None (no skill was attached). - """ - if skill_id is None: - return False - seen_running = False - for ev in events: - etype = ev.get("type") - if etype == "session.status_running": - seen_running = True - elif seen_running and etype == "agent.tool_use": - return True - return False diff --git a/skillforge/agents/managed_agents/__init__.py b/skillforge/agents/managed_agents/__init__.py new file mode 100644 index 0000000..ab42d30 --- /dev/null +++ b/skillforge/agents/managed_agents/__init__.py @@ -0,0 +1,97 @@ +"""Thin typed wrapper around the Anthropic Managed Agents + Skills beta APIs. + +Hides SDK quirks discovered during the Step 0 smoke test +(``scripts/smoke_skill_upload.py``): + +- Skill uploads must place ``SKILL.md`` inside a top-level folder that + matches the frontmatter ``name:`` field; a bare filename returns 400. +- ``beta.skills.delete()`` does NOT auto-clean versions — the 3-step + dance is required (``versions.list`` → ``versions.delete`` per + version → ``skills.delete``). +- Anthropic ships built-in skills (xlsx/pptx/pdf/docx) with + ``source="anthropic"``. Cleanup must NEVER attempt to delete them — + the guard is enforced in ``skills.archive_skill``. +- ``beta.sessions.events.stream()`` is unusable: the SDK routes it + through the Messages API SSE decoder which silently filters out every + Managed Agents event type. ``sessions.iter_session_events`` polls + ``events.list(order="asc")`` instead. +- Tool name in ``agent_toolset_20260401`` is ``write`` (not + ``write_file``). Input shape: ``{"file_path": str, "content": str}``. + Bash tool input is ``{"command": str}``. +- Token usage path: ``event.model_usage.input_tokens`` / + ``.output_tokens`` / ``.cache_creation_input_tokens`` / + ``.cache_read_input_tokens`` on ``span.model_request_end`` events. +- Session runtime cost = (``status_idle.processed_at`` - + ``status_running.processed_at``) hours × $0.08. + +This package is the ONLY place that imports beta resource paths from +the ``anthropic`` SDK. ``competitor_managed`` and the engine consume +only the wrapper's typed return values. + +Public surface is re-exported here so import sites keep reading +``from skillforge.agents import managed_agents`` and calling +``managed_agents.upload_skill(...)`` etc. +""" + +from __future__ import annotations + +from skillforge.agents.managed_agents._constants import ( + ANTHROPIC_SKILL_SOURCE, + MANAGED_AGENTS_BETA, + SESSION_RUNTIME_USD_PER_HOUR, + SKILLS_BETA, + make_client, +) +from skillforge.agents.managed_agents.agents import archive_agent, create_competitor_agent +from skillforge.agents.managed_agents.environments import ( + archive_environment, + create_environment, +) +from skillforge.agents.managed_agents.output import ( + _normalize_output_path, + compute_session_runtime_hours, + compute_token_usage, + extract_written_files, + session_was_skill_loaded, +) +from skillforge.agents.managed_agents.sessions import ( + archive_session, + create_session, + iter_session_events, + send_user_message, +) +from skillforge.agents.managed_agents.skills import ( + _extract_skill_name_from_md, + archive_skill, + archive_skill_safe, + upload_skill, +) + +__all__ = [ + # Constants + client + "ANTHROPIC_SKILL_SOURCE", + "MANAGED_AGENTS_BETA", + "SESSION_RUNTIME_USD_PER_HOUR", + "SKILLS_BETA", + "make_client", + # Environments + "create_environment", + "archive_environment", + # Skills + "upload_skill", + "archive_skill", + "archive_skill_safe", + # Agents + "create_competitor_agent", + "archive_agent", + # Sessions + "create_session", + "archive_session", + "send_user_message", + "iter_session_events", + # Output introspection + "extract_written_files", + "compute_token_usage", + "compute_session_runtime_hours", + "session_was_skill_loaded", +] diff --git a/skillforge/agents/managed_agents/_constants.py b/skillforge/agents/managed_agents/_constants.py new file mode 100644 index 0000000..23c0b61 --- /dev/null +++ b/skillforge/agents/managed_agents/_constants.py @@ -0,0 +1,35 @@ +"""Pinned beta headers, static constants, and the shared client factory. + +The constants are called out as a plan-edit event — any version bump +to ``MANAGED_AGENTS_BETA`` / ``SKILLS_BETA`` should land with a journal +entry explaining the upgrade. +""" + +from __future__ import annotations + +from anthropic import AsyncAnthropic + +from skillforge.config import ANTHROPIC_API_KEY + +# Pinned per PLAN-V1.2 architectural decision #6. Treat any version +# bump as a plan-edit event, not a silent dependency update. +MANAGED_AGENTS_BETA: str = "managed-agents-2026-04-01" +SKILLS_BETA: str = "skills-2025-10-02" + +# Built-in skill source — never delete. Confirmed via Step 0 inspection +# of the four pre-existing Anthropic skills (xlsx/pptx/pdf/docx) on the org. +ANTHROPIC_SKILL_SOURCE = "anthropic" + +# $0.08 per session-hour metered while status == running. Mirrors the +# constant in skillforge.config; duplicated here so this module can be +# imported standalone without pulling the whole config tree. +SESSION_RUNTIME_USD_PER_HOUR = 0.08 + + +def make_client(timeout: float = 600.0) -> AsyncAnthropic: + """Construct an AsyncAnthropic client wired to skillforge config. + + The caller is responsible for closing the client (``await client.close()``) + or using it as an async context manager. + """ + return AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=timeout) diff --git a/skillforge/agents/managed_agents/agents.py b/skillforge/agents/managed_agents/agents.py new file mode 100644 index 0000000..7babc47 --- /dev/null +++ b/skillforge/agents/managed_agents/agents.py @@ -0,0 +1,59 @@ +"""Competitor agent lifecycle — create / archive beta agents.""" + +from __future__ import annotations + +import contextlib +from typing import Any + +from anthropic import AsyncAnthropic + +from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA + +# --------------------------------------------------------------------------- +# Agent lifecycle +# --------------------------------------------------------------------------- + + +async def create_competitor_agent( + client: AsyncAnthropic, + *, + name: str, + model: str, + system_prompt: str, + skill_id: str | None = None, +) -> str: + """Create a Managed Agent for one competitor run. + + The agent is configured with the standard ``agent_toolset_20260401`` + (bash/edit/read/write/glob/grep/web_fetch/web_search) and an optional + custom skill linked via the ``skills`` field. + + The Advisor Strategy (``advisor_20260301``) is intentionally NOT + wired here — Step 0 confirmed it's not yet supported in the SDK or + on our beta access. When it lands, add a second tool entry behind a + ``COMPETITOR_ADVISOR`` flag. + """ + kwargs: dict[str, Any] = { + "name": name, + "model": model, + "system": system_prompt, + "tools": [{"type": "agent_toolset_20260401"}], + "betas": [MANAGED_AGENTS_BETA], + } + if skill_id is not None: + # BetaManagedAgentsCustomSkillParams shape: + # {"skill_id": str, "type": "custom", "version": Optional[str]} + # Empirical errors during the e2e smoke caught two prior shape + # mistakes: type="skill" (must be "custom"), id=... (must be + # skill_id=...). Both surfaced as 400 invalid_request_error. + kwargs["skills"] = [{"skill_id": skill_id, "type": "custom"}] + resp = await client.beta.agents.create(**kwargs) + return resp.id + + +async def archive_agent(client: AsyncAnthropic, agent_id: str) -> None: + """Best-effort agent teardown.""" + with contextlib.suppress(Exception): + await client.beta.agents.archive(agent_id, betas=[MANAGED_AGENTS_BETA]) + + diff --git a/skillforge/agents/managed_agents/environments.py b/skillforge/agents/managed_agents/environments.py new file mode 100644 index 0000000..19b7671 --- /dev/null +++ b/skillforge/agents/managed_agents/environments.py @@ -0,0 +1,55 @@ +"""Environment lifecycle — create / archive per-run Managed Agents environments.""" + +from __future__ import annotations + +import contextlib + +from anthropic import AsyncAnthropic + +from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA + +# --------------------------------------------------------------------------- +# Environment lifecycle +# --------------------------------------------------------------------------- + + +async def create_environment( + client: AsyncAnthropic, + *, + run_id: str, + packages: list[str] | None = None, +) -> str: + """Create a cloud environment with the given pip packages pre-installed. + + Returns the environment id. The caller stores it on the EvolutionRun + and reuses it across all competitor sessions in that run. + """ + pkg_list = packages if packages is not None else ["pytest", "ruff"] + resp = await client.beta.environments.create( + name=f"sf-run-{run_id[:12]}", + config={ + "type": "cloud", + "packages": { + "type": "packages", + "pip": pkg_list, + }, + }, + betas=[MANAGED_AGENTS_BETA], + ) + return resp.id + + +async def archive_environment(client: AsyncAnthropic, environment_id: str) -> None: + """Best-effort environment teardown. Logs and swallows errors. + + Cleanup must never block. The ``leaked_environments`` counterpart + would go here if we needed bookkeeping; for now we accept the + leak — environments are cheap and Anthropic GCs them. + """ + with contextlib.suppress(Exception): + await client.beta.environments.archive( + environment_id, + betas=[MANAGED_AGENTS_BETA], + ) + + diff --git a/skillforge/agents/managed_agents/output.py b/skillforge/agents/managed_agents/output.py new file mode 100644 index 0000000..4157cc5 --- /dev/null +++ b/skillforge/agents/managed_agents/output.py @@ -0,0 +1,208 @@ +"""Post-run event-stream introspection. + +Pulls the written-file map out of trace events, parses bash +``cat <<'EOF' > path`` writes, and computes token usage + session +runtime cost. All pure functions — no network, no mutation. +""" + +from __future__ import annotations + +import re +from datetime import datetime + +# --------------------------------------------------------------------------- +# Event parsing helpers +# --------------------------------------------------------------------------- + + +def extract_written_files(events: list[dict]) -> dict[str, str]: + """Reconstruct ``output_files`` from a session's event stream. + + Strategy: + 1. Walk all ``agent.tool_use`` events with ``name == "write"``. + The ``input.file_path`` and ``input.content`` keys are present + and complete (verified in Step 0). + 2. Walk all ``agent.tool_use`` events with ``name == "bash"``. + Parse the ``input.command`` for common file-write idioms: + heredoc redirects (``cat > path << EOF ... EOF``), simple + redirects (``echo "..." > path``), ``tee path <<<``, + ``printf "..." > path``. Best-effort — bash output is opaque + and the command may use shell expansion that we can't safely + eval. + + All paths are normalized to RELATIVE form: leading slashes are + stripped (the agent typically writes to absolute paths inside its + cloud sandbox, but L1's deterministic runner consumes relative + paths under a temp dir). The smoke test caught this — writing to + ``/output/solution.py`` in the cloud became ``Path('/') / '/output'`` + on the local FS and crashed L1's mkdir with a read-only filesystem + error. + + Later writes to the same path overwrite earlier ones (last-write-wins). + Files written via the ``edit`` tool are NOT captured here — that + tool produces a patch event, not a content event. v1.3 follow-up. + """ + out: dict[str, str] = {} + + for ev in events: + if ev.get("type") != "agent.tool_use": + continue + name = ev.get("name", "") + inp = ev.get("input") or {} + if not isinstance(inp, dict): + continue + + if name == "write": + path = inp.get("file_path") + content = inp.get("content") + if isinstance(path, str) and isinstance(content, str): + out[_normalize_output_path(path)] = content + + elif name == "bash": + cmd = inp.get("command") + if not isinstance(cmd, str): + continue + for path, content in _parse_bash_writes(cmd): + out[_normalize_output_path(path)] = content + + return out + + +def _normalize_output_path(path: str) -> str: + """Strip leading slashes so the path is relative for L1 consumption. + + Also collapses ``./`` prefixes and any leading whitespace. The result + is always safe to pass to ``Path(tmp_dir) / normalized_path`` without + accidentally jumping out of the temp dir via an absolute path or a + parent traversal. + """ + p = path.strip().lstrip("/") + while p.startswith("./"): + p = p[2:] + return p + + +_HEREDOC_RE = re.compile( + # cat redirects stdout to a file (`cat > path`); tee takes the path as a + # positional arg (`tee path`). Make the `>` optional so both work. + r"(?:cat|tee)\s*(?:-[a-z]+\s*)*(?:>\s*)?(?P['\"]?\S+['\"]?)\s*" + r"<<\s*['\"]?(?P\w+)['\"]?\n(?P.*?)\n(?P=delim)\s*$", + re.DOTALL | re.MULTILINE, +) +_SIMPLE_REDIRECT_RE = re.compile( + r"echo\s+(?P['\"][^'\"]*['\"]|\S+)\s*>\s*(?P['\"]?\S+['\"]?)" +) + + +def _parse_bash_writes(command: str) -> list[tuple[str, str]]: + """Best-effort parser for shell file-write idioms in a bash command string. + + Recognizes: + - ``cat > path << EOF ... EOF`` and ``cat > path << 'EOF' ... EOF`` + - ``tee path << EOF ... EOF`` + - ``echo "content" > path`` + + Returns a list of ``(path, content)`` tuples. Strips quoting from + paths. Returns an empty list if nothing recognizable matches. + """ + results: list[tuple[str, str]] = [] + + for match in _HEREDOC_RE.finditer(command): + path = match.group("path").strip().strip("'\"") + body = match.group("body") + results.append((path, body)) + + for match in _SIMPLE_REDIRECT_RE.finditer(command): + path = match.group("path").strip().strip("'\"") + content = match.group("content").strip().strip("'\"") + results.append((path, content)) + + return results + + +def compute_token_usage(events: list[dict]) -> dict[str, int]: + """Sum token usage across all ``span.model_request_end`` events. + + Returns a dict with ``input``, ``output``, ``cache_creation_input``, + ``cache_read_input``, and ``n_requests`` keys. Missing fields default + to 0. Field paths verified in Step 0: + ``event.model_usage.{input_tokens, output_tokens, + cache_creation_input_tokens, cache_read_input_tokens}``. + """ + totals = { + "input": 0, + "output": 0, + "cache_creation_input": 0, + "cache_read_input": 0, + "n_requests": 0, + } + for ev in events: + if ev.get("type") != "span.model_request_end": + continue + usage = ev.get("model_usage") or {} + if not isinstance(usage, dict): + continue + totals["input"] += int(usage.get("input_tokens") or 0) + totals["output"] += int(usage.get("output_tokens") or 0) + totals["cache_creation_input"] += int(usage.get("cache_creation_input_tokens") or 0) + totals["cache_read_input"] += int(usage.get("cache_read_input_tokens") or 0) + totals["n_requests"] += 1 + return totals + + +def compute_session_runtime_hours(events: list[dict]) -> float: + """Return ``(idle_time - running_time)`` in hours, or 0.0 if either is missing. + + Used to compute the session-runtime line item in + ``CompetitionResult.cost_breakdown`` — multiply the result by + :data:`SESSION_RUNTIME_USD_PER_HOUR` (``$0.08``) for USD. + """ + running_at: datetime | None = None + idle_at: datetime | None = None + + for ev in events: + etype = ev.get("type") + ts_raw = ev.get("processed_at") + if ts_raw is None: + continue + try: + if isinstance(ts_raw, datetime): + ts = ts_raw + else: + ts = datetime.fromisoformat(str(ts_raw).replace("Z", "+00:00")) + except (ValueError, TypeError): + continue + if etype == "session.status_running" and running_at is None: + running_at = ts + elif etype == "session.status_idle": + idle_at = ts + + if running_at is None or idle_at is None: + return 0.0 + delta = (idle_at - running_at).total_seconds() + if delta < 0: + return 0.0 + return delta / 3600.0 + + +def session_was_skill_loaded(events: list[dict], skill_id: str | None) -> bool: + """Return True if any event indicates the agent loaded the custom skill. + + For now, this is a heuristic: if the session was created with a + ``skill_id`` AND the agent emitted at least one tool_use after + ``session.status_running``, we consider the skill "loaded" (the + agent had access and chose to use tools). Refine in v1.3 once + Anthropic exposes a ``skill_load`` or equivalent event. + + Returns False if ``skill_id`` is None (no skill was attached). + """ + if skill_id is None: + return False + seen_running = False + for ev in events: + etype = ev.get("type") + if etype == "session.status_running": + seen_running = True + elif seen_running and etype == "agent.tool_use": + return True + return False diff --git a/skillforge/agents/managed_agents/sessions.py b/skillforge/agents/managed_agents/sessions.py new file mode 100644 index 0000000..682c8ae --- /dev/null +++ b/skillforge/agents/managed_agents/sessions.py @@ -0,0 +1,124 @@ +"""Session lifecycle + event iteration + user-message dispatch. + +Poll-based iteration intentionally avoids ``events.stream()`` — the SDK +routes that through the Messages API SSE decoder which silently filters +out every Managed Agents event type. See the package docstring for the +smoke-test findings. +""" + +from __future__ import annotations + +import asyncio +import contextlib +import time +from collections.abc import AsyncIterator +from typing import Any + +from anthropic import AsyncAnthropic + +from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA + +# --------------------------------------------------------------------------- +# Session lifecycle +# --------------------------------------------------------------------------- + + +async def create_session( + client: AsyncAnthropic, + *, + agent_id: str, + environment_id: str, + title: str | None = None, +) -> str: + """Create a session and return its id.""" + kwargs: dict[str, Any] = { + "agent": agent_id, + "environment_id": environment_id, + "betas": [MANAGED_AGENTS_BETA], + } + if title is not None: + kwargs["title"] = title + resp = await client.beta.sessions.create(**kwargs) + return resp.id + + +async def archive_session(client: AsyncAnthropic, session_id: str) -> None: + """Best-effort session teardown.""" + with contextlib.suppress(Exception): + await client.beta.sessions.archive( + session_id, betas=[MANAGED_AGENTS_BETA] + ) + + +async def send_user_message( + client: AsyncAnthropic, + session_id: str, + text: str, +) -> None: + """Send a single ``user.message`` event into a session.""" + await client.beta.sessions.events.send( + session_id, + events=[ + { + "type": "user.message", + "content": [{"type": "text", "text": text}], + } + ], + betas=[MANAGED_AGENTS_BETA], + ) + + +# --------------------------------------------------------------------------- +# Event polling — replaces the broken events.stream() +# --------------------------------------------------------------------------- + + +async def iter_session_events( + client: AsyncAnthropic, + session_id: str, + *, + deadline_seconds: float = 300.0, + poll_interval: float = 2.0, + page_limit: int = 100, +) -> AsyncIterator[dict]: + """Yield session events as plain dicts until ``session.status_idle`` arrives. + + Polls ``beta.sessions.events.list(order="asc")`` every ``poll_interval`` + seconds. Yields each new event exactly once (deduped by ``id``). + Stops on the first ``session.status_idle`` event OR when + ``deadline_seconds`` elapses. + + Why polling instead of ``events.stream()``: the SDK's stream wrapper + routes through the Anthropic Messages API SSE decoder, which only + recognizes Messages event names and silently filters out every + Managed Agents event type. ``events.list()`` returns structured + ``BetaManagedAgentsSessionEvent`` objects directly. See PLAN-V1.2 + §"Step 0 empirical findings" for the full investigation. + """ + deadline = time.monotonic() + deadline_seconds + seen_ids: set[str] = set() + idle_seen = False + + while time.monotonic() < deadline and not idle_seen: + page = await client.beta.sessions.events.list( + session_id, + limit=page_limit, + order="asc", + betas=[MANAGED_AGENTS_BETA], + ) + async for ev in page: + ev_id = getattr(ev, "id", None) + if ev_id is None or ev_id in seen_ids: + continue + seen_ids.add(ev_id) + d = ev.model_dump() if hasattr(ev, "model_dump") else dict(ev) + yield d + if d.get("type") == "session.status_idle": + idle_seen = True + break + + if idle_seen: + return + await asyncio.sleep(poll_interval) + + diff --git a/skillforge/agents/managed_agents/skills.py b/skillforge/agents/managed_agents/skills.py new file mode 100644 index 0000000..e9d6d9d --- /dev/null +++ b/skillforge/agents/managed_agents/skills.py @@ -0,0 +1,163 @@ +"""Skill lifecycle — upload, archive, and archive-safe helpers. + +All the SDK quirks called out in the package docstring (folder name +matching frontmatter, 3-step delete dance, never-delete-Anthropic-skills +guard, BOM normalization) live here. +""" + +from __future__ import annotations + +import re + +from anthropic import AsyncAnthropic + +from skillforge.agents.managed_agents._constants import ANTHROPIC_SKILL_SOURCE, SKILLS_BETA + +# --------------------------------------------------------------------------- +# Skill upload + 3-step delete dance +# --------------------------------------------------------------------------- + + +async def upload_skill( + client: AsyncAnthropic, + *, + name: str, + skill_md: str, +) -> str: + """Upload a SKILL.md as a versioned org-level custom skill. + + Two empirical constraints from Step 0: + + 1. The file must live inside a top-level folder — passing a bare + ``SKILL.md`` filename returns ``400 SKILL.md file must be exactly + in the top-level folder.`` + 2. **The folder name must MATCH the ``name:`` field in the SKILL.md + frontmatter** — surfaced during the live end-to-end smoke. The + ``name`` argument to this function is therefore IGNORED for the + folder/upload — we always extract the actual frontmatter name and + use that. The ``name`` arg is still used as the ``display_title`` + (which can be anything human-readable). + + The Anthropic Skills API hard-requires the payload to start literally + with ``---``. A UTF-8 BOM or stray leading whitespace — which neither + our structural validator nor JSON round-tripping strips — is enough + to earn a ``400 SKILL.md must start with YAML frontmatter (---)``. + We normalize here so the ~1% of model outputs with a leading BOM or + whitespace still upload cleanly instead of falling back to inline. + + Returns the new ``skill_id``. The caller is responsible for archiving it + via :func:`archive_skill` after the session completes. + """ + # Strip leading BOM + whitespace the API is strict about; don't touch + # the rest of the body so checksum/fitness stays stable. + normalized = skill_md.lstrip("\ufeff \t\r\n") + if not normalized.startswith("---"): + raise ValueError( + "upload_skill: skill_md does not start with YAML frontmatter (---) " + "after stripping BOM/whitespace — refusing to call the API" + ) + folder = _extract_skill_name_from_md(normalized) or name + resp = await client.beta.skills.create( + display_title=name, + files=[ + ( + f"{folder}/SKILL.md", + normalized.encode("utf-8"), + "text/markdown", + ) + ], + betas=[SKILLS_BETA], + ) + return resp.id + + +_SKILL_NAME_RE = re.compile(r"^name:\s*(?P[^\s\n]+)\s*$", re.MULTILINE) + + +def _extract_skill_name_from_md(skill_md: str) -> str | None: + """Pull the ``name`` field out of a SKILL.md's YAML frontmatter. + + Robust to variations in YAML formatting — uses a simple regex against + the raw text instead of parsing YAML, because the API's matching is + string-literal so we want exactly what's in the file. Returns None + if no name field is found. + """ + if not skill_md.startswith("---"): + return None + try: + _, fm_block, _ = skill_md.split("---", 2) + except ValueError: + return None + match = _SKILL_NAME_RE.search(fm_block) + if not match: + return None + return match.group("name").strip() + + +async def archive_skill(client: AsyncAnthropic, skill_id: str) -> None: + """Tear down a custom skill via the 3-step delete dance. + + Steps: + 1. ``versions.list(skill_id)`` — paginator over version objects + 2. ``versions.delete(version=ver_str, skill_id=skill_id)`` for each + 3. ``skills.delete(skill_id)`` + + **Anthropic built-in skills are protected**: we never list or delete + a skill we did not upload. The caller is responsible for passing + only ``skill_id``s that came from :func:`upload_skill`. As a + belt-and-suspenders, we re-fetch the skill via ``retrieve`` and + refuse to proceed if its ``source`` is ``anthropic``. + + Best-effort: any error in the dance is raised so the caller can log + a leak in the ``leaked_skills`` table. Use :func:`archive_skill_safe` + if you want a swallow-and-log variant. + """ + # Built-in guard + try: + existing = await client.beta.skills.retrieve(skill_id, betas=[SKILLS_BETA]) + source = getattr(existing, "source", None) + if source == ANTHROPIC_SKILL_SOURCE: + raise PermissionError( + f"refusing to archive Anthropic built-in skill {skill_id} " + f"(source={source!r})" + ) + except PermissionError: + raise + except Exception: # noqa: BLE001 + # If retrieve fails (skill already gone? auth issue?), proceed — + # the delete dance will surface a clearer error if there's a + # real problem. + pass + + # Step 1+2: enumerate and delete versions + versions_page = await client.beta.skills.versions.list( + skill_id, betas=[SKILLS_BETA] + ) + async for version in versions_page: + ver = getattr(version, "version", None) + if ver is None and hasattr(version, "model_dump"): + ver = version.model_dump().get("version") + if ver is None: + continue + await client.beta.skills.versions.delete( + version=str(ver), + skill_id=skill_id, + betas=[SKILLS_BETA], + ) + + # Step 3: delete the skill itself + await client.beta.skills.delete(skill_id, betas=[SKILLS_BETA]) + + +async def archive_skill_safe( + client: AsyncAnthropic, + skill_id: str, +) -> tuple[bool, str | None]: + """Swallow-and-log variant. Returns ``(success, error_message)``.""" + try: + await archive_skill(client, skill_id) + return True, None + except Exception as exc: # noqa: BLE001 + return False, f"{exc.__class__.__name__}: {str(exc)[:300]}" + + From a6edb81572fbfbd537a3f85ac22a12baee79ad07 Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Mon, 20 Apr 2026 01:26:15 -0500 Subject: [PATCH 2/4] refactor: split variant_evolution.py (620 LOC) into a package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decomposed by orchestration level — the mini-evolution loop, the assembly step, and the top-level run entry each live in their own file: variant_evolution/__init__.py barrel + re-exports run_variant_evolution variant_evolution/_helpers.py constants + _tier_sort_key + _aggregate_fitness variant_evolution/dimension.py _run_dimension_mini_evolution (challenge -> spawn -> compete -> score -> judge -> breed -> pick winner) variant_evolution/assembly.py _real_assembly (Engineer call + integration check) variant_evolution/main.py run_variant_evolution orchestrator Largest submodule is dimension.py at 345 LOC, under the 500-LOC ceiling in docs/clean-code.md §2. Prior to this split, the monolith held a single 311-LOC function (_run_dimension_mini_evolution) alongside the assembly logic and the main loop — the file was 620 LOC and every refactor touched everything. Test-access surface preserved: tests/test_variant_evolution.py imports _aggregate_fitness and _tier_sort_key directly from the package, so the __init__ re-exports them. Also rolled in: _extract_skill_name_from_md and _normalize_output_path added to the managed_agents package __all__ (they were already re-exported for test access, just needed the __all__ entry to satisfy F401). QA: ruff + mypy + 411 pytest all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/managed_agents/__init__.py | 3 + .../engine/variant_evolution/__init__.py | 33 ++ .../engine/variant_evolution/_helpers.py | 31 ++ .../engine/variant_evolution/assembly.py | 81 +++++ .../dimension.py} | 297 +----------------- skillforge/engine/variant_evolution/main.py | 174 ++++++++++ 6 files changed, 333 insertions(+), 286 deletions(-) create mode 100644 skillforge/engine/variant_evolution/__init__.py create mode 100644 skillforge/engine/variant_evolution/_helpers.py create mode 100644 skillforge/engine/variant_evolution/assembly.py rename skillforge/engine/{variant_evolution.py => variant_evolution/dimension.py} (53%) create mode 100644 skillforge/engine/variant_evolution/main.py diff --git a/skillforge/agents/managed_agents/__init__.py b/skillforge/agents/managed_agents/__init__.py index ab42d30..258b1d7 100644 --- a/skillforge/agents/managed_agents/__init__.py +++ b/skillforge/agents/managed_agents/__init__.py @@ -94,4 +94,7 @@ "compute_token_usage", "compute_session_runtime_hours", "session_was_skill_loaded", + # Private helpers re-exported for test access + "_extract_skill_name_from_md", + "_normalize_output_path", ] diff --git a/skillforge/engine/variant_evolution/__init__.py b/skillforge/engine/variant_evolution/__init__.py new file mode 100644 index 0000000..f48f9b9 --- /dev/null +++ b/skillforge/engine/variant_evolution/__init__.py @@ -0,0 +1,33 @@ +"""Variant evolution orchestrator (v2.0 Wave 3-1). + +Atomic-mode entry point. When ``run.evolution_mode == "atomic"`` the +parent ``run_evolution`` dispatcher delegates to ``run_variant_evolution`` +(re-exported below). The orchestrator runs one mini-evolution per +variant dimension recorded against the parent run, then calls the +Engineer to assemble the winners into a composite skill. + +Submodule layout: + + _helpers.py shared constants + small pure helpers + dimension.py per-dimension mini-evolution (challenge -> spawn -> + compete -> score -> judge -> breed -> pick winner) + assembly.py composite assembly via the Engineer + main.py top-level run_variant_evolution orchestrator + +The mini-evolutions reuse existing helpers (Spawner, Competitor, +judging pipeline) directly rather than recursing into ``run_evolution`` +itself — recursion would force a second event loop and complicate the +parent run's event stream. +""" + +from __future__ import annotations + +from skillforge.engine.variant_evolution._helpers import _aggregate_fitness, _tier_sort_key +from skillforge.engine.variant_evolution.main import run_variant_evolution + +__all__ = [ + "run_variant_evolution", + # Private helpers re-exported for test access. + "_aggregate_fitness", + "_tier_sort_key", +] diff --git a/skillforge/engine/variant_evolution/_helpers.py b/skillforge/engine/variant_evolution/_helpers.py new file mode 100644 index 0000000..89533f4 --- /dev/null +++ b/skillforge/engine/variant_evolution/_helpers.py @@ -0,0 +1,31 @@ +"""Shared helpers + defaults for variant_evolution orchestration.""" + +from __future__ import annotations + +from skillforge.models import SkillGenome, VariantEvolution + +# Atomic-mode defaults — small populations because the per-dimension +# challenge is narrow. Wave 1 of Phase 3 kept gen=1 (no breeding loop yet); +# post-v2.0 item 4 bumped the default to 2 so the default produces one +# round of breeding after gen 0. Existing VariantEvolution rows with +# ``num_generations=1`` still work — the loop collapses to a single pass. +DEFAULT_VARIANT_POP = 2 +DEFAULT_VARIANT_GENS = 2 +DEFAULT_VARIANT_CONCURRENCY = 3 + + +def _tier_sort_key(ve: VariantEvolution) -> tuple[int, str]: + """Sort foundation dimensions before capability dimensions.""" + order = {"foundation": 0, "capability": 1} + return (order.get(ve.tier, 99), ve.dimension) + + +def _aggregate_fitness(skill: SkillGenome) -> float: + """Compute a single fitness number for ranking variants.""" + if skill.pareto_objectives: + vals = list(skill.pareto_objectives.values()) + return sum(vals) / max(1, len(vals)) + if skill.deterministic_scores: + vals = list(skill.deterministic_scores.values()) + return sum(vals) / max(1, len(vals)) + return 0.0 diff --git a/skillforge/engine/variant_evolution/assembly.py b/skillforge/engine/variant_evolution/assembly.py new file mode 100644 index 0000000..6cb343a --- /dev/null +++ b/skillforge/engine/variant_evolution/assembly.py @@ -0,0 +1,81 @@ +"""Composite assembly — call the Engineer to merge winning variants.""" + +from __future__ import annotations + +import logging + +from skillforge.engine.events import emit +from skillforge.engine.variant_evolution._helpers import _aggregate_fitness +from skillforge.models import SkillGenome +from skillforge.models.run import EvolutionRun + +logger = logging.getLogger("skillforge.engine.variant_evolution.assembly") + + +async def _real_assembly( + run: EvolutionRun, + foundation_winner: SkillGenome | None, + capability_winners: list[SkillGenome], +) -> SkillGenome: + """Phase 4 real assembly — invoke the Engineer agent + integration test. + + Falls back to a "use the highest-fitness winner as-is" path when no + foundation variant exists (some atomic decompositions only have + capability dimensions). Otherwise runs the full Engineer flow: + weave → validate → optionally refine → persist composite. + """ + if foundation_winner is None: + if not capability_winners: + raise RuntimeError("assembly: no winners to assemble from") + # Edge case: no foundation tier in this decomposition. Use the + # highest-fitness capability as the de facto skeleton and emit + # a stub assembly_complete. Wave 4 polish will extend the + # Engineer to handle capability-only assemblies. + await emit( + run.id, + "assembly_started", + capability_count=len(capability_winners), + mode="capability_only_fallback", + ) + composite = max(capability_winners, key=_aggregate_fitness) + await emit( + run.id, + "assembly_complete", + composite_skill_id=composite.id, + capability_count=len(capability_winners), + integration_passed=True, + mode="capability_only_fallback", + ) + return composite + + # Resolve the family for the Engineer call + from skillforge.db.queries import get_family + + family = await get_family(run.family_id) if run.family_id else None + if family is None: + # Defensive fallback — synthesize a minimal SkillFamily so the + # Engineer call still has metadata to work with. The orchestrator + # logs a warning but doesn't block. + from skillforge.models import SkillFamily + + logger.warning( + "run=%s atomic assembly: no family found for family_id=%s; " + "using a synthesized SkillFamily for the Engineer call", + run.id[:8], + run.family_id, + ) + family = SkillFamily( + id=run.family_id or "fam_unknown", + slug="composite", + label="Composite", + specialization=run.specialization, + ) + + from skillforge.engine.assembly import assemble_skill + + composite, _report = await assemble_skill( + run, family, foundation_winner, capability_winners + ) + return composite + + diff --git a/skillforge/engine/variant_evolution.py b/skillforge/engine/variant_evolution/dimension.py similarity index 53% rename from skillforge/engine/variant_evolution.py rename to skillforge/engine/variant_evolution/dimension.py index 4972e41..b574e87 100644 --- a/skillforge/engine/variant_evolution.py +++ b/skillforge/engine/variant_evolution/dimension.py @@ -1,42 +1,9 @@ -"""Variant evolution orchestrator (v2.0 Wave 3-1). - -Atomic-mode entry point. When ``run.evolution_mode == "atomic"`` the parent -``run_evolution`` dispatcher delegates here. The orchestrator runs one -mini-evolution per variant dimension recorded against the parent run, then -calls a stub assembly step that returns the winning foundation as the -composite skill (Phase 4 will replace the stub with the real Engineer). - -Per-dimension flow: - - 1. Read all ``variant_evolutions`` rows for ``run.id``, sorted so - foundation dimensions come before capability dimensions. - 2. For each dimension, run a tiny mini-evolution: - a. Mark the row ``status="running"``, emit - ``variant_evolution_started``. - b. Design ONE focused challenge via - ``challenge_designer.design_variant_challenge``. - c. Spawn ``population_size`` variants via - ``spawner.spawn_variant_gen0`` — capability variants receive the - winning foundation as grounding context. - d. Run each spawned variant through the Competitor against the - single focused challenge. - e. Run the judging pipeline against the gathered results. - f. Pick the highest-fitness variant as the winner. Persist it as a - ``Variant`` row tied back to the family + the - ``VariantEvolution`` id. - g. Mark the ``VariantEvolution`` row ``status="complete"`` with - ``winner_variant_id`` and ``completed_at``. Emit - ``variant_evolution_complete``. - 3. After every dimension is done, call the assembly stub. - The Phase 4 Engineer will replace this stub. - 4. Set ``run.best_skill`` to the assembled composite, persist, and let - the parent ``run_evolution`` finalize. - -The mini-evolutions reuse existing helpers (Spawner, Competitor, judging -pipeline) directly rather than recursing into ``run_evolution`` itself. -Recursion would force a second event loop and complicate the parent run's -event stream — direct helper calls keep the event order deterministic and -the wall-clock budget bounded. +"""Per-dimension mini-evolution. + +Takes one ``VariantEvolution`` row, runs the full small-scale pipeline +(challenge design → spawn → compete → score → judge → breed → pick +winner), and returns the winning Variant + its genome. Called once per +dimension by ``main.run_variant_evolution``. """ from __future__ import annotations @@ -47,7 +14,6 @@ from datetime import UTC, datetime from skillforge.db.queries import ( - get_variant_evolutions_for_run, get_variants_for_family, save_challenge, save_genome, @@ -56,43 +22,14 @@ save_variant_evolution, ) from skillforge.engine.events import emit -from skillforge.models import ( - Generation, - SkillGenome, - Variant, - VariantEvolution, +from skillforge.engine.variant_evolution._helpers import ( + DEFAULT_VARIANT_CONCURRENCY, + _aggregate_fitness, ) +from skillforge.models import Generation, SkillGenome, Variant, VariantEvolution from skillforge.models.run import EvolutionRun -logger = logging.getLogger("skillforge.engine.variant_evolution") - -# Atomic-mode defaults — small populations because the per-dimension -# challenge is narrow. Wave 1 of Phase 3 keeps gen=1 (no breeding loop yet); -# Wave 4 will introduce per-dimension breeding. -DEFAULT_VARIANT_POP = 2 -# Post-v2.0 item 4: multi-generation breeding loops are now supported inside -# _run_dimension_mini_evolution. Bumped to 2 so the default produces one -# round of breeding after gen 0. Existing VariantEvolution rows with -# ``num_generations=1`` still work — the loop collapses to a single pass. -DEFAULT_VARIANT_GENS = 2 -DEFAULT_VARIANT_CONCURRENCY = 3 - - -def _tier_sort_key(ve: VariantEvolution) -> tuple[int, str]: - """Sort foundation dimensions before capability dimensions.""" - order = {"foundation": 0, "capability": 1} - return (order.get(ve.tier, 99), ve.dimension) - - -def _aggregate_fitness(skill: SkillGenome) -> float: - """Compute a single fitness number for ranking variants.""" - if skill.pareto_objectives: - vals = list(skill.pareto_objectives.values()) - return sum(vals) / max(1, len(vals)) - if skill.deterministic_scores: - vals = list(skill.deterministic_scores.values()) - return sum(vals) / max(1, len(vals)) - return 0.0 +logger = logging.getLogger("skillforge.engine.variant_evolution.dimension") async def _run_dimension_mini_evolution( @@ -406,215 +343,3 @@ async def _run_dimension_mini_evolution( return variant, winner_genome -async def _real_assembly( - run: EvolutionRun, - foundation_winner: SkillGenome | None, - capability_winners: list[SkillGenome], -) -> SkillGenome: - """Phase 4 real assembly — invoke the Engineer agent + integration test. - - Falls back to a "use the highest-fitness winner as-is" path when no - foundation variant exists (some atomic decompositions only have - capability dimensions). Otherwise runs the full Engineer flow: - weave → validate → optionally refine → persist composite. - """ - if foundation_winner is None: - if not capability_winners: - raise RuntimeError("assembly: no winners to assemble from") - # Edge case: no foundation tier in this decomposition. Use the - # highest-fitness capability as the de facto skeleton and emit - # a stub assembly_complete. Wave 4 polish will extend the - # Engineer to handle capability-only assemblies. - await emit( - run.id, - "assembly_started", - capability_count=len(capability_winners), - mode="capability_only_fallback", - ) - composite = max(capability_winners, key=_aggregate_fitness) - await emit( - run.id, - "assembly_complete", - composite_skill_id=composite.id, - capability_count=len(capability_winners), - integration_passed=True, - mode="capability_only_fallback", - ) - return composite - - # Resolve the family for the Engineer call - from skillforge.db.queries import get_family - - family = await get_family(run.family_id) if run.family_id else None - if family is None: - # Defensive fallback — synthesize a minimal SkillFamily so the - # Engineer call still has metadata to work with. The orchestrator - # logs a warning but doesn't block. - from skillforge.models import SkillFamily - - logger.warning( - "run=%s atomic assembly: no family found for family_id=%s; " - "using a synthesized SkillFamily for the Engineer call", - run.id[:8], - run.family_id, - ) - family = SkillFamily( - id=run.family_id or "fam_unknown", - slug="composite", - label="Composite", - specialization=run.specialization, - ) - - from skillforge.engine.assembly import assemble_skill - - composite, _report = await assemble_skill( - run, family, foundation_winner, capability_winners - ) - return composite - - -async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun: - """Top-level atomic-mode orchestrator. - - Reads the ``variant_evolutions`` rows for ``run.id``, processes each - dimension in tier order, and stamps ``run.best_skill`` with the - assembled composite. Falls back to molecular mode and logs a warning - if no dimensions are recorded against the run (defensive — the - Taxonomist should always create them at submission time for atomic). - """ - all_rows = await get_variant_evolutions_for_run(run.id) - - # Filter to rows that actually need work. Rows already in a terminal - # state (complete/failed) from prior runs must NOT be re-processed — - # that was causing 4x API spend on re-runs because the live test's - # hardcoded run_id accumulates stale rows across test invocations. - # "running" is included because a previous crash may have left a row - # stuck mid-processing; we let the orchestrator retry it. - pending = [ - v for v in all_rows if v.status not in {"complete", "failed"} - ] - skipped = len(all_rows) - len(pending) - if skipped: - logger.info( - "run=%s atomic mode: skipping %d terminal variant_evolutions " - "(%d pending)", - run.id[:8], - skipped, - len(pending), - ) - - if not pending: - logger.warning( - "run=%s atomic mode requested but no pending variant_evolutions; " - "falling back to molecular pipeline", - run.id[:8], - ) - # Caller (run_evolution dispatcher) will handle the fallback - run.evolution_mode = "molecular" - return run - - pending.sort(key=_tier_sort_key) - foundation_winner: SkillGenome | None = None - capability_winners: list[SkillGenome] = [] - - # --- Managed Agents environment (shared across all dimensions) --- - from skillforge.config import COMPETITOR_BACKEND - - env_id: str | None = None - managed_client = None - if COMPETITOR_BACKEND == "managed": - from skillforge.agents import managed_agents - - try: - logger.info("run=%s creating managed environment...", run.id[:8]) - managed_client = managed_agents.make_client() - env_id = await managed_agents.create_environment( - managed_client, run_id=run.id - ) - logger.info("run=%s managed environment ready: %s", run.id[:8], env_id) - await emit(run.id, "managed_environment_ready", environment_id=env_id) - except Exception as exc: # noqa: BLE001 — managed-env boundary: any SDK failure must be captured - logger.exception("run=%s managed environment creation failed", run.id[:8]) - run.status = "failed" - run.failure_reason = f"managed environment creation failed: {exc}" - await save_run(run) - return run - - logger.info( - "run=%s atomic mode: %d variant_evolutions queued", - run.id[:8], - len(pending), - ) - - try: - for vevo in pending: - # Apply default population size if the row was created without one - if vevo.population_size <= 0: - vevo.population_size = DEFAULT_VARIANT_POP - if vevo.num_generations <= 0: - vevo.num_generations = DEFAULT_VARIANT_GENS - - vevo.status = "running" - await save_variant_evolution(vevo) - await emit( - run.id, - "variant_evolution_started", - variant_evolution_id=vevo.id, - dimension=vevo.dimension, - tier=vevo.tier, - population_size=vevo.population_size, - ) - - try: - _variant, winner_genome = await _run_dimension_mini_evolution( - run=run, - vevo=vevo, - foundation_winner=foundation_winner, - env_id=env_id, - ) - except Exception as exc: # noqa: BLE001 — one bad dimension must not crash the whole atomic run - logger.exception( - "run=%s dimension %s mini-evolution failed", - run.id[:8], - vevo.dimension, - ) - vevo.status = "failed" - await save_variant_evolution(vevo) - await emit( - run.id, - "variant_evolution_complete", - variant_evolution_id=vevo.id, - dimension=vevo.dimension, - tier=vevo.tier, - status="failed", - error=str(exc), - ) - raise - - await emit( - run.id, - "variant_evolution_complete", - variant_evolution_id=vevo.id, - dimension=vevo.dimension, - tier=vevo.tier, - winner_variant_id=vevo.winner_variant_id, - status="complete", - ) - - if vevo.tier == "foundation": - foundation_winner = winner_genome - else: - capability_winners.append(winner_genome) - - composite = await _real_assembly(run, foundation_winner, capability_winners) - run.best_skill = composite - finally: - # Tear down managed environment - if env_id is not None and managed_client is not None: - try: - from skillforge.agents import managed_agents as _ma - await _ma.archive_environment(managed_client, env_id) - logger.info("run=%s managed environment archived: %s", run.id[:8], env_id) - except Exception: # noqa: BLE001 - logger.warning("run=%s managed environment cleanup failed", run.id[:8]) - return run diff --git a/skillforge/engine/variant_evolution/main.py b/skillforge/engine/variant_evolution/main.py new file mode 100644 index 0000000..96ddac3 --- /dev/null +++ b/skillforge/engine/variant_evolution/main.py @@ -0,0 +1,174 @@ +"""Top-level atomic-mode orchestrator. + +Reads the run's variant_evolutions rows, runs one mini-evolution per +dimension, then assembles the winners via the Engineer. +""" + +from __future__ import annotations + +import logging + +from skillforge.db.queries import ( + get_variant_evolutions_for_run, + save_run, + save_variant_evolution, +) +from skillforge.engine.events import emit +from skillforge.engine.variant_evolution._helpers import ( + DEFAULT_VARIANT_GENS, + DEFAULT_VARIANT_POP, + _tier_sort_key, +) +from skillforge.engine.variant_evolution.assembly import _real_assembly +from skillforge.engine.variant_evolution.dimension import _run_dimension_mini_evolution +from skillforge.models import SkillGenome +from skillforge.models.run import EvolutionRun + +logger = logging.getLogger("skillforge.engine.variant_evolution") + + +async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun: + """Top-level atomic-mode orchestrator. + + Reads the ``variant_evolutions`` rows for ``run.id``, processes each + dimension in tier order, and stamps ``run.best_skill`` with the + assembled composite. Falls back to molecular mode and logs a warning + if no dimensions are recorded against the run (defensive — the + Taxonomist should always create them at submission time for atomic). + """ + all_rows = await get_variant_evolutions_for_run(run.id) + + # Filter to rows that actually need work. Rows already in a terminal + # state (complete/failed) from prior runs must NOT be re-processed — + # that was causing 4x API spend on re-runs because the live test's + # hardcoded run_id accumulates stale rows across test invocations. + # "running" is included because a previous crash may have left a row + # stuck mid-processing; we let the orchestrator retry it. + pending = [ + v for v in all_rows if v.status not in {"complete", "failed"} + ] + skipped = len(all_rows) - len(pending) + if skipped: + logger.info( + "run=%s atomic mode: skipping %d terminal variant_evolutions " + "(%d pending)", + run.id[:8], + skipped, + len(pending), + ) + + if not pending: + logger.warning( + "run=%s atomic mode requested but no pending variant_evolutions; " + "falling back to molecular pipeline", + run.id[:8], + ) + # Caller (run_evolution dispatcher) will handle the fallback + run.evolution_mode = "molecular" + return run + + pending.sort(key=_tier_sort_key) + foundation_winner: SkillGenome | None = None + capability_winners: list[SkillGenome] = [] + + # --- Managed Agents environment (shared across all dimensions) --- + from skillforge.config import COMPETITOR_BACKEND + + env_id: str | None = None + managed_client = None + if COMPETITOR_BACKEND == "managed": + from skillforge.agents import managed_agents + + try: + logger.info("run=%s creating managed environment...", run.id[:8]) + managed_client = managed_agents.make_client() + env_id = await managed_agents.create_environment( + managed_client, run_id=run.id + ) + logger.info("run=%s managed environment ready: %s", run.id[:8], env_id) + await emit(run.id, "managed_environment_ready", environment_id=env_id) + except Exception as exc: # noqa: BLE001 — managed-env boundary: any SDK failure must be captured + logger.exception("run=%s managed environment creation failed", run.id[:8]) + run.status = "failed" + run.failure_reason = f"managed environment creation failed: {exc}" + await save_run(run) + return run + + logger.info( + "run=%s atomic mode: %d variant_evolutions queued", + run.id[:8], + len(pending), + ) + + try: + for vevo in pending: + # Apply default population size if the row was created without one + if vevo.population_size <= 0: + vevo.population_size = DEFAULT_VARIANT_POP + if vevo.num_generations <= 0: + vevo.num_generations = DEFAULT_VARIANT_GENS + + vevo.status = "running" + await save_variant_evolution(vevo) + await emit( + run.id, + "variant_evolution_started", + variant_evolution_id=vevo.id, + dimension=vevo.dimension, + tier=vevo.tier, + population_size=vevo.population_size, + ) + + try: + _variant, winner_genome = await _run_dimension_mini_evolution( + run=run, + vevo=vevo, + foundation_winner=foundation_winner, + env_id=env_id, + ) + except Exception as exc: # noqa: BLE001 — one bad dimension must not crash the whole atomic run + logger.exception( + "run=%s dimension %s mini-evolution failed", + run.id[:8], + vevo.dimension, + ) + vevo.status = "failed" + await save_variant_evolution(vevo) + await emit( + run.id, + "variant_evolution_complete", + variant_evolution_id=vevo.id, + dimension=vevo.dimension, + tier=vevo.tier, + status="failed", + error=str(exc), + ) + raise + + await emit( + run.id, + "variant_evolution_complete", + variant_evolution_id=vevo.id, + dimension=vevo.dimension, + tier=vevo.tier, + winner_variant_id=vevo.winner_variant_id, + status="complete", + ) + + if vevo.tier == "foundation": + foundation_winner = winner_genome + else: + capability_winners.append(winner_genome) + + composite = await _real_assembly(run, foundation_winner, capability_winners) + run.best_skill = composite + finally: + # Tear down managed environment + if env_id is not None and managed_client is not None: + try: + from skillforge.agents import managed_agents as _ma + await _ma.archive_environment(managed_client, env_id) + logger.info("run=%s managed environment archived: %s", run.id[:8], env_id) + except Exception: # noqa: BLE001 + logger.warning("run=%s managed environment cleanup failed", run.id[:8]) + return run From be09bd985984f9b71c1faa181bcaa5c3da2e8f6b Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Mon, 20 Apr 2026 01:45:35 -0500 Subject: [PATCH 3/4] refactor: split breeder.py (629 LOC) into a package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decomposed by responsibility. The six section comments in the monolith ("slot allocation", "ranking", "main breed", "mutation prompts", "lessons + reports", "bible publishing") each correspond to a submodule: breeder/__init__.py barrel — re-exports breed + compute_slots + rank_skills + publish_findings_to_bible, plus legacy aliases (breed_next_gen, spawn_gen0, BIBLE_DIR) that tests patch on the package root breeder/_ranking.py compute_slots + rank_skills + _aggregate_fitness breeder/_prompts.py diagnostic + crossover instruction templates + breeding-context formatter (pure) breeder/_reports.py _extract_lessons_and_report + siblings (LLM calls) breeder/main.py breed() + _carry_elite (orchestrator) breeder/bible.py publish_findings_to_bible (disk I/O) Largest submodule is _reports.py at 213 LOC, under the 500-LOC Python ceiling in docs/clean-code.md §2. Test-patch compatibility ------------------------ Tests patch three functions on the package root: ``breeder.breed_next_gen``, ``breeder.spawn_gen0``, ``breeder._extract_lessons_and_report``. Those patches don't propagate to bindings in submodules, so ``main.breed()`` now resolves each through the package namespace at call time (``_pkg().breed_next_gen`` etc.). BIBLE_DIR follows the same pattern in bible.py. QA: ruff + mypy + 411 pytest (unchanged) all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/breeder.py | 629 -------------------------- skillforge/agents/breeder/__init__.py | 69 +++ skillforge/agents/breeder/_prompts.py | 89 ++++ skillforge/agents/breeder/_ranking.py | 79 ++++ skillforge/agents/breeder/_reports.py | 213 +++++++++ skillforge/agents/breeder/bible.py | 139 ++++++ skillforge/agents/breeder/main.py | 146 ++++++ 7 files changed, 735 insertions(+), 629 deletions(-) delete mode 100644 skillforge/agents/breeder.py create mode 100644 skillforge/agents/breeder/__init__.py create mode 100644 skillforge/agents/breeder/_prompts.py create mode 100644 skillforge/agents/breeder/_ranking.py create mode 100644 skillforge/agents/breeder/_reports.py create mode 100644 skillforge/agents/breeder/bible.py create mode 100644 skillforge/agents/breeder/main.py diff --git a/skillforge/agents/breeder.py b/skillforge/agents/breeder.py deleted file mode 100644 index e2d0aac..0000000 --- a/skillforge/agents/breeder.py +++ /dev/null @@ -1,629 +0,0 @@ -"""Breeder — reflective mutation, multi-parent crossover, learning log, bible publishing. - -Inspired by GEPA's Actionable Side Information: mutations are diagnostic, not -random. The Breeder reads execution traces and trait attribution from the judging -pipeline, identifies root causes of failures, and proposes targeted fixes. - -Responsibilities: -- Elitism: top N Skills survive unchanged (N scales with population size) -- Reflective crossover: combine traits from 2-3 parents guided by attribution -- Diagnostic mutation: fix specific causes surfaced by trait attribution -- Joint component mutation: frontmatter + body + scripts mutate together -- Wildcard: 1+ slots per generation for fresh Skills -- Learning log maintenance: append new lessons each generation -- Bible publishing: extract generalizable findings to ``bible/findings/`` - -Slot allocation scales with ``target_pop_size`` (never hardcoded; see PLAN.md -§Cross-cutting contracts #11). -""" - -from __future__ import annotations - -import json -import logging -import re -from datetime import UTC, datetime - -from anthropic import AsyncAnthropic - -from skillforge.agents._llm import stream_text -from skillforge.agents.spawner import breed_next_gen, spawn_gen0 -from skillforge.config import ( - ANTHROPIC_API_KEY, - BIBLE_DIR, - BREEDER_CALL_MODE, - model_for, -) -from skillforge.models import Generation, SkillGenome - -logger = logging.getLogger("skillforge.agents.breeder") - -# --------------------------------------------------------------------------- -# Slot allocation -# --------------------------------------------------------------------------- - - -def compute_slots(target_pop_size: int) -> dict[str, int]: - """Allocate breeding slots as a function of ``target_pop_size``. - - Formula (from PLAN.md §Step 6e Breeder): - - elitism = max(1, target_pop_size // 5 * 2) ~40% floor 1 - wildcards = max(1, target_pop_size // 10) ~10% floor 1 - remainder = target_pop_size - elitism - wildcards - diagnostic = remainder // 2 - crossover = remainder - diagnostic - - Worked examples: - pop_size=3 → elitism=1, wildcards=1, diagnostic=0, crossover=1 (sum 3) - pop_size=5 → elitism=2, wildcards=1, diagnostic=1, crossover=1 (sum 5) - pop_size=10 → elitism=4, wildcards=1, diagnostic=2, crossover=3 (sum 10) - """ - if target_pop_size < 1: - raise ValueError(f"target_pop_size must be >=1, got {target_pop_size}") - - elitism = max(1, (target_pop_size // 5) * 2) - wildcards = max(1, target_pop_size // 10) - - # Ensure elitism + wildcards doesn't exceed target (pathological tiny sizes) - if elitism + wildcards > target_pop_size: - elitism = max(1, target_pop_size - 1) - wildcards = max(0, target_pop_size - elitism) - - remainder = target_pop_size - elitism - wildcards - diagnostic = remainder // 2 - crossover = remainder - diagnostic - - slots = { - "elitism": elitism, - "wildcards": wildcards, - "diagnostic": diagnostic, - "crossover": crossover, - } - assert sum(slots.values()) == target_pop_size, ( - f"slot sum {sum(slots.values())} != target {target_pop_size}: {slots}" - ) - return slots - - -# --------------------------------------------------------------------------- -# Ranking -# --------------------------------------------------------------------------- - - -def _aggregate_fitness(skill: SkillGenome) -> float: - """Scalar aggregate of Pareto objectives for ranking (charts/selection). - - The Pareto front is the real answer; this scalar is a summary for - ordering within the front (and for ranking Skills OFF the front). - """ - if not skill.pareto_objectives: - return 0.0 - return sum(skill.pareto_objectives.values()) / len(skill.pareto_objectives) - - -def rank_skills(generation: Generation) -> list[SkillGenome]: - """Return generation.skills sorted by (is_pareto_optimal desc, fitness desc).""" - return sorted( - generation.skills, - key=lambda s: (s.is_pareto_optimal, _aggregate_fitness(s)), - reverse=True, - ) - - -# --------------------------------------------------------------------------- -# Main breed() entry point -# --------------------------------------------------------------------------- - - -async def breed( - generation: Generation, - learning_log: list[str], - specialization: str, - target_pop_size: int, -) -> tuple[list[SkillGenome], list[str], str]: - """Produce the next generation from a ranked current generation. - - Returns ``(next_gen_skills, new_learning_log_entries, breeding_report)``. - - The slot allocation scales with ``target_pop_size`` — see ``compute_slots``. - The function guarantees ``len(next_gen_skills) == target_pop_size``. - """ - slots = compute_slots(target_pop_size) - ranked = rank_skills(generation) - - next_gen: list[SkillGenome] = [] - - # --- Elitism: top-N survive unchanged (but bump generations_survived) --- - elites = ranked[: slots["elitism"]] - for elite in elites: - carried = _carry_elite(elite) - next_gen.append(carried) - - # --- Diagnostic mutation: pick low-scoring Skills, ask LLM for targeted fixes --- - low_scorers = ranked[-slots["diagnostic"] :] if slots["diagnostic"] > 0 else [] - diagnostic_instructions = _build_diagnostic_instructions( - low_scorers, learning_log, slots["diagnostic"] - ) - if slots["diagnostic"] > 0 and low_scorers: - try: - diagnostic_children = await breed_next_gen( - parents=low_scorers, - learning_log=learning_log, - breeding_instructions=diagnostic_instructions, - ) - next_gen.extend(diagnostic_children[: slots["diagnostic"]]) - except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed - # Fall through — wildcard slots below absorb the shortfall. - logger.exception("breeder.diagnostic_failed") - - # --- Reflective crossover: combine 2-3 Pareto-optimal parents --- - pareto_parents = [s for s in ranked if s.is_pareto_optimal][:3] - if not pareto_parents: - # Fallback: use top 3 by fitness if nobody is Pareto-optimal - pareto_parents = ranked[:3] - - crossover_instructions = _build_crossover_instructions( - pareto_parents, learning_log, slots["crossover"] - ) - if slots["crossover"] > 0 and pareto_parents: - try: - crossover_children = await breed_next_gen( - parents=pareto_parents, - learning_log=learning_log, - breeding_instructions=crossover_instructions, - ) - next_gen.extend(crossover_children[: slots["crossover"]]) - except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed - logger.exception("breeder.crossover_failed") - - # --- Wildcard: fresh Skills via spawn_gen0 --- - if slots["wildcards"] > 0: - try: - wildcards = await spawn_gen0( - specialization=specialization, - pop_size=slots["wildcards"], - ) - # Mark wildcards as mutations on the next generation - next_gen_num = generation.number + 1 - for w in wildcards: - w.generation = next_gen_num - w.mutations = ["wildcard"] - w.mutation_rationale = "Wildcard slot: fresh spawn to prevent convergence" - next_gen.extend(wildcards) - except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed - logger.exception("breeder.wildcard_spawn_failed") - - # --- Trim or pad to exactly target_pop_size --- - next_gen = next_gen[:target_pop_size] - - # If we fell short (any slot failed), pad with elites cloned forward - while len(next_gen) < target_pop_size and ranked: - next_gen.append(_carry_elite(ranked[0])) - - assert len(next_gen) == target_pop_size, ( - f"breeder produced {len(next_gen)} children, expected {target_pop_size}" - ) - - # --- Stamp generation number on everything --- - next_gen_num = generation.number + 1 - for child in next_gen: - child.generation = next_gen_num - - # --- Extract new learning log entries + write breeding report --- - new_lessons, breeding_report = await _extract_lessons_and_report( - generation, learning_log, slots, elites, pareto_parents - ) - - return (next_gen, new_lessons, breeding_report) - - -def _carry_elite(skill: SkillGenome) -> SkillGenome: - """Return an elite skill carried forward with bumped metadata.""" - import copy - - carried = copy.deepcopy(skill) - carried.generations_survived += 1 - carried.mutations = ["elitism"] - carried.mutation_rationale = "Elitism: top-ranked parent carried forward unchanged" - # Bump maturity if the skill is surviving well - if carried.generations_survived >= 3 and carried.maturity == "tested": - carried.maturity = "hardened" - elif carried.generations_survived >= 2 and carried.maturity == "draft": - carried.maturity = "tested" - return carried - - -# --------------------------------------------------------------------------- -# Prompt builders -# --------------------------------------------------------------------------- - - -def _build_diagnostic_instructions( - low_scorers: list[SkillGenome], - learning_log: list[str], - n_children: int, -) -> str: - """Build breeding instructions for diagnostic mutation of low scorers.""" - if not low_scorers or n_children == 0: - return "" - - diagnoses = [] - for skill in low_scorers: - worst_traits = sorted( - skill.trait_attribution.items(), - key=lambda kv: kv[1], - )[:3] - trait_notes = "\n".join( - f" - {t}: contribution {c:.2f} — {skill.trait_diagnostics.get(t, 'no diagnosis')}" - for t, c in worst_traits - ) - diagnoses.append( - f" Skill {skill.id[:8]}:\n" - f" aggregate fitness: {_aggregate_fitness(skill):.2f}\n" - f" worst traits:\n{trait_notes}" - ) - - log_section = "\n".join(f" - {entry}" for entry in learning_log[-10:]) - - return ( - f"Produce exactly {n_children} child Skill(s) by DIAGNOSTIC MUTATION of the " - "low-scoring parent(s) below. For each child, identify the root cause of " - "the parent's low fitness (from the trait diagnostics), and make a TARGETED " - "fix — rewrite or remove the underperforming instructions, tighten vague " - "phrasing, add concrete examples for ignored rules, or rescope the trait.\n\n" - "Do NOT make random changes. Every mutation must cite a specific parent " - "trait and explain (in mutation_rationale) how the child addresses it.\n\n" - f"Low-scoring parents:\n{chr(10).join(diagnoses)}\n\n" - f"Recent lessons (learning log):\n{log_section or ' (none yet)'}" - ) - - -def _build_crossover_instructions( - parents: list[SkillGenome], - learning_log: list[str], - n_children: int, -) -> str: - """Build instructions for reflective crossover across 2-3 parents.""" - if not parents or n_children == 0: - return "" - - parent_notes = [] - for p in parents: - best_traits = sorted( - p.trait_attribution.items(), - key=lambda kv: kv[1], - reverse=True, - )[:3] - trait_summary = ", ".join(f"{t}:{c:+.2f}" for t, c in best_traits) or "(no attribution)" - parent_notes.append( - f" Parent {p.id[:8]} (fitness {_aggregate_fitness(p):.2f}): " - f"best traits → {trait_summary}" - ) - - log_section = "\n".join(f" - {entry}" for entry in learning_log[-10:]) - - return ( - f"Produce exactly {n_children} child Skill(s) by REFLECTIVE CROSSOVER of the " - f"Pareto-optimal parents below. Combine the HIGH-CONTRIBUTING traits from " - "each parent into each child, preserving the causal mechanism that made " - "each trait successful (not just the surface phrasing).\n\n" - "Crossover is NOT concatenation. For each child, explain (in mutation_rationale) " - "which traits from which parents were combined and WHY those particular " - "traits work together.\n\n" - f"Pareto-optimal parents:\n{chr(10).join(parent_notes)}\n\n" - f"Recent lessons (learning log):\n{log_section or ' (none yet)'}" - ) - - -# --------------------------------------------------------------------------- -# Learning log extraction + breeding report -# --------------------------------------------------------------------------- - - -async def _extract_lessons_and_report( - generation: Generation, - learning_log: list[str], - slots: dict[str, int], - elites: list[SkillGenome], - pareto_parents: list[SkillGenome], -) -> tuple[list[str], str]: - """Ask the LLM for (a) new learning log entries and (b) a breeding report. - - Dispatches on ``config.BREEDER_CALL_MODE``: - - "separate" (default): two LLM calls, one for lessons, one for report - - "consolidated" (Flex-3 cost saver): one structured call returning both - """ - context = _build_breeding_context(generation, slots, elites, pareto_parents) - - if BREEDER_CALL_MODE == "consolidated": - return await _extract_consolidated(context, learning_log) - lessons = await _extract_lessons(context, learning_log) - report = await _extract_breeding_report(context, slots, elites, pareto_parents) - return lessons, report - - -def _build_breeding_context( - generation: Generation, - slots: dict[str, int], - elites: list[SkillGenome], - pareto_parents: list[SkillGenome], -) -> str: - """Summarize this generation's results for the Breeder's LLM prompts.""" - elite_section = "\n".join( - f" - {s.id[:8]} fitness={_aggregate_fitness(s):.2f} traits={s.traits[:3]}" - for s in elites - ) or " (none)" - - pareto_section = "\n".join( - f" - {s.id[:8]} fitness={_aggregate_fitness(s):.2f}" - for s in pareto_parents - ) or " (none)" - - # Top 3 trait contributions across all results - all_traits: dict[str, list[float]] = {} - for r in generation.results: - for trait, contrib in r.trait_contribution.items(): - all_traits.setdefault(trait, []).append(contrib) - trait_means = sorted( - [(t, sum(vs) / len(vs)) for t, vs in all_traits.items()], - key=lambda kv: kv[1], - reverse=True, - ) - top_traits = "\n".join( - f" - {t}: {m:+.2f} (from trace attribution)" for t, m in trait_means[:5] - ) or " (no trait data)" - - return ( - f"Generation {generation.number} summary:\n" - f" population: {len(generation.skills)}\n" - f" best_fitness: {generation.best_fitness:.3f}\n" - f" avg_fitness: {generation.avg_fitness:.3f}\n" - f" pareto_front_size: {len(generation.pareto_front)}\n" - f"\n" - f"Slot allocation for next gen: {slots}\n" - f"\n" - f"Elites (carrying forward):\n{elite_section}\n" - f"\n" - f"Pareto-optimal parents selected for crossover:\n{pareto_section}\n" - f"\n" - f"Top-contributing traits this generation:\n{top_traits}\n" - ) - - -async def _extract_lessons(context: str, learning_log: list[str]) -> list[str]: - """Single LLM call extracting generalizable lessons as a JSON array.""" - recent_log = "\n".join(f"- {e}" for e in learning_log[-10:]) - - prompt = ( - "You are the Breeder agent for a population-based evolution of Claude Agent Skills. " - "Based on the generation summary below, identify 1-3 NEW generalizable lessons " - "about Skill authoring that this generation revealed. Do NOT repeat lessons from " - "the existing learning log. Lessons should be actionable for future breeding, " - "generic enough to apply across domains, and grounded in the trait attribution data.\n\n" - f"## Generation summary\n{context}\n\n" - f"## Existing learning log (don't repeat these)\n{recent_log or '(empty)'}\n\n" - "## Response format\n" - 'Respond with ONLY a JSON array of 1-3 strings, like ["lesson 1", "lesson 2"]. ' - "No prose before or after." - ) - - try: - client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) - text = await stream_text( - client, - model=model_for("breeder"), - max_tokens=500, - messages=[{"role": "user", "content": prompt}], - ) - except Exception: - # Degrade gracefully — a breeder that blocks on LLM hiccups would - # stall the whole run. The SDK has many concrete error types across - # versions; catching at the boundary keeps the engine moving. - logger.exception("breeder.lesson_extraction_failed") - return ["(lesson extraction failed)"] - - match = re.search(r"\[.*\]", text, re.DOTALL) - if not match: - return [] - try: - lessons = json.loads(match.group(0)) - except json.JSONDecodeError: - return [] - return [str(lesson) for lesson in lessons if isinstance(lesson, str)][:3] - - -async def _extract_breeding_report( - context: str, - slots: dict[str, int], - elites: list[SkillGenome], - pareto_parents: list[SkillGenome], -) -> str: - """Single LLM call producing a human-readable breeding report.""" - prompt = ( - "You are the Breeder agent for SkillForge. Write a 2-paragraph breeding report " - "explaining the decisions for the next generation. Paragraph 1: what this " - "generation revealed about trait fitness and which skills earned elite/Pareto " - "status. Paragraph 2: the strategy for the next generation's diagnostic " - "mutations and crossovers. Be specific, cite skill IDs by their 8-char prefix, " - "and reference trait contributions when they shaped a decision.\n\n" - f"## Generation summary\n{context}\n\n" - "Respond with ONLY the report prose. No headings." - ) - - try: - client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) - return await stream_text( - client, - model=model_for("breeder"), - max_tokens=800, - messages=[{"role": "user", "content": prompt}], - ) - except Exception: - # Degrade gracefully — see _extract_lessons for rationale. - logger.exception("breeder.report_extraction_failed") - return "(breeding report failed)" - - -async def _extract_consolidated( - context: str, - learning_log: list[str], -) -> tuple[list[str], str]: - """Flex-3 cost saver: one LLM call produces both lessons and report as JSON.""" - recent_log = "\n".join(f"- {e}" for e in learning_log[-10:]) - - prompt = ( - "You are the Breeder agent for SkillForge. Given the generation summary below, " - "produce BOTH: (1) 1-3 NEW generalizable lessons about Skill authoring, and " - "(2) a 2-paragraph breeding report explaining the decisions.\n\n" - f"## Generation summary\n{context}\n\n" - f"## Existing learning log (don't repeat)\n{recent_log or '(empty)'}\n\n" - "## Response format\n" - "Respond with ONLY a JSON object matching:\n" - '{\n' - ' "lessons": ["lesson 1", "lesson 2"],\n' - ' "report": "Paragraph 1...\\n\\nParagraph 2..."\n' - '}\n' - "No prose before or after the JSON." - ) - - try: - client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) - text = await stream_text( - client, - model=model_for("breeder"), - max_tokens=1200, - messages=[{"role": "user", "content": prompt}], - ) - except Exception: - # Degrade gracefully — see _extract_lessons for rationale. - logger.exception("breeder.consolidated_extraction_failed") - return (["(consolidated extraction failed)"], "") - - match = re.search(r"\{.*\}", text, re.DOTALL) - if not match: - return ([], "") - try: - raw = json.loads(match.group(0)) - except json.JSONDecodeError: - return ([], "") - - lessons = [str(entry) for entry in raw.get("lessons", []) if isinstance(entry, str)][:3] - report = str(raw.get("report", "")) - return (lessons, report) - - -# --------------------------------------------------------------------------- -# Bible publishing -# --------------------------------------------------------------------------- - - -def publish_findings_to_bible( - new_entries: list[str], - run_id: str, - generation: int, -) -> None: - """Write new learning-log entries as numbered finding files under bible/findings/. - - Each finding gets its own file following the schema in bible/README.md. - Also appends a summary line to bible/evolution-log.md. - - Failures here are logged but never raised — we don't want a bible write - failure to abort an evolution run. - """ - if not new_entries: - return - - findings_dir = BIBLE_DIR / "findings" - try: - findings_dir.mkdir(parents=True, exist_ok=True) - except OSError: - logger.exception("bible.findings_dir_mkdir_failed") - return - - # Determine the next finding number by scanning existing files - existing_nums = [] - for f in findings_dir.glob("*.md"): - match = re.match(r"^(\d{3})-", f.name) - if match: - existing_nums.append(int(match.group(1))) - next_num = (max(existing_nums) + 1) if existing_nums else 1 - - timestamp = datetime.now(UTC).strftime("%Y-%m-%d") - - for entry in new_entries: - if not entry or entry.startswith("("): - # Skip error placeholders - continue - slug = _slugify(entry)[:40] - filename = f"{next_num:03d}-{slug}.md" - content = _finding_markdown( - num=next_num, - title=entry, - body=entry, - run_id=run_id, - generation=generation, - timestamp=timestamp, - ) - try: - (findings_dir / filename).write_text(content) - except OSError: - logger.exception("bible.finding_write_failed", extra={"filename": filename}) - continue - next_num += 1 - - # Append to evolution log - log_path = BIBLE_DIR / "evolution-log.md" - try: - if log_path.exists(): - existing = log_path.read_text() - else: - existing = "# Evolution Log\n\n*Chronological log of all SkillForge evolution runs.*\n\n" - entry_line = f"- **{timestamp}** — run `{run_id[:8]}` gen {generation}: {len(new_entries)} new finding(s)\n" - log_path.write_text(existing + entry_line) - except OSError: - logger.exception("bible.evolution_log_write_failed") - - -def _slugify(text: str) -> str: - """Kebab-case a string for use in a filename.""" - slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-") - return slug or "untitled" - - -def _finding_markdown( - num: int, - title: str, - body: str, - run_id: str, - generation: int, - timestamp: str, -) -> str: - """Render a finding markdown file per bible/README.md schema.""" - short_title = title.split(".")[0][:60] if "." in title else title[:60] - return f"""# Finding {num:03d}: {short_title} - -**Discovered**: {timestamp} -**Evolution Run**: {run_id} -**Generation**: {generation} -**Status**: finding - -## Observation - -{body} - -## Evidence - -Automatically extracted from the generation {generation} trait attribution -and trace analysis by the Breeder agent. See run `{run_id}` in the -SkillForge database for the raw scores and traces. - -## Mechanism - -*To be filled in if this finding replicates across 3+ runs and gets -promoted to a pattern.* - -## Recommendation - -*To be filled in upon promotion.* -""" diff --git a/skillforge/agents/breeder/__init__.py b/skillforge/agents/breeder/__init__.py new file mode 100644 index 0000000..c10ce3d --- /dev/null +++ b/skillforge/agents/breeder/__init__.py @@ -0,0 +1,69 @@ +"""Breeder — reflective mutation, multi-parent crossover, learning log, bible publishing. + +Inspired by GEPA's Actionable Side Information: mutations are diagnostic, +not random. The Breeder reads execution traces and trait attribution +from the judging pipeline, identifies root causes of failures, and +proposes targeted fixes. + +Responsibilities: +- Elitism: top N Skills survive unchanged +- Reflective crossover: combine traits from 2-3 parents guided by attribution +- Diagnostic mutation: fix specific causes surfaced by trait attribution +- Joint component mutation: frontmatter + body + scripts mutate together +- Wildcard: 1+ slots per generation for fresh Skills +- Learning log maintenance: append new lessons each generation +- Bible publishing: extract generalizable findings to ``bible/findings/`` + +Slot allocation scales with ``target_pop_size`` (never hardcoded; see +``_ranking.compute_slots`` for the formula). + +Submodule layout: + + _ranking.py compute_slots + rank_skills + _aggregate_fitness (pure) + _prompts.py _build_diagnostic_instructions + _build_crossover_instructions + + _build_breeding_context (pure string-templating) + _reports.py _extract_lessons_and_report + _extract_lessons + + _extract_breeding_report + _extract_consolidated + (LLM-calling; degrades gracefully on SDK errors) + main.py breed() + _carry_elite (top-level orchestrator) + bible.py publish_findings_to_bible (disk I/O, fire-and-forget) +""" + +from __future__ import annotations + +# Re-expose imports the old breeder.py module aliased so test patches +# targeting ``skillforge.agents.breeder.breed_next_gen`` and +# ``skillforge.agents.breeder.BIBLE_DIR`` continue to resolve. +from skillforge.agents.breeder._ranking import ( + _aggregate_fitness, + compute_slots, + rank_skills, +) +from skillforge.agents.breeder._reports import ( + _extract_breeding_report, + _extract_consolidated, + _extract_lessons, + _extract_lessons_and_report, +) +from skillforge.agents.breeder.bible import publish_findings_to_bible +from skillforge.agents.breeder.main import _carry_elite, breed +from skillforge.agents.spawner import breed_next_gen, spawn_gen0 +from skillforge.config import BIBLE_DIR + +__all__ = [ + "breed", + "compute_slots", + "rank_skills", + "publish_findings_to_bible", + # Re-exports for test-patch stability. + "breed_next_gen", + "spawn_gen0", + "BIBLE_DIR", + # Private helpers re-exported for test access. + "_aggregate_fitness", + "_carry_elite", + "_extract_lessons_and_report", + "_extract_lessons", + "_extract_breeding_report", + "_extract_consolidated", +] diff --git a/skillforge/agents/breeder/_prompts.py b/skillforge/agents/breeder/_prompts.py new file mode 100644 index 0000000..8c0204a --- /dev/null +++ b/skillforge/agents/breeder/_prompts.py @@ -0,0 +1,89 @@ +"""Breeding-instruction prompt builders. + +Pure string-templating functions — no LLM calls, no I/O. The actual +breeding happens in ``main.breed()`` which feeds these prompts to the +Spawner. +""" + +from __future__ import annotations + +from skillforge.agents.breeder._ranking import _aggregate_fitness +from skillforge.models import SkillGenome + + +def _build_diagnostic_instructions( + low_scorers: list[SkillGenome], + learning_log: list[str], + n_children: int, +) -> str: + """Build breeding instructions for diagnostic mutation of low scorers.""" + if not low_scorers or n_children == 0: + return "" + + diagnoses = [] + for skill in low_scorers: + worst_traits = sorted( + skill.trait_attribution.items(), + key=lambda kv: kv[1], + )[:3] + trait_notes = "\n".join( + f" - {t}: contribution {c:.2f} — {skill.trait_diagnostics.get(t, 'no diagnosis')}" + for t, c in worst_traits + ) + diagnoses.append( + f" Skill {skill.id[:8]}:\n" + f" aggregate fitness: {_aggregate_fitness(skill):.2f}\n" + f" worst traits:\n{trait_notes}" + ) + + log_section = "\n".join(f" - {entry}" for entry in learning_log[-10:]) + + return ( + f"Produce exactly {n_children} child Skill(s) by DIAGNOSTIC MUTATION of the " + "low-scoring parent(s) below. For each child, identify the root cause of " + "the parent's low fitness (from the trait diagnostics), and make a TARGETED " + "fix — rewrite or remove the underperforming instructions, tighten vague " + "phrasing, add concrete examples for ignored rules, or rescope the trait.\n\n" + "Do NOT make random changes. Every mutation must cite a specific parent " + "trait and explain (in mutation_rationale) how the child addresses it.\n\n" + f"Low-scoring parents:\n{chr(10).join(diagnoses)}\n\n" + f"Recent lessons (learning log):\n{log_section or ' (none yet)'}" + ) + + +def _build_crossover_instructions( + parents: list[SkillGenome], + learning_log: list[str], + n_children: int, +) -> str: + """Build instructions for reflective crossover across 2-3 parents.""" + if not parents or n_children == 0: + return "" + + parent_notes = [] + for p in parents: + best_traits = sorted( + p.trait_attribution.items(), + key=lambda kv: kv[1], + reverse=True, + )[:3] + trait_summary = ", ".join(f"{t}:{c:+.2f}" for t, c in best_traits) or "(no attribution)" + parent_notes.append( + f" Parent {p.id[:8]} (fitness {_aggregate_fitness(p):.2f}): " + f"best traits → {trait_summary}" + ) + + log_section = "\n".join(f" - {entry}" for entry in learning_log[-10:]) + + return ( + f"Produce exactly {n_children} child Skill(s) by REFLECTIVE CROSSOVER of the " + f"Pareto-optimal parents below. Combine the HIGH-CONTRIBUTING traits from " + "each parent into each child, preserving the causal mechanism that made " + "each trait successful (not just the surface phrasing).\n\n" + "Crossover is NOT concatenation. For each child, explain (in mutation_rationale) " + "which traits from which parents were combined and WHY those particular " + "traits work together.\n\n" + f"Pareto-optimal parents:\n{chr(10).join(parent_notes)}\n\n" + f"Recent lessons (learning log):\n{log_section or ' (none yet)'}" + ) + diff --git a/skillforge/agents/breeder/_ranking.py b/skillforge/agents/breeder/_ranking.py new file mode 100644 index 0000000..2583e6d --- /dev/null +++ b/skillforge/agents/breeder/_ranking.py @@ -0,0 +1,79 @@ +"""Pure ranking helpers — slot allocation + fitness aggregation + sorting. + +No I/O, no LLM calls. Used by the main ``breed()`` orchestrator and by +``_build_breeding_context`` when it needs to format a ranked list. +""" + +from __future__ import annotations + +from skillforge.models import Generation, SkillGenome + + +def compute_slots(target_pop_size: int) -> dict[str, int]: + """Allocate breeding slots as a function of ``target_pop_size``. + + Formula (from PLAN.md §Step 6e Breeder): + + elitism = max(1, target_pop_size // 5 * 2) ~40% floor 1 + wildcards = max(1, target_pop_size // 10) ~10% floor 1 + remainder = target_pop_size - elitism - wildcards + diagnostic = remainder // 2 + crossover = remainder - diagnostic + + Worked examples: + pop_size=3 → elitism=1, wildcards=1, diagnostic=0, crossover=1 (sum 3) + pop_size=5 → elitism=2, wildcards=1, diagnostic=1, crossover=1 (sum 5) + pop_size=10 → elitism=4, wildcards=1, diagnostic=2, crossover=3 (sum 10) + """ + if target_pop_size < 1: + raise ValueError(f"target_pop_size must be >=1, got {target_pop_size}") + + elitism = max(1, (target_pop_size // 5) * 2) + wildcards = max(1, target_pop_size // 10) + + # Ensure elitism + wildcards doesn't exceed target (pathological tiny sizes) + if elitism + wildcards > target_pop_size: + elitism = max(1, target_pop_size - 1) + wildcards = max(0, target_pop_size - elitism) + + remainder = target_pop_size - elitism - wildcards + diagnostic = remainder // 2 + crossover = remainder - diagnostic + + slots = { + "elitism": elitism, + "wildcards": wildcards, + "diagnostic": diagnostic, + "crossover": crossover, + } + assert sum(slots.values()) == target_pop_size, ( + f"slot sum {sum(slots.values())} != target {target_pop_size}: {slots}" + ) + return slots + + +# --------------------------------------------------------------------------- +# Ranking +# --------------------------------------------------------------------------- + + +def _aggregate_fitness(skill: SkillGenome) -> float: + """Scalar aggregate of Pareto objectives for ranking (charts/selection). + + The Pareto front is the real answer; this scalar is a summary for + ordering within the front (and for ranking Skills OFF the front). + """ + if not skill.pareto_objectives: + return 0.0 + return sum(skill.pareto_objectives.values()) / len(skill.pareto_objectives) + + +def rank_skills(generation: Generation) -> list[SkillGenome]: + """Return generation.skills sorted by (is_pareto_optimal desc, fitness desc).""" + return sorted( + generation.skills, + key=lambda s: (s.is_pareto_optimal, _aggregate_fitness(s)), + reverse=True, + ) + + diff --git a/skillforge/agents/breeder/_reports.py b/skillforge/agents/breeder/_reports.py new file mode 100644 index 0000000..1e89442 --- /dev/null +++ b/skillforge/agents/breeder/_reports.py @@ -0,0 +1,213 @@ +"""Breeder's reflection step — post-generation lessons + written report. + +Calls the LLM to distill what this generation revealed about trait +fitness and to write a paragraph explaining the breeding decisions. +Degrades gracefully on SDK errors (see docs/clean-code.md §4). +""" + +from __future__ import annotations + +import json +import logging +import re + +from anthropic import AsyncAnthropic + +from skillforge.agents._llm import stream_text +from skillforge.agents.breeder._ranking import _aggregate_fitness +from skillforge.config import ANTHROPIC_API_KEY, BREEDER_CALL_MODE, model_for +from skillforge.models import Generation, SkillGenome + +logger = logging.getLogger("skillforge.agents.breeder.reports") + + +async def _extract_lessons_and_report( + generation: Generation, + learning_log: list[str], + slots: dict[str, int], + elites: list[SkillGenome], + pareto_parents: list[SkillGenome], +) -> tuple[list[str], str]: + """Ask the LLM for (a) new learning log entries and (b) a breeding report. + + Dispatches on ``config.BREEDER_CALL_MODE``: + - "separate" (default): two LLM calls, one for lessons, one for report + - "consolidated" (Flex-3 cost saver): one structured call returning both + """ + context = _build_breeding_context(generation, slots, elites, pareto_parents) + + if BREEDER_CALL_MODE == "consolidated": + return await _extract_consolidated(context, learning_log) + lessons = await _extract_lessons(context, learning_log) + report = await _extract_breeding_report(context, slots, elites, pareto_parents) + return lessons, report + + +def _build_breeding_context( + generation: Generation, + slots: dict[str, int], + elites: list[SkillGenome], + pareto_parents: list[SkillGenome], +) -> str: + """Summarize this generation's results for the Breeder's LLM prompts.""" + elite_section = "\n".join( + f" - {s.id[:8]} fitness={_aggregate_fitness(s):.2f} traits={s.traits[:3]}" + for s in elites + ) or " (none)" + + pareto_section = "\n".join( + f" - {s.id[:8]} fitness={_aggregate_fitness(s):.2f}" + for s in pareto_parents + ) or " (none)" + + # Top 3 trait contributions across all results + all_traits: dict[str, list[float]] = {} + for r in generation.results: + for trait, contrib in r.trait_contribution.items(): + all_traits.setdefault(trait, []).append(contrib) + trait_means = sorted( + [(t, sum(vs) / len(vs)) for t, vs in all_traits.items()], + key=lambda kv: kv[1], + reverse=True, + ) + top_traits = "\n".join( + f" - {t}: {m:+.2f} (from trace attribution)" for t, m in trait_means[:5] + ) or " (no trait data)" + + return ( + f"Generation {generation.number} summary:\n" + f" population: {len(generation.skills)}\n" + f" best_fitness: {generation.best_fitness:.3f}\n" + f" avg_fitness: {generation.avg_fitness:.3f}\n" + f" pareto_front_size: {len(generation.pareto_front)}\n" + f"\n" + f"Slot allocation for next gen: {slots}\n" + f"\n" + f"Elites (carrying forward):\n{elite_section}\n" + f"\n" + f"Pareto-optimal parents selected for crossover:\n{pareto_section}\n" + f"\n" + f"Top-contributing traits this generation:\n{top_traits}\n" + ) + + +async def _extract_lessons(context: str, learning_log: list[str]) -> list[str]: + """Single LLM call extracting generalizable lessons as a JSON array.""" + recent_log = "\n".join(f"- {e}" for e in learning_log[-10:]) + + prompt = ( + "You are the Breeder agent for a population-based evolution of Claude Agent Skills. " + "Based on the generation summary below, identify 1-3 NEW generalizable lessons " + "about Skill authoring that this generation revealed. Do NOT repeat lessons from " + "the existing learning log. Lessons should be actionable for future breeding, " + "generic enough to apply across domains, and grounded in the trait attribution data.\n\n" + f"## Generation summary\n{context}\n\n" + f"## Existing learning log (don't repeat these)\n{recent_log or '(empty)'}\n\n" + "## Response format\n" + 'Respond with ONLY a JSON array of 1-3 strings, like ["lesson 1", "lesson 2"]. ' + "No prose before or after." + ) + + try: + client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) + text = await stream_text( + client, + model=model_for("breeder"), + max_tokens=500, + messages=[{"role": "user", "content": prompt}], + ) + except Exception: + # Degrade gracefully — a breeder that blocks on LLM hiccups would + # stall the whole run. The SDK has many concrete error types across + # versions; catching at the boundary keeps the engine moving. + logger.exception("breeder.lesson_extraction_failed") + return ["(lesson extraction failed)"] + + match = re.search(r"\[.*\]", text, re.DOTALL) + if not match: + return [] + try: + lessons = json.loads(match.group(0)) + except json.JSONDecodeError: + return [] + return [str(lesson) for lesson in lessons if isinstance(lesson, str)][:3] + + +async def _extract_breeding_report( + context: str, + slots: dict[str, int], + elites: list[SkillGenome], + pareto_parents: list[SkillGenome], +) -> str: + """Single LLM call producing a human-readable breeding report.""" + prompt = ( + "You are the Breeder agent for SkillForge. Write a 2-paragraph breeding report " + "explaining the decisions for the next generation. Paragraph 1: what this " + "generation revealed about trait fitness and which skills earned elite/Pareto " + "status. Paragraph 2: the strategy for the next generation's diagnostic " + "mutations and crossovers. Be specific, cite skill IDs by their 8-char prefix, " + "and reference trait contributions when they shaped a decision.\n\n" + f"## Generation summary\n{context}\n\n" + "Respond with ONLY the report prose. No headings." + ) + + try: + client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) + return await stream_text( + client, + model=model_for("breeder"), + max_tokens=800, + messages=[{"role": "user", "content": prompt}], + ) + except Exception: + # Degrade gracefully — see _extract_lessons for rationale. + logger.exception("breeder.report_extraction_failed") + return "(breeding report failed)" + + +async def _extract_consolidated( + context: str, + learning_log: list[str], +) -> tuple[list[str], str]: + """Flex-3 cost saver: one LLM call produces both lessons and report as JSON.""" + recent_log = "\n".join(f"- {e}" for e in learning_log[-10:]) + + prompt = ( + "You are the Breeder agent for SkillForge. Given the generation summary below, " + "produce BOTH: (1) 1-3 NEW generalizable lessons about Skill authoring, and " + "(2) a 2-paragraph breeding report explaining the decisions.\n\n" + f"## Generation summary\n{context}\n\n" + f"## Existing learning log (don't repeat)\n{recent_log or '(empty)'}\n\n" + "## Response format\n" + "Respond with ONLY a JSON object matching:\n" + '{\n' + ' "lessons": ["lesson 1", "lesson 2"],\n' + ' "report": "Paragraph 1...\\n\\nParagraph 2..."\n' + '}\n' + "No prose before or after the JSON." + ) + + try: + client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0) + text = await stream_text( + client, + model=model_for("breeder"), + max_tokens=1200, + messages=[{"role": "user", "content": prompt}], + ) + except Exception: + # Degrade gracefully — see _extract_lessons for rationale. + logger.exception("breeder.consolidated_extraction_failed") + return (["(consolidated extraction failed)"], "") + + match = re.search(r"\{.*\}", text, re.DOTALL) + if not match: + return ([], "") + try: + raw = json.loads(match.group(0)) + except json.JSONDecodeError: + return ([], "") + + lessons = [str(entry) for entry in raw.get("lessons", []) if isinstance(entry, str)][:3] + report = str(raw.get("report", "")) + return (lessons, report) diff --git a/skillforge/agents/breeder/bible.py b/skillforge/agents/breeder/bible.py new file mode 100644 index 0000000..5ff5fd1 --- /dev/null +++ b/skillforge/agents/breeder/bible.py @@ -0,0 +1,139 @@ +"""Write learning-log entries out to ``bible/findings/`` on disk. + +All I/O lives here; the caller just passes in the new entries + run +metadata and expects best-effort persistence. Failures are logged, +never raised — a bible write must not abort an evolution run. +""" + +from __future__ import annotations + +import logging +import re +from datetime import UTC, datetime + +logger = logging.getLogger("skillforge.agents.breeder.bible") + + +def _resolve_bible_dir(): + """Look up BIBLE_DIR through the breeder package's namespace. + + The test suite patches ``skillforge.agents.breeder.BIBLE_DIR`` to + redirect writes to a tmp_path fixture. Reading the attribute fresh + each call (instead of binding at import time) keeps that patch + observable after the monolithic module was split into a package. + """ + from skillforge.agents import breeder as _pkg + + return _pkg.BIBLE_DIR + + +def publish_findings_to_bible( + new_entries: list[str], + run_id: str, + generation: int, +) -> None: + """Write new learning-log entries as numbered finding files under bible/findings/. + + Each finding gets its own file following the schema in bible/README.md. + Also appends a summary line to bible/evolution-log.md. + + Failures here are logged but never raised — we don't want a bible write + failure to abort an evolution run. + """ + if not new_entries: + return + + bible_dir = _resolve_bible_dir() + findings_dir = bible_dir / "findings" + try: + findings_dir.mkdir(parents=True, exist_ok=True) + except OSError: + logger.exception("bible.findings_dir_mkdir_failed") + return + + # Determine the next finding number by scanning existing files + existing_nums = [] + for f in findings_dir.glob("*.md"): + match = re.match(r"^(\d{3})-", f.name) + if match: + existing_nums.append(int(match.group(1))) + next_num = (max(existing_nums) + 1) if existing_nums else 1 + + timestamp = datetime.now(UTC).strftime("%Y-%m-%d") + + for entry in new_entries: + if not entry or entry.startswith("("): + # Skip error placeholders + continue + slug = _slugify(entry)[:40] + filename = f"{next_num:03d}-{slug}.md" + content = _finding_markdown( + num=next_num, + title=entry, + body=entry, + run_id=run_id, + generation=generation, + timestamp=timestamp, + ) + try: + (findings_dir / filename).write_text(content) + except OSError: + logger.exception("bible.finding_write_failed", extra={"filename": filename}) + continue + next_num += 1 + + # Append to evolution log + log_path = bible_dir / "evolution-log.md" + try: + if log_path.exists(): + existing = log_path.read_text() + else: + existing = "# Evolution Log\n\n*Chronological log of all SkillForge evolution runs.*\n\n" + entry_line = f"- **{timestamp}** — run `{run_id[:8]}` gen {generation}: {len(new_entries)} new finding(s)\n" + log_path.write_text(existing + entry_line) + except OSError: + logger.exception("bible.evolution_log_write_failed") + + +def _slugify(text: str) -> str: + """Kebab-case a string for use in a filename.""" + slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-") + return slug or "untitled" + + +def _finding_markdown( + num: int, + title: str, + body: str, + run_id: str, + generation: int, + timestamp: str, +) -> str: + """Render a finding markdown file per bible/README.md schema.""" + short_title = title.split(".")[0][:60] if "." in title else title[:60] + return f"""# Finding {num:03d}: {short_title} + +**Discovered**: {timestamp} +**Evolution Run**: {run_id} +**Generation**: {generation} +**Status**: finding + +## Observation + +{body} + +## Evidence + +Automatically extracted from the generation {generation} trait attribution +and trace analysis by the Breeder agent. See run `{run_id}` in the +SkillForge database for the raw scores and traces. + +## Mechanism + +*To be filled in if this finding replicates across 3+ runs and gets +promoted to a pattern.* + +## Recommendation + +*To be filled in upon promotion.* +""" diff --git a/skillforge/agents/breeder/main.py b/skillforge/agents/breeder/main.py new file mode 100644 index 0000000..5de2acf --- /dev/null +++ b/skillforge/agents/breeder/main.py @@ -0,0 +1,146 @@ +"""Main breed() orchestrator — allocate slots, run subagents, pad + return. + +``breed_next_gen`` / ``spawn_gen0`` / ``_extract_lessons_and_report`` are +resolved through the package namespace at call time (not bound at import) +so that tests which ``patch("skillforge.agents.breeder.breed_next_gen")`` +still intercept the call after the monolithic module was split. +""" + +from __future__ import annotations + +import logging + +from skillforge.agents.breeder._prompts import ( + _build_crossover_instructions, + _build_diagnostic_instructions, +) +from skillforge.agents.breeder._ranking import compute_slots, rank_skills +from skillforge.models import Generation, SkillGenome + +logger = logging.getLogger("skillforge.agents.breeder") + + +def _pkg(): + """Return the breeder package so attribute lookups honor test patches.""" + from skillforge.agents import breeder as _breeder_pkg + + return _breeder_pkg + + +async def breed( + generation: Generation, + learning_log: list[str], + specialization: str, + target_pop_size: int, +) -> tuple[list[SkillGenome], list[str], str]: + """Produce the next generation from a ranked current generation. + + Returns ``(next_gen_skills, new_learning_log_entries, breeding_report)``. + + The slot allocation scales with ``target_pop_size`` — see ``compute_slots``. + The function guarantees ``len(next_gen_skills) == target_pop_size``. + """ + slots = compute_slots(target_pop_size) + ranked = rank_skills(generation) + + next_gen: list[SkillGenome] = [] + + # --- Elitism: top-N survive unchanged (but bump generations_survived) --- + elites = ranked[: slots["elitism"]] + for elite in elites: + carried = _carry_elite(elite) + next_gen.append(carried) + + # --- Diagnostic mutation: pick low-scoring Skills, ask LLM for targeted fixes --- + low_scorers = ranked[-slots["diagnostic"] :] if slots["diagnostic"] > 0 else [] + diagnostic_instructions = _build_diagnostic_instructions( + low_scorers, learning_log, slots["diagnostic"] + ) + if slots["diagnostic"] > 0 and low_scorers: + try: + diagnostic_children = await _pkg().breed_next_gen( + parents=low_scorers, + learning_log=learning_log, + breeding_instructions=diagnostic_instructions, + ) + next_gen.extend(diagnostic_children[: slots["diagnostic"]]) + except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed + # Fall through — wildcard slots below absorb the shortfall. + logger.exception("breeder.diagnostic_failed") + + # --- Reflective crossover: combine 2-3 Pareto-optimal parents --- + pareto_parents = [s for s in ranked if s.is_pareto_optimal][:3] + if not pareto_parents: + # Fallback: use top 3 by fitness if nobody is Pareto-optimal + pareto_parents = ranked[:3] + + crossover_instructions = _build_crossover_instructions( + pareto_parents, learning_log, slots["crossover"] + ) + if slots["crossover"] > 0 and pareto_parents: + try: + crossover_children = await _pkg().breed_next_gen( + parents=pareto_parents, + learning_log=learning_log, + breeding_instructions=crossover_instructions, + ) + next_gen.extend(crossover_children[: slots["crossover"]]) + except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed + logger.exception("breeder.crossover_failed") + + # --- Wildcard: fresh Skills via spawn_gen0 --- + if slots["wildcards"] > 0: + try: + wildcards = await _pkg().spawn_gen0( + specialization=specialization, + pop_size=slots["wildcards"], + ) + # Mark wildcards as mutations on the next generation + next_gen_num = generation.number + 1 + for w in wildcards: + w.generation = next_gen_num + w.mutations = ["wildcard"] + w.mutation_rationale = "Wildcard slot: fresh spawn to prevent convergence" + next_gen.extend(wildcards) + except Exception: # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed + logger.exception("breeder.wildcard_spawn_failed") + + # --- Trim or pad to exactly target_pop_size --- + next_gen = next_gen[:target_pop_size] + + # If we fell short (any slot failed), pad with elites cloned forward + while len(next_gen) < target_pop_size and ranked: + next_gen.append(_carry_elite(ranked[0])) + + assert len(next_gen) == target_pop_size, ( + f"breeder produced {len(next_gen)} children, expected {target_pop_size}" + ) + + # --- Stamp generation number on everything --- + next_gen_num = generation.number + 1 + for child in next_gen: + child.generation = next_gen_num + + # --- Extract new learning log entries + write breeding report --- + new_lessons, breeding_report = await _pkg()._extract_lessons_and_report( + generation, learning_log, slots, elites, pareto_parents + ) + + return (next_gen, new_lessons, breeding_report) + + +def _carry_elite(skill: SkillGenome) -> SkillGenome: + """Return an elite skill carried forward with bumped metadata.""" + import copy + + carried = copy.deepcopy(skill) + carried.generations_survived += 1 + carried.mutations = ["elitism"] + carried.mutation_rationale = "Elitism: top-ranked parent carried forward unchanged" + # Bump maturity if the skill is surviving well + if carried.generations_survived >= 3 and carried.maturity == "tested": + carried.maturity = "hardened" + elif carried.generations_survived >= 2 and carried.maturity == "draft": + carried.maturity = "tested" + return carried + From e73dcfc896cbdb18b9e28bced5bb3cfee6cc3bf9 Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Mon, 20 Apr 2026 02:04:21 -0500 Subject: [PATCH 4/4] refactor: split spawner.py (763 LOC) into a package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decomposed along the pure-planner / thin-I/O-shell seam called out in docs/clean-code.md §7. Four submodules: spawner/__init__.py barrel — re-exports four entry points plus every helper tests patch on the package root (_generate, _read_bible_patterns, BIBLE_DIR, ...) spawner/_helpers.py _generate (LLM streaming) + _parse_genomes + _auto_repair_missing_references + _validate_genomes + _read_bible_patterns spawner/_prompts.py all _build_*_system_prompt string templates + embedded JSON schema constants (pure — no I/O, no LLM calls) spawner/main.py four public entry points: spawn_gen0, breed_next_gen, spawn_from_parent, spawn_variant_gen0 Largest submodule is main.py at 411 LOC, under the 500-LOC ceiling. Test-patch compatibility ------------------------ Same pattern as the breeder split: tests patch ``spawner._generate``, ``spawner._read_bible_patterns``, and ``spawner.BIBLE_DIR`` on the package root. Those patches do not propagate to direct imports in submodules, so ``main._generate`` / ``main._read_bible_patterns`` and ``_helpers._read_bible_patterns`` now resolve the reference through the package namespace at call time. Without these shims the test suite made real LLM calls for 11 minutes before first failure — the fix is load-bearing for both test speed and API-cost safety. QA: ruff + mypy (83 files) + 411 pytest all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/spawner.py | 763 -------------------------- skillforge/agents/spawner/__init__.py | 54 ++ skillforge/agents/spawner/_helpers.py | 187 +++++++ skillforge/agents/spawner/_prompts.py | 217 ++++++++ skillforge/agents/spawner/main.py | 411 ++++++++++++++ 5 files changed, 869 insertions(+), 763 deletions(-) delete mode 100644 skillforge/agents/spawner.py create mode 100644 skillforge/agents/spawner/__init__.py create mode 100644 skillforge/agents/spawner/_helpers.py create mode 100644 skillforge/agents/spawner/_prompts.py create mode 100644 skillforge/agents/spawner/main.py diff --git a/skillforge/agents/spawner.py b/skillforge/agents/spawner.py deleted file mode 100644 index 096bbc7..0000000 --- a/skillforge/agents/spawner.py +++ /dev/null @@ -1,763 +0,0 @@ -"""Spawner — creates gen 0 populations and breeds next generations. - -Gen 0: reads the golden template from ``config.GOLDEN_TEMPLATE_DIR`` and -``bible/patterns/*.md``, generates ``pop_size`` diverse Skills varying content -while preserving structure. - -Gen 1+: takes parent genomes + breeding instructions from the Breeder and -produces child Skills. The Spawner MUST enforce all authoring constraints -from ``engine.sandbox.validate_skill_structure``. - -Uses the Anthropic Messages API directly (NOT the Agent SDK's query()) because -this is a pure generation task with no tool use. The Agent SDK's query() is -for agentic loops with tools and hung the overnight live test. -""" - -from __future__ import annotations - -import re -import uuid - -from anthropic import AsyncAnthropic - -from skillforge.agents._json import extract_json_array -from skillforge.config import ANTHROPIC_API_KEY, BIBLE_DIR, GOLDEN_TEMPLATE_DIR, model_for -from skillforge.engine.sandbox import validate_skill_structure -from skillforge.errors import ParseError -from skillforge.models import SkillGenome - -# JSON schema for spawner responses -_SPAWN_SCHEMA_DESCRIPTION = """[ - { - "name": "kebab-case-name", - "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...", - "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."}, - "traits": ["imperative-phrasing", "tests-first"], - "meta_strategy": "plan-first TDD" - } -]""" - -_BREED_SCHEMA_DESCRIPTION = """[ - { - "name": "kebab-case-name", - "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...", - "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."}, - "traits": ["imperative-phrasing", "tests-first"], - "meta_strategy": "plan-first TDD", - "parent_ids": ["uuid-1", "uuid-2"], - "mutations": ["changed-meta-strategy", "added-examples"], - "mutation_rationale": "Switched to TDD-first based on parent attribution data" - } -]""" - - -def _read_bible_patterns() -> str: - """Concatenate all .md files under BIBLE_DIR/patterns in sorted order. - - Returns empty string if the directory doesn't exist or is empty. - """ - patterns_dir = BIBLE_DIR / "patterns" - if not patterns_dir.exists(): - return "" - - parts: list[str] = [] - for p in sorted(patterns_dir.glob("*.md")): - try: - parts.append(p.read_text()) - except (OSError, UnicodeDecodeError): - continue - - return "\n\n---\n\n".join(parts) - - -def _extract_response_text(response) -> str: - """Extract text from an Anthropic Messages API response. - - The response's ``content`` is a list of content blocks; extract any - that have a ``.text`` attribute. - """ - if not response.content: - return "" - parts: list[str] = [] - for block in response.content: - text = getattr(block, "text", None) - if text: - parts.append(text) - return "\n".join(parts) - - -def _save_debug_response(label: str, text: str) -> None: - """Write the last raw LLM response to /tmp for post-hoc debugging. - - Non-fatal — any write error is silently swallowed. This is for diagnosing - parse failures during live runs; in production the text is ephemeral. - """ - try: - from pathlib import Path - - path = Path("/tmp") / f"sf-{label}.txt" - path.write_text(text) - except OSError: - pass - - -def _parse_genomes( - raw: list[dict], - generation: int, - parent_ids: list[str] | None = None, -) -> list[SkillGenome]: - """Convert raw dicts from Claude's response into SkillGenome objects.""" - genomes: list[SkillGenome] = [] - for item in raw: - genome = SkillGenome( - id=str(uuid.uuid4()), - generation=generation, - skill_md_content=item.get("skill_md_content", ""), - supporting_files=item.get("supporting_files", {}), - traits=item.get("traits", []), - meta_strategy=item.get("meta_strategy", ""), - parent_ids=parent_ids or item.get("parent_ids", []), - mutations=item.get("mutations", []), - mutation_rationale=item.get("mutation_rationale", ""), - maturity="draft", - ) - genomes.append(genome) - return genomes - - -async def _generate(prompt: str) -> str: - """Streaming Anthropic API call. Returns the full assistant text response. - - The Spawner generates structured JSON output containing multiple SKILL.md - files (up to ~5KB per skill × pop_size = 25KB+ at pop_size=5). Non-streaming - requests get server-disconnected around the 3-4 minute mark on prompts this - size. Streaming keeps the connection alive via incremental chunks and handles - long generations reliably. - - ``max_tokens`` is set to 32000 to fit a full population of rich SKILL.md - files with supporting scripts. Claude Sonnet 4.6 supports up to 64K output - tokens in streaming mode; 32K is plenty for realistic populations while - keeping a sane ceiling. - """ - client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=600.0) - parts: list[str] = [] - async with client.messages.stream( - model=model_for("spawner"), - max_tokens=32000, - messages=[{"role": "user", "content": prompt}], - ) as stream: - async for text in stream.text_stream: - parts.append(text) - return "".join(parts) - - -# Pulls ${CLAUDE_SKILL_DIR}/ references out of a SKILL.md body. -# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8 -# exactly — see that function for the source-of-truth behavior. -_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)") - - -def _auto_repair_missing_references(genome: SkillGenome) -> int: - """Stub out ``${CLAUDE_SKILL_DIR}/`` refs missing from supporting_files. - - Cheap-tier Haiku routinely emits SKILL.md bodies that reference - ``references/*-guide.md`` in prose but forget to include the file in - ``supporting_files``. Validator rule 8 rejects those genomes, which - in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 times. - - Rather than burn another LLM call on a retry that often reproduces - the same oversight, we stub each missing reference with a minimal - placeholder. The skill still renders, the reference still resolves - at runtime, and the genome passes validation. The Breeder can flesh - out the stubs in later generations if fitness signal suggests they - carry weight. - - Returns the count of paths that were stubbed (0 if everything already - resolved, which is the expected Sonnet-tier case). - """ - stubbed = 0 - for match in _REF_PATH_RE.finditer(genome.skill_md_content): - rel_path = match.group(1).rstrip(".,;:)") - if rel_path in genome.supporting_files: - continue - filename = rel_path.rsplit("/", 1)[-1] - placeholder_title = filename.removesuffix(".md").replace("-", " ").title() - genome.supporting_files[rel_path] = ( - f"# {placeholder_title}\n\n" - f"_Placeholder — stubbed by the spawner's auto-repair pass " - f"because the generating LLM referenced this file but did not " - f"emit its contents. Replace with domain-specific material " - f"during a later generation._\n" - ) - stubbed += 1 - return stubbed - - -def _validate_genomes( - genomes: list[SkillGenome], -) -> tuple[list[SkillGenome], dict[int, list[str]]]: - """Validate each genome; returns (valid_genomes, {idx: violations}). - - Runs the reference-path auto-repair pass before validation so cheap-tier - LLM drift on rule 8 (missing supporting_files entries) doesn't kill a - whole population. The repair only adds files; it never touches the - skill_md body. - """ - valid: list[SkillGenome] = [] - invalid: dict[int, list[str]] = {} - for i, genome in enumerate(genomes): - _auto_repair_missing_references(genome) - violations = validate_skill_structure(genome) - if violations: - invalid[i] = violations - else: - valid.append(genome) - return valid, invalid - - -def _build_spawn_system_prompt( - specialization: str, - pop_size: int, - template: str, - bible_patterns: str, -) -> str: - """Build the system prompt for gen 0 spawn.""" - bible_section = ( - f"\n\n## Validated Patterns (apply these)\n\n{bible_patterns}" - if bible_patterns - else "" - ) - return ( - f"You are a Skill author for the Claude Agent SDK. Your task is to generate " - f"{pop_size} DIVERSE candidate Skills for the following specialization:\n\n" - f"SPECIALIZATION: {specialization}\n\n" - "Each Skill must:\n" - "1. Follow the exact YAML frontmatter + markdown structure of the template below\n" - "2. Include 'Use when' in the first 250 chars of the description\n" - "3. Have a name matching the regex ^[a-z0-9]+(-[a-z0-9]+)*$\n" - "4. Contain at least 2 example blocks (**Example or ## Example)\n" - "5. Keep the body under 500 lines\n" - "6. Have a description under 1024 characters\n" - "7. NOT use 'anthropic' or 'claude' in the name\n" - "8. Only reference paths in ${CLAUDE_SKILL_DIR}/... that are included in supporting_files\n\n" - "## Golden Template\n\n" - f"{template}" - f"{bible_section}\n\n" - f"Return ONLY a JSON array of exactly {pop_size} skill objects. " - "No prose before or after — ONLY the JSON array. Use this schema:\n" - f"{_SPAWN_SCHEMA_DESCRIPTION}\n" - "Vary the approach, strategy, instruction style, and examples across all skills " - "while preserving the template structure." - ) - - -def _build_breed_system_prompt( - parents: list[SkillGenome], - learning_log: list[str], - breeding_instructions: str, - bible_patterns: str, -) -> str: - """Build the system prompt for next-gen breeding.""" - bible_section = ( - f"\n\n## Validated Patterns\n\n{bible_patterns}" if bible_patterns else "" - ) - - parents_section = "\n\n".join( - f"### Parent {i + 1} (id: {p.id})\n" - f"**Traits**: {p.traits}\n" - f"**Meta-strategy**: {p.meta_strategy}\n" - f"**Trait attribution**: {p.trait_attribution}\n" - f"**Trait diagnostics**: {p.trait_diagnostics}\n\n" - f"**SKILL.md content**:\n```\n{p.skill_md_content}\n```" - for i, p in enumerate(parents) - ) - - learning_section = ( - "\n".join(f"- {entry}" for entry in learning_log) - if learning_log - else "(no entries yet)" - ) - - return ( - "You are a Skill evolutionary breeder for the Claude Agent SDK.\n\n" - "## Breeding Instructions (from Breeder agent)\n\n" - f"{breeding_instructions}\n\n" - "## Parent Skills\n\n" - f"{parents_section}\n\n" - "## Learning Log (failures and lessons from all prior generations)\n\n" - f"{learning_section}" - f"{bible_section}\n\n" - "## Rules for child Skills\n" - "1. Follow YAML frontmatter + markdown structure of the parents\n" - "2. Include 'Use when' in first 250 chars of description\n" - "3. Name must match ^[a-z0-9]+(-[a-z0-9]+)*$ and NOT contain 'anthropic' or 'claude'\n" - "4. At least 2 example blocks (**Example or ## Example)\n" - "5. Body under 500 lines, description under 1024 characters\n" - "6. Only reference ${CLAUDE_SKILL_DIR}/... paths that are in supporting_files\n\n" - "Return ONLY a JSON array of child skill objects. Use this schema:\n" - f"{_BREED_SCHEMA_DESCRIPTION}" - ) - - -def _build_repair_prompt( - original_prompt: str, - violations_by_idx: dict[int, list[str]], - genomes: list[SkillGenome], -) -> str: - """Build a reprompt asking Claude to fix specific violations.""" - violation_lines: list[str] = [] - for idx, viols in violations_by_idx.items(): - genome_name = genomes[idx].skill_md_content[:50].replace("\n", " ") - violation_lines.append( - f"Skill index {idx} ({genome_name!r}): {'; '.join(viols)}" - ) - violations_str = "\n".join(violation_lines) - - return ( - "Your previous response contained invalid Skills. " - "Fix the following violations and return a corrected JSON array:\n\n" - f"{violations_str}\n\n" - "Return ONLY the complete corrected JSON array — all skills, not just the fixed ones." - ) - - -async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]: - """Generate ``pop_size`` diverse gen 0 Skills for the specialization. - - Args: - specialization: Description of the Skill domain. - pop_size: Number of candidate Skills to generate. - - Returns: - A list of ``pop_size`` validated SkillGenome objects at generation 0. - - Raises: - ValueError: if Skills remain invalid after 1 retry. - """ - template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text() - bible_patterns = _read_bible_patterns() - - system_prompt = _build_spawn_system_prompt( - specialization, pop_size, template, bible_patterns - ) - - # Attempt 1 - text = await _generate(system_prompt) - _save_debug_response("spawn_gen0_attempt1", text) - - try: - raw = extract_json_array(text) - genomes = _parse_genomes(raw, generation=0) - valid_genomes, invalid = _validate_genomes(genomes) - first_attempt_failed = False - except (ValueError, ParseError): - # JSON parse failure — treat as if everything was invalid so the - # retry path runs. - genomes = [] - valid_genomes = [] - invalid = {} - first_attempt_failed = True - - if not first_attempt_failed and not invalid: - return valid_genomes - - # Attempt 2 — retry. Use the same prompt if JSON parse failed (Claude - # just didn't follow instructions), or a targeted repair prompt if - # the skills parsed but failed validation. - if first_attempt_failed: - retry_prompt = ( - system_prompt - + "\n\nCRITICAL: Your previous response did not contain a valid JSON " - "array. You must respond with ONLY a JSON array — no prose, no " - "markdown before or after the array. The array must start with [ " - "and end with ]. No explanations." - ) - else: - retry_prompt = _build_repair_prompt(system_prompt, invalid, genomes) - - text = await _generate(retry_prompt) - _save_debug_response("spawn_gen0_attempt2", text) - - try: - raw2 = extract_json_array(text) - except (ValueError, ParseError) as exc: - raise ValueError( - f"spawner failed to produce valid JSON on retry: {exc}. " - f"See /tmp/sf-spawn_gen0_attempt2.txt for the raw response." - ) from exc - - genomes2 = _parse_genomes(raw2, generation=0) - valid_genomes2, still_invalid = _validate_genomes(genomes2) - - if still_invalid: - all_violations = [ - f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items() - ] - raise ValueError( - "spawner produced invalid skills after retry: " - + "; ".join(all_violations) - ) - - return valid_genomes2 - - -async def breed_next_gen( - parents: list[SkillGenome], - learning_log: list[str], - breeding_instructions: str, -) -> list[SkillGenome]: - """Produce a child population from parents + Breeder's instructions. - - Args: - parents: Parent SkillGenome objects (with trait_attribution populated). - learning_log: Accumulated lessons from all prior generations. - breeding_instructions: Free-text directives from the Breeder agent. - - Returns: - A list of validated child SkillGenome objects at generation+1. - - Raises: - ValueError: if children remain invalid after 1 retry. - """ - bible_patterns = _read_bible_patterns() - parent_ids = [p.id for p in parents] - next_generation = (parents[0].generation + 1) if parents else 1 - - system_prompt = _build_breed_system_prompt( - parents, learning_log, breeding_instructions, bible_patterns - ) - - # Attempt 1 - text = await _generate(system_prompt) - - try: - raw = extract_json_array(text) - except (ValueError, ParseError) as exc: - raise ValueError( - f"spawner breed_next_gen failed to produce valid JSON: {exc}" - ) from exc - - # Parse with generation and parent_ids from raw (each child should specify its own parent_ids) - children: list[SkillGenome] = [] - for item in raw: - child = SkillGenome( - id=str(uuid.uuid4()), - generation=next_generation, - skill_md_content=item.get("skill_md_content", ""), - supporting_files=item.get("supporting_files", {}), - traits=item.get("traits", []), - meta_strategy=item.get("meta_strategy", ""), - parent_ids=item.get("parent_ids", parent_ids), - mutations=item.get("mutations", []), - mutation_rationale=item.get("mutation_rationale", ""), - maturity="draft", - ) - children.append(child) - - valid_children, invalid = _validate_genomes(children) - - if not invalid: - return valid_children - - # Attempt 2 — repair - repair_prompt = _build_repair_prompt(system_prompt, invalid, children) - text = await _generate(repair_prompt) - - try: - raw2 = extract_json_array(text) - except (ValueError, ParseError) as exc: - raise ValueError( - f"spawner breed_next_gen failed to produce valid JSON on retry: {exc}" - ) from exc - - children2: list[SkillGenome] = [] - for item in raw2: - child = SkillGenome( - id=str(uuid.uuid4()), - generation=next_generation, - skill_md_content=item.get("skill_md_content", ""), - supporting_files=item.get("supporting_files", {}), - traits=item.get("traits", []), - meta_strategy=item.get("meta_strategy", ""), - parent_ids=item.get("parent_ids", parent_ids), - mutations=item.get("mutations", []), - mutation_rationale=item.get("mutation_rationale", ""), - maturity="draft", - ) - children2.append(child) - - valid_children2, still_invalid = _validate_genomes(children2) - - if still_invalid: - all_violations = [ - f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items() - ] - raise ValueError( - "spawner produced invalid skills after retry: " - + "; ".join(all_violations) - ) - - return valid_children2 - - -# --------------------------------------------------------------------------- -# spawn_from_parent — gen 0 from an existing Skill (seed fork or upload) -# --------------------------------------------------------------------------- - -async def spawn_from_parent( - parent: SkillGenome, - pop_size: int, -) -> list[SkillGenome]: - """Generate a gen 0 population using an existing Skill as the seed parent. - - The parent itself is carried forward as the elite (slot 0) and ``pop_size - 1`` - diverse mutations are synthesized around it. Used by the Registry fork-and- - evolve flow and the upload-and-evolve flow — both just hand us an existing - genome to evolve forward instead of spawning from the golden template. - - Args: - parent: The seed SkillGenome to evolve from (untouched in the output). - pop_size: Total population size including the elite parent. - - Returns: - A list of ``pop_size`` SkillGenome objects at generation 0. The first - entry is the parent (re-id'd, elite); the rest are mutations. - """ - if pop_size < 1: - raise ValueError(f"pop_size must be ≥ 1, got {pop_size}") - - bible_patterns = _read_bible_patterns() - - # The elite: clone the parent with a fresh id, retain content + traits - elite = SkillGenome( - id=str(uuid.uuid4()), - generation=0, - skill_md_content=parent.skill_md_content, - frontmatter=dict(parent.frontmatter), - supporting_files=dict(parent.supporting_files), - traits=list(parent.traits), - meta_strategy=parent.meta_strategy, - parent_ids=[parent.id], - mutations=["elite-carry"], - mutation_rationale="Seed parent carried forward as elite.", - maturity=parent.maturity or "draft", - ) - - if pop_size == 1: - return [elite] - - num_mutants = pop_size - 1 - system_prompt = f"""You are evolving an existing Claude Agent Skill by producing {num_mutants} diverse mutations. - -The parent Skill is below. Your job is to produce {num_mutants} variant Skills that preserve the parent's core capability but explore different: -- Description phrasing + trigger expansion -- Instruction structure (more/fewer numbered steps, different section ordering) -- Trait emphasis (lean harder into some traits, introduce new ones) -- Example diversity (different I/O pairs) - -Each mutation must still satisfy every constraint in the bible (≤250 char description, "Use when" + "NOT for" clauses, ≤500 line body, 2-3 diverse examples, valid YAML frontmatter, unique name matching `^[a-z0-9]+(-[a-z0-9]+)*$`). - -## Bible patterns (non-negotiable) - -{bible_patterns} - -## Parent Skill - -``` -{parent.skill_md_content} -``` - -Parent traits: {", ".join(parent.traits) if parent.traits else "(none)"} -Parent strategy: {parent.meta_strategy} - -## Output - -Return a JSON array of exactly {num_mutants} skills. Each entry is a JSON object with fields: -- `skill_md_content`: the full SKILL.md (YAML frontmatter + body) -- `traits`: list of trait strings -- `meta_strategy`: 1-2 sentences -- `mutations`: list of mutation-type strings (e.g. ["description-expansion", "example-swap"]) -- `mutation_rationale`: why these mutations were made - -Do NOT modify the parent. Do NOT return fewer or more than {num_mutants} entries. Each mutation must have a UNIQUE `name` field in its frontmatter. -""" - - text = await _generate(system_prompt) - - try: - raw = extract_json_array(text) - except (ValueError, ParseError): - # If the LLM refused or produced garbage, fall back to elite-only - # (graceful degradation — evolution can still proceed with just the parent) - return [elite] - - mutants: list[SkillGenome] = [] - for item in raw[:num_mutants]: - mutants.append( - SkillGenome( - id=str(uuid.uuid4()), - generation=0, - skill_md_content=item.get("skill_md_content", ""), - supporting_files=item.get("supporting_files", {}), - traits=item.get("traits", []), - meta_strategy=item.get("meta_strategy", ""), - parent_ids=[parent.id], - mutations=item.get("mutations", []), - mutation_rationale=item.get("mutation_rationale", ""), - maturity="draft", - ) - ) - - # Drop any mutants that fail validation — keep the elite always - valid_mutants, _ = _validate_genomes(mutants) - return [elite, *valid_mutants][:pop_size] - - -# --------------------------------------------------------------------------- -# v2.0 — focused per-dimension variant spawner -# --------------------------------------------------------------------------- - - -def _build_variant_spawn_prompt( - specialization: str, - dimension: dict, - foundation_genome: SkillGenome | None, - pop_size: int, - template: str, -) -> str: - """System prompt for spawning N focused mini-SKILL.md variants for one dimension.""" - name = dimension.get("name", "") - tier = dimension.get("tier", "") - description = dimension.get("description", "") - evaluation_focus = dimension.get("evaluation_focus", "") - - foundation_block = "" - if foundation_genome is not None and tier == "capability": - # Capability variants get the winning foundation as grounding so they - # plug into a consistent skeleton during Engineer assembly later. - foundation_block = ( - "\n## Foundation context (capability variants must plug into this)\n\n" - "The following foundation variant has already won its tier. Your " - "capability variants will be assembled with it later, so they " - "MUST be compatible with its directory layout, naming, and fixture " - "philosophy. Reference the foundation's scripts and conventions " - "in your workflow steps.\n\n" - "```markdown\n" - f"{foundation_genome.skill_md_content[:2000]}\n" - "```\n" - ) - - return ( - f"## Specialization\n\n{specialization}\n\n" - f"## Variant dimension you are spawning for\n\n" - f"- Name: `{name}`\n" - f"- Tier: {tier}\n" - f"- Description: {description}\n" - f"- Evaluation focus: {evaluation_focus}\n" - f"{foundation_block}\n" - f"## Your job\n\n" - f"Spawn {pop_size} DIVERSE mini-skill packages that each take a " - f"DIFFERENT angle on the dimension above. Gen 0 exists to explore — " - f"do not produce N near-duplicates and do not kitchen-sink one " - f"variant with every approach.\n\n" - "**One dimension, one angle per variant.** Each variant's SKILL.md " - "body must focus on the single dimension named above and avoid " - "drifting into adjacent dimensions.\n\n" - "## Golden template\n\n" - f"```markdown\n{template}\n```\n\n" - "## Hard rules (validator-enforced)\n\n" - "- `name`: kebab-case, matches `^[a-z0-9]+(-[a-z0-9]+)*$`\n" - "- `description`: ≤250 chars, pushy routing pattern\n" - "- Body: ≤500 lines\n" - "- 2-3 diverse I/O examples mandatory\n" - "- The body MUST mention the dimension name somewhere\n" - "- All scripts/references referenced from SKILL.md use the\n" - " `${CLAUDE_SKILL_DIR}/...` path convention\n\n" - "## Output format\n\n" - f"Return ONLY a JSON array of exactly {pop_size} objects. The " - "``skill_md_content`` field MUST contain the FULL SKILL.md — " - "starting with ``---`` (YAML frontmatter), then the body. Do NOT " - "separate frontmatter into its own field; it must be embedded in " - "``skill_md_content`` as the validator expects a complete SKILL.md.\n\n" - "Schema:\n" - '```json\n[\n {\n' - ' "name": "kebab-case-name",\n' - ' "skill_md_content": "---\\nname: ...\\ndescription: >-\\n ...\\n---\\n\\n# Display Name\\n\\n## Quick Start\\n...",\n' - ' "supporting_files": {"scripts/score.py": "...", ' - '"scripts/validate.sh": "..."},\n' - ' "traits": ["trait1", "trait2"],\n' - ' "meta_strategy": "one-liner approach description"\n' - " }\n]\n```\n" - "No prose before or after — ONLY the JSON array." - ) - - -async def spawn_variant_gen0( - specialization: str, - dimension: dict, - foundation_genome: SkillGenome | None, - pop_size: int = 2, -) -> list[SkillGenome]: - """Spawn ``pop_size`` focused mini-skill variants for a single dimension. - - Args: - specialization: The parent skill family's specialization string. - dimension: A dict with at minimum ``name`` and ``tier`` keys; may - include ``description`` and ``evaluation_focus``. Matches the - shape of ``TaxonomistOutput.variant_dimensions``. - foundation_genome: For capability variants, the winning foundation - genome to use as grounding context. Pass ``None`` for foundation - variants. - pop_size: How many variants to spawn (default 2 for atomic mode). - - Returns: - A list of ``pop_size`` SkillGenome objects at generation 0. Each is - validated against the standard authoring constraints. Invalid - variants are dropped — the caller may receive fewer than - ``pop_size`` if the model produces malformed output, but never more. - - Raises: - ValueError: if no valid variants survive validation after one retry. - """ - if pop_size < 1: - raise ValueError(f"pop_size must be ≥ 1, got {pop_size}") - - template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text() - system_prompt = _build_variant_spawn_prompt( - specialization, dimension, foundation_genome, pop_size, template - ) - - text = await _generate(system_prompt) - _save_debug_response(f"spawn_variant_gen0_{dimension.get('name', 'unknown')}", text) - - try: - raw = extract_json_array(text) - except (ValueError, ParseError): - # One retry with a stricter formatting reminder - retry_prompt = ( - system_prompt - + "\n\nCRITICAL: Your previous response did not contain a valid " - "JSON array. Respond with ONLY a JSON array — no prose, no " - "markdown fences." - ) - text = await _generate(retry_prompt) - raw = extract_json_array(text) - - genomes = _parse_genomes(raw, generation=0) - valid_genomes, invalid = _validate_genomes(genomes) - - if not valid_genomes: - violations = [f"skill {i}: {'; '.join(v)}" for i, v in invalid.items()] - raise ValueError( - "spawn_variant_gen0 produced no valid variants: " - + "; ".join(violations) - ) - - # Stamp dimension metadata into the frontmatter so the Reviewer knows - # how to scope L3/L4 evaluation. Validator doesn't require it but it's - # the right shape for downstream consumers. - for genome in valid_genomes: - genome.frontmatter["dimension"] = dimension.get("name", "") - genome.frontmatter["tier"] = dimension.get("tier", "") - - return valid_genomes[:pop_size] diff --git a/skillforge/agents/spawner/__init__.py b/skillforge/agents/spawner/__init__.py new file mode 100644 index 0000000..ed34c6d --- /dev/null +++ b/skillforge/agents/spawner/__init__.py @@ -0,0 +1,54 @@ +"""Spawner — creates gen 0 populations and breeds next generations. + +Gen 0: reads the golden template from ``config.GOLDEN_TEMPLATE_DIR`` and +``bible/patterns/*.md``, generates ``pop_size`` diverse Skills varying +content while preserving structure. + +Gen 1+: takes parent genomes + breeding instructions from the Breeder +and produces child Skills. The Spawner MUST enforce all authoring +constraints from ``engine.sandbox.validate_skill_structure``. + +Uses the Anthropic Messages API directly (NOT the Agent SDK's query()) +because this is a pure generation task with no tool use. The Agent SDK's +query() is for agentic loops with tools and hung the overnight live test. + +Submodule layout: + + _helpers.py _generate (LLM call) + _parse_genomes + validation + + auto-repair + bible-pattern reader + _prompts.py all _build_*_prompt functions (pure string templating) + main.py four public entry points — spawn_gen0, breed_next_gen, + spawn_from_parent, spawn_variant_gen0 +""" + +from __future__ import annotations + +# Helpers re-exported for tests that patch them on the package root. +from skillforge.agents.spawner._helpers import ( + _auto_repair_missing_references, + _generate, + _parse_genomes, + _read_bible_patterns, + _validate_genomes, +) +from skillforge.agents.spawner.main import ( + breed_next_gen, + spawn_from_parent, + spawn_gen0, + spawn_variant_gen0, +) +from skillforge.config import BIBLE_DIR + +__all__ = [ + "spawn_gen0", + "breed_next_gen", + "spawn_from_parent", + "spawn_variant_gen0", + # Private helpers re-exported for test access. + "_auto_repair_missing_references", + "_generate", + "_parse_genomes", + "_read_bible_patterns", + "_validate_genomes", + "BIBLE_DIR", +] diff --git a/skillforge/agents/spawner/_helpers.py b/skillforge/agents/spawner/_helpers.py new file mode 100644 index 0000000..78ddda3 --- /dev/null +++ b/skillforge/agents/spawner/_helpers.py @@ -0,0 +1,187 @@ +"""Shared Spawner helpers — bible reading, response extraction, debug dumps, +genome parsing, auto-repair, structural validation, and the streaming LLM call. + +Extracted from the monolithic spawner so the per-entry-point modules +(``gen0``, ``breed``, ``from_parent``, ``variant``) share one private +implementation layer without re-declaring helpers. +""" + +from __future__ import annotations + +import re +import uuid + +from anthropic import AsyncAnthropic + +from skillforge.config import ANTHROPIC_API_KEY, model_for +from skillforge.engine.sandbox import validate_skill_structure +from skillforge.models import SkillGenome + +# Pulls ${CLAUDE_SKILL_DIR}/ references out of a SKILL.md body. +# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8. +_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)") + + +def _read_bible_patterns() -> str: + """Concatenate all .md files under BIBLE_DIR/patterns in sorted order. + + Returns empty string if the directory doesn't exist or is empty. + Looks up BIBLE_DIR through the package namespace so tests that + monkeypatch ``skillforge.agents.spawner.BIBLE_DIR`` intercept the + lookup. + """ + from skillforge.agents import spawner as _pkg + + patterns_dir = _pkg.BIBLE_DIR / "patterns" + if not patterns_dir.exists(): + return "" + + parts: list[str] = [] + for p in sorted(patterns_dir.glob("*.md")): + try: + parts.append(p.read_text()) + except (OSError, UnicodeDecodeError): + continue + + return "\n\n---\n\n".join(parts) + + +def _extract_response_text(response) -> str: + """Extract text from an Anthropic Messages API response. + + The response's ``content`` is a list of content blocks; extract any + that have a ``.text`` attribute. + """ + if not response.content: + return "" + parts: list[str] = [] + for block in response.content: + text = getattr(block, "text", None) + if text: + parts.append(text) + return "\n".join(parts) + + +def _save_debug_response(label: str, text: str) -> None: + """Write the last raw LLM response to /tmp for post-hoc debugging. + + Non-fatal — any write error is silently swallowed. This is for + diagnosing parse failures during live runs; in production the text + is ephemeral. + """ + try: + from pathlib import Path + + path = Path("/tmp") / f"sf-{label}.txt" + path.write_text(text) + except OSError: + pass + + +def _parse_genomes( + raw: list[dict], + generation: int, + parent_ids: list[str] | None = None, +) -> list[SkillGenome]: + """Convert raw dicts from Claude's response into SkillGenome objects.""" + genomes: list[SkillGenome] = [] + for item in raw: + genome = SkillGenome( + id=str(uuid.uuid4()), + generation=generation, + skill_md_content=item.get("skill_md_content", ""), + supporting_files=item.get("supporting_files", {}), + traits=item.get("traits", []), + meta_strategy=item.get("meta_strategy", ""), + parent_ids=parent_ids or item.get("parent_ids", []), + mutations=item.get("mutations", []), + mutation_rationale=item.get("mutation_rationale", ""), + maturity="draft", + ) + genomes.append(genome) + return genomes + + +async def _generate(prompt: str) -> str: + """Streaming Anthropic API call. Returns the full assistant text response. + + The Spawner generates structured JSON output containing multiple + SKILL.md files (up to ~5KB per skill × pop_size = 25KB+ at pop_size=5). + Non-streaming requests get server-disconnected around the 3-4 minute + mark on prompts this size. Streaming keeps the connection alive via + incremental chunks and handles long generations reliably. + + ``max_tokens`` is 32000 to fit a full population of rich SKILL.md + files with supporting scripts. Claude Sonnet 4.6 supports up to 64K + output tokens in streaming mode; 32K is plenty while keeping a sane + ceiling. + """ + client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=600.0) + parts: list[str] = [] + async with client.messages.stream( + model=model_for("spawner"), + max_tokens=32000, + messages=[{"role": "user", "content": prompt}], + ) as stream: + async for text in stream.text_stream: + parts.append(text) + return "".join(parts) + + +def _auto_repair_missing_references(genome: SkillGenome) -> int: + """Stub out ``${CLAUDE_SKILL_DIR}/`` refs missing from supporting_files. + + Cheap-tier Haiku routinely emits SKILL.md bodies that reference + ``references/*-guide.md`` in prose but forget to include the file in + ``supporting_files``. Validator rule 8 rejects those genomes, which + in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 + times. + + Rather than burn another LLM call on a retry that often reproduces + the same oversight, we stub each missing reference with a minimal + placeholder. The skill still renders, the reference still resolves + at runtime, and the genome passes validation. The Breeder can flesh + out the stubs in later generations if fitness signal suggests they + carry weight. + + Returns the count of paths that were stubbed (0 if everything + already resolved, which is the expected Sonnet-tier case). + """ + stubbed = 0 + for match in _REF_PATH_RE.finditer(genome.skill_md_content): + rel_path = match.group(1).rstrip(".,;:)") + if rel_path in genome.supporting_files: + continue + filename = rel_path.rsplit("/", 1)[-1] + placeholder_title = filename.removesuffix(".md").replace("-", " ").title() + genome.supporting_files[rel_path] = ( + f"# {placeholder_title}\n\n" + f"_Placeholder — stubbed by the spawner's auto-repair pass " + f"because the generating LLM referenced this file but did not " + f"emit its contents. Replace with domain-specific material " + f"during a later generation._\n" + ) + stubbed += 1 + return stubbed + + +def _validate_genomes( + genomes: list[SkillGenome], +) -> tuple[list[SkillGenome], dict[int, list[str]]]: + """Validate each genome; returns (valid_genomes, {idx: violations}). + + Runs the reference-path auto-repair pass before validation so + cheap-tier LLM drift on rule 8 (missing supporting_files entries) + doesn't kill a whole population. The repair only adds files; it + never touches the skill_md body. + """ + valid: list[SkillGenome] = [] + invalid: dict[int, list[str]] = {} + for i, genome in enumerate(genomes): + _auto_repair_missing_references(genome) + violations = validate_skill_structure(genome) + if violations: + invalid[i] = violations + else: + valid.append(genome) + return valid, invalid diff --git a/skillforge/agents/spawner/_prompts.py b/skillforge/agents/spawner/_prompts.py new file mode 100644 index 0000000..818d9dc --- /dev/null +++ b/skillforge/agents/spawner/_prompts.py @@ -0,0 +1,217 @@ +"""Spawner prompt-string builders. + +Pure string templating — no I/O, no LLM calls. The four entry points +(``gen0``, ``breed``, ``from_parent``, ``variant``) feed the strings +produced here into ``_helpers._generate``. + +The embedded JSON schema descriptions (``_SPAWN_SCHEMA_DESCRIPTION`` +etc.) double as prompt-documentation for Claude and as the contract the +Spawner validates against on the way back in. +""" + +from __future__ import annotations + +from skillforge.models import SkillGenome + +_SPAWN_SCHEMA_DESCRIPTION = """[ + { + "name": "kebab-case-name", + "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...", + "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."}, + "traits": ["imperative-phrasing", "tests-first"], + "meta_strategy": "plan-first TDD" + } +]""" + +_BREED_SCHEMA_DESCRIPTION = """[ + { + "name": "kebab-case-name", + "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...", + "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."}, + "traits": ["imperative-phrasing", "tests-first"], + "meta_strategy": "plan-first TDD", + "parent_ids": ["uuid-1", "uuid-2"], + "mutations": ["changed-meta-strategy", "added-examples"], + "mutation_rationale": "Switched to TDD-first based on parent attribution data" + } +]""" + + +def _build_spawn_system_prompt( + specialization: str, + pop_size: int, + template: str, + bible_patterns: str, +) -> str: + """Build the system prompt for gen 0 spawn.""" + bible_section = ( + f"\n\n## Validated Patterns (apply these)\n\n{bible_patterns}" + if bible_patterns + else "" + ) + return ( + f"You are a Skill author for the Claude Agent SDK. Your task is to generate " + f"{pop_size} DIVERSE candidate Skills for the following specialization:\n\n" + f"SPECIALIZATION: {specialization}\n\n" + "Each Skill must:\n" + "1. Follow the exact YAML frontmatter + markdown structure of the template below\n" + "2. Include 'Use when' in the first 250 chars of the description\n" + "3. Have a name matching the regex ^[a-z0-9]+(-[a-z0-9]+)*$\n" + "4. Contain at least 2 example blocks (**Example or ## Example)\n" + "5. Keep the body under 500 lines\n" + "6. Have a description under 1024 characters\n" + "7. NOT use 'anthropic' or 'claude' in the name\n" + "8. Only reference paths in ${CLAUDE_SKILL_DIR}/... that are included in supporting_files\n\n" + "## Golden Template\n\n" + f"{template}" + f"{bible_section}\n\n" + f"Return ONLY a JSON array of exactly {pop_size} skill objects. " + "No prose before or after — ONLY the JSON array. Use this schema:\n" + f"{_SPAWN_SCHEMA_DESCRIPTION}\n" + "Vary the approach, strategy, instruction style, and examples across all skills " + "while preserving the template structure." + ) + + +def _build_breed_system_prompt( + parents: list[SkillGenome], + learning_log: list[str], + breeding_instructions: str, + bible_patterns: str, +) -> str: + """Build the system prompt for next-gen breeding.""" + bible_section = ( + f"\n\n## Validated Patterns\n\n{bible_patterns}" if bible_patterns else "" + ) + + parents_section = "\n\n".join( + f"### Parent {i + 1} (id: {p.id})\n" + f"**Traits**: {p.traits}\n" + f"**Meta-strategy**: {p.meta_strategy}\n" + f"**Trait attribution**: {p.trait_attribution}\n" + f"**Trait diagnostics**: {p.trait_diagnostics}\n\n" + f"**SKILL.md content**:\n```\n{p.skill_md_content}\n```" + for i, p in enumerate(parents) + ) + + learning_section = ( + "\n".join(f"- {entry}" for entry in learning_log) + if learning_log + else "(no entries yet)" + ) + + return ( + "You are a Skill evolutionary breeder for the Claude Agent SDK.\n\n" + "## Breeding Instructions (from Breeder agent)\n\n" + f"{breeding_instructions}\n\n" + "## Parent Skills\n\n" + f"{parents_section}\n\n" + "## Learning Log (failures and lessons from all prior generations)\n\n" + f"{learning_section}" + f"{bible_section}\n\n" + "## Rules for child Skills\n" + "1. Follow YAML frontmatter + markdown structure of the parents\n" + "2. Include 'Use when' in first 250 chars of description\n" + "3. Name must match ^[a-z0-9]+(-[a-z0-9]+)*$ and NOT contain 'anthropic' or 'claude'\n" + "4. At least 2 example blocks (**Example or ## Example)\n" + "5. Body under 500 lines, description under 1024 characters\n" + "6. Only reference ${CLAUDE_SKILL_DIR}/... paths that are in supporting_files\n\n" + "Return ONLY a JSON array of child skill objects. Use this schema:\n" + f"{_BREED_SCHEMA_DESCRIPTION}" + ) + + +def _build_repair_prompt( + original_prompt: str, + violations_by_idx: dict[int, list[str]], + genomes: list[SkillGenome], +) -> str: + """Build a reprompt asking Claude to fix specific violations.""" + violation_lines: list[str] = [] + for idx, viols in violations_by_idx.items(): + genome_name = genomes[idx].skill_md_content[:50].replace("\n", " ") + violation_lines.append( + f"Skill index {idx} ({genome_name!r}): {'; '.join(viols)}" + ) + violations_str = "\n".join(violation_lines) + + return ( + "Your previous response contained invalid Skills. " + "Fix the following violations and return a corrected JSON array:\n\n" + f"{violations_str}\n\n" + "Return ONLY the complete corrected JSON array — all skills, not just the fixed ones." + ) + +def _build_variant_spawn_prompt( + specialization: str, + dimension: dict, + foundation_genome: SkillGenome | None, + pop_size: int, + template: str, +) -> str: + """System prompt for spawning N focused mini-SKILL.md variants for one dimension.""" + name = dimension.get("name", "") + tier = dimension.get("tier", "") + description = dimension.get("description", "") + evaluation_focus = dimension.get("evaluation_focus", "") + + foundation_block = "" + if foundation_genome is not None and tier == "capability": + # Capability variants get the winning foundation as grounding so they + # plug into a consistent skeleton during Engineer assembly later. + foundation_block = ( + "\n## Foundation context (capability variants must plug into this)\n\n" + "The following foundation variant has already won its tier. Your " + "capability variants will be assembled with it later, so they " + "MUST be compatible with its directory layout, naming, and fixture " + "philosophy. Reference the foundation's scripts and conventions " + "in your workflow steps.\n\n" + "```markdown\n" + f"{foundation_genome.skill_md_content[:2000]}\n" + "```\n" + ) + + return ( + f"## Specialization\n\n{specialization}\n\n" + f"## Variant dimension you are spawning for\n\n" + f"- Name: `{name}`\n" + f"- Tier: {tier}\n" + f"- Description: {description}\n" + f"- Evaluation focus: {evaluation_focus}\n" + f"{foundation_block}\n" + f"## Your job\n\n" + f"Spawn {pop_size} DIVERSE mini-skill packages that each take a " + f"DIFFERENT angle on the dimension above. Gen 0 exists to explore — " + f"do not produce N near-duplicates and do not kitchen-sink one " + f"variant with every approach.\n\n" + "**One dimension, one angle per variant.** Each variant's SKILL.md " + "body must focus on the single dimension named above and avoid " + "drifting into adjacent dimensions.\n\n" + "## Golden template\n\n" + f"```markdown\n{template}\n```\n\n" + "## Hard rules (validator-enforced)\n\n" + "- `name`: kebab-case, matches `^[a-z0-9]+(-[a-z0-9]+)*$`\n" + "- `description`: ≤250 chars, pushy routing pattern\n" + "- Body: ≤500 lines\n" + "- 2-3 diverse I/O examples mandatory\n" + "- The body MUST mention the dimension name somewhere\n" + "- All scripts/references referenced from SKILL.md use the\n" + " `${CLAUDE_SKILL_DIR}/...` path convention\n\n" + "## Output format\n\n" + f"Return ONLY a JSON array of exactly {pop_size} objects. The " + "``skill_md_content`` field MUST contain the FULL SKILL.md — " + "starting with ``---`` (YAML frontmatter), then the body. Do NOT " + "separate frontmatter into its own field; it must be embedded in " + "``skill_md_content`` as the validator expects a complete SKILL.md.\n\n" + "Schema:\n" + '```json\n[\n {\n' + ' "name": "kebab-case-name",\n' + ' "skill_md_content": "---\\nname: ...\\ndescription: >-\\n ...\\n---\\n\\n# Display Name\\n\\n## Quick Start\\n...",\n' + ' "supporting_files": {"scripts/score.py": "...", ' + '"scripts/validate.sh": "..."},\n' + ' "traits": ["trait1", "trait2"],\n' + ' "meta_strategy": "one-liner approach description"\n' + " }\n]\n```\n" + "No prose before or after — ONLY the JSON array." + ) + diff --git a/skillforge/agents/spawner/main.py b/skillforge/agents/spawner/main.py new file mode 100644 index 0000000..193ccfd --- /dev/null +++ b/skillforge/agents/spawner/main.py @@ -0,0 +1,411 @@ +"""Spawner entry points. + +Four top-level coroutines: +- ``spawn_gen0`` fresh population from a specialization string +- ``breed_next_gen`` child skills from ranked parents + instructions +- ``spawn_from_parent`` fork-and-evolve from a single seed genome +- ``spawn_variant_gen0`` per-dimension atomic variants + +All four share the same generate/parse/validate/repair loop, differ +only in the prompt they feed the LLM and their retry cadence. +""" + +from __future__ import annotations + +import uuid + +from skillforge.agents._json import extract_json_array +from skillforge.agents.spawner._helpers import ( + _parse_genomes, + _save_debug_response, + _validate_genomes, +) +from skillforge.agents.spawner._prompts import ( + _build_breed_system_prompt, + _build_repair_prompt, + _build_spawn_system_prompt, + _build_variant_spawn_prompt, +) +from skillforge.config import GOLDEN_TEMPLATE_DIR +from skillforge.errors import ParseError +from skillforge.models import SkillGenome + + +async def _generate(prompt: str) -> str: + """Dispatch to the real ``_generate`` via the package namespace. + + Tests patch ``skillforge.agents.spawner._generate`` to intercept LLM + calls. Binding the helper at import time would shadow that patch; + this indirection resolves the attribute on the package root at call + time so the patch takes effect. + """ + from skillforge.agents import spawner as _pkg + + return await _pkg._generate(prompt) + + +def _read_bible_patterns() -> str: + """Same lazy-lookup pattern as ``_generate`` — tests sometimes patch + ``skillforge.agents.spawner._read_bible_patterns``.""" + from skillforge.agents import spawner as _pkg + + return _pkg._read_bible_patterns() + + +async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]: + """Generate ``pop_size`` diverse gen 0 Skills for the specialization. + + Args: + specialization: Description of the Skill domain. + pop_size: Number of candidate Skills to generate. + + Returns: + A list of ``pop_size`` validated SkillGenome objects at generation 0. + + Raises: + ValueError: if Skills remain invalid after 1 retry. + """ + template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text() + bible_patterns = _read_bible_patterns() + + system_prompt = _build_spawn_system_prompt( + specialization, pop_size, template, bible_patterns + ) + + # Attempt 1 + text = await _generate(system_prompt) + _save_debug_response("spawn_gen0_attempt1", text) + + try: + raw = extract_json_array(text) + genomes = _parse_genomes(raw, generation=0) + valid_genomes, invalid = _validate_genomes(genomes) + first_attempt_failed = False + except (ValueError, ParseError): + # JSON parse failure — treat as if everything was invalid so the + # retry path runs. + genomes = [] + valid_genomes = [] + invalid = {} + first_attempt_failed = True + + if not first_attempt_failed and not invalid: + return valid_genomes + + # Attempt 2 — retry. Use the same prompt if JSON parse failed (Claude + # just didn't follow instructions), or a targeted repair prompt if + # the skills parsed but failed validation. + if first_attempt_failed: + retry_prompt = ( + system_prompt + + "\n\nCRITICAL: Your previous response did not contain a valid JSON " + "array. You must respond with ONLY a JSON array — no prose, no " + "markdown before or after the array. The array must start with [ " + "and end with ]. No explanations." + ) + else: + retry_prompt = _build_repair_prompt(system_prompt, invalid, genomes) + + text = await _generate(retry_prompt) + _save_debug_response("spawn_gen0_attempt2", text) + + try: + raw2 = extract_json_array(text) + except (ValueError, ParseError) as exc: + raise ValueError( + f"spawner failed to produce valid JSON on retry: {exc}. " + f"See /tmp/sf-spawn_gen0_attempt2.txt for the raw response." + ) from exc + + genomes2 = _parse_genomes(raw2, generation=0) + valid_genomes2, still_invalid = _validate_genomes(genomes2) + + if still_invalid: + all_violations = [ + f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items() + ] + raise ValueError( + "spawner produced invalid skills after retry: " + + "; ".join(all_violations) + ) + + return valid_genomes2 + + +async def breed_next_gen( + parents: list[SkillGenome], + learning_log: list[str], + breeding_instructions: str, +) -> list[SkillGenome]: + """Produce a child population from parents + Breeder's instructions. + + Args: + parents: Parent SkillGenome objects (with trait_attribution populated). + learning_log: Accumulated lessons from all prior generations. + breeding_instructions: Free-text directives from the Breeder agent. + + Returns: + A list of validated child SkillGenome objects at generation+1. + + Raises: + ValueError: if children remain invalid after 1 retry. + """ + bible_patterns = _read_bible_patterns() + parent_ids = [p.id for p in parents] + next_generation = (parents[0].generation + 1) if parents else 1 + + system_prompt = _build_breed_system_prompt( + parents, learning_log, breeding_instructions, bible_patterns + ) + + # Attempt 1 + text = await _generate(system_prompt) + + try: + raw = extract_json_array(text) + except (ValueError, ParseError) as exc: + raise ValueError( + f"spawner breed_next_gen failed to produce valid JSON: {exc}" + ) from exc + + # Parse with generation and parent_ids from raw (each child should specify its own parent_ids) + children: list[SkillGenome] = [] + for item in raw: + child = SkillGenome( + id=str(uuid.uuid4()), + generation=next_generation, + skill_md_content=item.get("skill_md_content", ""), + supporting_files=item.get("supporting_files", {}), + traits=item.get("traits", []), + meta_strategy=item.get("meta_strategy", ""), + parent_ids=item.get("parent_ids", parent_ids), + mutations=item.get("mutations", []), + mutation_rationale=item.get("mutation_rationale", ""), + maturity="draft", + ) + children.append(child) + + valid_children, invalid = _validate_genomes(children) + + if not invalid: + return valid_children + + # Attempt 2 — repair + repair_prompt = _build_repair_prompt(system_prompt, invalid, children) + text = await _generate(repair_prompt) + + try: + raw2 = extract_json_array(text) + except (ValueError, ParseError) as exc: + raise ValueError( + f"spawner breed_next_gen failed to produce valid JSON on retry: {exc}" + ) from exc + + children2: list[SkillGenome] = [] + for item in raw2: + child = SkillGenome( + id=str(uuid.uuid4()), + generation=next_generation, + skill_md_content=item.get("skill_md_content", ""), + supporting_files=item.get("supporting_files", {}), + traits=item.get("traits", []), + meta_strategy=item.get("meta_strategy", ""), + parent_ids=item.get("parent_ids", parent_ids), + mutations=item.get("mutations", []), + mutation_rationale=item.get("mutation_rationale", ""), + maturity="draft", + ) + children2.append(child) + + valid_children2, still_invalid = _validate_genomes(children2) + + if still_invalid: + all_violations = [ + f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items() + ] + raise ValueError( + "spawner produced invalid skills after retry: " + + "; ".join(all_violations) + ) + + return valid_children2 + + +async def spawn_from_parent( + parent: SkillGenome, + pop_size: int, +) -> list[SkillGenome]: + """Generate a gen 0 population using an existing Skill as the seed parent. + + The parent itself is carried forward as the elite (slot 0) and ``pop_size - 1`` + diverse mutations are synthesized around it. Used by the Registry fork-and- + evolve flow and the upload-and-evolve flow — both just hand us an existing + genome to evolve forward instead of spawning from the golden template. + + Args: + parent: The seed SkillGenome to evolve from (untouched in the output). + pop_size: Total population size including the elite parent. + + Returns: + A list of ``pop_size`` SkillGenome objects at generation 0. The first + entry is the parent (re-id'd, elite); the rest are mutations. + """ + if pop_size < 1: + raise ValueError(f"pop_size must be ≥ 1, got {pop_size}") + + bible_patterns = _read_bible_patterns() + + # The elite: clone the parent with a fresh id, retain content + traits + elite = SkillGenome( + id=str(uuid.uuid4()), + generation=0, + skill_md_content=parent.skill_md_content, + frontmatter=dict(parent.frontmatter), + supporting_files=dict(parent.supporting_files), + traits=list(parent.traits), + meta_strategy=parent.meta_strategy, + parent_ids=[parent.id], + mutations=["elite-carry"], + mutation_rationale="Seed parent carried forward as elite.", + maturity=parent.maturity or "draft", + ) + + if pop_size == 1: + return [elite] + + num_mutants = pop_size - 1 + system_prompt = f"""You are evolving an existing Claude Agent Skill by producing {num_mutants} diverse mutations. + +The parent Skill is below. Your job is to produce {num_mutants} variant Skills that preserve the parent's core capability but explore different: +- Description phrasing + trigger expansion +- Instruction structure (more/fewer numbered steps, different section ordering) +- Trait emphasis (lean harder into some traits, introduce new ones) +- Example diversity (different I/O pairs) + +Each mutation must still satisfy every constraint in the bible (≤250 char description, "Use when" + "NOT for" clauses, ≤500 line body, 2-3 diverse examples, valid YAML frontmatter, unique name matching `^[a-z0-9]+(-[a-z0-9]+)*$`). + +## Bible patterns (non-negotiable) + +{bible_patterns} + +## Parent Skill + +``` +{parent.skill_md_content} +``` + +Parent traits: {", ".join(parent.traits) if parent.traits else "(none)"} +Parent strategy: {parent.meta_strategy} + +## Output + +Return a JSON array of exactly {num_mutants} skills. Each entry is a JSON object with fields: +- `skill_md_content`: the full SKILL.md (YAML frontmatter + body) +- `traits`: list of trait strings +- `meta_strategy`: 1-2 sentences +- `mutations`: list of mutation-type strings (e.g. ["description-expansion", "example-swap"]) +- `mutation_rationale`: why these mutations were made + +Do NOT modify the parent. Do NOT return fewer or more than {num_mutants} entries. Each mutation must have a UNIQUE `name` field in its frontmatter. +""" + + text = await _generate(system_prompt) + + try: + raw = extract_json_array(text) + except (ValueError, ParseError): + # If the LLM refused or produced garbage, fall back to elite-only + # (graceful degradation — evolution can still proceed with just the parent) + return [elite] + + mutants: list[SkillGenome] = [] + for item in raw[:num_mutants]: + mutants.append( + SkillGenome( + id=str(uuid.uuid4()), + generation=0, + skill_md_content=item.get("skill_md_content", ""), + supporting_files=item.get("supporting_files", {}), + traits=item.get("traits", []), + meta_strategy=item.get("meta_strategy", ""), + parent_ids=[parent.id], + mutations=item.get("mutations", []), + mutation_rationale=item.get("mutation_rationale", ""), + maturity="draft", + ) + ) + + # Drop any mutants that fail validation — keep the elite always + valid_mutants, _ = _validate_genomes(mutants) + return [elite, *valid_mutants][:pop_size] + + +async def spawn_variant_gen0( + specialization: str, + dimension: dict, + foundation_genome: SkillGenome | None, + pop_size: int = 2, +) -> list[SkillGenome]: + """Spawn ``pop_size`` focused mini-skill variants for a single dimension. + + Args: + specialization: The parent skill family's specialization string. + dimension: A dict with at minimum ``name`` and ``tier`` keys; may + include ``description`` and ``evaluation_focus``. Matches the + shape of ``TaxonomistOutput.variant_dimensions``. + foundation_genome: For capability variants, the winning foundation + genome to use as grounding context. Pass ``None`` for foundation + variants. + pop_size: How many variants to spawn (default 2 for atomic mode). + + Returns: + A list of ``pop_size`` SkillGenome objects at generation 0. Each is + validated against the standard authoring constraints. Invalid + variants are dropped — the caller may receive fewer than + ``pop_size`` if the model produces malformed output, but never more. + + Raises: + ValueError: if no valid variants survive validation after one retry. + """ + if pop_size < 1: + raise ValueError(f"pop_size must be ≥ 1, got {pop_size}") + + template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text() + system_prompt = _build_variant_spawn_prompt( + specialization, dimension, foundation_genome, pop_size, template + ) + + text = await _generate(system_prompt) + _save_debug_response(f"spawn_variant_gen0_{dimension.get('name', 'unknown')}", text) + + try: + raw = extract_json_array(text) + except (ValueError, ParseError): + # One retry with a stricter formatting reminder + retry_prompt = ( + system_prompt + + "\n\nCRITICAL: Your previous response did not contain a valid " + "JSON array. Respond with ONLY a JSON array — no prose, no " + "markdown fences." + ) + text = await _generate(retry_prompt) + raw = extract_json_array(text) + + genomes = _parse_genomes(raw, generation=0) + valid_genomes, invalid = _validate_genomes(genomes) + + if not valid_genomes: + violations = [f"skill {i}: {'; '.join(v)}" for i, v in invalid.items()] + raise ValueError( + "spawn_variant_gen0 produced no valid variants: " + + "; ".join(violations) + ) + + # Stamp dimension metadata into the frontmatter so the Reviewer knows + # how to scope L3/L4 evaluation. Validator doesn't require it but it's + # the right shape for downstream consumers. + for genome in valid_genomes: + genome.frontmatter["dimension"] = dimension.get("name", "") + genome.frontmatter["tier"] = dimension.get("tier", "") + + return valid_genomes[:pop_size]