From a7681c89941637f64613161f7276b64873d1335c Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Mon, 20 Apr 2026 01:19:56 -0500
Subject: [PATCH 1/4] refactor: split managed_agents.py (620 LOC) into a
 package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Seven-submodule decomposition along SDK-resource-lifecycle seams:

  managed_agents/__init__.py         barrel + full docstring with all
                                     the Step-0 smoke test SDK quirks
  managed_agents/_constants.py (35)  beta headers + make_client + the
                                     $0.08/hr session rate
  managed_agents/environments.py (54) create / archive env
  managed_agents/skills.py      (167) upload + 3-step archive dance +
                                     archive_skill_safe + name extractor
  managed_agents/agents.py       (57) create / archive competitor agent
  managed_agents/sessions.py    (124) create / archive session +
                                     send_user_message + event polling
  managed_agents/output.py      (211) post-run trace introspection —
                                     written_files, bash-write parsing,
                                     token usage, runtime cost

Every public name is re-exported from the package __init__ so 38 call
sites keep their ``from skillforge.agents import managed_agents`` +
``managed_agents.upload_skill(...)`` usage unchanged.

Tests against two private helpers (_extract_skill_name_from_md,
_normalize_output_path) were accessing them on the module directly;
those are re-exported through the barrel so test patches continue to
resolve.

QA: ruff + mypy + 411 pytest (unchanged) all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/managed_agents.py           | 620 ------------------
 skillforge/agents/managed_agents/__init__.py  |  97 +++
 .../agents/managed_agents/_constants.py       |  35 +
 skillforge/agents/managed_agents/agents.py    |  59 ++
 .../agents/managed_agents/environments.py     |  55 ++
 skillforge/agents/managed_agents/output.py    | 208 ++++++
 skillforge/agents/managed_agents/sessions.py  | 124 ++++
 skillforge/agents/managed_agents/skills.py    | 163 +++++
 8 files changed, 741 insertions(+), 620 deletions(-)
 delete mode 100644 skillforge/agents/managed_agents.py
 create mode 100644 skillforge/agents/managed_agents/__init__.py
 create mode 100644 skillforge/agents/managed_agents/_constants.py
 create mode 100644 skillforge/agents/managed_agents/agents.py
 create mode 100644 skillforge/agents/managed_agents/environments.py
 create mode 100644 skillforge/agents/managed_agents/output.py
 create mode 100644 skillforge/agents/managed_agents/sessions.py
 create mode 100644 skillforge/agents/managed_agents/skills.py

diff --git a/skillforge/agents/managed_agents.py b/skillforge/agents/managed_agents.py
deleted file mode 100644
index dd6430f..0000000
--- a/skillforge/agents/managed_agents.py
+++ /dev/null
@@ -1,620 +0,0 @@
-"""Thin typed wrapper around the Anthropic Managed Agents + Skills beta APIs.
-
-Hides SDK quirks discovered during the Step 0 smoke test
-(``scripts/smoke_skill_upload.py``):
-
-- Skill uploads must place ``SKILL.md`` inside a top-level folder; a bare
-  ``SKILL.md`` filename returns 400.
-- ``beta.skills.delete()`` does NOT auto-clean versions — the 3-step
-  dance is required (``versions.list`` → ``versions.delete`` per version
-  → ``skills.delete``).
-- Anthropic ships built-in skills (xlsx/pptx/pdf/docx) with
-  ``source="anthropic"``. Cleanup must NEVER attempt to delete them — the
-  guard is enforced here.
-- ``beta.sessions.events.stream()`` is unusable: the SDK routes it through
-  the Anthropic Messages API SSE decoder which only recognizes
-  ``message_start``/``content_block_delta``/etc. and silently filters out
-  every Managed Agents event type. We poll ``events.list(order="asc")``
-  instead.
-- Tool name in ``agent_toolset_20260401`` is ``write`` (not
-  ``write_file``). Input shape: ``{"file_path": str, "content": str}``.
-  Bash tool input is ``{"command": str}`` only.
-- Token usage path: ``event.model_usage.input_tokens`` /
-  ``.output_tokens`` / ``.cache_creation_input_tokens`` /
-  ``.cache_read_input_tokens`` on ``span.model_request_end`` events.
-- Session runtime cost = (``status_idle.processed_at`` -
-  ``status_running.processed_at``) hours × $0.08.
-
-This module is the ONLY place that imports beta resource paths from the
-``anthropic`` SDK. ``competitor_managed.py`` and the engine consume only
-the wrapper's typed return values.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import contextlib
-import re
-import time
-from collections.abc import AsyncIterator
-from datetime import datetime
-from typing import Any
-
-from anthropic import AsyncAnthropic
-
-from skillforge.config import ANTHROPIC_API_KEY
-
-# ---------------------------------------------------------------------------
-# Beta header constants — pinned per PLAN-V1.2 architectural decision #6.
-# Treat any version bump as a plan-edit event, not a silent dependency
-# update. Update both the constant and the bump notes in the journal.
-# ---------------------------------------------------------------------------
-
-MANAGED_AGENTS_BETA: str = "managed-agents-2026-04-01"
-SKILLS_BETA: str = "skills-2025-10-02"
-
-# Built-in skill source — never delete. Confirmed via Step 0 inspection of
-# the four pre-existing skills (xlsx/pptx/pdf/docx) on the org.
-ANTHROPIC_SKILL_SOURCE = "anthropic"
-
-# $0.08 per session-hour metered while status == running. Mirrors the
-# constant in skillforge.config; duplicated here so this module can be
-# imported standalone without pulling the whole config tree.
-SESSION_RUNTIME_USD_PER_HOUR = 0.08
-
-
-# ---------------------------------------------------------------------------
-# Client construction
-# ---------------------------------------------------------------------------
-
-
-def make_client(timeout: float = 600.0) -> AsyncAnthropic:
-    """Construct an AsyncAnthropic client wired to skillforge config.
-
-    The caller is responsible for closing the client (``await client.close()``)
-    or using it as an async context manager.
-    """
-    return AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=timeout)
-
-
-# ---------------------------------------------------------------------------
-# Environment lifecycle
-# ---------------------------------------------------------------------------
-
-
-async def create_environment(
-    client: AsyncAnthropic,
-    *,
-    run_id: str,
-    packages: list[str] | None = None,
-) -> str:
-    """Create a cloud environment with the given pip packages pre-installed.
-
-    Returns the environment id. The caller stores it on the EvolutionRun
-    and reuses it across all competitor sessions in that run.
-    """
-    pkg_list = packages if packages is not None else ["pytest", "ruff"]
-    resp = await client.beta.environments.create(
-        name=f"sf-run-{run_id[:12]}",
-        config={
-            "type": "cloud",
-            "packages": {
-                "type": "packages",
-                "pip": pkg_list,
-            },
-        },
-        betas=[MANAGED_AGENTS_BETA],
-    )
-    return resp.id
-
-
-async def archive_environment(client: AsyncAnthropic, environment_id: str) -> None:
-    """Best-effort environment teardown. Logs and swallows errors.
-
-    Cleanup must never block. The ``leaked_environments`` counterpart
-    would go here if we needed bookkeeping; for now we accept the
-    leak — environments are cheap and Anthropic GCs them.
-    """
-    with contextlib.suppress(Exception):
-        await client.beta.environments.archive(
-            environment_id,
-            betas=[MANAGED_AGENTS_BETA],
-        )
-
-
-# ---------------------------------------------------------------------------
-# Skill upload + 3-step delete dance
-# ---------------------------------------------------------------------------
-
-
-async def upload_skill(
-    client: AsyncAnthropic,
-    *,
-    name: str,
-    skill_md: str,
-) -> str:
-    """Upload a SKILL.md as a versioned org-level custom skill.
-
-    Two empirical constraints from Step 0:
-
-      1. The file must live inside a top-level folder — passing a bare
-         ``SKILL.md`` filename returns ``400 SKILL.md file must be exactly
-         in the top-level folder.``
-      2. **The folder name must MATCH the ``name:`` field in the SKILL.md
-         frontmatter** — surfaced during the live end-to-end smoke. The
-         ``name`` argument to this function is therefore IGNORED for the
-         folder/upload — we always extract the actual frontmatter name and
-         use that. The ``name`` arg is still used as the ``display_title``
-         (which can be anything human-readable).
-
-    The Anthropic Skills API hard-requires the payload to start literally
-    with ``---``. A UTF-8 BOM or stray leading whitespace — which neither
-    our structural validator nor JSON round-tripping strips — is enough
-    to earn a ``400 SKILL.md must start with YAML frontmatter (---)``.
-    We normalize here so the ~1% of model outputs with a leading BOM or
-    whitespace still upload cleanly instead of falling back to inline.
-
-    Returns the new ``skill_id``. The caller is responsible for archiving it
-    via :func:`archive_skill` after the session completes.
-    """
-    # Strip leading BOM + whitespace the API is strict about; don't touch
-    # the rest of the body so checksum/fitness stays stable.
-    normalized = skill_md.lstrip("\ufeff \t\r\n")
-    if not normalized.startswith("---"):
-        raise ValueError(
-            "upload_skill: skill_md does not start with YAML frontmatter (---) "
-            "after stripping BOM/whitespace — refusing to call the API"
-        )
-    folder = _extract_skill_name_from_md(normalized) or name
-    resp = await client.beta.skills.create(
-        display_title=name,
-        files=[
-            (
-                f"{folder}/SKILL.md",
-                normalized.encode("utf-8"),
-                "text/markdown",
-            )
-        ],
-        betas=[SKILLS_BETA],
-    )
-    return resp.id
-
-
-_SKILL_NAME_RE = re.compile(r"^name:\s*(?P<name>[^\s\n]+)\s*$", re.MULTILINE)
-
-
-def _extract_skill_name_from_md(skill_md: str) -> str | None:
-    """Pull the ``name`` field out of a SKILL.md's YAML frontmatter.
-
-    Robust to variations in YAML formatting — uses a simple regex against
-    the raw text instead of parsing YAML, because the API's matching is
-    string-literal so we want exactly what's in the file. Returns None
-    if no name field is found.
-    """
-    if not skill_md.startswith("---"):
-        return None
-    try:
-        _, fm_block, _ = skill_md.split("---", 2)
-    except ValueError:
-        return None
-    match = _SKILL_NAME_RE.search(fm_block)
-    if not match:
-        return None
-    return match.group("name").strip()
-
-
-async def archive_skill(client: AsyncAnthropic, skill_id: str) -> None:
-    """Tear down a custom skill via the 3-step delete dance.
-
-    Steps:
-      1. ``versions.list(skill_id)`` — paginator over version objects
-      2. ``versions.delete(version=ver_str, skill_id=skill_id)`` for each
-      3. ``skills.delete(skill_id)``
-
-    **Anthropic built-in skills are protected**: we never list or delete
-    a skill we did not upload. The caller is responsible for passing
-    only ``skill_id``s that came from :func:`upload_skill`. As a
-    belt-and-suspenders, we re-fetch the skill via ``retrieve`` and
-    refuse to proceed if its ``source`` is ``anthropic``.
-
-    Best-effort: any error in the dance is raised so the caller can log
-    a leak in the ``leaked_skills`` table. Use :func:`archive_skill_safe`
-    if you want a swallow-and-log variant.
-    """
-    # Built-in guard
-    try:
-        existing = await client.beta.skills.retrieve(skill_id, betas=[SKILLS_BETA])
-        source = getattr(existing, "source", None)
-        if source == ANTHROPIC_SKILL_SOURCE:
-            raise PermissionError(
-                f"refusing to archive Anthropic built-in skill {skill_id} "
-                f"(source={source!r})"
-            )
-    except PermissionError:
-        raise
-    except Exception:  # noqa: BLE001
-        # If retrieve fails (skill already gone? auth issue?), proceed —
-        # the delete dance will surface a clearer error if there's a
-        # real problem.
-        pass
-
-    # Step 1+2: enumerate and delete versions
-    versions_page = await client.beta.skills.versions.list(
-        skill_id, betas=[SKILLS_BETA]
-    )
-    async for version in versions_page:
-        ver = getattr(version, "version", None)
-        if ver is None and hasattr(version, "model_dump"):
-            ver = version.model_dump().get("version")
-        if ver is None:
-            continue
-        await client.beta.skills.versions.delete(
-            version=str(ver),
-            skill_id=skill_id,
-            betas=[SKILLS_BETA],
-        )
-
-    # Step 3: delete the skill itself
-    await client.beta.skills.delete(skill_id, betas=[SKILLS_BETA])
-
-
-async def archive_skill_safe(
-    client: AsyncAnthropic,
-    skill_id: str,
-) -> tuple[bool, str | None]:
-    """Swallow-and-log variant. Returns ``(success, error_message)``."""
-    try:
-        await archive_skill(client, skill_id)
-        return True, None
-    except Exception as exc:  # noqa: BLE001
-        return False, f"{exc.__class__.__name__}: {str(exc)[:300]}"
-
-
-# ---------------------------------------------------------------------------
-# Agent lifecycle
-# ---------------------------------------------------------------------------
-
-
-async def create_competitor_agent(
-    client: AsyncAnthropic,
-    *,
-    name: str,
-    model: str,
-    system_prompt: str,
-    skill_id: str | None = None,
-) -> str:
-    """Create a Managed Agent for one competitor run.
-
-    The agent is configured with the standard ``agent_toolset_20260401``
-    (bash/edit/read/write/glob/grep/web_fetch/web_search) and an optional
-    custom skill linked via the ``skills`` field.
-
-    The Advisor Strategy (``advisor_20260301``) is intentionally NOT
-    wired here — Step 0 confirmed it's not yet supported in the SDK or
-    on our beta access. When it lands, add a second tool entry behind a
-    ``COMPETITOR_ADVISOR`` flag.
-    """
-    kwargs: dict[str, Any] = {
-        "name": name,
-        "model": model,
-        "system": system_prompt,
-        "tools": [{"type": "agent_toolset_20260401"}],
-        "betas": [MANAGED_AGENTS_BETA],
-    }
-    if skill_id is not None:
-        # BetaManagedAgentsCustomSkillParams shape:
-        # {"skill_id": str, "type": "custom", "version": Optional[str]}
-        # Empirical errors during the e2e smoke caught two prior shape
-        # mistakes: type="skill" (must be "custom"), id=... (must be
-        # skill_id=...). Both surfaced as 400 invalid_request_error.
-        kwargs["skills"] = [{"skill_id": skill_id, "type": "custom"}]
-    resp = await client.beta.agents.create(**kwargs)
-    return resp.id
-
-
-async def archive_agent(client: AsyncAnthropic, agent_id: str) -> None:
-    """Best-effort agent teardown."""
-    with contextlib.suppress(Exception):
-        await client.beta.agents.archive(agent_id, betas=[MANAGED_AGENTS_BETA])
-
-
-# ---------------------------------------------------------------------------
-# Session lifecycle
-# ---------------------------------------------------------------------------
-
-
-async def create_session(
-    client: AsyncAnthropic,
-    *,
-    agent_id: str,
-    environment_id: str,
-    title: str | None = None,
-) -> str:
-    """Create a session and return its id."""
-    kwargs: dict[str, Any] = {
-        "agent": agent_id,
-        "environment_id": environment_id,
-        "betas": [MANAGED_AGENTS_BETA],
-    }
-    if title is not None:
-        kwargs["title"] = title
-    resp = await client.beta.sessions.create(**kwargs)
-    return resp.id
-
-
-async def archive_session(client: AsyncAnthropic, session_id: str) -> None:
-    """Best-effort session teardown."""
-    with contextlib.suppress(Exception):
-        await client.beta.sessions.archive(
-            session_id, betas=[MANAGED_AGENTS_BETA]
-        )
-
-
-async def send_user_message(
-    client: AsyncAnthropic,
-    session_id: str,
-    text: str,
-) -> None:
-    """Send a single ``user.message`` event into a session."""
-    await client.beta.sessions.events.send(
-        session_id,
-        events=[
-            {
-                "type": "user.message",
-                "content": [{"type": "text", "text": text}],
-            }
-        ],
-        betas=[MANAGED_AGENTS_BETA],
-    )
-
-
-# ---------------------------------------------------------------------------
-# Event polling — replaces the broken events.stream()
-# ---------------------------------------------------------------------------
-
-
-async def iter_session_events(
-    client: AsyncAnthropic,
-    session_id: str,
-    *,
-    deadline_seconds: float = 300.0,
-    poll_interval: float = 2.0,
-    page_limit: int = 100,
-) -> AsyncIterator[dict]:
-    """Yield session events as plain dicts until ``session.status_idle`` arrives.
-
-    Polls ``beta.sessions.events.list(order="asc")`` every ``poll_interval``
-    seconds. Yields each new event exactly once (deduped by ``id``).
-    Stops on the first ``session.status_idle`` event OR when
-    ``deadline_seconds`` elapses.
-
-    Why polling instead of ``events.stream()``: the SDK's stream wrapper
-    routes through the Anthropic Messages API SSE decoder, which only
-    recognizes Messages event names and silently filters out every
-    Managed Agents event type. ``events.list()`` returns structured
-    ``BetaManagedAgentsSessionEvent`` objects directly. See PLAN-V1.2
-    §"Step 0 empirical findings" for the full investigation.
-    """
-    deadline = time.monotonic() + deadline_seconds
-    seen_ids: set[str] = set()
-    idle_seen = False
-
-    while time.monotonic() < deadline and not idle_seen:
-        page = await client.beta.sessions.events.list(
-            session_id,
-            limit=page_limit,
-            order="asc",
-            betas=[MANAGED_AGENTS_BETA],
-        )
-        async for ev in page:
-            ev_id = getattr(ev, "id", None)
-            if ev_id is None or ev_id in seen_ids:
-                continue
-            seen_ids.add(ev_id)
-            d = ev.model_dump() if hasattr(ev, "model_dump") else dict(ev)
-            yield d
-            if d.get("type") == "session.status_idle":
-                idle_seen = True
-                break
-
-        if idle_seen:
-            return
-        await asyncio.sleep(poll_interval)
-
-
-# ---------------------------------------------------------------------------
-# Event parsing helpers
-# ---------------------------------------------------------------------------
-
-
-def extract_written_files(events: list[dict]) -> dict[str, str]:
-    """Reconstruct ``output_files`` from a session's event stream.
-
-    Strategy:
-      1. Walk all ``agent.tool_use`` events with ``name == "write"``.
-         The ``input.file_path`` and ``input.content`` keys are present
-         and complete (verified in Step 0).
-      2. Walk all ``agent.tool_use`` events with ``name == "bash"``.
-         Parse the ``input.command`` for common file-write idioms:
-         heredoc redirects (``cat > path << EOF ... EOF``), simple
-         redirects (``echo "..." > path``), ``tee path <<<``,
-         ``printf "..." > path``. Best-effort — bash output is opaque
-         and the command may use shell expansion that we can't safely
-         eval.
-
-    All paths are normalized to RELATIVE form: leading slashes are
-    stripped (the agent typically writes to absolute paths inside its
-    cloud sandbox, but L1's deterministic runner consumes relative
-    paths under a temp dir). The smoke test caught this — writing to
-    ``/output/solution.py`` in the cloud became ``Path('/') / '/output'``
-    on the local FS and crashed L1's mkdir with a read-only filesystem
-    error.
-
-    Later writes to the same path overwrite earlier ones (last-write-wins).
-    Files written via the ``edit`` tool are NOT captured here — that
-    tool produces a patch event, not a content event. v1.3 follow-up.
-    """
-    out: dict[str, str] = {}
-
-    for ev in events:
-        if ev.get("type") != "agent.tool_use":
-            continue
-        name = ev.get("name", "")
-        inp = ev.get("input") or {}
-        if not isinstance(inp, dict):
-            continue
-
-        if name == "write":
-            path = inp.get("file_path")
-            content = inp.get("content")
-            if isinstance(path, str) and isinstance(content, str):
-                out[_normalize_output_path(path)] = content
-
-        elif name == "bash":
-            cmd = inp.get("command")
-            if not isinstance(cmd, str):
-                continue
-            for path, content in _parse_bash_writes(cmd):
-                out[_normalize_output_path(path)] = content
-
-    return out
-
-
-def _normalize_output_path(path: str) -> str:
-    """Strip leading slashes so the path is relative for L1 consumption.
-
-    Also collapses ``./`` prefixes and any leading whitespace. The result
-    is always safe to pass to ``Path(tmp_dir) / normalized_path`` without
-    accidentally jumping out of the temp dir via an absolute path or a
-    parent traversal.
-    """
-    p = path.strip().lstrip("/")
-    while p.startswith("./"):
-        p = p[2:]
-    return p
-
-
-_HEREDOC_RE = re.compile(
-    # cat redirects stdout to a file (`cat > path`); tee takes the path as a
-    # positional arg (`tee path`). Make the `>` optional so both work.
-    r"(?:cat|tee)\s*(?:-[a-z]+\s*)*(?:>\s*)?(?P<path>['\"]?\S+['\"]?)\s*"
-    r"<<\s*['\"]?(?P<delim>\w+)['\"]?\n(?P<body>.*?)\n(?P=delim)\s*$",
-    re.DOTALL | re.MULTILINE,
-)
-_SIMPLE_REDIRECT_RE = re.compile(
-    r"echo\s+(?P<content>['\"][^'\"]*['\"]|\S+)\s*>\s*(?P<path>['\"]?\S+['\"]?)"
-)
-
-
-def _parse_bash_writes(command: str) -> list[tuple[str, str]]:
-    """Best-effort parser for shell file-write idioms in a bash command string.
-
-    Recognizes:
-      - ``cat > path << EOF ... EOF`` and ``cat > path << 'EOF' ... EOF``
-      - ``tee path << EOF ... EOF``
-      - ``echo "content" > path``
-
-    Returns a list of ``(path, content)`` tuples. Strips quoting from
-    paths. Returns an empty list if nothing recognizable matches.
-    """
-    results: list[tuple[str, str]] = []
-
-    for match in _HEREDOC_RE.finditer(command):
-        path = match.group("path").strip().strip("'\"")
-        body = match.group("body")
-        results.append((path, body))
-
-    for match in _SIMPLE_REDIRECT_RE.finditer(command):
-        path = match.group("path").strip().strip("'\"")
-        content = match.group("content").strip().strip("'\"")
-        results.append((path, content))
-
-    return results
-
-
-def compute_token_usage(events: list[dict]) -> dict[str, int]:
-    """Sum token usage across all ``span.model_request_end`` events.
-
-    Returns a dict with ``input``, ``output``, ``cache_creation_input``,
-    ``cache_read_input``, and ``n_requests`` keys. Missing fields default
-    to 0. Field paths verified in Step 0:
-    ``event.model_usage.{input_tokens, output_tokens,
-    cache_creation_input_tokens, cache_read_input_tokens}``.
-    """
-    totals = {
-        "input": 0,
-        "output": 0,
-        "cache_creation_input": 0,
-        "cache_read_input": 0,
-        "n_requests": 0,
-    }
-    for ev in events:
-        if ev.get("type") != "span.model_request_end":
-            continue
-        usage = ev.get("model_usage") or {}
-        if not isinstance(usage, dict):
-            continue
-        totals["input"] += int(usage.get("input_tokens") or 0)
-        totals["output"] += int(usage.get("output_tokens") or 0)
-        totals["cache_creation_input"] += int(usage.get("cache_creation_input_tokens") or 0)
-        totals["cache_read_input"] += int(usage.get("cache_read_input_tokens") or 0)
-        totals["n_requests"] += 1
-    return totals
-
-
-def compute_session_runtime_hours(events: list[dict]) -> float:
-    """Return ``(idle_time - running_time)`` in hours, or 0.0 if either is missing.
-
-    Used to compute the session-runtime line item in
-    ``CompetitionResult.cost_breakdown`` — multiply the result by
-    :data:`SESSION_RUNTIME_USD_PER_HOUR` (``$0.08``) for USD.
-    """
-    running_at: datetime | None = None
-    idle_at: datetime | None = None
-
-    for ev in events:
-        etype = ev.get("type")
-        ts_raw = ev.get("processed_at")
-        if ts_raw is None:
-            continue
-        try:
-            if isinstance(ts_raw, datetime):
-                ts = ts_raw
-            else:
-                ts = datetime.fromisoformat(str(ts_raw).replace("Z", "+00:00"))
-        except (ValueError, TypeError):
-            continue
-        if etype == "session.status_running" and running_at is None:
-            running_at = ts
-        elif etype == "session.status_idle":
-            idle_at = ts
-
-    if running_at is None or idle_at is None:
-        return 0.0
-    delta = (idle_at - running_at).total_seconds()
-    if delta < 0:
-        return 0.0
-    return delta / 3600.0
-
-
-def session_was_skill_loaded(events: list[dict], skill_id: str | None) -> bool:
-    """Return True if any event indicates the agent loaded the custom skill.
-
-    For now, this is a heuristic: if the session was created with a
-    ``skill_id`` AND the agent emitted at least one tool_use after
-    ``session.status_running``, we consider the skill "loaded" (the
-    agent had access and chose to use tools). Refine in v1.3 once
-    Anthropic exposes a ``skill_load`` or equivalent event.
-
-    Returns False if ``skill_id`` is None (no skill was attached).
-    """
-    if skill_id is None:
-        return False
-    seen_running = False
-    for ev in events:
-        etype = ev.get("type")
-        if etype == "session.status_running":
-            seen_running = True
-        elif seen_running and etype == "agent.tool_use":
-            return True
-    return False
diff --git a/skillforge/agents/managed_agents/__init__.py b/skillforge/agents/managed_agents/__init__.py
new file mode 100644
index 0000000..ab42d30
--- /dev/null
+++ b/skillforge/agents/managed_agents/__init__.py
@@ -0,0 +1,97 @@
+"""Thin typed wrapper around the Anthropic Managed Agents + Skills beta APIs.
+
+Hides SDK quirks discovered during the Step 0 smoke test
+(``scripts/smoke_skill_upload.py``):
+
+- Skill uploads must place ``SKILL.md`` inside a top-level folder that
+  matches the frontmatter ``name:`` field; a bare filename returns 400.
+- ``beta.skills.delete()`` does NOT auto-clean versions — the 3-step
+  dance is required (``versions.list`` → ``versions.delete`` per
+  version → ``skills.delete``).
+- Anthropic ships built-in skills (xlsx/pptx/pdf/docx) with
+  ``source="anthropic"``. Cleanup must NEVER attempt to delete them —
+  the guard is enforced in ``skills.archive_skill``.
+- ``beta.sessions.events.stream()`` is unusable: the SDK routes it
+  through the Messages API SSE decoder which silently filters out every
+  Managed Agents event type. ``sessions.iter_session_events`` polls
+  ``events.list(order="asc")`` instead.
+- Tool name in ``agent_toolset_20260401`` is ``write`` (not
+  ``write_file``). Input shape: ``{"file_path": str, "content": str}``.
+  Bash tool input is ``{"command": str}``.
+- Token usage path: ``event.model_usage.input_tokens`` /
+  ``.output_tokens`` / ``.cache_creation_input_tokens`` /
+  ``.cache_read_input_tokens`` on ``span.model_request_end`` events.
+- Session runtime cost = (``status_idle.processed_at`` -
+  ``status_running.processed_at``) hours × $0.08.
+
+This package is the ONLY place that imports beta resource paths from
+the ``anthropic`` SDK. ``competitor_managed`` and the engine consume
+only the wrapper's typed return values.
+
+Public surface is re-exported here so import sites keep reading
+``from skillforge.agents import managed_agents`` and calling
+``managed_agents.upload_skill(...)`` etc.
+"""
+
+from __future__ import annotations
+
+from skillforge.agents.managed_agents._constants import (
+    ANTHROPIC_SKILL_SOURCE,
+    MANAGED_AGENTS_BETA,
+    SESSION_RUNTIME_USD_PER_HOUR,
+    SKILLS_BETA,
+    make_client,
+)
+from skillforge.agents.managed_agents.agents import archive_agent, create_competitor_agent
+from skillforge.agents.managed_agents.environments import (
+    archive_environment,
+    create_environment,
+)
+from skillforge.agents.managed_agents.output import (
+    _normalize_output_path,
+    compute_session_runtime_hours,
+    compute_token_usage,
+    extract_written_files,
+    session_was_skill_loaded,
+)
+from skillforge.agents.managed_agents.sessions import (
+    archive_session,
+    create_session,
+    iter_session_events,
+    send_user_message,
+)
+from skillforge.agents.managed_agents.skills import (
+    _extract_skill_name_from_md,
+    archive_skill,
+    archive_skill_safe,
+    upload_skill,
+)
+
+__all__ = [
+    # Constants + client
+    "ANTHROPIC_SKILL_SOURCE",
+    "MANAGED_AGENTS_BETA",
+    "SESSION_RUNTIME_USD_PER_HOUR",
+    "SKILLS_BETA",
+    "make_client",
+    # Environments
+    "create_environment",
+    "archive_environment",
+    # Skills
+    "upload_skill",
+    "archive_skill",
+    "archive_skill_safe",
+    # Agents
+    "create_competitor_agent",
+    "archive_agent",
+    # Sessions
+    "create_session",
+    "archive_session",
+    "send_user_message",
+    "iter_session_events",
+    # Output introspection
+    "extract_written_files",
+    "compute_token_usage",
+    "compute_session_runtime_hours",
+    "session_was_skill_loaded",
+]
diff --git a/skillforge/agents/managed_agents/_constants.py b/skillforge/agents/managed_agents/_constants.py
new file mode 100644
index 0000000..23c0b61
--- /dev/null
+++ b/skillforge/agents/managed_agents/_constants.py
@@ -0,0 +1,35 @@
+"""Pinned beta headers, static constants, and the shared client factory.
+
+The constants are called out as a plan-edit event — any version bump
+to ``MANAGED_AGENTS_BETA`` / ``SKILLS_BETA`` should land with a journal
+entry explaining the upgrade.
+"""
+
+from __future__ import annotations
+
+from anthropic import AsyncAnthropic
+
+from skillforge.config import ANTHROPIC_API_KEY
+
+# Pinned per PLAN-V1.2 architectural decision #6. Treat any version
+# bump as a plan-edit event, not a silent dependency update.
+MANAGED_AGENTS_BETA: str = "managed-agents-2026-04-01"
+SKILLS_BETA: str = "skills-2025-10-02"
+
+# Built-in skill source — never delete. Confirmed via Step 0 inspection
+# of the four pre-existing Anthropic skills (xlsx/pptx/pdf/docx) on the org.
+ANTHROPIC_SKILL_SOURCE = "anthropic"
+
+# $0.08 per session-hour metered while status == running. Mirrors the
+# constant in skillforge.config; duplicated here so this module can be
+# imported standalone without pulling the whole config tree.
+SESSION_RUNTIME_USD_PER_HOUR = 0.08
+
+
+def make_client(timeout: float = 600.0) -> AsyncAnthropic:
+    """Construct an AsyncAnthropic client wired to skillforge config.
+
+    The caller is responsible for closing the client (``await client.close()``)
+    or using it as an async context manager.
+    """
+    return AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=timeout)
diff --git a/skillforge/agents/managed_agents/agents.py b/skillforge/agents/managed_agents/agents.py
new file mode 100644
index 0000000..7babc47
--- /dev/null
+++ b/skillforge/agents/managed_agents/agents.py
@@ -0,0 +1,59 @@
+"""Competitor agent lifecycle — create / archive beta agents."""
+
+from __future__ import annotations
+
+import contextlib
+from typing import Any
+
+from anthropic import AsyncAnthropic
+
+from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA
+
+# ---------------------------------------------------------------------------
+# Agent lifecycle
+# ---------------------------------------------------------------------------
+
+
+async def create_competitor_agent(
+    client: AsyncAnthropic,
+    *,
+    name: str,
+    model: str,
+    system_prompt: str,
+    skill_id: str | None = None,
+) -> str:
+    """Create a Managed Agent for one competitor run.
+
+    The agent is configured with the standard ``agent_toolset_20260401``
+    (bash/edit/read/write/glob/grep/web_fetch/web_search) and an optional
+    custom skill linked via the ``skills`` field.
+
+    The Advisor Strategy (``advisor_20260301``) is intentionally NOT
+    wired here — Step 0 confirmed it's not yet supported in the SDK or
+    on our beta access. When it lands, add a second tool entry behind a
+    ``COMPETITOR_ADVISOR`` flag.
+    """
+    kwargs: dict[str, Any] = {
+        "name": name,
+        "model": model,
+        "system": system_prompt,
+        "tools": [{"type": "agent_toolset_20260401"}],
+        "betas": [MANAGED_AGENTS_BETA],
+    }
+    if skill_id is not None:
+        # BetaManagedAgentsCustomSkillParams shape:
+        # {"skill_id": str, "type": "custom", "version": Optional[str]}
+        # Empirical errors during the e2e smoke caught two prior shape
+        # mistakes: type="skill" (must be "custom"), id=... (must be
+        # skill_id=...). Both surfaced as 400 invalid_request_error.
+        kwargs["skills"] = [{"skill_id": skill_id, "type": "custom"}]
+    resp = await client.beta.agents.create(**kwargs)
+    return resp.id
+
+
+async def archive_agent(client: AsyncAnthropic, agent_id: str) -> None:
+    """Best-effort agent teardown."""
+    with contextlib.suppress(Exception):
+        await client.beta.agents.archive(agent_id, betas=[MANAGED_AGENTS_BETA])
+
+
diff --git a/skillforge/agents/managed_agents/environments.py b/skillforge/agents/managed_agents/environments.py
new file mode 100644
index 0000000..19b7671
--- /dev/null
+++ b/skillforge/agents/managed_agents/environments.py
@@ -0,0 +1,55 @@
+"""Environment lifecycle — create / archive per-run Managed Agents environments."""
+
+from __future__ import annotations
+
+import contextlib
+
+from anthropic import AsyncAnthropic
+
+from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA
+
+# ---------------------------------------------------------------------------
+# Environment lifecycle
+# ---------------------------------------------------------------------------
+
+
+async def create_environment(
+    client: AsyncAnthropic,
+    *,
+    run_id: str,
+    packages: list[str] | None = None,
+) -> str:
+    """Create a cloud environment with the given pip packages pre-installed.
+
+    Returns the environment id. The caller stores it on the EvolutionRun
+    and reuses it across all competitor sessions in that run.
+    """
+    pkg_list = packages if packages is not None else ["pytest", "ruff"]
+    resp = await client.beta.environments.create(
+        name=f"sf-run-{run_id[:12]}",
+        config={
+            "type": "cloud",
+            "packages": {
+                "type": "packages",
+                "pip": pkg_list,
+            },
+        },
+        betas=[MANAGED_AGENTS_BETA],
+    )
+    return resp.id
+
+
+async def archive_environment(client: AsyncAnthropic, environment_id: str) -> None:
+    """Best-effort environment teardown. Logs and swallows errors.
+
+    Cleanup must never block. The ``leaked_environments`` counterpart
+    would go here if we needed bookkeeping; for now we accept the
+    leak — environments are cheap and Anthropic GCs them.
+    """
+    with contextlib.suppress(Exception):
+        await client.beta.environments.archive(
+            environment_id,
+            betas=[MANAGED_AGENTS_BETA],
+        )
+
+
diff --git a/skillforge/agents/managed_agents/output.py b/skillforge/agents/managed_agents/output.py
new file mode 100644
index 0000000..4157cc5
--- /dev/null
+++ b/skillforge/agents/managed_agents/output.py
@@ -0,0 +1,208 @@
+"""Post-run event-stream introspection.
+
+Pulls the written-file map out of trace events, parses bash
+``cat <<'EOF' > path`` writes, and computes token usage + session
+runtime cost. All pure functions — no network, no mutation.
+"""
+
+from __future__ import annotations
+
+import re
+from datetime import datetime
+
+# ---------------------------------------------------------------------------
+# Event parsing helpers
+# ---------------------------------------------------------------------------
+
+
+def extract_written_files(events: list[dict]) -> dict[str, str]:
+    """Reconstruct ``output_files`` from a session's event stream.
+
+    Strategy:
+      1. Walk all ``agent.tool_use`` events with ``name == "write"``.
+         The ``input.file_path`` and ``input.content`` keys are present
+         and complete (verified in Step 0).
+      2. Walk all ``agent.tool_use`` events with ``name == "bash"``.
+         Parse the ``input.command`` for common file-write idioms:
+         heredoc redirects (``cat > path << EOF ... EOF``), simple
+         redirects (``echo "..." > path``), ``tee path <<<``,
+         ``printf "..." > path``. Best-effort — bash output is opaque
+         and the command may use shell expansion that we can't safely
+         eval.
+
+    All paths are normalized to RELATIVE form: leading slashes are
+    stripped (the agent typically writes to absolute paths inside its
+    cloud sandbox, but L1's deterministic runner consumes relative
+    paths under a temp dir). The smoke test caught this — writing to
+    ``/output/solution.py`` in the cloud became ``Path('/') / '/output'``
+    on the local FS and crashed L1's mkdir with a read-only filesystem
+    error.
+
+    Later writes to the same path overwrite earlier ones (last-write-wins).
+    Files written via the ``edit`` tool are NOT captured here — that
+    tool produces a patch event, not a content event. v1.3 follow-up.
+    """
+    out: dict[str, str] = {}
+
+    for ev in events:
+        if ev.get("type") != "agent.tool_use":
+            continue
+        name = ev.get("name", "")
+        inp = ev.get("input") or {}
+        if not isinstance(inp, dict):
+            continue
+
+        if name == "write":
+            path = inp.get("file_path")
+            content = inp.get("content")
+            if isinstance(path, str) and isinstance(content, str):
+                out[_normalize_output_path(path)] = content
+
+        elif name == "bash":
+            cmd = inp.get("command")
+            if not isinstance(cmd, str):
+                continue
+            for path, content in _parse_bash_writes(cmd):
+                out[_normalize_output_path(path)] = content
+
+    return out
+
+
+def _normalize_output_path(path: str) -> str:
+    """Strip leading slashes so the path is relative for L1 consumption.
+
+    Also collapses ``./`` prefixes and any leading whitespace. The result
+    is always safe to pass to ``Path(tmp_dir) / normalized_path`` without
+    accidentally jumping out of the temp dir via an absolute path or a
+    parent traversal.
+    """
+    p = path.strip().lstrip("/")
+    while p.startswith("./"):
+        p = p[2:]
+    return p
+
+
+_HEREDOC_RE = re.compile(
+    # cat redirects stdout to a file (`cat > path`); tee takes the path as a
+    # positional arg (`tee path`). Make the `>` optional so both work.
+    r"(?:cat|tee)\s*(?:-[a-z]+\s*)*(?:>\s*)?(?P<path>['\"]?\S+['\"]?)\s*"
+    r"<<\s*['\"]?(?P<delim>\w+)['\"]?\n(?P<body>.*?)\n(?P=delim)\s*$",
+    re.DOTALL | re.MULTILINE,
+)
+_SIMPLE_REDIRECT_RE = re.compile(
+    r"echo\s+(?P<content>['\"][^'\"]*['\"]|\S+)\s*>\s*(?P<path>['\"]?\S+['\"]?)"
+)
+
+
+def _parse_bash_writes(command: str) -> list[tuple[str, str]]:
+    """Best-effort parser for shell file-write idioms in a bash command string.
+
+    Recognizes:
+      - ``cat > path << EOF ... EOF`` and ``cat > path << 'EOF' ... EOF``
+      - ``tee path << EOF ... EOF``
+      - ``echo "content" > path``
+
+    Returns a list of ``(path, content)`` tuples. Strips quoting from
+    paths. Returns an empty list if nothing recognizable matches.
+    """
+    results: list[tuple[str, str]] = []
+
+    for match in _HEREDOC_RE.finditer(command):
+        path = match.group("path").strip().strip("'\"")
+        body = match.group("body")
+        results.append((path, body))
+
+    for match in _SIMPLE_REDIRECT_RE.finditer(command):
+        path = match.group("path").strip().strip("'\"")
+        content = match.group("content").strip().strip("'\"")
+        results.append((path, content))
+
+    return results
+
+
+def compute_token_usage(events: list[dict]) -> dict[str, int]:
+    """Sum token usage across all ``span.model_request_end`` events.
+
+    Returns a dict with ``input``, ``output``, ``cache_creation_input``,
+    ``cache_read_input``, and ``n_requests`` keys. Missing fields default
+    to 0. Field paths verified in Step 0:
+    ``event.model_usage.{input_tokens, output_tokens,
+    cache_creation_input_tokens, cache_read_input_tokens}``.
+    """
+    totals = {
+        "input": 0,
+        "output": 0,
+        "cache_creation_input": 0,
+        "cache_read_input": 0,
+        "n_requests": 0,
+    }
+    for ev in events:
+        if ev.get("type") != "span.model_request_end":
+            continue
+        usage = ev.get("model_usage") or {}
+        if not isinstance(usage, dict):
+            continue
+        totals["input"] += int(usage.get("input_tokens") or 0)
+        totals["output"] += int(usage.get("output_tokens") or 0)
+        totals["cache_creation_input"] += int(usage.get("cache_creation_input_tokens") or 0)
+        totals["cache_read_input"] += int(usage.get("cache_read_input_tokens") or 0)
+        totals["n_requests"] += 1
+    return totals
+
+
+def compute_session_runtime_hours(events: list[dict]) -> float:
+    """Return ``(idle_time - running_time)`` in hours, or 0.0 if either is missing.
+
+    Used to compute the session-runtime line item in
+    ``CompetitionResult.cost_breakdown`` — multiply the result by
+    :data:`SESSION_RUNTIME_USD_PER_HOUR` (``$0.08``) for USD.
+    """
+    running_at: datetime | None = None
+    idle_at: datetime | None = None
+
+    for ev in events:
+        etype = ev.get("type")
+        ts_raw = ev.get("processed_at")
+        if ts_raw is None:
+            continue
+        try:
+            if isinstance(ts_raw, datetime):
+                ts = ts_raw
+            else:
+                ts = datetime.fromisoformat(str(ts_raw).replace("Z", "+00:00"))
+        except (ValueError, TypeError):
+            continue
+        if etype == "session.status_running" and running_at is None:
+            running_at = ts
+        elif etype == "session.status_idle":
+            idle_at = ts
+
+    if running_at is None or idle_at is None:
+        return 0.0
+    delta = (idle_at - running_at).total_seconds()
+    if delta < 0:
+        return 0.0
+    return delta / 3600.0
+
+
+def session_was_skill_loaded(events: list[dict], skill_id: str | None) -> bool:
+    """Return True if any event indicates the agent loaded the custom skill.
+
+    For now, this is a heuristic: if the session was created with a
+    ``skill_id`` AND the agent emitted at least one tool_use after
+    ``session.status_running``, we consider the skill "loaded" (the
+    agent had access and chose to use tools). Refine in v1.3 once
+    Anthropic exposes a ``skill_load`` or equivalent event.
+
+    Returns False if ``skill_id`` is None (no skill was attached).
+    """
+    if skill_id is None:
+        return False
+    seen_running = False
+    for ev in events:
+        etype = ev.get("type")
+        if etype == "session.status_running":
+            seen_running = True
+        elif seen_running and etype == "agent.tool_use":
+            return True
+    return False
diff --git a/skillforge/agents/managed_agents/sessions.py b/skillforge/agents/managed_agents/sessions.py
new file mode 100644
index 0000000..682c8ae
--- /dev/null
+++ b/skillforge/agents/managed_agents/sessions.py
@@ -0,0 +1,124 @@
+"""Session lifecycle + event iteration + user-message dispatch.
+
+Poll-based iteration intentionally avoids ``events.stream()`` — the SDK
+routes that through the Messages API SSE decoder which silently filters
+out every Managed Agents event type. See the package docstring for the
+smoke-test findings.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import time
+from collections.abc import AsyncIterator
+from typing import Any
+
+from anthropic import AsyncAnthropic
+
+from skillforge.agents.managed_agents._constants import MANAGED_AGENTS_BETA
+
+# ---------------------------------------------------------------------------
+# Session lifecycle
+# ---------------------------------------------------------------------------
+
+
+async def create_session(
+    client: AsyncAnthropic,
+    *,
+    agent_id: str,
+    environment_id: str,
+    title: str | None = None,
+) -> str:
+    """Create a session and return its id."""
+    kwargs: dict[str, Any] = {
+        "agent": agent_id,
+        "environment_id": environment_id,
+        "betas": [MANAGED_AGENTS_BETA],
+    }
+    if title is not None:
+        kwargs["title"] = title
+    resp = await client.beta.sessions.create(**kwargs)
+    return resp.id
+
+
+async def archive_session(client: AsyncAnthropic, session_id: str) -> None:
+    """Best-effort session teardown."""
+    with contextlib.suppress(Exception):
+        await client.beta.sessions.archive(
+            session_id, betas=[MANAGED_AGENTS_BETA]
+        )
+
+
+async def send_user_message(
+    client: AsyncAnthropic,
+    session_id: str,
+    text: str,
+) -> None:
+    """Send a single ``user.message`` event into a session."""
+    await client.beta.sessions.events.send(
+        session_id,
+        events=[
+            {
+                "type": "user.message",
+                "content": [{"type": "text", "text": text}],
+            }
+        ],
+        betas=[MANAGED_AGENTS_BETA],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Event polling — replaces the broken events.stream()
+# ---------------------------------------------------------------------------
+
+
+async def iter_session_events(
+    client: AsyncAnthropic,
+    session_id: str,
+    *,
+    deadline_seconds: float = 300.0,
+    poll_interval: float = 2.0,
+    page_limit: int = 100,
+) -> AsyncIterator[dict]:
+    """Yield session events as plain dicts until ``session.status_idle`` arrives.
+
+    Polls ``beta.sessions.events.list(order="asc")`` every ``poll_interval``
+    seconds. Yields each new event exactly once (deduped by ``id``).
+    Stops on the first ``session.status_idle`` event OR when
+    ``deadline_seconds`` elapses.
+
+    Why polling instead of ``events.stream()``: the SDK's stream wrapper
+    routes through the Anthropic Messages API SSE decoder, which only
+    recognizes Messages event names and silently filters out every
+    Managed Agents event type. ``events.list()`` returns structured
+    ``BetaManagedAgentsSessionEvent`` objects directly. See PLAN-V1.2
+    §"Step 0 empirical findings" for the full investigation.
+    """
+    deadline = time.monotonic() + deadline_seconds
+    seen_ids: set[str] = set()
+    idle_seen = False
+
+    while time.monotonic() < deadline and not idle_seen:
+        page = await client.beta.sessions.events.list(
+            session_id,
+            limit=page_limit,
+            order="asc",
+            betas=[MANAGED_AGENTS_BETA],
+        )
+        async for ev in page:
+            ev_id = getattr(ev, "id", None)
+            if ev_id is None or ev_id in seen_ids:
+                continue
+            seen_ids.add(ev_id)
+            d = ev.model_dump() if hasattr(ev, "model_dump") else dict(ev)
+            yield d
+            if d.get("type") == "session.status_idle":
+                idle_seen = True
+                break
+
+        if idle_seen:
+            return
+        await asyncio.sleep(poll_interval)
+
+
diff --git a/skillforge/agents/managed_agents/skills.py b/skillforge/agents/managed_agents/skills.py
new file mode 100644
index 0000000..e9d6d9d
--- /dev/null
+++ b/skillforge/agents/managed_agents/skills.py
@@ -0,0 +1,163 @@
+"""Skill lifecycle — upload, archive, and archive-safe helpers.
+
+All the SDK quirks called out in the package docstring (folder name
+matching frontmatter, 3-step delete dance, never-delete-Anthropic-skills
+guard, BOM normalization) live here.
+"""
+
+from __future__ import annotations
+
+import re
+
+from anthropic import AsyncAnthropic
+
+from skillforge.agents.managed_agents._constants import ANTHROPIC_SKILL_SOURCE, SKILLS_BETA
+
+# ---------------------------------------------------------------------------
+# Skill upload + 3-step delete dance
+# ---------------------------------------------------------------------------
+
+
+async def upload_skill(
+    client: AsyncAnthropic,
+    *,
+    name: str,
+    skill_md: str,
+) -> str:
+    """Upload a SKILL.md as a versioned org-level custom skill.
+
+    Two empirical constraints from Step 0:
+
+      1. The file must live inside a top-level folder — passing a bare
+         ``SKILL.md`` filename returns ``400 SKILL.md file must be exactly
+         in the top-level folder.``
+      2. **The folder name must MATCH the ``name:`` field in the SKILL.md
+         frontmatter** — surfaced during the live end-to-end smoke. The
+         ``name`` argument to this function is therefore IGNORED for the
+         folder/upload — we always extract the actual frontmatter name and
+         use that. The ``name`` arg is still used as the ``display_title``
+         (which can be anything human-readable).
+
+    The Anthropic Skills API hard-requires the payload to start literally
+    with ``---``. A UTF-8 BOM or stray leading whitespace — which neither
+    our structural validator nor JSON round-tripping strips — is enough
+    to earn a ``400 SKILL.md must start with YAML frontmatter (---)``.
+    We normalize here so the ~1% of model outputs with a leading BOM or
+    whitespace still upload cleanly instead of falling back to inline.
+
+    Returns the new ``skill_id``. The caller is responsible for archiving it
+    via :func:`archive_skill` after the session completes.
+    """
+    # Strip leading BOM + whitespace the API is strict about; don't touch
+    # the rest of the body so checksum/fitness stays stable.
+    normalized = skill_md.lstrip("\ufeff \t\r\n")
+    if not normalized.startswith("---"):
+        raise ValueError(
+            "upload_skill: skill_md does not start with YAML frontmatter (---) "
+            "after stripping BOM/whitespace — refusing to call the API"
+        )
+    folder = _extract_skill_name_from_md(normalized) or name
+    resp = await client.beta.skills.create(
+        display_title=name,
+        files=[
+            (
+                f"{folder}/SKILL.md",
+                normalized.encode("utf-8"),
+                "text/markdown",
+            )
+        ],
+        betas=[SKILLS_BETA],
+    )
+    return resp.id
+
+
+_SKILL_NAME_RE = re.compile(r"^name:\s*(?P<name>[^\s\n]+)\s*$", re.MULTILINE)
+
+
+def _extract_skill_name_from_md(skill_md: str) -> str | None:
+    """Pull the ``name`` field out of a SKILL.md's YAML frontmatter.
+
+    Robust to variations in YAML formatting — uses a simple regex against
+    the raw text instead of parsing YAML, because the API's matching is
+    string-literal so we want exactly what's in the file. Returns None
+    if no name field is found.
+    """
+    if not skill_md.startswith("---"):
+        return None
+    try:
+        _, fm_block, _ = skill_md.split("---", 2)
+    except ValueError:
+        return None
+    match = _SKILL_NAME_RE.search(fm_block)
+    if not match:
+        return None
+    return match.group("name").strip()
+
+
+async def archive_skill(client: AsyncAnthropic, skill_id: str) -> None:
+    """Tear down a custom skill via the 3-step delete dance.
+
+    Steps:
+      1. ``versions.list(skill_id)`` — paginator over version objects
+      2. ``versions.delete(version=ver_str, skill_id=skill_id)`` for each
+      3. ``skills.delete(skill_id)``
+
+    **Anthropic built-in skills are protected**: we never list or delete
+    a skill we did not upload. The caller is responsible for passing
+    only ``skill_id``s that came from :func:`upload_skill`. As a
+    belt-and-suspenders, we re-fetch the skill via ``retrieve`` and
+    refuse to proceed if its ``source`` is ``anthropic``.
+
+    Best-effort: any error in the dance is raised so the caller can log
+    a leak in the ``leaked_skills`` table. Use :func:`archive_skill_safe`
+    if you want a swallow-and-log variant.
+    """
+    # Built-in guard
+    try:
+        existing = await client.beta.skills.retrieve(skill_id, betas=[SKILLS_BETA])
+        source = getattr(existing, "source", None)
+        if source == ANTHROPIC_SKILL_SOURCE:
+            raise PermissionError(
+                f"refusing to archive Anthropic built-in skill {skill_id} "
+                f"(source={source!r})"
+            )
+    except PermissionError:
+        raise
+    except Exception:  # noqa: BLE001
+        # If retrieve fails (skill already gone? auth issue?), proceed —
+        # the delete dance will surface a clearer error if there's a
+        # real problem.
+        pass
+
+    # Step 1+2: enumerate and delete versions
+    versions_page = await client.beta.skills.versions.list(
+        skill_id, betas=[SKILLS_BETA]
+    )
+    async for version in versions_page:
+        ver = getattr(version, "version", None)
+        if ver is None and hasattr(version, "model_dump"):
+            ver = version.model_dump().get("version")
+        if ver is None:
+            continue
+        await client.beta.skills.versions.delete(
+            version=str(ver),
+            skill_id=skill_id,
+            betas=[SKILLS_BETA],
+        )
+
+    # Step 3: delete the skill itself
+    await client.beta.skills.delete(skill_id, betas=[SKILLS_BETA])
+
+
+async def archive_skill_safe(
+    client: AsyncAnthropic,
+    skill_id: str,
+) -> tuple[bool, str | None]:
+    """Swallow-and-log variant. Returns ``(success, error_message)``."""
+    try:
+        await archive_skill(client, skill_id)
+        return True, None
+    except Exception as exc:  # noqa: BLE001
+        return False, f"{exc.__class__.__name__}: {str(exc)[:300]}"
+
+

From a6edb81572fbfbd537a3f85ac22a12baee79ad07 Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Mon, 20 Apr 2026 01:26:15 -0500
Subject: [PATCH 2/4] refactor: split variant_evolution.py (620 LOC) into a
 package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decomposed by orchestration level — the mini-evolution loop, the
assembly step, and the top-level run entry each live in their own file:

  variant_evolution/__init__.py    barrel + re-exports run_variant_evolution
  variant_evolution/_helpers.py    constants + _tier_sort_key
                                   + _aggregate_fitness
  variant_evolution/dimension.py   _run_dimension_mini_evolution
                                   (challenge -> spawn -> compete ->
                                   score -> judge -> breed -> pick winner)
  variant_evolution/assembly.py    _real_assembly (Engineer call +
                                   integration check)
  variant_evolution/main.py        run_variant_evolution orchestrator

Largest submodule is dimension.py at 345 LOC, under the 500-LOC ceiling
in docs/clean-code.md §2. Prior to this split, the monolith held a
single 311-LOC function (_run_dimension_mini_evolution) alongside the
assembly logic and the main loop — the file was 620 LOC and every
refactor touched everything.

Test-access surface preserved: tests/test_variant_evolution.py imports
_aggregate_fitness and _tier_sort_key directly from the package, so the
__init__ re-exports them.

Also rolled in: _extract_skill_name_from_md and _normalize_output_path
added to the managed_agents package __all__ (they were already
re-exported for test access, just needed the __all__ entry to satisfy
F401).

QA: ruff + mypy + 411 pytest all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/managed_agents/__init__.py  |   3 +
 .../engine/variant_evolution/__init__.py      |  33 ++
 .../engine/variant_evolution/_helpers.py      |  31 ++
 .../engine/variant_evolution/assembly.py      |  81 +++++
 .../dimension.py}                             | 297 +-----------------
 skillforge/engine/variant_evolution/main.py   | 174 ++++++++++
 6 files changed, 333 insertions(+), 286 deletions(-)
 create mode 100644 skillforge/engine/variant_evolution/__init__.py
 create mode 100644 skillforge/engine/variant_evolution/_helpers.py
 create mode 100644 skillforge/engine/variant_evolution/assembly.py
 rename skillforge/engine/{variant_evolution.py => variant_evolution/dimension.py} (53%)
 create mode 100644 skillforge/engine/variant_evolution/main.py

diff --git a/skillforge/agents/managed_agents/__init__.py b/skillforge/agents/managed_agents/__init__.py
index ab42d30..258b1d7 100644
--- a/skillforge/agents/managed_agents/__init__.py
+++ b/skillforge/agents/managed_agents/__init__.py
@@ -94,4 +94,7 @@
     "compute_token_usage",
     "compute_session_runtime_hours",
     "session_was_skill_loaded",
+    # Private helpers re-exported for test access
+    "_extract_skill_name_from_md",
+    "_normalize_output_path",
 ]
diff --git a/skillforge/engine/variant_evolution/__init__.py b/skillforge/engine/variant_evolution/__init__.py
new file mode 100644
index 0000000..f48f9b9
--- /dev/null
+++ b/skillforge/engine/variant_evolution/__init__.py
@@ -0,0 +1,33 @@
+"""Variant evolution orchestrator (v2.0 Wave 3-1).
+
+Atomic-mode entry point. When ``run.evolution_mode == "atomic"`` the
+parent ``run_evolution`` dispatcher delegates to ``run_variant_evolution``
+(re-exported below). The orchestrator runs one mini-evolution per
+variant dimension recorded against the parent run, then calls the
+Engineer to assemble the winners into a composite skill.
+
+Submodule layout:
+
+  _helpers.py   shared constants + small pure helpers
+  dimension.py  per-dimension mini-evolution (challenge -> spawn ->
+                compete -> score -> judge -> breed -> pick winner)
+  assembly.py   composite assembly via the Engineer
+  main.py       top-level run_variant_evolution orchestrator
+
+The mini-evolutions reuse existing helpers (Spawner, Competitor,
+judging pipeline) directly rather than recursing into ``run_evolution``
+itself — recursion would force a second event loop and complicate the
+parent run's event stream.
+"""
+
+from __future__ import annotations
+
+from skillforge.engine.variant_evolution._helpers import _aggregate_fitness, _tier_sort_key
+from skillforge.engine.variant_evolution.main import run_variant_evolution
+
+__all__ = [
+    "run_variant_evolution",
+    # Private helpers re-exported for test access.
+    "_aggregate_fitness",
+    "_tier_sort_key",
+]
diff --git a/skillforge/engine/variant_evolution/_helpers.py b/skillforge/engine/variant_evolution/_helpers.py
new file mode 100644
index 0000000..89533f4
--- /dev/null
+++ b/skillforge/engine/variant_evolution/_helpers.py
@@ -0,0 +1,31 @@
+"""Shared helpers + defaults for variant_evolution orchestration."""
+
+from __future__ import annotations
+
+from skillforge.models import SkillGenome, VariantEvolution
+
+# Atomic-mode defaults — small populations because the per-dimension
+# challenge is narrow. Wave 1 of Phase 3 kept gen=1 (no breeding loop yet);
+# post-v2.0 item 4 bumped the default to 2 so the default produces one
+# round of breeding after gen 0. Existing VariantEvolution rows with
+# ``num_generations=1`` still work — the loop collapses to a single pass.
+DEFAULT_VARIANT_POP = 2
+DEFAULT_VARIANT_GENS = 2
+DEFAULT_VARIANT_CONCURRENCY = 3
+
+
+def _tier_sort_key(ve: VariantEvolution) -> tuple[int, str]:
+    """Sort foundation dimensions before capability dimensions."""
+    order = {"foundation": 0, "capability": 1}
+    return (order.get(ve.tier, 99), ve.dimension)
+
+
+def _aggregate_fitness(skill: SkillGenome) -> float:
+    """Compute a single fitness number for ranking variants."""
+    if skill.pareto_objectives:
+        vals = list(skill.pareto_objectives.values())
+        return sum(vals) / max(1, len(vals))
+    if skill.deterministic_scores:
+        vals = list(skill.deterministic_scores.values())
+        return sum(vals) / max(1, len(vals))
+    return 0.0
diff --git a/skillforge/engine/variant_evolution/assembly.py b/skillforge/engine/variant_evolution/assembly.py
new file mode 100644
index 0000000..6cb343a
--- /dev/null
+++ b/skillforge/engine/variant_evolution/assembly.py
@@ -0,0 +1,81 @@
+"""Composite assembly — call the Engineer to merge winning variants."""
+
+from __future__ import annotations
+
+import logging
+
+from skillforge.engine.events import emit
+from skillforge.engine.variant_evolution._helpers import _aggregate_fitness
+from skillforge.models import SkillGenome
+from skillforge.models.run import EvolutionRun
+
+logger = logging.getLogger("skillforge.engine.variant_evolution.assembly")
+
+
+async def _real_assembly(
+    run: EvolutionRun,
+    foundation_winner: SkillGenome | None,
+    capability_winners: list[SkillGenome],
+) -> SkillGenome:
+    """Phase 4 real assembly — invoke the Engineer agent + integration test.
+
+    Falls back to a "use the highest-fitness winner as-is" path when no
+    foundation variant exists (some atomic decompositions only have
+    capability dimensions). Otherwise runs the full Engineer flow:
+    weave → validate → optionally refine → persist composite.
+    """
+    if foundation_winner is None:
+        if not capability_winners:
+            raise RuntimeError("assembly: no winners to assemble from")
+        # Edge case: no foundation tier in this decomposition. Use the
+        # highest-fitness capability as the de facto skeleton and emit
+        # a stub assembly_complete. Wave 4 polish will extend the
+        # Engineer to handle capability-only assemblies.
+        await emit(
+            run.id,
+            "assembly_started",
+            capability_count=len(capability_winners),
+            mode="capability_only_fallback",
+        )
+        composite = max(capability_winners, key=_aggregate_fitness)
+        await emit(
+            run.id,
+            "assembly_complete",
+            composite_skill_id=composite.id,
+            capability_count=len(capability_winners),
+            integration_passed=True,
+            mode="capability_only_fallback",
+        )
+        return composite
+
+    # Resolve the family for the Engineer call
+    from skillforge.db.queries import get_family
+
+    family = await get_family(run.family_id) if run.family_id else None
+    if family is None:
+        # Defensive fallback — synthesize a minimal SkillFamily so the
+        # Engineer call still has metadata to work with. The orchestrator
+        # logs a warning but doesn't block.
+        from skillforge.models import SkillFamily
+
+        logger.warning(
+            "run=%s atomic assembly: no family found for family_id=%s; "
+            "using a synthesized SkillFamily for the Engineer call",
+            run.id[:8],
+            run.family_id,
+        )
+        family = SkillFamily(
+            id=run.family_id or "fam_unknown",
+            slug="composite",
+            label="Composite",
+            specialization=run.specialization,
+        )
+
+    from skillforge.engine.assembly import assemble_skill
+
+    composite, _report = await assemble_skill(
+        run, family, foundation_winner, capability_winners
+    )
+    return composite
+
+
diff --git a/skillforge/engine/variant_evolution.py b/skillforge/engine/variant_evolution/dimension.py
similarity index 53%
rename from skillforge/engine/variant_evolution.py
rename to skillforge/engine/variant_evolution/dimension.py
index 4972e41..b574e87 100644
--- a/skillforge/engine/variant_evolution.py
+++ b/skillforge/engine/variant_evolution/dimension.py
@@ -1,42 +1,9 @@
-"""Variant evolution orchestrator (v2.0 Wave 3-1).
-
-Atomic-mode entry point. When ``run.evolution_mode == "atomic"`` the parent
-``run_evolution`` dispatcher delegates here. The orchestrator runs one
-mini-evolution per variant dimension recorded against the parent run, then
-calls a stub assembly step that returns the winning foundation as the
-composite skill (Phase 4 will replace the stub with the real Engineer).
-
-Per-dimension flow:
-
-  1. Read all ``variant_evolutions`` rows for ``run.id``, sorted so
-     foundation dimensions come before capability dimensions.
-  2. For each dimension, run a tiny mini-evolution:
-       a. Mark the row ``status="running"``, emit
-          ``variant_evolution_started``.
-       b. Design ONE focused challenge via
-          ``challenge_designer.design_variant_challenge``.
-       c. Spawn ``population_size`` variants via
-          ``spawner.spawn_variant_gen0`` — capability variants receive the
-          winning foundation as grounding context.
-       d. Run each spawned variant through the Competitor against the
-          single focused challenge.
-       e. Run the judging pipeline against the gathered results.
-       f. Pick the highest-fitness variant as the winner. Persist it as a
-          ``Variant`` row tied back to the family + the
-          ``VariantEvolution`` id.
-       g. Mark the ``VariantEvolution`` row ``status="complete"`` with
-          ``winner_variant_id`` and ``completed_at``. Emit
-          ``variant_evolution_complete``.
-  3. After every dimension is done, call the assembly stub.
-     The Phase 4 Engineer will replace this stub.
-  4. Set ``run.best_skill`` to the assembled composite, persist, and let
-     the parent ``run_evolution`` finalize.
-
-The mini-evolutions reuse existing helpers (Spawner, Competitor, judging
-pipeline) directly rather than recursing into ``run_evolution`` itself.
-Recursion would force a second event loop and complicate the parent run's
-event stream — direct helper calls keep the event order deterministic and
-the wall-clock budget bounded.
+"""Per-dimension mini-evolution.
+
+Takes one ``VariantEvolution`` row, runs the full small-scale pipeline
+(challenge design → spawn → compete → score → judge → breed → pick
+winner), and returns the winning Variant + its genome. Called once per
+dimension by ``main.run_variant_evolution``.
 """
 
 from __future__ import annotations
@@ -47,7 +14,6 @@
 from datetime import UTC, datetime
 
 from skillforge.db.queries import (
-    get_variant_evolutions_for_run,
     get_variants_for_family,
     save_challenge,
     save_genome,
@@ -56,43 +22,14 @@
     save_variant_evolution,
 )
 from skillforge.engine.events import emit
-from skillforge.models import (
-    Generation,
-    SkillGenome,
-    Variant,
-    VariantEvolution,
+from skillforge.engine.variant_evolution._helpers import (
+    DEFAULT_VARIANT_CONCURRENCY,
+    _aggregate_fitness,
 )
+from skillforge.models import Generation, SkillGenome, Variant, VariantEvolution
 from skillforge.models.run import EvolutionRun
 
-logger = logging.getLogger("skillforge.engine.variant_evolution")
-
-# Atomic-mode defaults — small populations because the per-dimension
-# challenge is narrow. Wave 1 of Phase 3 keeps gen=1 (no breeding loop yet);
-# Wave 4 will introduce per-dimension breeding.
-DEFAULT_VARIANT_POP = 2
-# Post-v2.0 item 4: multi-generation breeding loops are now supported inside
-# _run_dimension_mini_evolution. Bumped to 2 so the default produces one
-# round of breeding after gen 0. Existing VariantEvolution rows with
-# ``num_generations=1`` still work — the loop collapses to a single pass.
-DEFAULT_VARIANT_GENS = 2
-DEFAULT_VARIANT_CONCURRENCY = 3
-
-
-def _tier_sort_key(ve: VariantEvolution) -> tuple[int, str]:
-    """Sort foundation dimensions before capability dimensions."""
-    order = {"foundation": 0, "capability": 1}
-    return (order.get(ve.tier, 99), ve.dimension)
-
-
-def _aggregate_fitness(skill: SkillGenome) -> float:
-    """Compute a single fitness number for ranking variants."""
-    if skill.pareto_objectives:
-        vals = list(skill.pareto_objectives.values())
-        return sum(vals) / max(1, len(vals))
-    if skill.deterministic_scores:
-        vals = list(skill.deterministic_scores.values())
-        return sum(vals) / max(1, len(vals))
-    return 0.0
+logger = logging.getLogger("skillforge.engine.variant_evolution.dimension")
 
 
 async def _run_dimension_mini_evolution(
@@ -406,215 +343,3 @@ async def _run_dimension_mini_evolution(
     return variant, winner_genome
 
 
-async def _real_assembly(
-    run: EvolutionRun,
-    foundation_winner: SkillGenome | None,
-    capability_winners: list[SkillGenome],
-) -> SkillGenome:
-    """Phase 4 real assembly — invoke the Engineer agent + integration test.
-
-    Falls back to a "use the highest-fitness winner as-is" path when no
-    foundation variant exists (some atomic decompositions only have
-    capability dimensions). Otherwise runs the full Engineer flow:
-    weave → validate → optionally refine → persist composite.
-    """
-    if foundation_winner is None:
-        if not capability_winners:
-            raise RuntimeError("assembly: no winners to assemble from")
-        # Edge case: no foundation tier in this decomposition. Use the
-        # highest-fitness capability as the de facto skeleton and emit
-        # a stub assembly_complete. Wave 4 polish will extend the
-        # Engineer to handle capability-only assemblies.
-        await emit(
-            run.id,
-            "assembly_started",
-            capability_count=len(capability_winners),
-            mode="capability_only_fallback",
-        )
-        composite = max(capability_winners, key=_aggregate_fitness)
-        await emit(
-            run.id,
-            "assembly_complete",
-            composite_skill_id=composite.id,
-            capability_count=len(capability_winners),
-            integration_passed=True,
-            mode="capability_only_fallback",
-        )
-        return composite
-
-    # Resolve the family for the Engineer call
-    from skillforge.db.queries import get_family
-
-    family = await get_family(run.family_id) if run.family_id else None
-    if family is None:
-        # Defensive fallback — synthesize a minimal SkillFamily so the
-        # Engineer call still has metadata to work with. The orchestrator
-        # logs a warning but doesn't block.
-        from skillforge.models import SkillFamily
-
-        logger.warning(
-            "run=%s atomic assembly: no family found for family_id=%s; "
-            "using a synthesized SkillFamily for the Engineer call",
-            run.id[:8],
-            run.family_id,
-        )
-        family = SkillFamily(
-            id=run.family_id or "fam_unknown",
-            slug="composite",
-            label="Composite",
-            specialization=run.specialization,
-        )
-
-    from skillforge.engine.assembly import assemble_skill
-
-    composite, _report = await assemble_skill(
-        run, family, foundation_winner, capability_winners
-    )
-    return composite
-
-
-async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun:
-    """Top-level atomic-mode orchestrator.
-
-    Reads the ``variant_evolutions`` rows for ``run.id``, processes each
-    dimension in tier order, and stamps ``run.best_skill`` with the
-    assembled composite. Falls back to molecular mode and logs a warning
-    if no dimensions are recorded against the run (defensive — the
-    Taxonomist should always create them at submission time for atomic).
-    """
-    all_rows = await get_variant_evolutions_for_run(run.id)
-
-    # Filter to rows that actually need work. Rows already in a terminal
-    # state (complete/failed) from prior runs must NOT be re-processed —
-    # that was causing 4x API spend on re-runs because the live test's
-    # hardcoded run_id accumulates stale rows across test invocations.
-    # "running" is included because a previous crash may have left a row
-    # stuck mid-processing; we let the orchestrator retry it.
-    pending = [
-        v for v in all_rows if v.status not in {"complete", "failed"}
-    ]
-    skipped = len(all_rows) - len(pending)
-    if skipped:
-        logger.info(
-            "run=%s atomic mode: skipping %d terminal variant_evolutions "
-            "(%d pending)",
-            run.id[:8],
-            skipped,
-            len(pending),
-        )
-
-    if not pending:
-        logger.warning(
-            "run=%s atomic mode requested but no pending variant_evolutions; "
-            "falling back to molecular pipeline",
-            run.id[:8],
-        )
-        # Caller (run_evolution dispatcher) will handle the fallback
-        run.evolution_mode = "molecular"
-        return run
-
-    pending.sort(key=_tier_sort_key)
-    foundation_winner: SkillGenome | None = None
-    capability_winners: list[SkillGenome] = []
-
-    # --- Managed Agents environment (shared across all dimensions) ---
-    from skillforge.config import COMPETITOR_BACKEND
-
-    env_id: str | None = None
-    managed_client = None
-    if COMPETITOR_BACKEND == "managed":
-        from skillforge.agents import managed_agents
-
-        try:
-            logger.info("run=%s creating managed environment...", run.id[:8])
-            managed_client = managed_agents.make_client()
-            env_id = await managed_agents.create_environment(
-                managed_client, run_id=run.id
-            )
-            logger.info("run=%s managed environment ready: %s", run.id[:8], env_id)
-            await emit(run.id, "managed_environment_ready", environment_id=env_id)
-        except Exception as exc:  # noqa: BLE001 — managed-env boundary: any SDK failure must be captured
-            logger.exception("run=%s managed environment creation failed", run.id[:8])
-            run.status = "failed"
-            run.failure_reason = f"managed environment creation failed: {exc}"
-            await save_run(run)
-            return run
-
-    logger.info(
-        "run=%s atomic mode: %d variant_evolutions queued",
-        run.id[:8],
-        len(pending),
-    )
-
-    try:
-        for vevo in pending:
-            # Apply default population size if the row was created without one
-            if vevo.population_size <= 0:
-                vevo.population_size = DEFAULT_VARIANT_POP
-            if vevo.num_generations <= 0:
-                vevo.num_generations = DEFAULT_VARIANT_GENS
-
-            vevo.status = "running"
-            await save_variant_evolution(vevo)
-            await emit(
-                run.id,
-                "variant_evolution_started",
-                variant_evolution_id=vevo.id,
-                dimension=vevo.dimension,
-                tier=vevo.tier,
-                population_size=vevo.population_size,
-            )
-
-            try:
-                _variant, winner_genome = await _run_dimension_mini_evolution(
-                    run=run,
-                    vevo=vevo,
-                    foundation_winner=foundation_winner,
-                    env_id=env_id,
-                )
-            except Exception as exc:  # noqa: BLE001 — one bad dimension must not crash the whole atomic run
-                logger.exception(
-                    "run=%s dimension %s mini-evolution failed",
-                    run.id[:8],
-                    vevo.dimension,
-                )
-                vevo.status = "failed"
-                await save_variant_evolution(vevo)
-                await emit(
-                    run.id,
-                    "variant_evolution_complete",
-                    variant_evolution_id=vevo.id,
-                    dimension=vevo.dimension,
-                    tier=vevo.tier,
-                    status="failed",
-                    error=str(exc),
-                )
-                raise
-
-            await emit(
-                run.id,
-                "variant_evolution_complete",
-                variant_evolution_id=vevo.id,
-                dimension=vevo.dimension,
-                tier=vevo.tier,
-                winner_variant_id=vevo.winner_variant_id,
-                status="complete",
-            )
-
-            if vevo.tier == "foundation":
-                foundation_winner = winner_genome
-            else:
-                capability_winners.append(winner_genome)
-
-        composite = await _real_assembly(run, foundation_winner, capability_winners)
-        run.best_skill = composite
-    finally:
-        # Tear down managed environment
-        if env_id is not None and managed_client is not None:
-            try:
-                from skillforge.agents import managed_agents as _ma
-                await _ma.archive_environment(managed_client, env_id)
-                logger.info("run=%s managed environment archived: %s", run.id[:8], env_id)
-            except Exception:  # noqa: BLE001
-                logger.warning("run=%s managed environment cleanup failed", run.id[:8])
-    return run
diff --git a/skillforge/engine/variant_evolution/main.py b/skillforge/engine/variant_evolution/main.py
new file mode 100644
index 0000000..96ddac3
--- /dev/null
+++ b/skillforge/engine/variant_evolution/main.py
@@ -0,0 +1,174 @@
+"""Top-level atomic-mode orchestrator.
+
+Reads the run's variant_evolutions rows, runs one mini-evolution per
+dimension, then assembles the winners via the Engineer.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from skillforge.db.queries import (
+    get_variant_evolutions_for_run,
+    save_run,
+    save_variant_evolution,
+)
+from skillforge.engine.events import emit
+from skillforge.engine.variant_evolution._helpers import (
+    DEFAULT_VARIANT_GENS,
+    DEFAULT_VARIANT_POP,
+    _tier_sort_key,
+)
+from skillforge.engine.variant_evolution.assembly import _real_assembly
+from skillforge.engine.variant_evolution.dimension import _run_dimension_mini_evolution
+from skillforge.models import SkillGenome
+from skillforge.models.run import EvolutionRun
+
+logger = logging.getLogger("skillforge.engine.variant_evolution")
+
+
+async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun:
+    """Top-level atomic-mode orchestrator.
+
+    Reads the ``variant_evolutions`` rows for ``run.id``, processes each
+    dimension in tier order, and stamps ``run.best_skill`` with the
+    assembled composite. Falls back to molecular mode and logs a warning
+    if no dimensions are recorded against the run (defensive — the
+    Taxonomist should always create them at submission time for atomic).
+    """
+    all_rows = await get_variant_evolutions_for_run(run.id)
+
+    # Filter to rows that actually need work. Rows already in a terminal
+    # state (complete/failed) from prior runs must NOT be re-processed —
+    # that was causing 4x API spend on re-runs because the live test's
+    # hardcoded run_id accumulates stale rows across test invocations.
+    # "running" is included because a previous crash may have left a row
+    # stuck mid-processing; we let the orchestrator retry it.
+    pending = [
+        v for v in all_rows if v.status not in {"complete", "failed"}
+    ]
+    skipped = len(all_rows) - len(pending)
+    if skipped:
+        logger.info(
+            "run=%s atomic mode: skipping %d terminal variant_evolutions "
+            "(%d pending)",
+            run.id[:8],
+            skipped,
+            len(pending),
+        )
+
+    if not pending:
+        logger.warning(
+            "run=%s atomic mode requested but no pending variant_evolutions; "
+            "falling back to molecular pipeline",
+            run.id[:8],
+        )
+        # Caller (run_evolution dispatcher) will handle the fallback
+        run.evolution_mode = "molecular"
+        return run
+
+    pending.sort(key=_tier_sort_key)
+    foundation_winner: SkillGenome | None = None
+    capability_winners: list[SkillGenome] = []
+
+    # --- Managed Agents environment (shared across all dimensions) ---
+    from skillforge.config import COMPETITOR_BACKEND
+
+    env_id: str | None = None
+    managed_client = None
+    if COMPETITOR_BACKEND == "managed":
+        from skillforge.agents import managed_agents
+
+        try:
+            logger.info("run=%s creating managed environment...", run.id[:8])
+            managed_client = managed_agents.make_client()
+            env_id = await managed_agents.create_environment(
+                managed_client, run_id=run.id
+            )
+            logger.info("run=%s managed environment ready: %s", run.id[:8], env_id)
+            await emit(run.id, "managed_environment_ready", environment_id=env_id)
+        except Exception as exc:  # noqa: BLE001 — managed-env boundary: any SDK failure must be captured
+            logger.exception("run=%s managed environment creation failed", run.id[:8])
+            run.status = "failed"
+            run.failure_reason = f"managed environment creation failed: {exc}"
+            await save_run(run)
+            return run
+
+    logger.info(
+        "run=%s atomic mode: %d variant_evolutions queued",
+        run.id[:8],
+        len(pending),
+    )
+
+    try:
+        for vevo in pending:
+            # Apply default population size if the row was created without one
+            if vevo.population_size <= 0:
+                vevo.population_size = DEFAULT_VARIANT_POP
+            if vevo.num_generations <= 0:
+                vevo.num_generations = DEFAULT_VARIANT_GENS
+
+            vevo.status = "running"
+            await save_variant_evolution(vevo)
+            await emit(
+                run.id,
+                "variant_evolution_started",
+                variant_evolution_id=vevo.id,
+                dimension=vevo.dimension,
+                tier=vevo.tier,
+                population_size=vevo.population_size,
+            )
+
+            try:
+                _variant, winner_genome = await _run_dimension_mini_evolution(
+                    run=run,
+                    vevo=vevo,
+                    foundation_winner=foundation_winner,
+                    env_id=env_id,
+                )
+            except Exception as exc:  # noqa: BLE001 — one bad dimension must not crash the whole atomic run
+                logger.exception(
+                    "run=%s dimension %s mini-evolution failed",
+                    run.id[:8],
+                    vevo.dimension,
+                )
+                vevo.status = "failed"
+                await save_variant_evolution(vevo)
+                await emit(
+                    run.id,
+                    "variant_evolution_complete",
+                    variant_evolution_id=vevo.id,
+                    dimension=vevo.dimension,
+                    tier=vevo.tier,
+                    status="failed",
+                    error=str(exc),
+                )
+                raise
+
+            await emit(
+                run.id,
+                "variant_evolution_complete",
+                variant_evolution_id=vevo.id,
+                dimension=vevo.dimension,
+                tier=vevo.tier,
+                winner_variant_id=vevo.winner_variant_id,
+                status="complete",
+            )
+
+            if vevo.tier == "foundation":
+                foundation_winner = winner_genome
+            else:
+                capability_winners.append(winner_genome)
+
+        composite = await _real_assembly(run, foundation_winner, capability_winners)
+        run.best_skill = composite
+    finally:
+        # Tear down managed environment
+        if env_id is not None and managed_client is not None:
+            try:
+                from skillforge.agents import managed_agents as _ma
+                await _ma.archive_environment(managed_client, env_id)
+                logger.info("run=%s managed environment archived: %s", run.id[:8], env_id)
+            except Exception:  # noqa: BLE001
+                logger.warning("run=%s managed environment cleanup failed", run.id[:8])
+    return run

From be09bd985984f9b71c1faa181bcaa5c3da2e8f6b Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Mon, 20 Apr 2026 01:45:35 -0500
Subject: [PATCH 3/4] refactor: split breeder.py (629 LOC) into a package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decomposed by responsibility. The six section comments in the monolith
("slot allocation", "ranking", "main breed", "mutation prompts",
"lessons + reports", "bible publishing") each correspond to a submodule:

  breeder/__init__.py   barrel — re-exports breed + compute_slots +
                        rank_skills + publish_findings_to_bible, plus
                        legacy aliases (breed_next_gen, spawn_gen0,
                        BIBLE_DIR) that tests patch on the package root
  breeder/_ranking.py   compute_slots + rank_skills + _aggregate_fitness
  breeder/_prompts.py   diagnostic + crossover instruction templates
                        + breeding-context formatter (pure)
  breeder/_reports.py   _extract_lessons_and_report + siblings (LLM calls)
  breeder/main.py       breed() + _carry_elite (orchestrator)
  breeder/bible.py      publish_findings_to_bible (disk I/O)

Largest submodule is _reports.py at 213 LOC, under the 500-LOC Python
ceiling in docs/clean-code.md §2.

Test-patch compatibility
------------------------
Tests patch three functions on the package root:
``breeder.breed_next_gen``, ``breeder.spawn_gen0``,
``breeder._extract_lessons_and_report``. Those patches don't propagate
to bindings in submodules, so ``main.breed()`` now resolves each
through the package namespace at call time (``_pkg().breed_next_gen``
etc.). BIBLE_DIR follows the same pattern in bible.py.

QA: ruff + mypy + 411 pytest (unchanged) all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/breeder.py          | 629 --------------------------
 skillforge/agents/breeder/__init__.py |  69 +++
 skillforge/agents/breeder/_prompts.py |  89 ++++
 skillforge/agents/breeder/_ranking.py |  79 ++++
 skillforge/agents/breeder/_reports.py | 213 +++++++++
 skillforge/agents/breeder/bible.py    | 139 ++++++
 skillforge/agents/breeder/main.py     | 146 ++++++
 7 files changed, 735 insertions(+), 629 deletions(-)
 delete mode 100644 skillforge/agents/breeder.py
 create mode 100644 skillforge/agents/breeder/__init__.py
 create mode 100644 skillforge/agents/breeder/_prompts.py
 create mode 100644 skillforge/agents/breeder/_ranking.py
 create mode 100644 skillforge/agents/breeder/_reports.py
 create mode 100644 skillforge/agents/breeder/bible.py
 create mode 100644 skillforge/agents/breeder/main.py

diff --git a/skillforge/agents/breeder.py b/skillforge/agents/breeder.py
deleted file mode 100644
index e2d0aac..0000000
--- a/skillforge/agents/breeder.py
+++ /dev/null
@@ -1,629 +0,0 @@
-"""Breeder — reflective mutation, multi-parent crossover, learning log, bible publishing.
-
-Inspired by GEPA's Actionable Side Information: mutations are diagnostic, not
-random. The Breeder reads execution traces and trait attribution from the judging
-pipeline, identifies root causes of failures, and proposes targeted fixes.
-
-Responsibilities:
-- Elitism: top N Skills survive unchanged (N scales with population size)
-- Reflective crossover: combine traits from 2-3 parents guided by attribution
-- Diagnostic mutation: fix specific causes surfaced by trait attribution
-- Joint component mutation: frontmatter + body + scripts mutate together
-- Wildcard: 1+ slots per generation for fresh Skills
-- Learning log maintenance: append new lessons each generation
-- Bible publishing: extract generalizable findings to ``bible/findings/``
-
-Slot allocation scales with ``target_pop_size`` (never hardcoded; see PLAN.md
-§Cross-cutting contracts #11).
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-from datetime import UTC, datetime
-
-from anthropic import AsyncAnthropic
-
-from skillforge.agents._llm import stream_text
-from skillforge.agents.spawner import breed_next_gen, spawn_gen0
-from skillforge.config import (
-    ANTHROPIC_API_KEY,
-    BIBLE_DIR,
-    BREEDER_CALL_MODE,
-    model_for,
-)
-from skillforge.models import Generation, SkillGenome
-
-logger = logging.getLogger("skillforge.agents.breeder")
-
-# ---------------------------------------------------------------------------
-# Slot allocation
-# ---------------------------------------------------------------------------
-
-
-def compute_slots(target_pop_size: int) -> dict[str, int]:
-    """Allocate breeding slots as a function of ``target_pop_size``.
-
-    Formula (from PLAN.md §Step 6e Breeder):
-
-        elitism    = max(1, target_pop_size // 5 * 2)   ~40% floor 1
-        wildcards  = max(1, target_pop_size // 10)      ~10% floor 1
-        remainder  = target_pop_size - elitism - wildcards
-        diagnostic = remainder // 2
-        crossover  = remainder - diagnostic
-
-    Worked examples:
-        pop_size=3  → elitism=1, wildcards=1, diagnostic=0, crossover=1 (sum 3)
-        pop_size=5  → elitism=2, wildcards=1, diagnostic=1, crossover=1 (sum 5)
-        pop_size=10 → elitism=4, wildcards=1, diagnostic=2, crossover=3 (sum 10)
-    """
-    if target_pop_size < 1:
-        raise ValueError(f"target_pop_size must be >=1, got {target_pop_size}")
-
-    elitism = max(1, (target_pop_size // 5) * 2)
-    wildcards = max(1, target_pop_size // 10)
-
-    # Ensure elitism + wildcards doesn't exceed target (pathological tiny sizes)
-    if elitism + wildcards > target_pop_size:
-        elitism = max(1, target_pop_size - 1)
-        wildcards = max(0, target_pop_size - elitism)
-
-    remainder = target_pop_size - elitism - wildcards
-    diagnostic = remainder // 2
-    crossover = remainder - diagnostic
-
-    slots = {
-        "elitism": elitism,
-        "wildcards": wildcards,
-        "diagnostic": diagnostic,
-        "crossover": crossover,
-    }
-    assert sum(slots.values()) == target_pop_size, (
-        f"slot sum {sum(slots.values())} != target {target_pop_size}: {slots}"
-    )
-    return slots
-
-
-# ---------------------------------------------------------------------------
-# Ranking
-# ---------------------------------------------------------------------------
-
-
-def _aggregate_fitness(skill: SkillGenome) -> float:
-    """Scalar aggregate of Pareto objectives for ranking (charts/selection).
-
-    The Pareto front is the real answer; this scalar is a summary for
-    ordering within the front (and for ranking Skills OFF the front).
-    """
-    if not skill.pareto_objectives:
-        return 0.0
-    return sum(skill.pareto_objectives.values()) / len(skill.pareto_objectives)
-
-
-def rank_skills(generation: Generation) -> list[SkillGenome]:
-    """Return generation.skills sorted by (is_pareto_optimal desc, fitness desc)."""
-    return sorted(
-        generation.skills,
-        key=lambda s: (s.is_pareto_optimal, _aggregate_fitness(s)),
-        reverse=True,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Main breed() entry point
-# ---------------------------------------------------------------------------
-
-
-async def breed(
-    generation: Generation,
-    learning_log: list[str],
-    specialization: str,
-    target_pop_size: int,
-) -> tuple[list[SkillGenome], list[str], str]:
-    """Produce the next generation from a ranked current generation.
-
-    Returns ``(next_gen_skills, new_learning_log_entries, breeding_report)``.
-
-    The slot allocation scales with ``target_pop_size`` — see ``compute_slots``.
-    The function guarantees ``len(next_gen_skills) == target_pop_size``.
-    """
-    slots = compute_slots(target_pop_size)
-    ranked = rank_skills(generation)
-
-    next_gen: list[SkillGenome] = []
-
-    # --- Elitism: top-N survive unchanged (but bump generations_survived) ---
-    elites = ranked[: slots["elitism"]]
-    for elite in elites:
-        carried = _carry_elite(elite)
-        next_gen.append(carried)
-
-    # --- Diagnostic mutation: pick low-scoring Skills, ask LLM for targeted fixes ---
-    low_scorers = ranked[-slots["diagnostic"] :] if slots["diagnostic"] > 0 else []
-    diagnostic_instructions = _build_diagnostic_instructions(
-        low_scorers, learning_log, slots["diagnostic"]
-    )
-    if slots["diagnostic"] > 0 and low_scorers:
-        try:
-            diagnostic_children = await breed_next_gen(
-                parents=low_scorers,
-                learning_log=learning_log,
-                breeding_instructions=diagnostic_instructions,
-            )
-            next_gen.extend(diagnostic_children[: slots["diagnostic"]])
-        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
-            # Fall through — wildcard slots below absorb the shortfall.
-            logger.exception("breeder.diagnostic_failed")
-
-    # --- Reflective crossover: combine 2-3 Pareto-optimal parents ---
-    pareto_parents = [s for s in ranked if s.is_pareto_optimal][:3]
-    if not pareto_parents:
-        # Fallback: use top 3 by fitness if nobody is Pareto-optimal
-        pareto_parents = ranked[:3]
-
-    crossover_instructions = _build_crossover_instructions(
-        pareto_parents, learning_log, slots["crossover"]
-    )
-    if slots["crossover"] > 0 and pareto_parents:
-        try:
-            crossover_children = await breed_next_gen(
-                parents=pareto_parents,
-                learning_log=learning_log,
-                breeding_instructions=crossover_instructions,
-            )
-            next_gen.extend(crossover_children[: slots["crossover"]])
-        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
-            logger.exception("breeder.crossover_failed")
-
-    # --- Wildcard: fresh Skills via spawn_gen0 ---
-    if slots["wildcards"] > 0:
-        try:
-            wildcards = await spawn_gen0(
-                specialization=specialization,
-                pop_size=slots["wildcards"],
-            )
-            # Mark wildcards as mutations on the next generation
-            next_gen_num = generation.number + 1
-            for w in wildcards:
-                w.generation = next_gen_num
-                w.mutations = ["wildcard"]
-                w.mutation_rationale = "Wildcard slot: fresh spawn to prevent convergence"
-            next_gen.extend(wildcards)
-        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
-            logger.exception("breeder.wildcard_spawn_failed")
-
-    # --- Trim or pad to exactly target_pop_size ---
-    next_gen = next_gen[:target_pop_size]
-
-    # If we fell short (any slot failed), pad with elites cloned forward
-    while len(next_gen) < target_pop_size and ranked:
-        next_gen.append(_carry_elite(ranked[0]))
-
-    assert len(next_gen) == target_pop_size, (
-        f"breeder produced {len(next_gen)} children, expected {target_pop_size}"
-    )
-
-    # --- Stamp generation number on everything ---
-    next_gen_num = generation.number + 1
-    for child in next_gen:
-        child.generation = next_gen_num
-
-    # --- Extract new learning log entries + write breeding report ---
-    new_lessons, breeding_report = await _extract_lessons_and_report(
-        generation, learning_log, slots, elites, pareto_parents
-    )
-
-    return (next_gen, new_lessons, breeding_report)
-
-
-def _carry_elite(skill: SkillGenome) -> SkillGenome:
-    """Return an elite skill carried forward with bumped metadata."""
-    import copy
-
-    carried = copy.deepcopy(skill)
-    carried.generations_survived += 1
-    carried.mutations = ["elitism"]
-    carried.mutation_rationale = "Elitism: top-ranked parent carried forward unchanged"
-    # Bump maturity if the skill is surviving well
-    if carried.generations_survived >= 3 and carried.maturity == "tested":
-        carried.maturity = "hardened"
-    elif carried.generations_survived >= 2 and carried.maturity == "draft":
-        carried.maturity = "tested"
-    return carried
-
-
-# ---------------------------------------------------------------------------
-# Prompt builders
-# ---------------------------------------------------------------------------
-
-
-def _build_diagnostic_instructions(
-    low_scorers: list[SkillGenome],
-    learning_log: list[str],
-    n_children: int,
-) -> str:
-    """Build breeding instructions for diagnostic mutation of low scorers."""
-    if not low_scorers or n_children == 0:
-        return ""
-
-    diagnoses = []
-    for skill in low_scorers:
-        worst_traits = sorted(
-            skill.trait_attribution.items(),
-            key=lambda kv: kv[1],
-        )[:3]
-        trait_notes = "\n".join(
-            f"    - {t}: contribution {c:.2f} — {skill.trait_diagnostics.get(t, 'no diagnosis')}"
-            for t, c in worst_traits
-        )
-        diagnoses.append(
-            f"  Skill {skill.id[:8]}:\n"
-            f"    aggregate fitness: {_aggregate_fitness(skill):.2f}\n"
-            f"    worst traits:\n{trait_notes}"
-        )
-
-    log_section = "\n".join(f"  - {entry}" for entry in learning_log[-10:])
-
-    return (
-        f"Produce exactly {n_children} child Skill(s) by DIAGNOSTIC MUTATION of the "
-        "low-scoring parent(s) below. For each child, identify the root cause of "
-        "the parent's low fitness (from the trait diagnostics), and make a TARGETED "
-        "fix — rewrite or remove the underperforming instructions, tighten vague "
-        "phrasing, add concrete examples for ignored rules, or rescope the trait.\n\n"
-        "Do NOT make random changes. Every mutation must cite a specific parent "
-        "trait and explain (in mutation_rationale) how the child addresses it.\n\n"
-        f"Low-scoring parents:\n{chr(10).join(diagnoses)}\n\n"
-        f"Recent lessons (learning log):\n{log_section or '  (none yet)'}"
-    )
-
-
-def _build_crossover_instructions(
-    parents: list[SkillGenome],
-    learning_log: list[str],
-    n_children: int,
-) -> str:
-    """Build instructions for reflective crossover across 2-3 parents."""
-    if not parents or n_children == 0:
-        return ""
-
-    parent_notes = []
-    for p in parents:
-        best_traits = sorted(
-            p.trait_attribution.items(),
-            key=lambda kv: kv[1],
-            reverse=True,
-        )[:3]
-        trait_summary = ", ".join(f"{t}:{c:+.2f}" for t, c in best_traits) or "(no attribution)"
-        parent_notes.append(
-            f"  Parent {p.id[:8]} (fitness {_aggregate_fitness(p):.2f}): "
-            f"best traits → {trait_summary}"
-        )
-
-    log_section = "\n".join(f"  - {entry}" for entry in learning_log[-10:])
-
-    return (
-        f"Produce exactly {n_children} child Skill(s) by REFLECTIVE CROSSOVER of the "
-        f"Pareto-optimal parents below. Combine the HIGH-CONTRIBUTING traits from "
-        "each parent into each child, preserving the causal mechanism that made "
-        "each trait successful (not just the surface phrasing).\n\n"
-        "Crossover is NOT concatenation. For each child, explain (in mutation_rationale) "
-        "which traits from which parents were combined and WHY those particular "
-        "traits work together.\n\n"
-        f"Pareto-optimal parents:\n{chr(10).join(parent_notes)}\n\n"
-        f"Recent lessons (learning log):\n{log_section or '  (none yet)'}"
-    )
-
-
-# ---------------------------------------------------------------------------
-# Learning log extraction + breeding report
-# ---------------------------------------------------------------------------
-
-
-async def _extract_lessons_and_report(
-    generation: Generation,
-    learning_log: list[str],
-    slots: dict[str, int],
-    elites: list[SkillGenome],
-    pareto_parents: list[SkillGenome],
-) -> tuple[list[str], str]:
-    """Ask the LLM for (a) new learning log entries and (b) a breeding report.
-
-    Dispatches on ``config.BREEDER_CALL_MODE``:
-    - "separate" (default): two LLM calls, one for lessons, one for report
-    - "consolidated" (Flex-3 cost saver): one structured call returning both
-    """
-    context = _build_breeding_context(generation, slots, elites, pareto_parents)
-
-    if BREEDER_CALL_MODE == "consolidated":
-        return await _extract_consolidated(context, learning_log)
-    lessons = await _extract_lessons(context, learning_log)
-    report = await _extract_breeding_report(context, slots, elites, pareto_parents)
-    return lessons, report
-
-
-def _build_breeding_context(
-    generation: Generation,
-    slots: dict[str, int],
-    elites: list[SkillGenome],
-    pareto_parents: list[SkillGenome],
-) -> str:
-    """Summarize this generation's results for the Breeder's LLM prompts."""
-    elite_section = "\n".join(
-        f"  - {s.id[:8]} fitness={_aggregate_fitness(s):.2f} traits={s.traits[:3]}"
-        for s in elites
-    ) or "  (none)"
-
-    pareto_section = "\n".join(
-        f"  - {s.id[:8]} fitness={_aggregate_fitness(s):.2f}"
-        for s in pareto_parents
-    ) or "  (none)"
-
-    # Top 3 trait contributions across all results
-    all_traits: dict[str, list[float]] = {}
-    for r in generation.results:
-        for trait, contrib in r.trait_contribution.items():
-            all_traits.setdefault(trait, []).append(contrib)
-    trait_means = sorted(
-        [(t, sum(vs) / len(vs)) for t, vs in all_traits.items()],
-        key=lambda kv: kv[1],
-        reverse=True,
-    )
-    top_traits = "\n".join(
-        f"  - {t}: {m:+.2f} (from trace attribution)" for t, m in trait_means[:5]
-    ) or "  (no trait data)"
-
-    return (
-        f"Generation {generation.number} summary:\n"
-        f"  population: {len(generation.skills)}\n"
-        f"  best_fitness: {generation.best_fitness:.3f}\n"
-        f"  avg_fitness: {generation.avg_fitness:.3f}\n"
-        f"  pareto_front_size: {len(generation.pareto_front)}\n"
-        f"\n"
-        f"Slot allocation for next gen: {slots}\n"
-        f"\n"
-        f"Elites (carrying forward):\n{elite_section}\n"
-        f"\n"
-        f"Pareto-optimal parents selected for crossover:\n{pareto_section}\n"
-        f"\n"
-        f"Top-contributing traits this generation:\n{top_traits}\n"
-    )
-
-
-async def _extract_lessons(context: str, learning_log: list[str]) -> list[str]:
-    """Single LLM call extracting generalizable lessons as a JSON array."""
-    recent_log = "\n".join(f"- {e}" for e in learning_log[-10:])
-
-    prompt = (
-        "You are the Breeder agent for a population-based evolution of Claude Agent Skills. "
-        "Based on the generation summary below, identify 1-3 NEW generalizable lessons "
-        "about Skill authoring that this generation revealed. Do NOT repeat lessons from "
-        "the existing learning log. Lessons should be actionable for future breeding, "
-        "generic enough to apply across domains, and grounded in the trait attribution data.\n\n"
-        f"## Generation summary\n{context}\n\n"
-        f"## Existing learning log (don't repeat these)\n{recent_log or '(empty)'}\n\n"
-        "## Response format\n"
-        'Respond with ONLY a JSON array of 1-3 strings, like ["lesson 1", "lesson 2"]. '
-        "No prose before or after."
-    )
-
-    try:
-        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
-        text = await stream_text(
-            client,
-            model=model_for("breeder"),
-            max_tokens=500,
-            messages=[{"role": "user", "content": prompt}],
-        )
-    except Exception:
-        # Degrade gracefully — a breeder that blocks on LLM hiccups would
-        # stall the whole run. The SDK has many concrete error types across
-        # versions; catching at the boundary keeps the engine moving.
-        logger.exception("breeder.lesson_extraction_failed")
-        return ["(lesson extraction failed)"]
-
-    match = re.search(r"\[.*\]", text, re.DOTALL)
-    if not match:
-        return []
-    try:
-        lessons = json.loads(match.group(0))
-    except json.JSONDecodeError:
-        return []
-    return [str(lesson) for lesson in lessons if isinstance(lesson, str)][:3]
-
-
-async def _extract_breeding_report(
-    context: str,
-    slots: dict[str, int],
-    elites: list[SkillGenome],
-    pareto_parents: list[SkillGenome],
-) -> str:
-    """Single LLM call producing a human-readable breeding report."""
-    prompt = (
-        "You are the Breeder agent for SkillForge. Write a 2-paragraph breeding report "
-        "explaining the decisions for the next generation. Paragraph 1: what this "
-        "generation revealed about trait fitness and which skills earned elite/Pareto "
-        "status. Paragraph 2: the strategy for the next generation's diagnostic "
-        "mutations and crossovers. Be specific, cite skill IDs by their 8-char prefix, "
-        "and reference trait contributions when they shaped a decision.\n\n"
-        f"## Generation summary\n{context}\n\n"
-        "Respond with ONLY the report prose. No headings."
-    )
-
-    try:
-        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
-        return await stream_text(
-            client,
-            model=model_for("breeder"),
-            max_tokens=800,
-            messages=[{"role": "user", "content": prompt}],
-        )
-    except Exception:
-        # Degrade gracefully — see _extract_lessons for rationale.
-        logger.exception("breeder.report_extraction_failed")
-        return "(breeding report failed)"
-
-
-async def _extract_consolidated(
-    context: str,
-    learning_log: list[str],
-) -> tuple[list[str], str]:
-    """Flex-3 cost saver: one LLM call produces both lessons and report as JSON."""
-    recent_log = "\n".join(f"- {e}" for e in learning_log[-10:])
-
-    prompt = (
-        "You are the Breeder agent for SkillForge. Given the generation summary below, "
-        "produce BOTH: (1) 1-3 NEW generalizable lessons about Skill authoring, and "
-        "(2) a 2-paragraph breeding report explaining the decisions.\n\n"
-        f"## Generation summary\n{context}\n\n"
-        f"## Existing learning log (don't repeat)\n{recent_log or '(empty)'}\n\n"
-        "## Response format\n"
-        "Respond with ONLY a JSON object matching:\n"
-        '{\n'
-        '  "lessons": ["lesson 1", "lesson 2"],\n'
-        '  "report": "Paragraph 1...\\n\\nParagraph 2..."\n'
-        '}\n'
-        "No prose before or after the JSON."
-    )
-
-    try:
-        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
-        text = await stream_text(
-            client,
-            model=model_for("breeder"),
-            max_tokens=1200,
-            messages=[{"role": "user", "content": prompt}],
-        )
-    except Exception:
-        # Degrade gracefully — see _extract_lessons for rationale.
-        logger.exception("breeder.consolidated_extraction_failed")
-        return (["(consolidated extraction failed)"], "")
-
-    match = re.search(r"\{.*\}", text, re.DOTALL)
-    if not match:
-        return ([], "")
-    try:
-        raw = json.loads(match.group(0))
-    except json.JSONDecodeError:
-        return ([], "")
-
-    lessons = [str(entry) for entry in raw.get("lessons", []) if isinstance(entry, str)][:3]
-    report = str(raw.get("report", ""))
-    return (lessons, report)
-
-
-# ---------------------------------------------------------------------------
-# Bible publishing
-# ---------------------------------------------------------------------------
-
-
-def publish_findings_to_bible(
-    new_entries: list[str],
-    run_id: str,
-    generation: int,
-) -> None:
-    """Write new learning-log entries as numbered finding files under bible/findings/.
-
-    Each finding gets its own file following the schema in bible/README.md.
-    Also appends a summary line to bible/evolution-log.md.
-
-    Failures here are logged but never raised — we don't want a bible write
-    failure to abort an evolution run.
-    """
-    if not new_entries:
-        return
-
-    findings_dir = BIBLE_DIR / "findings"
-    try:
-        findings_dir.mkdir(parents=True, exist_ok=True)
-    except OSError:
-        logger.exception("bible.findings_dir_mkdir_failed")
-        return
-
-    # Determine the next finding number by scanning existing files
-    existing_nums = []
-    for f in findings_dir.glob("*.md"):
-        match = re.match(r"^(\d{3})-", f.name)
-        if match:
-            existing_nums.append(int(match.group(1)))
-    next_num = (max(existing_nums) + 1) if existing_nums else 1
-
-    timestamp = datetime.now(UTC).strftime("%Y-%m-%d")
-
-    for entry in new_entries:
-        if not entry or entry.startswith("("):
-            # Skip error placeholders
-            continue
-        slug = _slugify(entry)[:40]
-        filename = f"{next_num:03d}-{slug}.md"
-        content = _finding_markdown(
-            num=next_num,
-            title=entry,
-            body=entry,
-            run_id=run_id,
-            generation=generation,
-            timestamp=timestamp,
-        )
-        try:
-            (findings_dir / filename).write_text(content)
-        except OSError:
-            logger.exception("bible.finding_write_failed", extra={"filename": filename})
-            continue
-        next_num += 1
-
-    # Append to evolution log
-    log_path = BIBLE_DIR / "evolution-log.md"
-    try:
-        if log_path.exists():
-            existing = log_path.read_text()
-        else:
-            existing = "# Evolution Log\n\n*Chronological log of all SkillForge evolution runs.*\n\n"
-        entry_line = f"- **{timestamp}** — run `{run_id[:8]}` gen {generation}: {len(new_entries)} new finding(s)\n"
-        log_path.write_text(existing + entry_line)
-    except OSError:
-        logger.exception("bible.evolution_log_write_failed")
-
-
-def _slugify(text: str) -> str:
-    """Kebab-case a string for use in a filename."""
-    slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
-    return slug or "untitled"
-
-
-def _finding_markdown(
-    num: int,
-    title: str,
-    body: str,
-    run_id: str,
-    generation: int,
-    timestamp: str,
-) -> str:
-    """Render a finding markdown file per bible/README.md schema."""
-    short_title = title.split(".")[0][:60] if "." in title else title[:60]
-    return f"""# Finding {num:03d}: {short_title}
-
-**Discovered**: {timestamp}
-**Evolution Run**: {run_id}
-**Generation**: {generation}
-**Status**: finding
-
-## Observation
-
-{body}
-
-## Evidence
-
-Automatically extracted from the generation {generation} trait attribution
-and trace analysis by the Breeder agent. See run `{run_id}` in the
-SkillForge database for the raw scores and traces.
-
-## Mechanism
-
-*To be filled in if this finding replicates across 3+ runs and gets
-promoted to a pattern.*
-
-## Recommendation
-
-*To be filled in upon promotion.*
-"""
diff --git a/skillforge/agents/breeder/__init__.py b/skillforge/agents/breeder/__init__.py
new file mode 100644
index 0000000..c10ce3d
--- /dev/null
+++ b/skillforge/agents/breeder/__init__.py
@@ -0,0 +1,69 @@
+"""Breeder — reflective mutation, multi-parent crossover, learning log, bible publishing.
+
+Inspired by GEPA's Actionable Side Information: mutations are diagnostic,
+not random. The Breeder reads execution traces and trait attribution
+from the judging pipeline, identifies root causes of failures, and
+proposes targeted fixes.
+
+Responsibilities:
+- Elitism: top N Skills survive unchanged
+- Reflective crossover: combine traits from 2-3 parents guided by attribution
+- Diagnostic mutation: fix specific causes surfaced by trait attribution
+- Joint component mutation: frontmatter + body + scripts mutate together
+- Wildcard: 1+ slots per generation for fresh Skills
+- Learning log maintenance: append new lessons each generation
+- Bible publishing: extract generalizable findings to ``bible/findings/``
+
+Slot allocation scales with ``target_pop_size`` (never hardcoded; see
+``_ranking.compute_slots`` for the formula).
+
+Submodule layout:
+
+  _ranking.py   compute_slots + rank_skills + _aggregate_fitness (pure)
+  _prompts.py   _build_diagnostic_instructions + _build_crossover_instructions
+                + _build_breeding_context (pure string-templating)
+  _reports.py   _extract_lessons_and_report + _extract_lessons
+                + _extract_breeding_report + _extract_consolidated
+                (LLM-calling; degrades gracefully on SDK errors)
+  main.py       breed() + _carry_elite (top-level orchestrator)
+  bible.py      publish_findings_to_bible (disk I/O, fire-and-forget)
+"""
+
+from __future__ import annotations
+
+# Re-expose imports the old breeder.py module aliased so test patches
+# targeting ``skillforge.agents.breeder.breed_next_gen`` and
+# ``skillforge.agents.breeder.BIBLE_DIR`` continue to resolve.
+from skillforge.agents.breeder._ranking import (
+    _aggregate_fitness,
+    compute_slots,
+    rank_skills,
+)
+from skillforge.agents.breeder._reports import (
+    _extract_breeding_report,
+    _extract_consolidated,
+    _extract_lessons,
+    _extract_lessons_and_report,
+)
+from skillforge.agents.breeder.bible import publish_findings_to_bible
+from skillforge.agents.breeder.main import _carry_elite, breed
+from skillforge.agents.spawner import breed_next_gen, spawn_gen0
+from skillforge.config import BIBLE_DIR
+
+__all__ = [
+    "breed",
+    "compute_slots",
+    "rank_skills",
+    "publish_findings_to_bible",
+    # Re-exports for test-patch stability.
+    "breed_next_gen",
+    "spawn_gen0",
+    "BIBLE_DIR",
+    # Private helpers re-exported for test access.
+    "_aggregate_fitness",
+    "_carry_elite",
+    "_extract_lessons_and_report",
+    "_extract_lessons",
+    "_extract_breeding_report",
+    "_extract_consolidated",
+]
diff --git a/skillforge/agents/breeder/_prompts.py b/skillforge/agents/breeder/_prompts.py
new file mode 100644
index 0000000..8c0204a
--- /dev/null
+++ b/skillforge/agents/breeder/_prompts.py
@@ -0,0 +1,89 @@
+"""Breeding-instruction prompt builders.
+
+Pure string-templating functions — no LLM calls, no I/O. The actual
+breeding happens in ``main.breed()`` which feeds these prompts to the
+Spawner.
+"""
+
+from __future__ import annotations
+
+from skillforge.agents.breeder._ranking import _aggregate_fitness
+from skillforge.models import SkillGenome
+
+
+def _build_diagnostic_instructions(
+    low_scorers: list[SkillGenome],
+    learning_log: list[str],
+    n_children: int,
+) -> str:
+    """Build breeding instructions for diagnostic mutation of low scorers."""
+    if not low_scorers or n_children == 0:
+        return ""
+
+    diagnoses = []
+    for skill in low_scorers:
+        worst_traits = sorted(
+            skill.trait_attribution.items(),
+            key=lambda kv: kv[1],
+        )[:3]
+        trait_notes = "\n".join(
+            f"    - {t}: contribution {c:.2f} — {skill.trait_diagnostics.get(t, 'no diagnosis')}"
+            for t, c in worst_traits
+        )
+        diagnoses.append(
+            f"  Skill {skill.id[:8]}:\n"
+            f"    aggregate fitness: {_aggregate_fitness(skill):.2f}\n"
+            f"    worst traits:\n{trait_notes}"
+        )
+
+    log_section = "\n".join(f"  - {entry}" for entry in learning_log[-10:])
+
+    return (
+        f"Produce exactly {n_children} child Skill(s) by DIAGNOSTIC MUTATION of the "
+        "low-scoring parent(s) below. For each child, identify the root cause of "
+        "the parent's low fitness (from the trait diagnostics), and make a TARGETED "
+        "fix — rewrite or remove the underperforming instructions, tighten vague "
+        "phrasing, add concrete examples for ignored rules, or rescope the trait.\n\n"
+        "Do NOT make random changes. Every mutation must cite a specific parent "
+        "trait and explain (in mutation_rationale) how the child addresses it.\n\n"
+        f"Low-scoring parents:\n{chr(10).join(diagnoses)}\n\n"
+        f"Recent lessons (learning log):\n{log_section or '  (none yet)'}"
+    )
+
+
+def _build_crossover_instructions(
+    parents: list[SkillGenome],
+    learning_log: list[str],
+    n_children: int,
+) -> str:
+    """Build instructions for reflective crossover across 2-3 parents."""
+    if not parents or n_children == 0:
+        return ""
+
+    parent_notes = []
+    for p in parents:
+        best_traits = sorted(
+            p.trait_attribution.items(),
+            key=lambda kv: kv[1],
+            reverse=True,
+        )[:3]
+        trait_summary = ", ".join(f"{t}:{c:+.2f}" for t, c in best_traits) or "(no attribution)"
+        parent_notes.append(
+            f"  Parent {p.id[:8]} (fitness {_aggregate_fitness(p):.2f}): "
+            f"best traits → {trait_summary}"
+        )
+
+    log_section = "\n".join(f"  - {entry}" for entry in learning_log[-10:])
+
+    return (
+        f"Produce exactly {n_children} child Skill(s) by REFLECTIVE CROSSOVER of the "
+        f"Pareto-optimal parents below. Combine the HIGH-CONTRIBUTING traits from "
+        "each parent into each child, preserving the causal mechanism that made "
+        "each trait successful (not just the surface phrasing).\n\n"
+        "Crossover is NOT concatenation. For each child, explain (in mutation_rationale) "
+        "which traits from which parents were combined and WHY those particular "
+        "traits work together.\n\n"
+        f"Pareto-optimal parents:\n{chr(10).join(parent_notes)}\n\n"
+        f"Recent lessons (learning log):\n{log_section or '  (none yet)'}"
+    )
+
diff --git a/skillforge/agents/breeder/_ranking.py b/skillforge/agents/breeder/_ranking.py
new file mode 100644
index 0000000..2583e6d
--- /dev/null
+++ b/skillforge/agents/breeder/_ranking.py
@@ -0,0 +1,79 @@
+"""Pure ranking helpers — slot allocation + fitness aggregation + sorting.
+
+No I/O, no LLM calls. Used by the main ``breed()`` orchestrator and by
+``_build_breeding_context`` when it needs to format a ranked list.
+"""
+
+from __future__ import annotations
+
+from skillforge.models import Generation, SkillGenome
+
+
+def compute_slots(target_pop_size: int) -> dict[str, int]:
+    """Allocate breeding slots as a function of ``target_pop_size``.
+
+    Formula (from PLAN.md §Step 6e Breeder):
+
+        elitism    = max(1, target_pop_size // 5 * 2)   ~40% floor 1
+        wildcards  = max(1, target_pop_size // 10)      ~10% floor 1
+        remainder  = target_pop_size - elitism - wildcards
+        diagnostic = remainder // 2
+        crossover  = remainder - diagnostic
+
+    Worked examples:
+        pop_size=3  → elitism=1, wildcards=1, diagnostic=0, crossover=1 (sum 3)
+        pop_size=5  → elitism=2, wildcards=1, diagnostic=1, crossover=1 (sum 5)
+        pop_size=10 → elitism=4, wildcards=1, diagnostic=2, crossover=3 (sum 10)
+    """
+    if target_pop_size < 1:
+        raise ValueError(f"target_pop_size must be >=1, got {target_pop_size}")
+
+    elitism = max(1, (target_pop_size // 5) * 2)
+    wildcards = max(1, target_pop_size // 10)
+
+    # Ensure elitism + wildcards doesn't exceed target (pathological tiny sizes)
+    if elitism + wildcards > target_pop_size:
+        elitism = max(1, target_pop_size - 1)
+        wildcards = max(0, target_pop_size - elitism)
+
+    remainder = target_pop_size - elitism - wildcards
+    diagnostic = remainder // 2
+    crossover = remainder - diagnostic
+
+    slots = {
+        "elitism": elitism,
+        "wildcards": wildcards,
+        "diagnostic": diagnostic,
+        "crossover": crossover,
+    }
+    assert sum(slots.values()) == target_pop_size, (
+        f"slot sum {sum(slots.values())} != target {target_pop_size}: {slots}"
+    )
+    return slots
+
+
+# ---------------------------------------------------------------------------
+# Ranking
+# ---------------------------------------------------------------------------
+
+
+def _aggregate_fitness(skill: SkillGenome) -> float:
+    """Scalar aggregate of Pareto objectives for ranking (charts/selection).
+
+    The Pareto front is the real answer; this scalar is a summary for
+    ordering within the front (and for ranking Skills OFF the front).
+    """
+    if not skill.pareto_objectives:
+        return 0.0
+    return sum(skill.pareto_objectives.values()) / len(skill.pareto_objectives)
+
+
+def rank_skills(generation: Generation) -> list[SkillGenome]:
+    """Return generation.skills sorted by (is_pareto_optimal desc, fitness desc)."""
+    return sorted(
+        generation.skills,
+        key=lambda s: (s.is_pareto_optimal, _aggregate_fitness(s)),
+        reverse=True,
+    )
+
+
diff --git a/skillforge/agents/breeder/_reports.py b/skillforge/agents/breeder/_reports.py
new file mode 100644
index 0000000..1e89442
--- /dev/null
+++ b/skillforge/agents/breeder/_reports.py
@@ -0,0 +1,213 @@
+"""Breeder's reflection step — post-generation lessons + written report.
+
+Calls the LLM to distill what this generation revealed about trait
+fitness and to write a paragraph explaining the breeding decisions.
+Degrades gracefully on SDK errors (see docs/clean-code.md §4).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+
+from anthropic import AsyncAnthropic
+
+from skillforge.agents._llm import stream_text
+from skillforge.agents.breeder._ranking import _aggregate_fitness
+from skillforge.config import ANTHROPIC_API_KEY, BREEDER_CALL_MODE, model_for
+from skillforge.models import Generation, SkillGenome
+
+logger = logging.getLogger("skillforge.agents.breeder.reports")
+
+
+async def _extract_lessons_and_report(
+    generation: Generation,
+    learning_log: list[str],
+    slots: dict[str, int],
+    elites: list[SkillGenome],
+    pareto_parents: list[SkillGenome],
+) -> tuple[list[str], str]:
+    """Ask the LLM for (a) new learning log entries and (b) a breeding report.
+
+    Dispatches on ``config.BREEDER_CALL_MODE``:
+    - "separate" (default): two LLM calls, one for lessons, one for report
+    - "consolidated" (Flex-3 cost saver): one structured call returning both
+    """
+    context = _build_breeding_context(generation, slots, elites, pareto_parents)
+
+    if BREEDER_CALL_MODE == "consolidated":
+        return await _extract_consolidated(context, learning_log)
+    lessons = await _extract_lessons(context, learning_log)
+    report = await _extract_breeding_report(context, slots, elites, pareto_parents)
+    return lessons, report
+
+
+def _build_breeding_context(
+    generation: Generation,
+    slots: dict[str, int],
+    elites: list[SkillGenome],
+    pareto_parents: list[SkillGenome],
+) -> str:
+    """Summarize this generation's results for the Breeder's LLM prompts."""
+    elite_section = "\n".join(
+        f"  - {s.id[:8]} fitness={_aggregate_fitness(s):.2f} traits={s.traits[:3]}"
+        for s in elites
+    ) or "  (none)"
+
+    pareto_section = "\n".join(
+        f"  - {s.id[:8]} fitness={_aggregate_fitness(s):.2f}"
+        for s in pareto_parents
+    ) or "  (none)"
+
+    # Top 3 trait contributions across all results
+    all_traits: dict[str, list[float]] = {}
+    for r in generation.results:
+        for trait, contrib in r.trait_contribution.items():
+            all_traits.setdefault(trait, []).append(contrib)
+    trait_means = sorted(
+        [(t, sum(vs) / len(vs)) for t, vs in all_traits.items()],
+        key=lambda kv: kv[1],
+        reverse=True,
+    )
+    top_traits = "\n".join(
+        f"  - {t}: {m:+.2f} (from trace attribution)" for t, m in trait_means[:5]
+    ) or "  (no trait data)"
+
+    return (
+        f"Generation {generation.number} summary:\n"
+        f"  population: {len(generation.skills)}\n"
+        f"  best_fitness: {generation.best_fitness:.3f}\n"
+        f"  avg_fitness: {generation.avg_fitness:.3f}\n"
+        f"  pareto_front_size: {len(generation.pareto_front)}\n"
+        f"\n"
+        f"Slot allocation for next gen: {slots}\n"
+        f"\n"
+        f"Elites (carrying forward):\n{elite_section}\n"
+        f"\n"
+        f"Pareto-optimal parents selected for crossover:\n{pareto_section}\n"
+        f"\n"
+        f"Top-contributing traits this generation:\n{top_traits}\n"
+    )
+
+
+async def _extract_lessons(context: str, learning_log: list[str]) -> list[str]:
+    """Single LLM call extracting generalizable lessons as a JSON array."""
+    recent_log = "\n".join(f"- {e}" for e in learning_log[-10:])
+
+    prompt = (
+        "You are the Breeder agent for a population-based evolution of Claude Agent Skills. "
+        "Based on the generation summary below, identify 1-3 NEW generalizable lessons "
+        "about Skill authoring that this generation revealed. Do NOT repeat lessons from "
+        "the existing learning log. Lessons should be actionable for future breeding, "
+        "generic enough to apply across domains, and grounded in the trait attribution data.\n\n"
+        f"## Generation summary\n{context}\n\n"
+        f"## Existing learning log (don't repeat these)\n{recent_log or '(empty)'}\n\n"
+        "## Response format\n"
+        'Respond with ONLY a JSON array of 1-3 strings, like ["lesson 1", "lesson 2"]. '
+        "No prose before or after."
+    )
+
+    try:
+        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
+        text = await stream_text(
+            client,
+            model=model_for("breeder"),
+            max_tokens=500,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    except Exception:
+        # Degrade gracefully — a breeder that blocks on LLM hiccups would
+        # stall the whole run. The SDK has many concrete error types across
+        # versions; catching at the boundary keeps the engine moving.
+        logger.exception("breeder.lesson_extraction_failed")
+        return ["(lesson extraction failed)"]
+
+    match = re.search(r"\[.*\]", text, re.DOTALL)
+    if not match:
+        return []
+    try:
+        lessons = json.loads(match.group(0))
+    except json.JSONDecodeError:
+        return []
+    return [str(lesson) for lesson in lessons if isinstance(lesson, str)][:3]
+
+
+async def _extract_breeding_report(
+    context: str,
+    slots: dict[str, int],
+    elites: list[SkillGenome],
+    pareto_parents: list[SkillGenome],
+) -> str:
+    """Single LLM call producing a human-readable breeding report."""
+    prompt = (
+        "You are the Breeder agent for SkillForge. Write a 2-paragraph breeding report "
+        "explaining the decisions for the next generation. Paragraph 1: what this "
+        "generation revealed about trait fitness and which skills earned elite/Pareto "
+        "status. Paragraph 2: the strategy for the next generation's diagnostic "
+        "mutations and crossovers. Be specific, cite skill IDs by their 8-char prefix, "
+        "and reference trait contributions when they shaped a decision.\n\n"
+        f"## Generation summary\n{context}\n\n"
+        "Respond with ONLY the report prose. No headings."
+    )
+
+    try:
+        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
+        return await stream_text(
+            client,
+            model=model_for("breeder"),
+            max_tokens=800,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    except Exception:
+        # Degrade gracefully — see _extract_lessons for rationale.
+        logger.exception("breeder.report_extraction_failed")
+        return "(breeding report failed)"
+
+
+async def _extract_consolidated(
+    context: str,
+    learning_log: list[str],
+) -> tuple[list[str], str]:
+    """Flex-3 cost saver: one LLM call produces both lessons and report as JSON."""
+    recent_log = "\n".join(f"- {e}" for e in learning_log[-10:])
+
+    prompt = (
+        "You are the Breeder agent for SkillForge. Given the generation summary below, "
+        "produce BOTH: (1) 1-3 NEW generalizable lessons about Skill authoring, and "
+        "(2) a 2-paragraph breeding report explaining the decisions.\n\n"
+        f"## Generation summary\n{context}\n\n"
+        f"## Existing learning log (don't repeat)\n{recent_log or '(empty)'}\n\n"
+        "## Response format\n"
+        "Respond with ONLY a JSON object matching:\n"
+        '{\n'
+        '  "lessons": ["lesson 1", "lesson 2"],\n'
+        '  "report": "Paragraph 1...\\n\\nParagraph 2..."\n'
+        '}\n'
+        "No prose before or after the JSON."
+    )
+
+    try:
+        client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=300.0)
+        text = await stream_text(
+            client,
+            model=model_for("breeder"),
+            max_tokens=1200,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    except Exception:
+        # Degrade gracefully — see _extract_lessons for rationale.
+        logger.exception("breeder.consolidated_extraction_failed")
+        return (["(consolidated extraction failed)"], "")
+
+    match = re.search(r"\{.*\}", text, re.DOTALL)
+    if not match:
+        return ([], "")
+    try:
+        raw = json.loads(match.group(0))
+    except json.JSONDecodeError:
+        return ([], "")
+
+    lessons = [str(entry) for entry in raw.get("lessons", []) if isinstance(entry, str)][:3]
+    report = str(raw.get("report", ""))
+    return (lessons, report)
diff --git a/skillforge/agents/breeder/bible.py b/skillforge/agents/breeder/bible.py
new file mode 100644
index 0000000..5ff5fd1
--- /dev/null
+++ b/skillforge/agents/breeder/bible.py
@@ -0,0 +1,139 @@
+"""Write learning-log entries out to ``bible/findings/`` on disk.
+
+All I/O lives here; the caller just passes in the new entries + run
+metadata and expects best-effort persistence. Failures are logged,
+never raised — a bible write must not abort an evolution run.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from datetime import UTC, datetime
+
+logger = logging.getLogger("skillforge.agents.breeder.bible")
+
+
+def _resolve_bible_dir():
+    """Look up BIBLE_DIR through the breeder package's namespace.
+
+    The test suite patches ``skillforge.agents.breeder.BIBLE_DIR`` to
+    redirect writes to a tmp_path fixture. Reading the attribute fresh
+    each call (instead of binding at import time) keeps that patch
+    observable after the monolithic module was split into a package.
+    """
+    from skillforge.agents import breeder as _pkg
+
+    return _pkg.BIBLE_DIR
+
+
+def publish_findings_to_bible(
+    new_entries: list[str],
+    run_id: str,
+    generation: int,
+) -> None:
+    """Write new learning-log entries as numbered finding files under bible/findings/.
+
+    Each finding gets its own file following the schema in bible/README.md.
+    Also appends a summary line to bible/evolution-log.md.
+
+    Failures here are logged but never raised — we don't want a bible write
+    failure to abort an evolution run.
+    """
+    if not new_entries:
+        return
+
+    bible_dir = _resolve_bible_dir()
+    findings_dir = bible_dir / "findings"
+    try:
+        findings_dir.mkdir(parents=True, exist_ok=True)
+    except OSError:
+        logger.exception("bible.findings_dir_mkdir_failed")
+        return
+
+    # Determine the next finding number by scanning existing files
+    existing_nums = []
+    for f in findings_dir.glob("*.md"):
+        match = re.match(r"^(\d{3})-", f.name)
+        if match:
+            existing_nums.append(int(match.group(1)))
+    next_num = (max(existing_nums) + 1) if existing_nums else 1
+
+    timestamp = datetime.now(UTC).strftime("%Y-%m-%d")
+
+    for entry in new_entries:
+        if not entry or entry.startswith("("):
+            # Skip error placeholders
+            continue
+        slug = _slugify(entry)[:40]
+        filename = f"{next_num:03d}-{slug}.md"
+        content = _finding_markdown(
+            num=next_num,
+            title=entry,
+            body=entry,
+            run_id=run_id,
+            generation=generation,
+            timestamp=timestamp,
+        )
+        try:
+            (findings_dir / filename).write_text(content)
+        except OSError:
+            logger.exception("bible.finding_write_failed", extra={"filename": filename})
+            continue
+        next_num += 1
+
+    # Append to evolution log
+    log_path = bible_dir / "evolution-log.md"
+    try:
+        if log_path.exists():
+            existing = log_path.read_text()
+        else:
+            existing = "# Evolution Log\n\n*Chronological log of all SkillForge evolution runs.*\n\n"
+        entry_line = f"- **{timestamp}** — run `{run_id[:8]}` gen {generation}: {len(new_entries)} new finding(s)\n"
+        log_path.write_text(existing + entry_line)
+    except OSError:
+        logger.exception("bible.evolution_log_write_failed")
+
+
+def _slugify(text: str) -> str:
+    """Kebab-case a string for use in a filename."""
+    slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
+    return slug or "untitled"
+
+
+def _finding_markdown(
+    num: int,
+    title: str,
+    body: str,
+    run_id: str,
+    generation: int,
+    timestamp: str,
+) -> str:
+    """Render a finding markdown file per bible/README.md schema."""
+    short_title = title.split(".")[0][:60] if "." in title else title[:60]
+    return f"""# Finding {num:03d}: {short_title}
+
+**Discovered**: {timestamp}
+**Evolution Run**: {run_id}
+**Generation**: {generation}
+**Status**: finding
+
+## Observation
+
+{body}
+
+## Evidence
+
+Automatically extracted from the generation {generation} trait attribution
+and trace analysis by the Breeder agent. See run `{run_id}` in the
+SkillForge database for the raw scores and traces.
+
+## Mechanism
+
+*To be filled in if this finding replicates across 3+ runs and gets
+promoted to a pattern.*
+
+## Recommendation
+
+*To be filled in upon promotion.*
+"""
diff --git a/skillforge/agents/breeder/main.py b/skillforge/agents/breeder/main.py
new file mode 100644
index 0000000..5de2acf
--- /dev/null
+++ b/skillforge/agents/breeder/main.py
@@ -0,0 +1,146 @@
+"""Main breed() orchestrator — allocate slots, run subagents, pad + return.
+
+``breed_next_gen`` / ``spawn_gen0`` / ``_extract_lessons_and_report`` are
+resolved through the package namespace at call time (not bound at import)
+so that tests which ``patch("skillforge.agents.breeder.breed_next_gen")``
+still intercept the call after the monolithic module was split.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from skillforge.agents.breeder._prompts import (
+    _build_crossover_instructions,
+    _build_diagnostic_instructions,
+)
+from skillforge.agents.breeder._ranking import compute_slots, rank_skills
+from skillforge.models import Generation, SkillGenome
+
+logger = logging.getLogger("skillforge.agents.breeder")
+
+
+def _pkg():
+    """Return the breeder package so attribute lookups honor test patches."""
+    from skillforge.agents import breeder as _breeder_pkg
+
+    return _breeder_pkg
+
+
+async def breed(
+    generation: Generation,
+    learning_log: list[str],
+    specialization: str,
+    target_pop_size: int,
+) -> tuple[list[SkillGenome], list[str], str]:
+    """Produce the next generation from a ranked current generation.
+
+    Returns ``(next_gen_skills, new_learning_log_entries, breeding_report)``.
+
+    The slot allocation scales with ``target_pop_size`` — see ``compute_slots``.
+    The function guarantees ``len(next_gen_skills) == target_pop_size``.
+    """
+    slots = compute_slots(target_pop_size)
+    ranked = rank_skills(generation)
+
+    next_gen: list[SkillGenome] = []
+
+    # --- Elitism: top-N survive unchanged (but bump generations_survived) ---
+    elites = ranked[: slots["elitism"]]
+    for elite in elites:
+        carried = _carry_elite(elite)
+        next_gen.append(carried)
+
+    # --- Diagnostic mutation: pick low-scoring Skills, ask LLM for targeted fixes ---
+    low_scorers = ranked[-slots["diagnostic"] :] if slots["diagnostic"] > 0 else []
+    diagnostic_instructions = _build_diagnostic_instructions(
+        low_scorers, learning_log, slots["diagnostic"]
+    )
+    if slots["diagnostic"] > 0 and low_scorers:
+        try:
+            diagnostic_children = await _pkg().breed_next_gen(
+                parents=low_scorers,
+                learning_log=learning_log,
+                breeding_instructions=diagnostic_instructions,
+            )
+            next_gen.extend(diagnostic_children[: slots["diagnostic"]])
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            # Fall through — wildcard slots below absorb the shortfall.
+            logger.exception("breeder.diagnostic_failed")
+
+    # --- Reflective crossover: combine 2-3 Pareto-optimal parents ---
+    pareto_parents = [s for s in ranked if s.is_pareto_optimal][:3]
+    if not pareto_parents:
+        # Fallback: use top 3 by fitness if nobody is Pareto-optimal
+        pareto_parents = ranked[:3]
+
+    crossover_instructions = _build_crossover_instructions(
+        pareto_parents, learning_log, slots["crossover"]
+    )
+    if slots["crossover"] > 0 and pareto_parents:
+        try:
+            crossover_children = await _pkg().breed_next_gen(
+                parents=pareto_parents,
+                learning_log=learning_log,
+                breeding_instructions=crossover_instructions,
+            )
+            next_gen.extend(crossover_children[: slots["crossover"]])
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            logger.exception("breeder.crossover_failed")
+
+    # --- Wildcard: fresh Skills via spawn_gen0 ---
+    if slots["wildcards"] > 0:
+        try:
+            wildcards = await _pkg().spawn_gen0(
+                specialization=specialization,
+                pop_size=slots["wildcards"],
+            )
+            # Mark wildcards as mutations on the next generation
+            next_gen_num = generation.number + 1
+            for w in wildcards:
+                w.generation = next_gen_num
+                w.mutations = ["wildcard"]
+                w.mutation_rationale = "Wildcard slot: fresh spawn to prevent convergence"
+            next_gen.extend(wildcards)
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            logger.exception("breeder.wildcard_spawn_failed")
+
+    # --- Trim or pad to exactly target_pop_size ---
+    next_gen = next_gen[:target_pop_size]
+
+    # If we fell short (any slot failed), pad with elites cloned forward
+    while len(next_gen) < target_pop_size and ranked:
+        next_gen.append(_carry_elite(ranked[0]))
+
+    assert len(next_gen) == target_pop_size, (
+        f"breeder produced {len(next_gen)} children, expected {target_pop_size}"
+    )
+
+    # --- Stamp generation number on everything ---
+    next_gen_num = generation.number + 1
+    for child in next_gen:
+        child.generation = next_gen_num
+
+    # --- Extract new learning log entries + write breeding report ---
+    new_lessons, breeding_report = await _pkg()._extract_lessons_and_report(
+        generation, learning_log, slots, elites, pareto_parents
+    )
+
+    return (next_gen, new_lessons, breeding_report)
+
+
+def _carry_elite(skill: SkillGenome) -> SkillGenome:
+    """Return an elite skill carried forward with bumped metadata."""
+    import copy
+
+    carried = copy.deepcopy(skill)
+    carried.generations_survived += 1
+    carried.mutations = ["elitism"]
+    carried.mutation_rationale = "Elitism: top-ranked parent carried forward unchanged"
+    # Bump maturity if the skill is surviving well
+    if carried.generations_survived >= 3 and carried.maturity == "tested":
+        carried.maturity = "hardened"
+    elif carried.generations_survived >= 2 and carried.maturity == "draft":
+        carried.maturity = "tested"
+    return carried
+

From e73dcfc896cbdb18b9e28bced5bb3cfee6cc3bf9 Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Mon, 20 Apr 2026 02:04:21 -0500
Subject: [PATCH 4/4] refactor: split spawner.py (763 LOC) into a package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Decomposed along the pure-planner / thin-I/O-shell seam called out in
docs/clean-code.md §7. Four submodules:

  spawner/__init__.py     barrel — re-exports four entry points plus
                          every helper tests patch on the package root
                          (_generate, _read_bible_patterns, BIBLE_DIR, ...)
  spawner/_helpers.py    _generate (LLM streaming) + _parse_genomes
                         + _auto_repair_missing_references
                         + _validate_genomes + _read_bible_patterns
  spawner/_prompts.py    all _build_*_system_prompt string templates
                         + embedded JSON schema constants
                         (pure — no I/O, no LLM calls)
  spawner/main.py        four public entry points:
                         spawn_gen0, breed_next_gen,
                         spawn_from_parent, spawn_variant_gen0

Largest submodule is main.py at 411 LOC, under the 500-LOC ceiling.

Test-patch compatibility
------------------------
Same pattern as the breeder split: tests patch ``spawner._generate``,
``spawner._read_bible_patterns``, and ``spawner.BIBLE_DIR`` on the
package root. Those patches do not propagate to direct imports in
submodules, so ``main._generate`` / ``main._read_bible_patterns``
and ``_helpers._read_bible_patterns`` now resolve the reference
through the package namespace at call time.

Without these shims the test suite made real LLM calls for 11 minutes
before first failure — the fix is load-bearing for both test speed
and API-cost safety.

QA: ruff + mypy (83 files) + 411 pytest all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/spawner.py          | 763 --------------------------
 skillforge/agents/spawner/__init__.py |  54 ++
 skillforge/agents/spawner/_helpers.py | 187 +++++++
 skillforge/agents/spawner/_prompts.py | 217 ++++++++
 skillforge/agents/spawner/main.py     | 411 ++++++++++++++
 5 files changed, 869 insertions(+), 763 deletions(-)
 delete mode 100644 skillforge/agents/spawner.py
 create mode 100644 skillforge/agents/spawner/__init__.py
 create mode 100644 skillforge/agents/spawner/_helpers.py
 create mode 100644 skillforge/agents/spawner/_prompts.py
 create mode 100644 skillforge/agents/spawner/main.py

diff --git a/skillforge/agents/spawner.py b/skillforge/agents/spawner.py
deleted file mode 100644
index 096bbc7..0000000
--- a/skillforge/agents/spawner.py
+++ /dev/null
@@ -1,763 +0,0 @@
-"""Spawner — creates gen 0 populations and breeds next generations.
-
-Gen 0: reads the golden template from ``config.GOLDEN_TEMPLATE_DIR`` and
-``bible/patterns/*.md``, generates ``pop_size`` diverse Skills varying content
-while preserving structure.
-
-Gen 1+: takes parent genomes + breeding instructions from the Breeder and
-produces child Skills. The Spawner MUST enforce all authoring constraints
-from ``engine.sandbox.validate_skill_structure``.
-
-Uses the Anthropic Messages API directly (NOT the Agent SDK's query()) because
-this is a pure generation task with no tool use. The Agent SDK's query() is
-for agentic loops with tools and hung the overnight live test.
-"""
-
-from __future__ import annotations
-
-import re
-import uuid
-
-from anthropic import AsyncAnthropic
-
-from skillforge.agents._json import extract_json_array
-from skillforge.config import ANTHROPIC_API_KEY, BIBLE_DIR, GOLDEN_TEMPLATE_DIR, model_for
-from skillforge.engine.sandbox import validate_skill_structure
-from skillforge.errors import ParseError
-from skillforge.models import SkillGenome
-
-# JSON schema for spawner responses
-_SPAWN_SCHEMA_DESCRIPTION = """[
-  {
-    "name": "kebab-case-name",
-    "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...",
-    "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."},
-    "traits": ["imperative-phrasing", "tests-first"],
-    "meta_strategy": "plan-first TDD"
-  }
-]"""
-
-_BREED_SCHEMA_DESCRIPTION = """[
-  {
-    "name": "kebab-case-name",
-    "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...",
-    "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."},
-    "traits": ["imperative-phrasing", "tests-first"],
-    "meta_strategy": "plan-first TDD",
-    "parent_ids": ["uuid-1", "uuid-2"],
-    "mutations": ["changed-meta-strategy", "added-examples"],
-    "mutation_rationale": "Switched to TDD-first based on parent attribution data"
-  }
-]"""
-
-
-def _read_bible_patterns() -> str:
-    """Concatenate all .md files under BIBLE_DIR/patterns in sorted order.
-
-    Returns empty string if the directory doesn't exist or is empty.
-    """
-    patterns_dir = BIBLE_DIR / "patterns"
-    if not patterns_dir.exists():
-        return ""
-
-    parts: list[str] = []
-    for p in sorted(patterns_dir.glob("*.md")):
-        try:
-            parts.append(p.read_text())
-        except (OSError, UnicodeDecodeError):
-            continue
-
-    return "\n\n---\n\n".join(parts)
-
-
-def _extract_response_text(response) -> str:
-    """Extract text from an Anthropic Messages API response.
-
-    The response's ``content`` is a list of content blocks; extract any
-    that have a ``.text`` attribute.
-    """
-    if not response.content:
-        return ""
-    parts: list[str] = []
-    for block in response.content:
-        text = getattr(block, "text", None)
-        if text:
-            parts.append(text)
-    return "\n".join(parts)
-
-
-def _save_debug_response(label: str, text: str) -> None:
-    """Write the last raw LLM response to /tmp for post-hoc debugging.
-
-    Non-fatal — any write error is silently swallowed. This is for diagnosing
-    parse failures during live runs; in production the text is ephemeral.
-    """
-    try:
-        from pathlib import Path
-
-        path = Path("/tmp") / f"sf-{label}.txt"
-        path.write_text(text)
-    except OSError:
-        pass
-
-
-def _parse_genomes(
-    raw: list[dict],
-    generation: int,
-    parent_ids: list[str] | None = None,
-) -> list[SkillGenome]:
-    """Convert raw dicts from Claude's response into SkillGenome objects."""
-    genomes: list[SkillGenome] = []
-    for item in raw:
-        genome = SkillGenome(
-            id=str(uuid.uuid4()),
-            generation=generation,
-            skill_md_content=item.get("skill_md_content", ""),
-            supporting_files=item.get("supporting_files", {}),
-            traits=item.get("traits", []),
-            meta_strategy=item.get("meta_strategy", ""),
-            parent_ids=parent_ids or item.get("parent_ids", []),
-            mutations=item.get("mutations", []),
-            mutation_rationale=item.get("mutation_rationale", ""),
-            maturity="draft",
-        )
-        genomes.append(genome)
-    return genomes
-
-
-async def _generate(prompt: str) -> str:
-    """Streaming Anthropic API call. Returns the full assistant text response.
-
-    The Spawner generates structured JSON output containing multiple SKILL.md
-    files (up to ~5KB per skill × pop_size = 25KB+ at pop_size=5). Non-streaming
-    requests get server-disconnected around the 3-4 minute mark on prompts this
-    size. Streaming keeps the connection alive via incremental chunks and handles
-    long generations reliably.
-
-    ``max_tokens`` is set to 32000 to fit a full population of rich SKILL.md
-    files with supporting scripts. Claude Sonnet 4.6 supports up to 64K output
-    tokens in streaming mode; 32K is plenty for realistic populations while
-    keeping a sane ceiling.
-    """
-    client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=600.0)
-    parts: list[str] = []
-    async with client.messages.stream(
-        model=model_for("spawner"),
-        max_tokens=32000,
-        messages=[{"role": "user", "content": prompt}],
-    ) as stream:
-        async for text in stream.text_stream:
-            parts.append(text)
-    return "".join(parts)
-
-
-# Pulls ${CLAUDE_SKILL_DIR}/<relative/path> references out of a SKILL.md body.
-# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8
-# exactly — see that function for the source-of-truth behavior.
-_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)")
-
-
-def _auto_repair_missing_references(genome: SkillGenome) -> int:
-    """Stub out ``${CLAUDE_SKILL_DIR}/<path>`` refs missing from supporting_files.
-
-    Cheap-tier Haiku routinely emits SKILL.md bodies that reference
-    ``references/*-guide.md`` in prose but forget to include the file in
-    ``supporting_files``. Validator rule 8 rejects those genomes, which
-    in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 times.
-
-    Rather than burn another LLM call on a retry that often reproduces
-    the same oversight, we stub each missing reference with a minimal
-    placeholder. The skill still renders, the reference still resolves
-    at runtime, and the genome passes validation. The Breeder can flesh
-    out the stubs in later generations if fitness signal suggests they
-    carry weight.
-
-    Returns the count of paths that were stubbed (0 if everything already
-    resolved, which is the expected Sonnet-tier case).
-    """
-    stubbed = 0
-    for match in _REF_PATH_RE.finditer(genome.skill_md_content):
-        rel_path = match.group(1).rstrip(".,;:)")
-        if rel_path in genome.supporting_files:
-            continue
-        filename = rel_path.rsplit("/", 1)[-1]
-        placeholder_title = filename.removesuffix(".md").replace("-", " ").title()
-        genome.supporting_files[rel_path] = (
-            f"# {placeholder_title}\n\n"
-            f"_Placeholder — stubbed by the spawner's auto-repair pass "
-            f"because the generating LLM referenced this file but did not "
-            f"emit its contents. Replace with domain-specific material "
-            f"during a later generation._\n"
-        )
-        stubbed += 1
-    return stubbed
-
-
-def _validate_genomes(
-    genomes: list[SkillGenome],
-) -> tuple[list[SkillGenome], dict[int, list[str]]]:
-    """Validate each genome; returns (valid_genomes, {idx: violations}).
-
-    Runs the reference-path auto-repair pass before validation so cheap-tier
-    LLM drift on rule 8 (missing supporting_files entries) doesn't kill a
-    whole population. The repair only adds files; it never touches the
-    skill_md body.
-    """
-    valid: list[SkillGenome] = []
-    invalid: dict[int, list[str]] = {}
-    for i, genome in enumerate(genomes):
-        _auto_repair_missing_references(genome)
-        violations = validate_skill_structure(genome)
-        if violations:
-            invalid[i] = violations
-        else:
-            valid.append(genome)
-    return valid, invalid
-
-
-def _build_spawn_system_prompt(
-    specialization: str,
-    pop_size: int,
-    template: str,
-    bible_patterns: str,
-) -> str:
-    """Build the system prompt for gen 0 spawn."""
-    bible_section = (
-        f"\n\n## Validated Patterns (apply these)\n\n{bible_patterns}"
-        if bible_patterns
-        else ""
-    )
-    return (
-        f"You are a Skill author for the Claude Agent SDK. Your task is to generate "
-        f"{pop_size} DIVERSE candidate Skills for the following specialization:\n\n"
-        f"SPECIALIZATION: {specialization}\n\n"
-        "Each Skill must:\n"
-        "1. Follow the exact YAML frontmatter + markdown structure of the template below\n"
-        "2. Include 'Use when' in the first 250 chars of the description\n"
-        "3. Have a name matching the regex ^[a-z0-9]+(-[a-z0-9]+)*$\n"
-        "4. Contain at least 2 example blocks (**Example or ## Example)\n"
-        "5. Keep the body under 500 lines\n"
-        "6. Have a description under 1024 characters\n"
-        "7. NOT use 'anthropic' or 'claude' in the name\n"
-        "8. Only reference paths in ${CLAUDE_SKILL_DIR}/... that are included in supporting_files\n\n"
-        "## Golden Template\n\n"
-        f"{template}"
-        f"{bible_section}\n\n"
-        f"Return ONLY a JSON array of exactly {pop_size} skill objects. "
-        "No prose before or after — ONLY the JSON array. Use this schema:\n"
-        f"{_SPAWN_SCHEMA_DESCRIPTION}\n"
-        "Vary the approach, strategy, instruction style, and examples across all skills "
-        "while preserving the template structure."
-    )
-
-
-def _build_breed_system_prompt(
-    parents: list[SkillGenome],
-    learning_log: list[str],
-    breeding_instructions: str,
-    bible_patterns: str,
-) -> str:
-    """Build the system prompt for next-gen breeding."""
-    bible_section = (
-        f"\n\n## Validated Patterns\n\n{bible_patterns}" if bible_patterns else ""
-    )
-
-    parents_section = "\n\n".join(
-        f"### Parent {i + 1} (id: {p.id})\n"
-        f"**Traits**: {p.traits}\n"
-        f"**Meta-strategy**: {p.meta_strategy}\n"
-        f"**Trait attribution**: {p.trait_attribution}\n"
-        f"**Trait diagnostics**: {p.trait_diagnostics}\n\n"
-        f"**SKILL.md content**:\n```\n{p.skill_md_content}\n```"
-        for i, p in enumerate(parents)
-    )
-
-    learning_section = (
-        "\n".join(f"- {entry}" for entry in learning_log)
-        if learning_log
-        else "(no entries yet)"
-    )
-
-    return (
-        "You are a Skill evolutionary breeder for the Claude Agent SDK.\n\n"
-        "## Breeding Instructions (from Breeder agent)\n\n"
-        f"{breeding_instructions}\n\n"
-        "## Parent Skills\n\n"
-        f"{parents_section}\n\n"
-        "## Learning Log (failures and lessons from all prior generations)\n\n"
-        f"{learning_section}"
-        f"{bible_section}\n\n"
-        "## Rules for child Skills\n"
-        "1. Follow YAML frontmatter + markdown structure of the parents\n"
-        "2. Include 'Use when' in first 250 chars of description\n"
-        "3. Name must match ^[a-z0-9]+(-[a-z0-9]+)*$ and NOT contain 'anthropic' or 'claude'\n"
-        "4. At least 2 example blocks (**Example or ## Example)\n"
-        "5. Body under 500 lines, description under 1024 characters\n"
-        "6. Only reference ${CLAUDE_SKILL_DIR}/... paths that are in supporting_files\n\n"
-        "Return ONLY a JSON array of child skill objects. Use this schema:\n"
-        f"{_BREED_SCHEMA_DESCRIPTION}"
-    )
-
-
-def _build_repair_prompt(
-    original_prompt: str,
-    violations_by_idx: dict[int, list[str]],
-    genomes: list[SkillGenome],
-) -> str:
-    """Build a reprompt asking Claude to fix specific violations."""
-    violation_lines: list[str] = []
-    for idx, viols in violations_by_idx.items():
-        genome_name = genomes[idx].skill_md_content[:50].replace("\n", " ")
-        violation_lines.append(
-            f"Skill index {idx} ({genome_name!r}): {'; '.join(viols)}"
-        )
-    violations_str = "\n".join(violation_lines)
-
-    return (
-        "Your previous response contained invalid Skills. "
-        "Fix the following violations and return a corrected JSON array:\n\n"
-        f"{violations_str}\n\n"
-        "Return ONLY the complete corrected JSON array — all skills, not just the fixed ones."
-    )
-
-
-async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]:
-    """Generate ``pop_size`` diverse gen 0 Skills for the specialization.
-
-    Args:
-        specialization: Description of the Skill domain.
-        pop_size: Number of candidate Skills to generate.
-
-    Returns:
-        A list of ``pop_size`` validated SkillGenome objects at generation 0.
-
-    Raises:
-        ValueError: if Skills remain invalid after 1 retry.
-    """
-    template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text()
-    bible_patterns = _read_bible_patterns()
-
-    system_prompt = _build_spawn_system_prompt(
-        specialization, pop_size, template, bible_patterns
-    )
-
-    # Attempt 1
-    text = await _generate(system_prompt)
-    _save_debug_response("spawn_gen0_attempt1", text)
-
-    try:
-        raw = extract_json_array(text)
-        genomes = _parse_genomes(raw, generation=0)
-        valid_genomes, invalid = _validate_genomes(genomes)
-        first_attempt_failed = False
-    except (ValueError, ParseError):
-        # JSON parse failure — treat as if everything was invalid so the
-        # retry path runs.
-        genomes = []
-        valid_genomes = []
-        invalid = {}
-        first_attempt_failed = True
-
-    if not first_attempt_failed and not invalid:
-        return valid_genomes
-
-    # Attempt 2 — retry. Use the same prompt if JSON parse failed (Claude
-    # just didn't follow instructions), or a targeted repair prompt if
-    # the skills parsed but failed validation.
-    if first_attempt_failed:
-        retry_prompt = (
-            system_prompt
-            + "\n\nCRITICAL: Your previous response did not contain a valid JSON "
-            "array. You must respond with ONLY a JSON array — no prose, no "
-            "markdown before or after the array. The array must start with [ "
-            "and end with ]. No explanations."
-        )
-    else:
-        retry_prompt = _build_repair_prompt(system_prompt, invalid, genomes)
-
-    text = await _generate(retry_prompt)
-    _save_debug_response("spawn_gen0_attempt2", text)
-
-    try:
-        raw2 = extract_json_array(text)
-    except (ValueError, ParseError) as exc:
-        raise ValueError(
-            f"spawner failed to produce valid JSON on retry: {exc}. "
-            f"See /tmp/sf-spawn_gen0_attempt2.txt for the raw response."
-        ) from exc
-
-    genomes2 = _parse_genomes(raw2, generation=0)
-    valid_genomes2, still_invalid = _validate_genomes(genomes2)
-
-    if still_invalid:
-        all_violations = [
-            f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items()
-        ]
-        raise ValueError(
-            "spawner produced invalid skills after retry: "
-            + "; ".join(all_violations)
-        )
-
-    return valid_genomes2
-
-
-async def breed_next_gen(
-    parents: list[SkillGenome],
-    learning_log: list[str],
-    breeding_instructions: str,
-) -> list[SkillGenome]:
-    """Produce a child population from parents + Breeder's instructions.
-
-    Args:
-        parents: Parent SkillGenome objects (with trait_attribution populated).
-        learning_log: Accumulated lessons from all prior generations.
-        breeding_instructions: Free-text directives from the Breeder agent.
-
-    Returns:
-        A list of validated child SkillGenome objects at generation+1.
-
-    Raises:
-        ValueError: if children remain invalid after 1 retry.
-    """
-    bible_patterns = _read_bible_patterns()
-    parent_ids = [p.id for p in parents]
-    next_generation = (parents[0].generation + 1) if parents else 1
-
-    system_prompt = _build_breed_system_prompt(
-        parents, learning_log, breeding_instructions, bible_patterns
-    )
-
-    # Attempt 1
-    text = await _generate(system_prompt)
-
-    try:
-        raw = extract_json_array(text)
-    except (ValueError, ParseError) as exc:
-        raise ValueError(
-            f"spawner breed_next_gen failed to produce valid JSON: {exc}"
-        ) from exc
-
-    # Parse with generation and parent_ids from raw (each child should specify its own parent_ids)
-    children: list[SkillGenome] = []
-    for item in raw:
-        child = SkillGenome(
-            id=str(uuid.uuid4()),
-            generation=next_generation,
-            skill_md_content=item.get("skill_md_content", ""),
-            supporting_files=item.get("supporting_files", {}),
-            traits=item.get("traits", []),
-            meta_strategy=item.get("meta_strategy", ""),
-            parent_ids=item.get("parent_ids", parent_ids),
-            mutations=item.get("mutations", []),
-            mutation_rationale=item.get("mutation_rationale", ""),
-            maturity="draft",
-        )
-        children.append(child)
-
-    valid_children, invalid = _validate_genomes(children)
-
-    if not invalid:
-        return valid_children
-
-    # Attempt 2 — repair
-    repair_prompt = _build_repair_prompt(system_prompt, invalid, children)
-    text = await _generate(repair_prompt)
-
-    try:
-        raw2 = extract_json_array(text)
-    except (ValueError, ParseError) as exc:
-        raise ValueError(
-            f"spawner breed_next_gen failed to produce valid JSON on retry: {exc}"
-        ) from exc
-
-    children2: list[SkillGenome] = []
-    for item in raw2:
-        child = SkillGenome(
-            id=str(uuid.uuid4()),
-            generation=next_generation,
-            skill_md_content=item.get("skill_md_content", ""),
-            supporting_files=item.get("supporting_files", {}),
-            traits=item.get("traits", []),
-            meta_strategy=item.get("meta_strategy", ""),
-            parent_ids=item.get("parent_ids", parent_ids),
-            mutations=item.get("mutations", []),
-            mutation_rationale=item.get("mutation_rationale", ""),
-            maturity="draft",
-        )
-        children2.append(child)
-
-    valid_children2, still_invalid = _validate_genomes(children2)
-
-    if still_invalid:
-        all_violations = [
-            f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items()
-        ]
-        raise ValueError(
-            "spawner produced invalid skills after retry: "
-            + "; ".join(all_violations)
-        )
-
-    return valid_children2
-
-
-# ---------------------------------------------------------------------------
-# spawn_from_parent — gen 0 from an existing Skill (seed fork or upload)
-# ---------------------------------------------------------------------------
-
-async def spawn_from_parent(
-    parent: SkillGenome,
-    pop_size: int,
-) -> list[SkillGenome]:
-    """Generate a gen 0 population using an existing Skill as the seed parent.
-
-    The parent itself is carried forward as the elite (slot 0) and ``pop_size - 1``
-    diverse mutations are synthesized around it. Used by the Registry fork-and-
-    evolve flow and the upload-and-evolve flow — both just hand us an existing
-    genome to evolve forward instead of spawning from the golden template.
-
-    Args:
-        parent: The seed SkillGenome to evolve from (untouched in the output).
-        pop_size: Total population size including the elite parent.
-
-    Returns:
-        A list of ``pop_size`` SkillGenome objects at generation 0. The first
-        entry is the parent (re-id'd, elite); the rest are mutations.
-    """
-    if pop_size < 1:
-        raise ValueError(f"pop_size must be ≥ 1, got {pop_size}")
-
-    bible_patterns = _read_bible_patterns()
-
-    # The elite: clone the parent with a fresh id, retain content + traits
-    elite = SkillGenome(
-        id=str(uuid.uuid4()),
-        generation=0,
-        skill_md_content=parent.skill_md_content,
-        frontmatter=dict(parent.frontmatter),
-        supporting_files=dict(parent.supporting_files),
-        traits=list(parent.traits),
-        meta_strategy=parent.meta_strategy,
-        parent_ids=[parent.id],
-        mutations=["elite-carry"],
-        mutation_rationale="Seed parent carried forward as elite.",
-        maturity=parent.maturity or "draft",
-    )
-
-    if pop_size == 1:
-        return [elite]
-
-    num_mutants = pop_size - 1
-    system_prompt = f"""You are evolving an existing Claude Agent Skill by producing {num_mutants} diverse mutations.
-
-The parent Skill is below. Your job is to produce {num_mutants} variant Skills that preserve the parent's core capability but explore different:
-- Description phrasing + trigger expansion
-- Instruction structure (more/fewer numbered steps, different section ordering)
-- Trait emphasis (lean harder into some traits, introduce new ones)
-- Example diversity (different I/O pairs)
-
-Each mutation must still satisfy every constraint in the bible (≤250 char description, "Use when" + "NOT for" clauses, ≤500 line body, 2-3 diverse examples, valid YAML frontmatter, unique name matching `^[a-z0-9]+(-[a-z0-9]+)*$`).
-
-## Bible patterns (non-negotiable)
-
-{bible_patterns}
-
-## Parent Skill
-
-```
-{parent.skill_md_content}
-```
-
-Parent traits: {", ".join(parent.traits) if parent.traits else "(none)"}
-Parent strategy: {parent.meta_strategy}
-
-## Output
-
-Return a JSON array of exactly {num_mutants} skills. Each entry is a JSON object with fields:
-- `skill_md_content`: the full SKILL.md (YAML frontmatter + body)
-- `traits`: list of trait strings
-- `meta_strategy`: 1-2 sentences
-- `mutations`: list of mutation-type strings (e.g. ["description-expansion", "example-swap"])
-- `mutation_rationale`: why these mutations were made
-
-Do NOT modify the parent. Do NOT return fewer or more than {num_mutants} entries. Each mutation must have a UNIQUE `name` field in its frontmatter.
-"""
-
-    text = await _generate(system_prompt)
-
-    try:
-        raw = extract_json_array(text)
-    except (ValueError, ParseError):
-        # If the LLM refused or produced garbage, fall back to elite-only
-        # (graceful degradation — evolution can still proceed with just the parent)
-        return [elite]
-
-    mutants: list[SkillGenome] = []
-    for item in raw[:num_mutants]:
-        mutants.append(
-            SkillGenome(
-                id=str(uuid.uuid4()),
-                generation=0,
-                skill_md_content=item.get("skill_md_content", ""),
-                supporting_files=item.get("supporting_files", {}),
-                traits=item.get("traits", []),
-                meta_strategy=item.get("meta_strategy", ""),
-                parent_ids=[parent.id],
-                mutations=item.get("mutations", []),
-                mutation_rationale=item.get("mutation_rationale", ""),
-                maturity="draft",
-            )
-        )
-
-    # Drop any mutants that fail validation — keep the elite always
-    valid_mutants, _ = _validate_genomes(mutants)
-    return [elite, *valid_mutants][:pop_size]
-
-
-# ---------------------------------------------------------------------------
-# v2.0 — focused per-dimension variant spawner
-# ---------------------------------------------------------------------------
-
-
-def _build_variant_spawn_prompt(
-    specialization: str,
-    dimension: dict,
-    foundation_genome: SkillGenome | None,
-    pop_size: int,
-    template: str,
-) -> str:
-    """System prompt for spawning N focused mini-SKILL.md variants for one dimension."""
-    name = dimension.get("name", "")
-    tier = dimension.get("tier", "")
-    description = dimension.get("description", "")
-    evaluation_focus = dimension.get("evaluation_focus", "")
-
-    foundation_block = ""
-    if foundation_genome is not None and tier == "capability":
-        # Capability variants get the winning foundation as grounding so they
-        # plug into a consistent skeleton during Engineer assembly later.
-        foundation_block = (
-            "\n## Foundation context (capability variants must plug into this)\n\n"
-            "The following foundation variant has already won its tier. Your "
-            "capability variants will be assembled with it later, so they "
-            "MUST be compatible with its directory layout, naming, and fixture "
-            "philosophy. Reference the foundation's scripts and conventions "
-            "in your workflow steps.\n\n"
-            "```markdown\n"
-            f"{foundation_genome.skill_md_content[:2000]}\n"
-            "```\n"
-        )
-
-    return (
-        f"## Specialization\n\n{specialization}\n\n"
-        f"## Variant dimension you are spawning for\n\n"
-        f"- Name: `{name}`\n"
-        f"- Tier: {tier}\n"
-        f"- Description: {description}\n"
-        f"- Evaluation focus: {evaluation_focus}\n"
-        f"{foundation_block}\n"
-        f"## Your job\n\n"
-        f"Spawn {pop_size} DIVERSE mini-skill packages that each take a "
-        f"DIFFERENT angle on the dimension above. Gen 0 exists to explore — "
-        f"do not produce N near-duplicates and do not kitchen-sink one "
-        f"variant with every approach.\n\n"
-        "**One dimension, one angle per variant.** Each variant's SKILL.md "
-        "body must focus on the single dimension named above and avoid "
-        "drifting into adjacent dimensions.\n\n"
-        "## Golden template\n\n"
-        f"```markdown\n{template}\n```\n\n"
-        "## Hard rules (validator-enforced)\n\n"
-        "- `name`: kebab-case, matches `^[a-z0-9]+(-[a-z0-9]+)*$`\n"
-        "- `description`: ≤250 chars, pushy routing pattern\n"
-        "- Body: ≤500 lines\n"
-        "- 2-3 diverse I/O examples mandatory\n"
-        "- The body MUST mention the dimension name somewhere\n"
-        "- All scripts/references referenced from SKILL.md use the\n"
-        "  `${CLAUDE_SKILL_DIR}/...` path convention\n\n"
-        "## Output format\n\n"
-        f"Return ONLY a JSON array of exactly {pop_size} objects. The "
-        "``skill_md_content`` field MUST contain the FULL SKILL.md — "
-        "starting with ``---`` (YAML frontmatter), then the body. Do NOT "
-        "separate frontmatter into its own field; it must be embedded in "
-        "``skill_md_content`` as the validator expects a complete SKILL.md.\n\n"
-        "Schema:\n"
-        '```json\n[\n  {\n'
-        '    "name": "kebab-case-name",\n'
-        '    "skill_md_content": "---\\nname: ...\\ndescription: >-\\n  ...\\n---\\n\\n# Display Name\\n\\n## Quick Start\\n...",\n'
-        '    "supporting_files": {"scripts/score.py": "...", '
-        '"scripts/validate.sh": "..."},\n'
-        '    "traits": ["trait1", "trait2"],\n'
-        '    "meta_strategy": "one-liner approach description"\n'
-        "  }\n]\n```\n"
-        "No prose before or after — ONLY the JSON array."
-    )
-
-
-async def spawn_variant_gen0(
-    specialization: str,
-    dimension: dict,
-    foundation_genome: SkillGenome | None,
-    pop_size: int = 2,
-) -> list[SkillGenome]:
-    """Spawn ``pop_size`` focused mini-skill variants for a single dimension.
-
-    Args:
-        specialization: The parent skill family's specialization string.
-        dimension: A dict with at minimum ``name`` and ``tier`` keys; may
-            include ``description`` and ``evaluation_focus``. Matches the
-            shape of ``TaxonomistOutput.variant_dimensions``.
-        foundation_genome: For capability variants, the winning foundation
-            genome to use as grounding context. Pass ``None`` for foundation
-            variants.
-        pop_size: How many variants to spawn (default 2 for atomic mode).
-
-    Returns:
-        A list of ``pop_size`` SkillGenome objects at generation 0. Each is
-        validated against the standard authoring constraints. Invalid
-        variants are dropped — the caller may receive fewer than
-        ``pop_size`` if the model produces malformed output, but never more.
-
-    Raises:
-        ValueError: if no valid variants survive validation after one retry.
-    """
-    if pop_size < 1:
-        raise ValueError(f"pop_size must be ≥ 1, got {pop_size}")
-
-    template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text()
-    system_prompt = _build_variant_spawn_prompt(
-        specialization, dimension, foundation_genome, pop_size, template
-    )
-
-    text = await _generate(system_prompt)
-    _save_debug_response(f"spawn_variant_gen0_{dimension.get('name', 'unknown')}", text)
-
-    try:
-        raw = extract_json_array(text)
-    except (ValueError, ParseError):
-        # One retry with a stricter formatting reminder
-        retry_prompt = (
-            system_prompt
-            + "\n\nCRITICAL: Your previous response did not contain a valid "
-            "JSON array. Respond with ONLY a JSON array — no prose, no "
-            "markdown fences."
-        )
-        text = await _generate(retry_prompt)
-        raw = extract_json_array(text)
-
-    genomes = _parse_genomes(raw, generation=0)
-    valid_genomes, invalid = _validate_genomes(genomes)
-
-    if not valid_genomes:
-        violations = [f"skill {i}: {'; '.join(v)}" for i, v in invalid.items()]
-        raise ValueError(
-            "spawn_variant_gen0 produced no valid variants: "
-            + "; ".join(violations)
-        )
-
-    # Stamp dimension metadata into the frontmatter so the Reviewer knows
-    # how to scope L3/L4 evaluation. Validator doesn't require it but it's
-    # the right shape for downstream consumers.
-    for genome in valid_genomes:
-        genome.frontmatter["dimension"] = dimension.get("name", "")
-        genome.frontmatter["tier"] = dimension.get("tier", "")
-
-    return valid_genomes[:pop_size]
diff --git a/skillforge/agents/spawner/__init__.py b/skillforge/agents/spawner/__init__.py
new file mode 100644
index 0000000..ed34c6d
--- /dev/null
+++ b/skillforge/agents/spawner/__init__.py
@@ -0,0 +1,54 @@
+"""Spawner — creates gen 0 populations and breeds next generations.
+
+Gen 0: reads the golden template from ``config.GOLDEN_TEMPLATE_DIR`` and
+``bible/patterns/*.md``, generates ``pop_size`` diverse Skills varying
+content while preserving structure.
+
+Gen 1+: takes parent genomes + breeding instructions from the Breeder
+and produces child Skills. The Spawner MUST enforce all authoring
+constraints from ``engine.sandbox.validate_skill_structure``.
+
+Uses the Anthropic Messages API directly (NOT the Agent SDK's query())
+because this is a pure generation task with no tool use. The Agent SDK's
+query() is for agentic loops with tools and hung the overnight live test.
+
+Submodule layout:
+
+  _helpers.py    _generate (LLM call) + _parse_genomes + validation +
+                 auto-repair + bible-pattern reader
+  _prompts.py    all _build_*_prompt functions (pure string templating)
+  main.py        four public entry points — spawn_gen0, breed_next_gen,
+                 spawn_from_parent, spawn_variant_gen0
+"""
+
+from __future__ import annotations
+
+# Helpers re-exported for tests that patch them on the package root.
+from skillforge.agents.spawner._helpers import (
+    _auto_repair_missing_references,
+    _generate,
+    _parse_genomes,
+    _read_bible_patterns,
+    _validate_genomes,
+)
+from skillforge.agents.spawner.main import (
+    breed_next_gen,
+    spawn_from_parent,
+    spawn_gen0,
+    spawn_variant_gen0,
+)
+from skillforge.config import BIBLE_DIR
+
+__all__ = [
+    "spawn_gen0",
+    "breed_next_gen",
+    "spawn_from_parent",
+    "spawn_variant_gen0",
+    # Private helpers re-exported for test access.
+    "_auto_repair_missing_references",
+    "_generate",
+    "_parse_genomes",
+    "_read_bible_patterns",
+    "_validate_genomes",
+    "BIBLE_DIR",
+]
diff --git a/skillforge/agents/spawner/_helpers.py b/skillforge/agents/spawner/_helpers.py
new file mode 100644
index 0000000..78ddda3
--- /dev/null
+++ b/skillforge/agents/spawner/_helpers.py
@@ -0,0 +1,187 @@
+"""Shared Spawner helpers — bible reading, response extraction, debug dumps,
+genome parsing, auto-repair, structural validation, and the streaming LLM call.
+
+Extracted from the monolithic spawner so the per-entry-point modules
+(``gen0``, ``breed``, ``from_parent``, ``variant``) share one private
+implementation layer without re-declaring helpers.
+"""
+
+from __future__ import annotations
+
+import re
+import uuid
+
+from anthropic import AsyncAnthropic
+
+from skillforge.config import ANTHROPIC_API_KEY, model_for
+from skillforge.engine.sandbox import validate_skill_structure
+from skillforge.models import SkillGenome
+
+# Pulls ${CLAUDE_SKILL_DIR}/<relative/path> references out of a SKILL.md body.
+# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8.
+_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)")
+
+
+def _read_bible_patterns() -> str:
+    """Concatenate all .md files under BIBLE_DIR/patterns in sorted order.
+
+    Returns empty string if the directory doesn't exist or is empty.
+    Looks up BIBLE_DIR through the package namespace so tests that
+    monkeypatch ``skillforge.agents.spawner.BIBLE_DIR`` intercept the
+    lookup.
+    """
+    from skillforge.agents import spawner as _pkg
+
+    patterns_dir = _pkg.BIBLE_DIR / "patterns"
+    if not patterns_dir.exists():
+        return ""
+
+    parts: list[str] = []
+    for p in sorted(patterns_dir.glob("*.md")):
+        try:
+            parts.append(p.read_text())
+        except (OSError, UnicodeDecodeError):
+            continue
+
+    return "\n\n---\n\n".join(parts)
+
+
+def _extract_response_text(response) -> str:
+    """Extract text from an Anthropic Messages API response.
+
+    The response's ``content`` is a list of content blocks; extract any
+    that have a ``.text`` attribute.
+    """
+    if not response.content:
+        return ""
+    parts: list[str] = []
+    for block in response.content:
+        text = getattr(block, "text", None)
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+
+
+def _save_debug_response(label: str, text: str) -> None:
+    """Write the last raw LLM response to /tmp for post-hoc debugging.
+
+    Non-fatal — any write error is silently swallowed. This is for
+    diagnosing parse failures during live runs; in production the text
+    is ephemeral.
+    """
+    try:
+        from pathlib import Path
+
+        path = Path("/tmp") / f"sf-{label}.txt"
+        path.write_text(text)
+    except OSError:
+        pass
+
+
+def _parse_genomes(
+    raw: list[dict],
+    generation: int,
+    parent_ids: list[str] | None = None,
+) -> list[SkillGenome]:
+    """Convert raw dicts from Claude's response into SkillGenome objects."""
+    genomes: list[SkillGenome] = []
+    for item in raw:
+        genome = SkillGenome(
+            id=str(uuid.uuid4()),
+            generation=generation,
+            skill_md_content=item.get("skill_md_content", ""),
+            supporting_files=item.get("supporting_files", {}),
+            traits=item.get("traits", []),
+            meta_strategy=item.get("meta_strategy", ""),
+            parent_ids=parent_ids or item.get("parent_ids", []),
+            mutations=item.get("mutations", []),
+            mutation_rationale=item.get("mutation_rationale", ""),
+            maturity="draft",
+        )
+        genomes.append(genome)
+    return genomes
+
+
+async def _generate(prompt: str) -> str:
+    """Streaming Anthropic API call. Returns the full assistant text response.
+
+    The Spawner generates structured JSON output containing multiple
+    SKILL.md files (up to ~5KB per skill × pop_size = 25KB+ at pop_size=5).
+    Non-streaming requests get server-disconnected around the 3-4 minute
+    mark on prompts this size. Streaming keeps the connection alive via
+    incremental chunks and handles long generations reliably.
+
+    ``max_tokens`` is 32000 to fit a full population of rich SKILL.md
+    files with supporting scripts. Claude Sonnet 4.6 supports up to 64K
+    output tokens in streaming mode; 32K is plenty while keeping a sane
+    ceiling.
+    """
+    client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY, timeout=600.0)
+    parts: list[str] = []
+    async with client.messages.stream(
+        model=model_for("spawner"),
+        max_tokens=32000,
+        messages=[{"role": "user", "content": prompt}],
+    ) as stream:
+        async for text in stream.text_stream:
+            parts.append(text)
+    return "".join(parts)
+
+
+def _auto_repair_missing_references(genome: SkillGenome) -> int:
+    """Stub out ``${CLAUDE_SKILL_DIR}/<path>`` refs missing from supporting_files.
+
+    Cheap-tier Haiku routinely emits SKILL.md bodies that reference
+    ``references/*-guide.md`` in prose but forget to include the file in
+    ``supporting_files``. Validator rule 8 rejects those genomes, which
+    in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3
+    times.
+
+    Rather than burn another LLM call on a retry that often reproduces
+    the same oversight, we stub each missing reference with a minimal
+    placeholder. The skill still renders, the reference still resolves
+    at runtime, and the genome passes validation. The Breeder can flesh
+    out the stubs in later generations if fitness signal suggests they
+    carry weight.
+
+    Returns the count of paths that were stubbed (0 if everything
+    already resolved, which is the expected Sonnet-tier case).
+    """
+    stubbed = 0
+    for match in _REF_PATH_RE.finditer(genome.skill_md_content):
+        rel_path = match.group(1).rstrip(".,;:)")
+        if rel_path in genome.supporting_files:
+            continue
+        filename = rel_path.rsplit("/", 1)[-1]
+        placeholder_title = filename.removesuffix(".md").replace("-", " ").title()
+        genome.supporting_files[rel_path] = (
+            f"# {placeholder_title}\n\n"
+            f"_Placeholder — stubbed by the spawner's auto-repair pass "
+            f"because the generating LLM referenced this file but did not "
+            f"emit its contents. Replace with domain-specific material "
+            f"during a later generation._\n"
+        )
+        stubbed += 1
+    return stubbed
+
+
+def _validate_genomes(
+    genomes: list[SkillGenome],
+) -> tuple[list[SkillGenome], dict[int, list[str]]]:
+    """Validate each genome; returns (valid_genomes, {idx: violations}).
+
+    Runs the reference-path auto-repair pass before validation so
+    cheap-tier LLM drift on rule 8 (missing supporting_files entries)
+    doesn't kill a whole population. The repair only adds files; it
+    never touches the skill_md body.
+    """
+    valid: list[SkillGenome] = []
+    invalid: dict[int, list[str]] = {}
+    for i, genome in enumerate(genomes):
+        _auto_repair_missing_references(genome)
+        violations = validate_skill_structure(genome)
+        if violations:
+            invalid[i] = violations
+        else:
+            valid.append(genome)
+    return valid, invalid
diff --git a/skillforge/agents/spawner/_prompts.py b/skillforge/agents/spawner/_prompts.py
new file mode 100644
index 0000000..818d9dc
--- /dev/null
+++ b/skillforge/agents/spawner/_prompts.py
@@ -0,0 +1,217 @@
+"""Spawner prompt-string builders.
+
+Pure string templating — no I/O, no LLM calls. The four entry points
+(``gen0``, ``breed``, ``from_parent``, ``variant``) feed the strings
+produced here into ``_helpers._generate``.
+
+The embedded JSON schema descriptions (``_SPAWN_SCHEMA_DESCRIPTION``
+etc.) double as prompt-documentation for Claude and as the contract the
+Spawner validates against on the way back in.
+"""
+
+from __future__ import annotations
+
+from skillforge.models import SkillGenome
+
+_SPAWN_SCHEMA_DESCRIPTION = """[
+  {
+    "name": "kebab-case-name",
+    "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...",
+    "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."},
+    "traits": ["imperative-phrasing", "tests-first"],
+    "meta_strategy": "plan-first TDD"
+  }
+]"""
+
+_BREED_SCHEMA_DESCRIPTION = """[
+  {
+    "name": "kebab-case-name",
+    "skill_md_content": "---\\nname: ...\\n---\\n\\n# Skill\\n\\n...",
+    "supporting_files": {"scripts/validate.sh": "#!/bin/bash\\n..."},
+    "traits": ["imperative-phrasing", "tests-first"],
+    "meta_strategy": "plan-first TDD",
+    "parent_ids": ["uuid-1", "uuid-2"],
+    "mutations": ["changed-meta-strategy", "added-examples"],
+    "mutation_rationale": "Switched to TDD-first based on parent attribution data"
+  }
+]"""
+
+
+def _build_spawn_system_prompt(
+    specialization: str,
+    pop_size: int,
+    template: str,
+    bible_patterns: str,
+) -> str:
+    """Build the system prompt for gen 0 spawn."""
+    bible_section = (
+        f"\n\n## Validated Patterns (apply these)\n\n{bible_patterns}"
+        if bible_patterns
+        else ""
+    )
+    return (
+        f"You are a Skill author for the Claude Agent SDK. Your task is to generate "
+        f"{pop_size} DIVERSE candidate Skills for the following specialization:\n\n"
+        f"SPECIALIZATION: {specialization}\n\n"
+        "Each Skill must:\n"
+        "1. Follow the exact YAML frontmatter + markdown structure of the template below\n"
+        "2. Include 'Use when' in the first 250 chars of the description\n"
+        "3. Have a name matching the regex ^[a-z0-9]+(-[a-z0-9]+)*$\n"
+        "4. Contain at least 2 example blocks (**Example or ## Example)\n"
+        "5. Keep the body under 500 lines\n"
+        "6. Have a description under 1024 characters\n"
+        "7. NOT use 'anthropic' or 'claude' in the name\n"
+        "8. Only reference paths in ${CLAUDE_SKILL_DIR}/... that are included in supporting_files\n\n"
+        "## Golden Template\n\n"
+        f"{template}"
+        f"{bible_section}\n\n"
+        f"Return ONLY a JSON array of exactly {pop_size} skill objects. "
+        "No prose before or after — ONLY the JSON array. Use this schema:\n"
+        f"{_SPAWN_SCHEMA_DESCRIPTION}\n"
+        "Vary the approach, strategy, instruction style, and examples across all skills "
+        "while preserving the template structure."
+    )
+
+
+def _build_breed_system_prompt(
+    parents: list[SkillGenome],
+    learning_log: list[str],
+    breeding_instructions: str,
+    bible_patterns: str,
+) -> str:
+    """Build the system prompt for next-gen breeding."""
+    bible_section = (
+        f"\n\n## Validated Patterns\n\n{bible_patterns}" if bible_patterns else ""
+    )
+
+    parents_section = "\n\n".join(
+        f"### Parent {i + 1} (id: {p.id})\n"
+        f"**Traits**: {p.traits}\n"
+        f"**Meta-strategy**: {p.meta_strategy}\n"
+        f"**Trait attribution**: {p.trait_attribution}\n"
+        f"**Trait diagnostics**: {p.trait_diagnostics}\n\n"
+        f"**SKILL.md content**:\n```\n{p.skill_md_content}\n```"
+        for i, p in enumerate(parents)
+    )
+
+    learning_section = (
+        "\n".join(f"- {entry}" for entry in learning_log)
+        if learning_log
+        else "(no entries yet)"
+    )
+
+    return (
+        "You are a Skill evolutionary breeder for the Claude Agent SDK.\n\n"
+        "## Breeding Instructions (from Breeder agent)\n\n"
+        f"{breeding_instructions}\n\n"
+        "## Parent Skills\n\n"
+        f"{parents_section}\n\n"
+        "## Learning Log (failures and lessons from all prior generations)\n\n"
+        f"{learning_section}"
+        f"{bible_section}\n\n"
+        "## Rules for child Skills\n"
+        "1. Follow YAML frontmatter + markdown structure of the parents\n"
+        "2. Include 'Use when' in first 250 chars of description\n"
+        "3. Name must match ^[a-z0-9]+(-[a-z0-9]+)*$ and NOT contain 'anthropic' or 'claude'\n"
+        "4. At least 2 example blocks (**Example or ## Example)\n"
+        "5. Body under 500 lines, description under 1024 characters\n"
+        "6. Only reference ${CLAUDE_SKILL_DIR}/... paths that are in supporting_files\n\n"
+        "Return ONLY a JSON array of child skill objects. Use this schema:\n"
+        f"{_BREED_SCHEMA_DESCRIPTION}"
+    )
+
+
+def _build_repair_prompt(
+    original_prompt: str,
+    violations_by_idx: dict[int, list[str]],
+    genomes: list[SkillGenome],
+) -> str:
+    """Build a reprompt asking Claude to fix specific violations."""
+    violation_lines: list[str] = []
+    for idx, viols in violations_by_idx.items():
+        genome_name = genomes[idx].skill_md_content[:50].replace("\n", " ")
+        violation_lines.append(
+            f"Skill index {idx} ({genome_name!r}): {'; '.join(viols)}"
+        )
+    violations_str = "\n".join(violation_lines)
+
+    return (
+        "Your previous response contained invalid Skills. "
+        "Fix the following violations and return a corrected JSON array:\n\n"
+        f"{violations_str}\n\n"
+        "Return ONLY the complete corrected JSON array — all skills, not just the fixed ones."
+    )
+
+def _build_variant_spawn_prompt(
+    specialization: str,
+    dimension: dict,
+    foundation_genome: SkillGenome | None,
+    pop_size: int,
+    template: str,
+) -> str:
+    """System prompt for spawning N focused mini-SKILL.md variants for one dimension."""
+    name = dimension.get("name", "")
+    tier = dimension.get("tier", "")
+    description = dimension.get("description", "")
+    evaluation_focus = dimension.get("evaluation_focus", "")
+
+    foundation_block = ""
+    if foundation_genome is not None and tier == "capability":
+        # Capability variants get the winning foundation as grounding so they
+        # plug into a consistent skeleton during Engineer assembly later.
+        foundation_block = (
+            "\n## Foundation context (capability variants must plug into this)\n\n"
+            "The following foundation variant has already won its tier. Your "
+            "capability variants will be assembled with it later, so they "
+            "MUST be compatible with its directory layout, naming, and fixture "
+            "philosophy. Reference the foundation's scripts and conventions "
+            "in your workflow steps.\n\n"
+            "```markdown\n"
+            f"{foundation_genome.skill_md_content[:2000]}\n"
+            "```\n"
+        )
+
+    return (
+        f"## Specialization\n\n{specialization}\n\n"
+        f"## Variant dimension you are spawning for\n\n"
+        f"- Name: `{name}`\n"
+        f"- Tier: {tier}\n"
+        f"- Description: {description}\n"
+        f"- Evaluation focus: {evaluation_focus}\n"
+        f"{foundation_block}\n"
+        f"## Your job\n\n"
+        f"Spawn {pop_size} DIVERSE mini-skill packages that each take a "
+        f"DIFFERENT angle on the dimension above. Gen 0 exists to explore — "
+        f"do not produce N near-duplicates and do not kitchen-sink one "
+        f"variant with every approach.\n\n"
+        "**One dimension, one angle per variant.** Each variant's SKILL.md "
+        "body must focus on the single dimension named above and avoid "
+        "drifting into adjacent dimensions.\n\n"
+        "## Golden template\n\n"
+        f"```markdown\n{template}\n```\n\n"
+        "## Hard rules (validator-enforced)\n\n"
+        "- `name`: kebab-case, matches `^[a-z0-9]+(-[a-z0-9]+)*$`\n"
+        "- `description`: ≤250 chars, pushy routing pattern\n"
+        "- Body: ≤500 lines\n"
+        "- 2-3 diverse I/O examples mandatory\n"
+        "- The body MUST mention the dimension name somewhere\n"
+        "- All scripts/references referenced from SKILL.md use the\n"
+        "  `${CLAUDE_SKILL_DIR}/...` path convention\n\n"
+        "## Output format\n\n"
+        f"Return ONLY a JSON array of exactly {pop_size} objects. The "
+        "``skill_md_content`` field MUST contain the FULL SKILL.md — "
+        "starting with ``---`` (YAML frontmatter), then the body. Do NOT "
+        "separate frontmatter into its own field; it must be embedded in "
+        "``skill_md_content`` as the validator expects a complete SKILL.md.\n\n"
+        "Schema:\n"
+        '```json\n[\n  {\n'
+        '    "name": "kebab-case-name",\n'
+        '    "skill_md_content": "---\\nname: ...\\ndescription: >-\\n  ...\\n---\\n\\n# Display Name\\n\\n## Quick Start\\n...",\n'
+        '    "supporting_files": {"scripts/score.py": "...", '
+        '"scripts/validate.sh": "..."},\n'
+        '    "traits": ["trait1", "trait2"],\n'
+        '    "meta_strategy": "one-liner approach description"\n'
+        "  }\n]\n```\n"
+        "No prose before or after — ONLY the JSON array."
+    )
+
diff --git a/skillforge/agents/spawner/main.py b/skillforge/agents/spawner/main.py
new file mode 100644
index 0000000..193ccfd
--- /dev/null
+++ b/skillforge/agents/spawner/main.py
@@ -0,0 +1,411 @@
+"""Spawner entry points.
+
+Four top-level coroutines:
+- ``spawn_gen0``          fresh population from a specialization string
+- ``breed_next_gen``      child skills from ranked parents + instructions
+- ``spawn_from_parent``   fork-and-evolve from a single seed genome
+- ``spawn_variant_gen0``  per-dimension atomic variants
+
+All four share the same generate/parse/validate/repair loop, differ
+only in the prompt they feed the LLM and their retry cadence.
+"""
+
+from __future__ import annotations
+
+import uuid
+
+from skillforge.agents._json import extract_json_array
+from skillforge.agents.spawner._helpers import (
+    _parse_genomes,
+    _save_debug_response,
+    _validate_genomes,
+)
+from skillforge.agents.spawner._prompts import (
+    _build_breed_system_prompt,
+    _build_repair_prompt,
+    _build_spawn_system_prompt,
+    _build_variant_spawn_prompt,
+)
+from skillforge.config import GOLDEN_TEMPLATE_DIR
+from skillforge.errors import ParseError
+from skillforge.models import SkillGenome
+
+
+async def _generate(prompt: str) -> str:
+    """Dispatch to the real ``_generate`` via the package namespace.
+
+    Tests patch ``skillforge.agents.spawner._generate`` to intercept LLM
+    calls. Binding the helper at import time would shadow that patch;
+    this indirection resolves the attribute on the package root at call
+    time so the patch takes effect.
+    """
+    from skillforge.agents import spawner as _pkg
+
+    return await _pkg._generate(prompt)
+
+
+def _read_bible_patterns() -> str:
+    """Same lazy-lookup pattern as ``_generate`` — tests sometimes patch
+    ``skillforge.agents.spawner._read_bible_patterns``."""
+    from skillforge.agents import spawner as _pkg
+
+    return _pkg._read_bible_patterns()
+
+
+async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]:
+    """Generate ``pop_size`` diverse gen 0 Skills for the specialization.
+
+    Args:
+        specialization: Description of the Skill domain.
+        pop_size: Number of candidate Skills to generate.
+
+    Returns:
+        A list of ``pop_size`` validated SkillGenome objects at generation 0.
+
+    Raises:
+        ValueError: if Skills remain invalid after 1 retry.
+    """
+    template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text()
+    bible_patterns = _read_bible_patterns()
+
+    system_prompt = _build_spawn_system_prompt(
+        specialization, pop_size, template, bible_patterns
+    )
+
+    # Attempt 1
+    text = await _generate(system_prompt)
+    _save_debug_response("spawn_gen0_attempt1", text)
+
+    try:
+        raw = extract_json_array(text)
+        genomes = _parse_genomes(raw, generation=0)
+        valid_genomes, invalid = _validate_genomes(genomes)
+        first_attempt_failed = False
+    except (ValueError, ParseError):
+        # JSON parse failure — treat as if everything was invalid so the
+        # retry path runs.
+        genomes = []
+        valid_genomes = []
+        invalid = {}
+        first_attempt_failed = True
+
+    if not first_attempt_failed and not invalid:
+        return valid_genomes
+
+    # Attempt 2 — retry. Use the same prompt if JSON parse failed (Claude
+    # just didn't follow instructions), or a targeted repair prompt if
+    # the skills parsed but failed validation.
+    if first_attempt_failed:
+        retry_prompt = (
+            system_prompt
+            + "\n\nCRITICAL: Your previous response did not contain a valid JSON "
+            "array. You must respond with ONLY a JSON array — no prose, no "
+            "markdown before or after the array. The array must start with [ "
+            "and end with ]. No explanations."
+        )
+    else:
+        retry_prompt = _build_repair_prompt(system_prompt, invalid, genomes)
+
+    text = await _generate(retry_prompt)
+    _save_debug_response("spawn_gen0_attempt2", text)
+
+    try:
+        raw2 = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
+        raise ValueError(
+            f"spawner failed to produce valid JSON on retry: {exc}. "
+            f"See /tmp/sf-spawn_gen0_attempt2.txt for the raw response."
+        ) from exc
+
+    genomes2 = _parse_genomes(raw2, generation=0)
+    valid_genomes2, still_invalid = _validate_genomes(genomes2)
+
+    if still_invalid:
+        all_violations = [
+            f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items()
+        ]
+        raise ValueError(
+            "spawner produced invalid skills after retry: "
+            + "; ".join(all_violations)
+        )
+
+    return valid_genomes2
+
+
+async def breed_next_gen(
+    parents: list[SkillGenome],
+    learning_log: list[str],
+    breeding_instructions: str,
+) -> list[SkillGenome]:
+    """Produce a child population from parents + Breeder's instructions.
+
+    Args:
+        parents: Parent SkillGenome objects (with trait_attribution populated).
+        learning_log: Accumulated lessons from all prior generations.
+        breeding_instructions: Free-text directives from the Breeder agent.
+
+    Returns:
+        A list of validated child SkillGenome objects at generation+1.
+
+    Raises:
+        ValueError: if children remain invalid after 1 retry.
+    """
+    bible_patterns = _read_bible_patterns()
+    parent_ids = [p.id for p in parents]
+    next_generation = (parents[0].generation + 1) if parents else 1
+
+    system_prompt = _build_breed_system_prompt(
+        parents, learning_log, breeding_instructions, bible_patterns
+    )
+
+    # Attempt 1
+    text = await _generate(system_prompt)
+
+    try:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
+        raise ValueError(
+            f"spawner breed_next_gen failed to produce valid JSON: {exc}"
+        ) from exc
+
+    # Parse with generation and parent_ids from raw (each child should specify its own parent_ids)
+    children: list[SkillGenome] = []
+    for item in raw:
+        child = SkillGenome(
+            id=str(uuid.uuid4()),
+            generation=next_generation,
+            skill_md_content=item.get("skill_md_content", ""),
+            supporting_files=item.get("supporting_files", {}),
+            traits=item.get("traits", []),
+            meta_strategy=item.get("meta_strategy", ""),
+            parent_ids=item.get("parent_ids", parent_ids),
+            mutations=item.get("mutations", []),
+            mutation_rationale=item.get("mutation_rationale", ""),
+            maturity="draft",
+        )
+        children.append(child)
+
+    valid_children, invalid = _validate_genomes(children)
+
+    if not invalid:
+        return valid_children
+
+    # Attempt 2 — repair
+    repair_prompt = _build_repair_prompt(system_prompt, invalid, children)
+    text = await _generate(repair_prompt)
+
+    try:
+        raw2 = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
+        raise ValueError(
+            f"spawner breed_next_gen failed to produce valid JSON on retry: {exc}"
+        ) from exc
+
+    children2: list[SkillGenome] = []
+    for item in raw2:
+        child = SkillGenome(
+            id=str(uuid.uuid4()),
+            generation=next_generation,
+            skill_md_content=item.get("skill_md_content", ""),
+            supporting_files=item.get("supporting_files", {}),
+            traits=item.get("traits", []),
+            meta_strategy=item.get("meta_strategy", ""),
+            parent_ids=item.get("parent_ids", parent_ids),
+            mutations=item.get("mutations", []),
+            mutation_rationale=item.get("mutation_rationale", ""),
+            maturity="draft",
+        )
+        children2.append(child)
+
+    valid_children2, still_invalid = _validate_genomes(children2)
+
+    if still_invalid:
+        all_violations = [
+            f"skill {i}: {'; '.join(v)}" for i, v in still_invalid.items()
+        ]
+        raise ValueError(
+            "spawner produced invalid skills after retry: "
+            + "; ".join(all_violations)
+        )
+
+    return valid_children2
+
+
+async def spawn_from_parent(
+    parent: SkillGenome,
+    pop_size: int,
+) -> list[SkillGenome]:
+    """Generate a gen 0 population using an existing Skill as the seed parent.
+
+    The parent itself is carried forward as the elite (slot 0) and ``pop_size - 1``
+    diverse mutations are synthesized around it. Used by the Registry fork-and-
+    evolve flow and the upload-and-evolve flow — both just hand us an existing
+    genome to evolve forward instead of spawning from the golden template.
+
+    Args:
+        parent: The seed SkillGenome to evolve from (untouched in the output).
+        pop_size: Total population size including the elite parent.
+
+    Returns:
+        A list of ``pop_size`` SkillGenome objects at generation 0. The first
+        entry is the parent (re-id'd, elite); the rest are mutations.
+    """
+    if pop_size < 1:
+        raise ValueError(f"pop_size must be ≥ 1, got {pop_size}")
+
+    bible_patterns = _read_bible_patterns()
+
+    # The elite: clone the parent with a fresh id, retain content + traits
+    elite = SkillGenome(
+        id=str(uuid.uuid4()),
+        generation=0,
+        skill_md_content=parent.skill_md_content,
+        frontmatter=dict(parent.frontmatter),
+        supporting_files=dict(parent.supporting_files),
+        traits=list(parent.traits),
+        meta_strategy=parent.meta_strategy,
+        parent_ids=[parent.id],
+        mutations=["elite-carry"],
+        mutation_rationale="Seed parent carried forward as elite.",
+        maturity=parent.maturity or "draft",
+    )
+
+    if pop_size == 1:
+        return [elite]
+
+    num_mutants = pop_size - 1
+    system_prompt = f"""You are evolving an existing Claude Agent Skill by producing {num_mutants} diverse mutations.
+
+The parent Skill is below. Your job is to produce {num_mutants} variant Skills that preserve the parent's core capability but explore different:
+- Description phrasing + trigger expansion
+- Instruction structure (more/fewer numbered steps, different section ordering)
+- Trait emphasis (lean harder into some traits, introduce new ones)
+- Example diversity (different I/O pairs)
+
+Each mutation must still satisfy every constraint in the bible (≤250 char description, "Use when" + "NOT for" clauses, ≤500 line body, 2-3 diverse examples, valid YAML frontmatter, unique name matching `^[a-z0-9]+(-[a-z0-9]+)*$`).
+
+## Bible patterns (non-negotiable)
+
+{bible_patterns}
+
+## Parent Skill
+
+```
+{parent.skill_md_content}
+```
+
+Parent traits: {", ".join(parent.traits) if parent.traits else "(none)"}
+Parent strategy: {parent.meta_strategy}
+
+## Output
+
+Return a JSON array of exactly {num_mutants} skills. Each entry is a JSON object with fields:
+- `skill_md_content`: the full SKILL.md (YAML frontmatter + body)
+- `traits`: list of trait strings
+- `meta_strategy`: 1-2 sentences
+- `mutations`: list of mutation-type strings (e.g. ["description-expansion", "example-swap"])
+- `mutation_rationale`: why these mutations were made
+
+Do NOT modify the parent. Do NOT return fewer or more than {num_mutants} entries. Each mutation must have a UNIQUE `name` field in its frontmatter.
+"""
+
+    text = await _generate(system_prompt)
+
+    try:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
+        # If the LLM refused or produced garbage, fall back to elite-only
+        # (graceful degradation — evolution can still proceed with just the parent)
+        return [elite]
+
+    mutants: list[SkillGenome] = []
+    for item in raw[:num_mutants]:
+        mutants.append(
+            SkillGenome(
+                id=str(uuid.uuid4()),
+                generation=0,
+                skill_md_content=item.get("skill_md_content", ""),
+                supporting_files=item.get("supporting_files", {}),
+                traits=item.get("traits", []),
+                meta_strategy=item.get("meta_strategy", ""),
+                parent_ids=[parent.id],
+                mutations=item.get("mutations", []),
+                mutation_rationale=item.get("mutation_rationale", ""),
+                maturity="draft",
+            )
+        )
+
+    # Drop any mutants that fail validation — keep the elite always
+    valid_mutants, _ = _validate_genomes(mutants)
+    return [elite, *valid_mutants][:pop_size]
+
+
+async def spawn_variant_gen0(
+    specialization: str,
+    dimension: dict,
+    foundation_genome: SkillGenome | None,
+    pop_size: int = 2,
+) -> list[SkillGenome]:
+    """Spawn ``pop_size`` focused mini-skill variants for a single dimension.
+
+    Args:
+        specialization: The parent skill family's specialization string.
+        dimension: A dict with at minimum ``name`` and ``tier`` keys; may
+            include ``description`` and ``evaluation_focus``. Matches the
+            shape of ``TaxonomistOutput.variant_dimensions``.
+        foundation_genome: For capability variants, the winning foundation
+            genome to use as grounding context. Pass ``None`` for foundation
+            variants.
+        pop_size: How many variants to spawn (default 2 for atomic mode).
+
+    Returns:
+        A list of ``pop_size`` SkillGenome objects at generation 0. Each is
+        validated against the standard authoring constraints. Invalid
+        variants are dropped — the caller may receive fewer than
+        ``pop_size`` if the model produces malformed output, but never more.
+
+    Raises:
+        ValueError: if no valid variants survive validation after one retry.
+    """
+    if pop_size < 1:
+        raise ValueError(f"pop_size must be ≥ 1, got {pop_size}")
+
+    template = (GOLDEN_TEMPLATE_DIR / "SKILL.md").read_text()
+    system_prompt = _build_variant_spawn_prompt(
+        specialization, dimension, foundation_genome, pop_size, template
+    )
+
+    text = await _generate(system_prompt)
+    _save_debug_response(f"spawn_variant_gen0_{dimension.get('name', 'unknown')}", text)
+
+    try:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
+        # One retry with a stricter formatting reminder
+        retry_prompt = (
+            system_prompt
+            + "\n\nCRITICAL: Your previous response did not contain a valid "
+            "JSON array. Respond with ONLY a JSON array — no prose, no "
+            "markdown fences."
+        )
+        text = await _generate(retry_prompt)
+        raw = extract_json_array(text)
+
+    genomes = _parse_genomes(raw, generation=0)
+    valid_genomes, invalid = _validate_genomes(genomes)
+
+    if not valid_genomes:
+        violations = [f"skill {i}: {'; '.join(v)}" for i, v in invalid.items()]
+        raise ValueError(
+            "spawn_variant_gen0 produced no valid variants: "
+            + "; ".join(violations)
+        )
+
+    # Stamp dimension metadata into the frontmatter so the Reviewer knows
+    # how to scope L3/L4 evaluation. Validator doesn't require it but it's
+    # the right shape for downstream consumers.
+    for genome in valid_genomes:
+        genome.frontmatter["dimension"] = dimension.get("name", "")
+        genome.frontmatter["tier"] = dimension.get("tier", "")
+
+    return valid_genomes[:pop_size]