From aaa8395f588996f4cbd51eea42c0029b1864ad64 Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Sun, 19 Apr 2026 15:38:18 -0500
Subject: [PATCH 1/2] fix: live-test bugs from cheap-Haiku atomic run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes two bugs surfaced by the first live atomic-evolution run on the
cheap Haiku tier (see journal #17).

1. Engineer oversize description (assembly-killer)
-------------------------------------------------
Haiku routinely overshoots the 250-char composite description cap by
10–40%. The previous behavior raised ValueError, which triggered an
~$0.20 LLM retry that often produced the same class of overshoot. For
2-of-3 runs it killed the whole atomic pipeline.

Fix: add `_try_truncate_description` helper that repairs oversize
descriptions at a word boundary when it can do so without clobbering
the "Use when" pushy-pattern marker. `_validate_composite_shape` now
repairs in place before raising; only truly unsalvageable cases
(no "Use when" in the first 250 chars) still raise.

Covered by 3 new unit tests plus the updated existing oversize test.

2. Managed-agents skill upload YAML frontmatter 400 (upload-killer)
-------------------------------------------------------------------
~1% of `managed_agents.upload_skill` calls returned
``400 SKILL.md must start with YAML frontmatter (---)`` — the
Anthropic Skills API is byte-strict about the leading ``---``. Models
occasionally prepend a UTF-8 BOM or stray whitespace that the
structural validator (which uses `startswith("---")` after standard
string handling) happens to tolerate.

Fix: `upload_skill` now `lstrip`s BOM + whitespace from the payload
before calling the API. If the normalized content still doesn't start
with ``---``, raise ValueError with a clear message instead of
round-tripping to a generic 400. Caller (`competitor_managed`) still
falls back to inline for genuine bad content; BOM/whitespace-only
damage now uploads cleanly instead of falling back.

Covered by 2 new unit tests (reject + BOM-strip).

QA
--
  ruff check skillforge     - clean
  mypy skillforge           - 65 files pass
  pytest tests/             - 408 passed (+5), 2 skipped

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/engineer.py       | 44 +++++++++++++++++++++--
 skillforge/agents/managed_agents.py | 19 ++++++++--
 tests/test_engineer.py              | 56 +++++++++++++++++++++++++++--
 tests/test_managed_agents.py        | 47 ++++++++++++++++++++----
 4 files changed, 151 insertions(+), 15 deletions(-)

diff --git a/skillforge/agents/engineer.py b/skillforge/agents/engineer.py
index 5ed0037..7a09170 100644
--- a/skillforge/agents/engineer.py
+++ b/skillforge/agents/engineer.py
@@ -240,6 +240,38 @@ def _extract_json_object(text: str) -> dict[str, Any]:
     raise ValueError("unterminated JSON object in Engineer response")
 
 
+def _try_truncate_description(desc: str, cap: int = 250) -> str | None:
+    """Return a truncated description that fits under ``cap`` chars, or None.
+
+    Haiku-tier runs routinely over-shoot the 250-char frontmatter cap by
+    10–40% — the constraint is in the prompt but the model doesn't
+    enforce it reliably. Retrying on length alone costs a full LLM call
+    (~$0.20) to re-generate content that's 99% correct; truncating at a
+    word boundary is free and preserves the semantic payload.
+
+    Returns None if we can't truncate without clobbering the "Use when"
+    pushy-pattern marker — at that point it's a genuine content
+    failure and the caller should retry or fail loudly.
+    """
+    if len(desc) <= cap:
+        return desc
+
+    # Reserve 3 chars for the ellipsis suffix.
+    target = cap - 3
+    cut = desc.rfind(" ", 0, target + 1)
+    if cut == -1 or cut < cap // 2:
+        # No word boundary in the useful range — hard-cut at target.
+        cut = target
+    truncated = desc[:cut].rstrip(" ,.;:") + "..."
+
+    # The composite validator below requires "Use when" in the first
+    # 250 chars; if truncation erased it, the repair is worse than the
+    # original problem.
+    if "use when" not in truncated.lower():
+        return None
+    return truncated
+
+
 def _validate_composite_shape(raw: dict[str, Any]) -> None:
     if not isinstance(raw, dict):
         raise ValueError("Engineer output must be a JSON object")
@@ -253,9 +285,15 @@ def _validate_composite_shape(raw: dict[str, Any]) -> None:
 
     desc = fm.get("description", "")
     if isinstance(desc, str) and len(desc) > 250:
-        raise ValueError(
-            f"Engineer composite description is {len(desc)} chars > 250 limit"
-        )
+        # Try the cheap repair before escalating to an LLM retry.
+        repaired = _try_truncate_description(desc)
+        if repaired is None:
+            raise ValueError(
+                f"Engineer composite description is {len(desc)} chars > 250 "
+                f"limit and cannot be safely truncated (would remove 'Use when')"
+            )
+        fm["description"] = repaired
+        desc = repaired
     # Pushy-pattern requirement — validate_skill_structure rule 5 rejects
     # descriptions that don't contain "Use when" in the first 250 chars.
     # Catch it here so the Engineer gets one retry instead of hitting the
diff --git a/skillforge/agents/managed_agents.py b/skillforge/agents/managed_agents.py
index 7ac3812..dd6430f 100644
--- a/skillforge/agents/managed_agents.py
+++ b/skillforge/agents/managed_agents.py
@@ -147,16 +147,31 @@ async def upload_skill(
          use that. The ``name`` arg is still used as the ``display_title``
          (which can be anything human-readable).
 
+    The Anthropic Skills API hard-requires the payload to start literally
+    with ``---``. A UTF-8 BOM or stray leading whitespace — which neither
+    our structural validator nor JSON round-tripping strips — is enough
+    to earn a ``400 SKILL.md must start with YAML frontmatter (---)``.
+    We normalize here so the ~1% of model outputs with a leading BOM or
+    whitespace still upload cleanly instead of falling back to inline.
+
     Returns the new ``skill_id``. The caller is responsible for archiving it
     via :func:`archive_skill` after the session completes.
     """
-    folder = _extract_skill_name_from_md(skill_md) or name
+    # Strip leading BOM + whitespace the API is strict about; don't touch
+    # the rest of the body so checksum/fitness stays stable.
+    normalized = skill_md.lstrip("\ufeff \t\r\n")
+    if not normalized.startswith("---"):
+        raise ValueError(
+            "upload_skill: skill_md does not start with YAML frontmatter (---) "
+            "after stripping BOM/whitespace — refusing to call the API"
+        )
+    folder = _extract_skill_name_from_md(normalized) or name
     resp = await client.beta.skills.create(
         display_title=name,
         files=[
             (
                 f"{folder}/SKILL.md",
-                skill_md.encode("utf-8"),
+                normalized.encode("utf-8"),
                 "text/markdown",
             )
         ],
diff --git a/tests/test_engineer.py b/tests/test_engineer.py
index c6fc9c8..8a519b1 100644
--- a/tests/test_engineer.py
+++ b/tests/test_engineer.py
@@ -21,6 +21,7 @@
 from skillforge.agents.engineer import (
     IntegrationReport,
     _detect_conflicts,
+    _try_truncate_description,
     _validate_composite_shape,
     assemble_variants,
 )
@@ -132,10 +133,33 @@ def test_validate_composite_shape_rejects_missing_frontmatter():
         _validate_composite_shape(raw)
 
 
-def test_validate_composite_shape_rejects_oversize_description():
+def test_validate_composite_shape_truncates_oversize_description_in_place():
+    """Oversize description with 'Use when' early is truncated, not rejected.
+
+    Pre-wave-4 refactor this test used to assert the length check raised.
+    We now repair in place because Haiku-tier runs routinely overshoot by
+    10–40% and a no-op truncation saves an LLM retry (~$0.20). See also
+    `test_try_truncate_description_*` below for the pure-helper tests.
+    """
+    raw = json.loads(_canned_engineer_response())
+    raw["frontmatter"]["description"] = (
+        "Use when evolved family is ready to assemble into a single composite "
+        "skill that merges foundation traits with capability traits "
+        + "and " * 60  # balloon it past 250 chars
+    )
+    _validate_composite_shape(raw)  # no exception — truncation repairs
+    assert len(raw["frontmatter"]["description"]) <= 250
+    assert "use when" in raw["frontmatter"]["description"].lower()
+
+
+def test_validate_composite_shape_rejects_unsalvageable_oversize_description():
+    """Oversize description where 'Use when' sits past char 250 cannot be
+    salvaged by truncation — the validator should raise."""
     raw = json.loads(_canned_engineer_response())
-    raw["frontmatter"]["description"] = "Use when " + "x" * 300
-    with pytest.raises(ValueError, match="> 250"):
+    raw["frontmatter"]["description"] = (
+        "x" * 260 + " Use when this lands past the truncation budget"
+    )
+    with pytest.raises(ValueError, match="cannot be safely truncated"):
         _validate_composite_shape(raw)
 
 
@@ -146,6 +170,32 @@ def test_validate_composite_shape_rejects_missing_use_when():
         _validate_composite_shape(raw)
 
 
+# ---------------------------------------------------------------------------
+# _try_truncate_description — pure helper
+# ---------------------------------------------------------------------------
+
+
+def test_try_truncate_description_noop_when_under_cap():
+    assert _try_truncate_description("short desc, Use when x") == "short desc, Use when x"
+
+
+def test_try_truncate_description_repairs_oversize_at_word_boundary():
+    desc = "Use when evolved family is ready. " + "word " * 100
+    out = _try_truncate_description(desc)
+    assert out is not None
+    assert len(out) <= 250
+    assert out.endswith("...")
+    assert "use when" in out.lower()
+    # Boundary was a word boundary, not a mid-word cut.
+    body = out.removesuffix("...")
+    assert not body.endswith(" ")
+
+
+def test_try_truncate_description_returns_none_when_use_when_is_past_cap():
+    desc = "x" * 300 + " Use when we care about long prefixes"
+    assert _try_truncate_description(desc) is None
+
+
 # ---------------------------------------------------------------------------
 # assemble_variants — full path with mocked LLM
 # ---------------------------------------------------------------------------
diff --git a/tests/test_managed_agents.py b/tests/test_managed_agents.py
index aaf5127..0ea38e7 100644
--- a/tests/test_managed_agents.py
+++ b/tests/test_managed_agents.py
@@ -394,19 +394,52 @@ async def test_upload_skill_folder_name_matches_frontmatter_name():
 
 
 @pytest.mark.asyncio
-async def test_upload_skill_falls_back_to_arg_name_when_frontmatter_missing():
-    """If SKILL.md has no parseable name, use the ``name`` arg as the folder."""
+async def test_upload_skill_rejects_payload_without_frontmatter():
+    """Content without leading ``---`` is rejected locally, never sent to the API.
+
+    Empirical finding from a live Haiku-tier run: ~1% of managed-backend
+    uploads returned ``400 SKILL.md must start with YAML frontmatter (---)``.
+    Catching it here (instead of after the round-trip) gives a clear error
+    and keeps the fallback-to-inline code path in ``competitor_managed``
+    from being triggered by genuinely bad payloads.
+    """
+    mock_client = MagicMock()
+    mock_client.beta.skills.create = AsyncMock()
+
+    skill_md = "no frontmatter at all, just prose"
+    with pytest.raises(ValueError, match="does not start with YAML frontmatter"):
+        await managed_agents.upload_skill(
+            mock_client, name="fallback-name", skill_md=skill_md
+        )
+    # Never actually called the API
+    mock_client.beta.skills.create.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_upload_skill_strips_bom_and_leading_whitespace():
+    """A leading UTF-8 BOM or stray whitespace is stripped before upload.
+
+    The API is byte-strict about the payload starting with ``---`` —
+    models occasionally emit content with a leading BOM or space that the
+    structural validator happens to tolerate. Normalizing here means the
+    upload succeeds instead of falling back to inline over something
+    cosmetic.
+    """
     mock_client = MagicMock()
     mock_response = MagicMock()
-    mock_response.id = "skill_fallback_999"
+    mock_response.id = "skill_normalized"
     mock_client.beta.skills.create = AsyncMock(return_value=mock_response)
 
-    skill_md = "no frontmatter at all, just prose"
-    await managed_agents.upload_skill(
-        mock_client, name="fallback-name", skill_md=skill_md
+    skill_md = "\ufeff  \n---\nname: bom-skill\n---\n# x"
+    result = await managed_agents.upload_skill(
+        mock_client, name="display", skill_md=skill_md
     )
+    assert result == "skill_normalized"
     files = mock_client.beta.skills.create.call_args.kwargs["files"]
-    assert files[0][0] == "fallback-name/SKILL.md"
+    filename, content_bytes, _ = files[0]
+    assert filename == "bom-skill/SKILL.md"
+    # BOM/whitespace stripped; payload starts exactly with ``---``
+    assert content_bytes.startswith(b"---\n")
 
 
 def test_extract_skill_name_from_md_happy_path():

From ff4ee883e87a84f1609f6f1b5c1db740adfc170d Mon Sep 17 00:00:00 2001
From: "Matt (via Claude Code)" <matt@skillforge.local>
Date: Sun, 19 Apr 2026 15:48:13 -0500
Subject: [PATCH 2/2] fix: spawner auto-repairs missing ${CLAUDE_SKILL_DIR}
 reference files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Third LLM-fragility bug from the cheap-Haiku atomic run (same class as
the two fixed in the previous commit): the Spawner routinely emits
SKILL.md bodies that reference references/*-guide.md files in prose
but forgets to include the file in supporting_files. Structural rule 8
rejects those genomes, and in atomic mode (pop=2, 1 retry) this was
killing the whole dimension 1-of-3 times.

Fix: new _auto_repair_missing_references helper runs before
_validate_genomes. For each ${CLAUDE_SKILL_DIR}/<path> reference
missing from supporting_files, stub a placeholder file with a clear
auto-generated marker. The skill renders, the reference resolves at
runtime, validation passes, and the Breeder can flesh out the stub
in later generations if the signal warrants it.

Same defensive-repair pattern as the Engineer description truncation:
cheap LLM produces almost-valid output, we repair at the boundary
instead of burning another ~\$0.20 retry that often reproduces the
same oversight.

Covered by 2 new unit tests (stubs-missing + noop-when-present).

QA: ruff + mypy + 410 pytest (+2 from new tests) — all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skillforge/agents/spawner.py | 52 +++++++++++++++++++++++++++++++++++-
 tests/test_agents.py         | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/skillforge/agents/spawner.py b/skillforge/agents/spawner.py
index c25c036..096bbc7 100644
--- a/skillforge/agents/spawner.py
+++ b/skillforge/agents/spawner.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import re
 import uuid
 
 from anthropic import AsyncAnthropic
@@ -150,13 +151,62 @@ async def _generate(prompt: str) -> str:
     return "".join(parts)
 
 
+# Pulls ${CLAUDE_SKILL_DIR}/<relative/path> references out of a SKILL.md body.
+# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8
+# exactly — see that function for the source-of-truth behavior.
+_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)")
+
+
+def _auto_repair_missing_references(genome: SkillGenome) -> int:
+    """Stub out ``${CLAUDE_SKILL_DIR}/<path>`` refs missing from supporting_files.
+
+    Cheap-tier Haiku routinely emits SKILL.md bodies that reference
+    ``references/*-guide.md`` in prose but forget to include the file in
+    ``supporting_files``. Validator rule 8 rejects those genomes, which
+    in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 times.
+
+    Rather than burn another LLM call on a retry that often reproduces
+    the same oversight, we stub each missing reference with a minimal
+    placeholder. The skill still renders, the reference still resolves
+    at runtime, and the genome passes validation. The Breeder can flesh
+    out the stubs in later generations if fitness signal suggests they
+    carry weight.
+
+    Returns the count of paths that were stubbed (0 if everything already
+    resolved, which is the expected Sonnet-tier case).
+    """
+    stubbed = 0
+    for match in _REF_PATH_RE.finditer(genome.skill_md_content):
+        rel_path = match.group(1).rstrip(".,;:)")
+        if rel_path in genome.supporting_files:
+            continue
+        filename = rel_path.rsplit("/", 1)[-1]
+        placeholder_title = filename.removesuffix(".md").replace("-", " ").title()
+        genome.supporting_files[rel_path] = (
+            f"# {placeholder_title}\n\n"
+            f"_Placeholder — stubbed by the spawner's auto-repair pass "
+            f"because the generating LLM referenced this file but did not "
+            f"emit its contents. Replace with domain-specific material "
+            f"during a later generation._\n"
+        )
+        stubbed += 1
+    return stubbed
+
+
 def _validate_genomes(
     genomes: list[SkillGenome],
 ) -> tuple[list[SkillGenome], dict[int, list[str]]]:
-    """Validate each genome; returns (valid_genomes, {idx: violations})."""
+    """Validate each genome; returns (valid_genomes, {idx: violations}).
+
+    Runs the reference-path auto-repair pass before validation so cheap-tier
+    LLM drift on rule 8 (missing supporting_files entries) doesn't kill a
+    whole population. The repair only adds files; it never touches the
+    skill_md body.
+    """
     valid: list[SkillGenome] = []
     invalid: dict[int, list[str]] = {}
     for i, genome in enumerate(genomes):
+        _auto_repair_missing_references(genome)
         violations = validate_skill_structure(genome)
         if violations:
             invalid[i] = violations
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 6299446..1c990e7 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -375,6 +375,54 @@ async def test_spawn_gen0_raises_on_persistent_invalid():
         await spawn_gen0("test domain", pop_size=1)
 
 
+def test_auto_repair_missing_references_stubs_missing_files():
+    """Rule-8 drift: a body references a file that isn't in supporting_files.
+
+    The repair pass stubs it with a placeholder so the genome passes
+    structural validation instead of killing the whole population on a
+    cosmetic miss.
+    """
+    from skillforge.agents.spawner import _auto_repair_missing_references
+
+    skill_md = (
+        "---\nname: x\ndescription: Use when y.\n---\n\n"
+        "# X\n\nSee ${CLAUDE_SKILL_DIR}/references/style-guide.md for details.\n"
+    )
+    genome = SkillGenome(
+        id="g1",
+        generation=0,
+        skill_md_content=skill_md,
+        supporting_files={},
+        traits=[],
+    )
+    stubbed = _auto_repair_missing_references(genome)
+    assert stubbed == 1
+    assert "references/style-guide.md" in genome.supporting_files
+    assert genome.supporting_files["references/style-guide.md"].startswith(
+        "# Style Guide"
+    )
+
+
+def test_auto_repair_missing_references_noop_when_all_present():
+    """If every reference already resolves, the repair must not touch anything."""
+    from skillforge.agents.spawner import _auto_repair_missing_references
+
+    skill_md = (
+        "---\nname: x\ndescription: Use when y.\n---\n\n"
+        "# X\n\nSee ${CLAUDE_SKILL_DIR}/references/guide.md.\n"
+    )
+    genome = SkillGenome(
+        id="g2",
+        generation=0,
+        skill_md_content=skill_md,
+        supporting_files={"references/guide.md": "# real content"},
+        traits=[],
+    )
+    stubbed = _auto_repair_missing_references(genome)
+    assert stubbed == 0
+    assert genome.supporting_files["references/guide.md"] == "# real content"
+
+
 # ===========================================================================
 # Spawner tests — breed_next_gen
 # ===========================================================================