From aaa8395f588996f4cbd51eea42c0029b1864ad64 Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Sun, 19 Apr 2026 15:38:18 -0500 Subject: [PATCH 1/2] fix: live-test bugs from cheap-Haiku atomic run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes two bugs surfaced by the first live atomic-evolution run on the cheap Haiku tier (see journal #17). 1. Engineer oversize description (assembly-killer) ------------------------------------------------- Haiku routinely overshoots the 250-char composite description cap by 10–40%. The previous behavior raised ValueError, which triggered an ~$0.20 LLM retry that often produced the same class of overshoot. For 2-of-3 runs it killed the whole atomic pipeline. Fix: add `_try_truncate_description` helper that repairs oversize descriptions at a word boundary when it can do so without clobbering the "Use when" pushy-pattern marker. `_validate_composite_shape` now repairs in place before raising; only truly unsalvageable cases (no "Use when" in the first 250 chars) still raise. Covered by 3 new unit tests plus the updated existing oversize test. 2. Managed-agents skill upload YAML frontmatter 400 (upload-killer) ------------------------------------------------------------------- ~1% of `managed_agents.upload_skill` calls returned ``400 SKILL.md must start with YAML frontmatter (---)`` — the Anthropic Skills API is byte-strict about the leading ``---``. Models occasionally prepend a UTF-8 BOM or stray whitespace that the structural validator (which uses `startswith("---")` after standard string handling) happens to tolerate. Fix: `upload_skill` now `lstrip`s BOM + whitespace from the payload before calling the API. If the normalized content still doesn't start with ``---``, raise ValueError with a clear message instead of round-tripping to a generic 400. Caller (`competitor_managed`) still falls back to inline for genuine bad content; BOM/whitespace-only damage now uploads cleanly instead of falling back. Covered by 2 new unit tests (reject + BOM-strip). QA -- ruff check skillforge - clean mypy skillforge - 65 files pass pytest tests/ - 408 passed (+5), 2 skipped Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/engineer.py | 44 +++++++++++++++++++++-- skillforge/agents/managed_agents.py | 19 ++++++++-- tests/test_engineer.py | 56 +++++++++++++++++++++++++++-- tests/test_managed_agents.py | 47 ++++++++++++++++++++---- 4 files changed, 151 insertions(+), 15 deletions(-) diff --git a/skillforge/agents/engineer.py b/skillforge/agents/engineer.py index 5ed0037..7a09170 100644 --- a/skillforge/agents/engineer.py +++ b/skillforge/agents/engineer.py @@ -240,6 +240,38 @@ def _extract_json_object(text: str) -> dict[str, Any]: raise ValueError("unterminated JSON object in Engineer response") +def _try_truncate_description(desc: str, cap: int = 250) -> str | None: + """Return a truncated description that fits under ``cap`` chars, or None. + + Haiku-tier runs routinely over-shoot the 250-char frontmatter cap by + 10–40% — the constraint is in the prompt but the model doesn't + enforce it reliably. Retrying on length alone costs a full LLM call + (~$0.20) to re-generate content that's 99% correct; truncating at a + word boundary is free and preserves the semantic payload. + + Returns None if we can't truncate without clobbering the "Use when" + pushy-pattern marker — at that point it's a genuine content + failure and the caller should retry or fail loudly. + """ + if len(desc) <= cap: + return desc + + # Reserve 3 chars for the ellipsis suffix. + target = cap - 3 + cut = desc.rfind(" ", 0, target + 1) + if cut == -1 or cut < cap // 2: + # No word boundary in the useful range — hard-cut at target. + cut = target + truncated = desc[:cut].rstrip(" ,.;:") + "..." + + # The composite validator below requires "Use when" in the first + # 250 chars; if truncation erased it, the repair is worse than the + # original problem. + if "use when" not in truncated.lower(): + return None + return truncated + + def _validate_composite_shape(raw: dict[str, Any]) -> None: if not isinstance(raw, dict): raise ValueError("Engineer output must be a JSON object") @@ -253,9 +285,15 @@ def _validate_composite_shape(raw: dict[str, Any]) -> None: desc = fm.get("description", "") if isinstance(desc, str) and len(desc) > 250: - raise ValueError( - f"Engineer composite description is {len(desc)} chars > 250 limit" - ) + # Try the cheap repair before escalating to an LLM retry. + repaired = _try_truncate_description(desc) + if repaired is None: + raise ValueError( + f"Engineer composite description is {len(desc)} chars > 250 " + f"limit and cannot be safely truncated (would remove 'Use when')" + ) + fm["description"] = repaired + desc = repaired # Pushy-pattern requirement — validate_skill_structure rule 5 rejects # descriptions that don't contain "Use when" in the first 250 chars. # Catch it here so the Engineer gets one retry instead of hitting the diff --git a/skillforge/agents/managed_agents.py b/skillforge/agents/managed_agents.py index 7ac3812..dd6430f 100644 --- a/skillforge/agents/managed_agents.py +++ b/skillforge/agents/managed_agents.py @@ -147,16 +147,31 @@ async def upload_skill( use that. The ``name`` arg is still used as the ``display_title`` (which can be anything human-readable). + The Anthropic Skills API hard-requires the payload to start literally + with ``---``. A UTF-8 BOM or stray leading whitespace — which neither + our structural validator nor JSON round-tripping strips — is enough + to earn a ``400 SKILL.md must start with YAML frontmatter (---)``. + We normalize here so the ~1% of model outputs with a leading BOM or + whitespace still upload cleanly instead of falling back to inline. + Returns the new ``skill_id``. The caller is responsible for archiving it via :func:`archive_skill` after the session completes. """ - folder = _extract_skill_name_from_md(skill_md) or name + # Strip leading BOM + whitespace the API is strict about; don't touch + # the rest of the body so checksum/fitness stays stable. + normalized = skill_md.lstrip("\ufeff \t\r\n") + if not normalized.startswith("---"): + raise ValueError( + "upload_skill: skill_md does not start with YAML frontmatter (---) " + "after stripping BOM/whitespace — refusing to call the API" + ) + folder = _extract_skill_name_from_md(normalized) or name resp = await client.beta.skills.create( display_title=name, files=[ ( f"{folder}/SKILL.md", - skill_md.encode("utf-8"), + normalized.encode("utf-8"), "text/markdown", ) ], diff --git a/tests/test_engineer.py b/tests/test_engineer.py index c6fc9c8..8a519b1 100644 --- a/tests/test_engineer.py +++ b/tests/test_engineer.py @@ -21,6 +21,7 @@ from skillforge.agents.engineer import ( IntegrationReport, _detect_conflicts, + _try_truncate_description, _validate_composite_shape, assemble_variants, ) @@ -132,10 +133,33 @@ def test_validate_composite_shape_rejects_missing_frontmatter(): _validate_composite_shape(raw) -def test_validate_composite_shape_rejects_oversize_description(): +def test_validate_composite_shape_truncates_oversize_description_in_place(): + """Oversize description with 'Use when' early is truncated, not rejected. + + Pre-wave-4 refactor this test used to assert the length check raised. + We now repair in place because Haiku-tier runs routinely overshoot by + 10–40% and a no-op truncation saves an LLM retry (~$0.20). See also + `test_try_truncate_description_*` below for the pure-helper tests. + """ + raw = json.loads(_canned_engineer_response()) + raw["frontmatter"]["description"] = ( + "Use when evolved family is ready to assemble into a single composite " + "skill that merges foundation traits with capability traits " + + "and " * 60 # balloon it past 250 chars + ) + _validate_composite_shape(raw) # no exception — truncation repairs + assert len(raw["frontmatter"]["description"]) <= 250 + assert "use when" in raw["frontmatter"]["description"].lower() + + +def test_validate_composite_shape_rejects_unsalvageable_oversize_description(): + """Oversize description where 'Use when' sits past char 250 cannot be + salvaged by truncation — the validator should raise.""" raw = json.loads(_canned_engineer_response()) - raw["frontmatter"]["description"] = "Use when " + "x" * 300 - with pytest.raises(ValueError, match="> 250"): + raw["frontmatter"]["description"] = ( + "x" * 260 + " Use when this lands past the truncation budget" + ) + with pytest.raises(ValueError, match="cannot be safely truncated"): _validate_composite_shape(raw) @@ -146,6 +170,32 @@ def test_validate_composite_shape_rejects_missing_use_when(): _validate_composite_shape(raw) +# --------------------------------------------------------------------------- +# _try_truncate_description — pure helper +# --------------------------------------------------------------------------- + + +def test_try_truncate_description_noop_when_under_cap(): + assert _try_truncate_description("short desc, Use when x") == "short desc, Use when x" + + +def test_try_truncate_description_repairs_oversize_at_word_boundary(): + desc = "Use when evolved family is ready. " + "word " * 100 + out = _try_truncate_description(desc) + assert out is not None + assert len(out) <= 250 + assert out.endswith("...") + assert "use when" in out.lower() + # Boundary was a word boundary, not a mid-word cut. + body = out.removesuffix("...") + assert not body.endswith(" ") + + +def test_try_truncate_description_returns_none_when_use_when_is_past_cap(): + desc = "x" * 300 + " Use when we care about long prefixes" + assert _try_truncate_description(desc) is None + + # --------------------------------------------------------------------------- # assemble_variants — full path with mocked LLM # --------------------------------------------------------------------------- diff --git a/tests/test_managed_agents.py b/tests/test_managed_agents.py index aaf5127..0ea38e7 100644 --- a/tests/test_managed_agents.py +++ b/tests/test_managed_agents.py @@ -394,19 +394,52 @@ async def test_upload_skill_folder_name_matches_frontmatter_name(): @pytest.mark.asyncio -async def test_upload_skill_falls_back_to_arg_name_when_frontmatter_missing(): - """If SKILL.md has no parseable name, use the ``name`` arg as the folder.""" +async def test_upload_skill_rejects_payload_without_frontmatter(): + """Content without leading ``---`` is rejected locally, never sent to the API. + + Empirical finding from a live Haiku-tier run: ~1% of managed-backend + uploads returned ``400 SKILL.md must start with YAML frontmatter (---)``. + Catching it here (instead of after the round-trip) gives a clear error + and keeps the fallback-to-inline code path in ``competitor_managed`` + from being triggered by genuinely bad payloads. + """ + mock_client = MagicMock() + mock_client.beta.skills.create = AsyncMock() + + skill_md = "no frontmatter at all, just prose" + with pytest.raises(ValueError, match="does not start with YAML frontmatter"): + await managed_agents.upload_skill( + mock_client, name="fallback-name", skill_md=skill_md + ) + # Never actually called the API + mock_client.beta.skills.create.assert_not_called() + + +@pytest.mark.asyncio +async def test_upload_skill_strips_bom_and_leading_whitespace(): + """A leading UTF-8 BOM or stray whitespace is stripped before upload. + + The API is byte-strict about the payload starting with ``---`` — + models occasionally emit content with a leading BOM or space that the + structural validator happens to tolerate. Normalizing here means the + upload succeeds instead of falling back to inline over something + cosmetic. + """ mock_client = MagicMock() mock_response = MagicMock() - mock_response.id = "skill_fallback_999" + mock_response.id = "skill_normalized" mock_client.beta.skills.create = AsyncMock(return_value=mock_response) - skill_md = "no frontmatter at all, just prose" - await managed_agents.upload_skill( - mock_client, name="fallback-name", skill_md=skill_md + skill_md = "\ufeff \n---\nname: bom-skill\n---\n# x" + result = await managed_agents.upload_skill( + mock_client, name="display", skill_md=skill_md ) + assert result == "skill_normalized" files = mock_client.beta.skills.create.call_args.kwargs["files"] - assert files[0][0] == "fallback-name/SKILL.md" + filename, content_bytes, _ = files[0] + assert filename == "bom-skill/SKILL.md" + # BOM/whitespace stripped; payload starts exactly with ``---`` + assert content_bytes.startswith(b"---\n") def test_extract_skill_name_from_md_happy_path(): From ff4ee883e87a84f1609f6f1b5c1db740adfc170d Mon Sep 17 00:00:00 2001 From: "Matt (via Claude Code)" Date: Sun, 19 Apr 2026 15:48:13 -0500 Subject: [PATCH 2/2] fix: spawner auto-repairs missing ${CLAUDE_SKILL_DIR} reference files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third LLM-fragility bug from the cheap-Haiku atomic run (same class as the two fixed in the previous commit): the Spawner routinely emits SKILL.md bodies that reference references/*-guide.md files in prose but forgets to include the file in supporting_files. Structural rule 8 rejects those genomes, and in atomic mode (pop=2, 1 retry) this was killing the whole dimension 1-of-3 times. Fix: new _auto_repair_missing_references helper runs before _validate_genomes. For each ${CLAUDE_SKILL_DIR}/ reference missing from supporting_files, stub a placeholder file with a clear auto-generated marker. The skill renders, the reference resolves at runtime, validation passes, and the Breeder can flesh out the stub in later generations if the signal warrants it. Same defensive-repair pattern as the Engineer description truncation: cheap LLM produces almost-valid output, we repair at the boundary instead of burning another ~\$0.20 retry that often reproduces the same oversight. Covered by 2 new unit tests (stubs-missing + noop-when-present). QA: ruff + mypy + 410 pytest (+2 from new tests) — all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- skillforge/agents/spawner.py | 52 +++++++++++++++++++++++++++++++++++- tests/test_agents.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/skillforge/agents/spawner.py b/skillforge/agents/spawner.py index c25c036..096bbc7 100644 --- a/skillforge/agents/spawner.py +++ b/skillforge/agents/spawner.py @@ -15,6 +15,7 @@ from __future__ import annotations +import re import uuid from anthropic import AsyncAnthropic @@ -150,13 +151,62 @@ async def _generate(prompt: str) -> str: return "".join(parts) +# Pulls ${CLAUDE_SKILL_DIR}/ references out of a SKILL.md body. +# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8 +# exactly — see that function for the source-of-truth behavior. +_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)") + + +def _auto_repair_missing_references(genome: SkillGenome) -> int: + """Stub out ``${CLAUDE_SKILL_DIR}/`` refs missing from supporting_files. + + Cheap-tier Haiku routinely emits SKILL.md bodies that reference + ``references/*-guide.md`` in prose but forget to include the file in + ``supporting_files``. Validator rule 8 rejects those genomes, which + in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 times. + + Rather than burn another LLM call on a retry that often reproduces + the same oversight, we stub each missing reference with a minimal + placeholder. The skill still renders, the reference still resolves + at runtime, and the genome passes validation. The Breeder can flesh + out the stubs in later generations if fitness signal suggests they + carry weight. + + Returns the count of paths that were stubbed (0 if everything already + resolved, which is the expected Sonnet-tier case). + """ + stubbed = 0 + for match in _REF_PATH_RE.finditer(genome.skill_md_content): + rel_path = match.group(1).rstrip(".,;:)") + if rel_path in genome.supporting_files: + continue + filename = rel_path.rsplit("/", 1)[-1] + placeholder_title = filename.removesuffix(".md").replace("-", " ").title() + genome.supporting_files[rel_path] = ( + f"# {placeholder_title}\n\n" + f"_Placeholder — stubbed by the spawner's auto-repair pass " + f"because the generating LLM referenced this file but did not " + f"emit its contents. Replace with domain-specific material " + f"during a later generation._\n" + ) + stubbed += 1 + return stubbed + + def _validate_genomes( genomes: list[SkillGenome], ) -> tuple[list[SkillGenome], dict[int, list[str]]]: - """Validate each genome; returns (valid_genomes, {idx: violations}).""" + """Validate each genome; returns (valid_genomes, {idx: violations}). + + Runs the reference-path auto-repair pass before validation so cheap-tier + LLM drift on rule 8 (missing supporting_files entries) doesn't kill a + whole population. The repair only adds files; it never touches the + skill_md body. + """ valid: list[SkillGenome] = [] invalid: dict[int, list[str]] = {} for i, genome in enumerate(genomes): + _auto_repair_missing_references(genome) violations = validate_skill_structure(genome) if violations: invalid[i] = violations diff --git a/tests/test_agents.py b/tests/test_agents.py index 6299446..1c990e7 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -375,6 +375,54 @@ async def test_spawn_gen0_raises_on_persistent_invalid(): await spawn_gen0("test domain", pop_size=1) +def test_auto_repair_missing_references_stubs_missing_files(): + """Rule-8 drift: a body references a file that isn't in supporting_files. + + The repair pass stubs it with a placeholder so the genome passes + structural validation instead of killing the whole population on a + cosmetic miss. + """ + from skillforge.agents.spawner import _auto_repair_missing_references + + skill_md = ( + "---\nname: x\ndescription: Use when y.\n---\n\n" + "# X\n\nSee ${CLAUDE_SKILL_DIR}/references/style-guide.md for details.\n" + ) + genome = SkillGenome( + id="g1", + generation=0, + skill_md_content=skill_md, + supporting_files={}, + traits=[], + ) + stubbed = _auto_repair_missing_references(genome) + assert stubbed == 1 + assert "references/style-guide.md" in genome.supporting_files + assert genome.supporting_files["references/style-guide.md"].startswith( + "# Style Guide" + ) + + +def test_auto_repair_missing_references_noop_when_all_present(): + """If every reference already resolves, the repair must not touch anything.""" + from skillforge.agents.spawner import _auto_repair_missing_references + + skill_md = ( + "---\nname: x\ndescription: Use when y.\n---\n\n" + "# X\n\nSee ${CLAUDE_SKILL_DIR}/references/guide.md.\n" + ) + genome = SkillGenome( + id="g2", + generation=0, + skill_md_content=skill_md, + supporting_files={"references/guide.md": "# real content"}, + traits=[], + ) + stubbed = _auto_repair_missing_references(genome) + assert stubbed == 0 + assert genome.supporting_files["references/guide.md"] == "# real content" + + # =========================================================================== # Spawner tests — breed_next_gen # ===========================================================================