Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions skillforge/agents/engineer.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,38 @@ def _extract_json_object(text: str) -> dict[str, Any]:
raise ValueError("unterminated JSON object in Engineer response")


def _try_truncate_description(desc: str, cap: int = 250) -> str | None:
"""Return a truncated description that fits under ``cap`` chars, or None.

Haiku-tier runs routinely over-shoot the 250-char frontmatter cap by
10–40% — the constraint is in the prompt but the model doesn't
enforce it reliably. Retrying on length alone costs a full LLM call
(~$0.20) to re-generate content that's 99% correct; truncating at a
word boundary is free and preserves the semantic payload.

Returns None if we can't truncate without clobbering the "Use when"
pushy-pattern marker — at that point it's a genuine content
failure and the caller should retry or fail loudly.
"""
if len(desc) <= cap:
return desc

# Reserve 3 chars for the ellipsis suffix.
target = cap - 3
cut = desc.rfind(" ", 0, target + 1)
if cut == -1 or cut < cap // 2:
# No word boundary in the useful range — hard-cut at target.
cut = target
truncated = desc[:cut].rstrip(" ,.;:") + "..."

# The composite validator below requires "Use when" in the first
# 250 chars; if truncation erased it, the repair is worse than the
# original problem.
if "use when" not in truncated.lower():
return None
return truncated


def _validate_composite_shape(raw: dict[str, Any]) -> None:
if not isinstance(raw, dict):
raise ValueError("Engineer output must be a JSON object")
Expand All @@ -253,9 +285,15 @@ def _validate_composite_shape(raw: dict[str, Any]) -> None:

desc = fm.get("description", "")
if isinstance(desc, str) and len(desc) > 250:
raise ValueError(
f"Engineer composite description is {len(desc)} chars > 250 limit"
)
# Try the cheap repair before escalating to an LLM retry.
repaired = _try_truncate_description(desc)
if repaired is None:
raise ValueError(
f"Engineer composite description is {len(desc)} chars > 250 "
f"limit and cannot be safely truncated (would remove 'Use when')"
)
fm["description"] = repaired
desc = repaired
# Pushy-pattern requirement — validate_skill_structure rule 5 rejects
# descriptions that don't contain "Use when" in the first 250 chars.
# Catch it here so the Engineer gets one retry instead of hitting the
Expand Down
19 changes: 17 additions & 2 deletions skillforge/agents/managed_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,31 @@ async def upload_skill(
use that. The ``name`` arg is still used as the ``display_title``
(which can be anything human-readable).

The Anthropic Skills API hard-requires the payload to start literally
with ``---``. A UTF-8 BOM or stray leading whitespace — which neither
our structural validator nor JSON round-tripping strips — is enough
to earn a ``400 SKILL.md must start with YAML frontmatter (---)``.
We normalize here so the ~1% of model outputs with a leading BOM or
whitespace still upload cleanly instead of falling back to inline.

Returns the new ``skill_id``. The caller is responsible for archiving it
via :func:`archive_skill` after the session completes.
"""
folder = _extract_skill_name_from_md(skill_md) or name
# Strip leading BOM + whitespace the API is strict about; don't touch
# the rest of the body so checksum/fitness stays stable.
normalized = skill_md.lstrip("\ufeff \t\r\n")
if not normalized.startswith("---"):
raise ValueError(
"upload_skill: skill_md does not start with YAML frontmatter (---) "
"after stripping BOM/whitespace — refusing to call the API"
)
folder = _extract_skill_name_from_md(normalized) or name
resp = await client.beta.skills.create(
display_title=name,
files=[
(
f"{folder}/SKILL.md",
skill_md.encode("utf-8"),
normalized.encode("utf-8"),
"text/markdown",
)
],
Expand Down
52 changes: 51 additions & 1 deletion skillforge/agents/spawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from __future__ import annotations

import re
import uuid

from anthropic import AsyncAnthropic
Expand Down Expand Up @@ -150,13 +151,62 @@ async def _generate(prompt: str) -> str:
return "".join(parts)


# Pulls ${CLAUDE_SKILL_DIR}/<relative/path> references out of a SKILL.md body.
# Must match the regex in ``engine.sandbox.validate_skill_structure`` rule 8
# exactly — see that function for the source-of-truth behavior.
_REF_PATH_RE = re.compile(r"\$\{CLAUDE_SKILL_DIR\}/([^\s`)\"']+)")


def _auto_repair_missing_references(genome: SkillGenome) -> int:
"""Stub out ``${CLAUDE_SKILL_DIR}/<path>`` refs missing from supporting_files.

Cheap-tier Haiku routinely emits SKILL.md bodies that reference
``references/*-guide.md`` in prose but forget to include the file in
``supporting_files``. Validator rule 8 rejects those genomes, which
in atomic mode (pop=2, 1 retry) was killing the whole run 1-of-3 times.

Rather than burn another LLM call on a retry that often reproduces
the same oversight, we stub each missing reference with a minimal
placeholder. The skill still renders, the reference still resolves
at runtime, and the genome passes validation. The Breeder can flesh
out the stubs in later generations if fitness signal suggests they
carry weight.

Returns the count of paths that were stubbed (0 if everything already
resolved, which is the expected Sonnet-tier case).
"""
stubbed = 0
for match in _REF_PATH_RE.finditer(genome.skill_md_content):
rel_path = match.group(1).rstrip(".,;:)")
if rel_path in genome.supporting_files:
continue
filename = rel_path.rsplit("/", 1)[-1]
placeholder_title = filename.removesuffix(".md").replace("-", " ").title()
genome.supporting_files[rel_path] = (
f"# {placeholder_title}\n\n"
f"_Placeholder — stubbed by the spawner's auto-repair pass "
f"because the generating LLM referenced this file but did not "
f"emit its contents. Replace with domain-specific material "
f"during a later generation._\n"
)
stubbed += 1
return stubbed


def _validate_genomes(
genomes: list[SkillGenome],
) -> tuple[list[SkillGenome], dict[int, list[str]]]:
"""Validate each genome; returns (valid_genomes, {idx: violations})."""
"""Validate each genome; returns (valid_genomes, {idx: violations}).

Runs the reference-path auto-repair pass before validation so cheap-tier
LLM drift on rule 8 (missing supporting_files entries) doesn't kill a
whole population. The repair only adds files; it never touches the
skill_md body.
"""
valid: list[SkillGenome] = []
invalid: dict[int, list[str]] = {}
for i, genome in enumerate(genomes):
_auto_repair_missing_references(genome)
violations = validate_skill_structure(genome)
if violations:
invalid[i] = violations
Expand Down
48 changes: 48 additions & 0 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,54 @@ async def test_spawn_gen0_raises_on_persistent_invalid():
await spawn_gen0("test domain", pop_size=1)


def test_auto_repair_missing_references_stubs_missing_files():
"""Rule-8 drift: a body references a file that isn't in supporting_files.

The repair pass stubs it with a placeholder so the genome passes
structural validation instead of killing the whole population on a
cosmetic miss.
"""
from skillforge.agents.spawner import _auto_repair_missing_references

skill_md = (
"---\nname: x\ndescription: Use when y.\n---\n\n"
"# X\n\nSee ${CLAUDE_SKILL_DIR}/references/style-guide.md for details.\n"
)
genome = SkillGenome(
id="g1",
generation=0,
skill_md_content=skill_md,
supporting_files={},
traits=[],
)
stubbed = _auto_repair_missing_references(genome)
assert stubbed == 1
assert "references/style-guide.md" in genome.supporting_files
assert genome.supporting_files["references/style-guide.md"].startswith(
"# Style Guide"
)


def test_auto_repair_missing_references_noop_when_all_present():
"""If every reference already resolves, the repair must not touch anything."""
from skillforge.agents.spawner import _auto_repair_missing_references

skill_md = (
"---\nname: x\ndescription: Use when y.\n---\n\n"
"# X\n\nSee ${CLAUDE_SKILL_DIR}/references/guide.md.\n"
)
genome = SkillGenome(
id="g2",
generation=0,
skill_md_content=skill_md,
supporting_files={"references/guide.md": "# real content"},
traits=[],
)
stubbed = _auto_repair_missing_references(genome)
assert stubbed == 0
assert genome.supporting_files["references/guide.md"] == "# real content"


# ===========================================================================
# Spawner tests — breed_next_gen
# ===========================================================================
Expand Down
56 changes: 53 additions & 3 deletions tests/test_engineer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from skillforge.agents.engineer import (
IntegrationReport,
_detect_conflicts,
_try_truncate_description,
_validate_composite_shape,
assemble_variants,
)
Expand Down Expand Up @@ -132,10 +133,33 @@ def test_validate_composite_shape_rejects_missing_frontmatter():
_validate_composite_shape(raw)


def test_validate_composite_shape_rejects_oversize_description():
def test_validate_composite_shape_truncates_oversize_description_in_place():
"""Oversize description with 'Use when' early is truncated, not rejected.

Pre-wave-4 refactor this test used to assert the length check raised.
We now repair in place because Haiku-tier runs routinely overshoot by
10–40% and a no-op truncation saves an LLM retry (~$0.20). See also
`test_try_truncate_description_*` below for the pure-helper tests.
"""
raw = json.loads(_canned_engineer_response())
raw["frontmatter"]["description"] = (
"Use when evolved family is ready to assemble into a single composite "
"skill that merges foundation traits with capability traits "
+ "and " * 60 # balloon it past 250 chars
)
_validate_composite_shape(raw) # no exception — truncation repairs
assert len(raw["frontmatter"]["description"]) <= 250
assert "use when" in raw["frontmatter"]["description"].lower()


def test_validate_composite_shape_rejects_unsalvageable_oversize_description():
"""Oversize description where 'Use when' sits past char 250 cannot be
salvaged by truncation — the validator should raise."""
raw = json.loads(_canned_engineer_response())
raw["frontmatter"]["description"] = "Use when " + "x" * 300
with pytest.raises(ValueError, match="> 250"):
raw["frontmatter"]["description"] = (
"x" * 260 + " Use when this lands past the truncation budget"
)
with pytest.raises(ValueError, match="cannot be safely truncated"):
_validate_composite_shape(raw)


Expand All @@ -146,6 +170,32 @@ def test_validate_composite_shape_rejects_missing_use_when():
_validate_composite_shape(raw)


# ---------------------------------------------------------------------------
# _try_truncate_description — pure helper
# ---------------------------------------------------------------------------


def test_try_truncate_description_noop_when_under_cap():
assert _try_truncate_description("short desc, Use when x") == "short desc, Use when x"


def test_try_truncate_description_repairs_oversize_at_word_boundary():
desc = "Use when evolved family is ready. " + "word " * 100
out = _try_truncate_description(desc)
assert out is not None
assert len(out) <= 250
assert out.endswith("...")
assert "use when" in out.lower()
# Boundary was a word boundary, not a mid-word cut.
body = out.removesuffix("...")
assert not body.endswith(" ")


def test_try_truncate_description_returns_none_when_use_when_is_past_cap():
desc = "x" * 300 + " Use when we care about long prefixes"
assert _try_truncate_description(desc) is None


# ---------------------------------------------------------------------------
# assemble_variants — full path with mocked LLM
# ---------------------------------------------------------------------------
Expand Down
47 changes: 40 additions & 7 deletions tests/test_managed_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,19 +394,52 @@ async def test_upload_skill_folder_name_matches_frontmatter_name():


@pytest.mark.asyncio
async def test_upload_skill_falls_back_to_arg_name_when_frontmatter_missing():
"""If SKILL.md has no parseable name, use the ``name`` arg as the folder."""
async def test_upload_skill_rejects_payload_without_frontmatter():
"""Content without leading ``---`` is rejected locally, never sent to the API.

Empirical finding from a live Haiku-tier run: ~1% of managed-backend
uploads returned ``400 SKILL.md must start with YAML frontmatter (---)``.
Catching it here (instead of after the round-trip) gives a clear error
and keeps the fallback-to-inline code path in ``competitor_managed``
from being triggered by genuinely bad payloads.
"""
mock_client = MagicMock()
mock_client.beta.skills.create = AsyncMock()

skill_md = "no frontmatter at all, just prose"
with pytest.raises(ValueError, match="does not start with YAML frontmatter"):
await managed_agents.upload_skill(
mock_client, name="fallback-name", skill_md=skill_md
)
# Never actually called the API
mock_client.beta.skills.create.assert_not_called()


@pytest.mark.asyncio
async def test_upload_skill_strips_bom_and_leading_whitespace():
"""A leading UTF-8 BOM or stray whitespace is stripped before upload.

The API is byte-strict about the payload starting with ``---`` —
models occasionally emit content with a leading BOM or space that the
structural validator happens to tolerate. Normalizing here means the
upload succeeds instead of falling back to inline over something
cosmetic.
"""
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.id = "skill_fallback_999"
mock_response.id = "skill_normalized"
mock_client.beta.skills.create = AsyncMock(return_value=mock_response)

skill_md = "no frontmatter at all, just prose"
await managed_agents.upload_skill(
mock_client, name="fallback-name", skill_md=skill_md
skill_md = "\ufeff \n---\nname: bom-skill\n---\n# x"
result = await managed_agents.upload_skill(
mock_client, name="display", skill_md=skill_md
)
assert result == "skill_normalized"
files = mock_client.beta.skills.create.call_args.kwargs["files"]
assert files[0][0] == "fallback-name/SKILL.md"
filename, content_bytes, _ = files[0]
assert filename == "bom-skill/SKILL.md"
# BOM/whitespace stripped; payload starts exactly with ``---``
assert content_bytes.startswith(b"---\n")


def test_extract_skill_name_from_md_happy_path():
Expand Down
Loading