diff --git a/pyproject.toml b/pyproject.toml
index 0aea578..bc8f111 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,13 +42,23 @@ target-version = "py312"
 [tool.ruff.lint]
 # E/F: pycodestyle + pyflakes  I: isort  UP: pyupgrade  B: flake8-bugbear
 # SIM: flake8-simplify  LOG: flake8-logging  RET: flake8-return  PTH: flake8-use-pathlib
-# BLE + TRY (exception hygiene) are added in Wave 2 alongside the bare-except cleanup.
-select = ["E", "F", "I", "UP", "B", "SIM", "LOG", "RET", "PTH"]
+# BLE: no-bare-except  TRY: tryceratops (exception hygiene)
+select = ["E", "F", "I", "UP", "B", "SIM", "LOG", "RET", "PTH", "BLE", "TRY"]
 ignore = [
     "E501",   # line-too-long (handled by formatter preference)
     "RET504", # unnecessary assignment before return (noisy)
+    "TRY003", # long messages in raise — our domain errors benefit from prose
+    "TRY301", # abstract raise into own block — stylistic, rarely actionable
+    "TRY300", # consider-else-block after try — stylistic, not buggy
+    "TRY004", # prefer TypeError for isinstance failures — our LLM-parse
+              # results are genuine *value* errors (malformed payload), not
+              # programmer type errors. ValueError is the right semantic.
 ]
 
+# Tests may use broad excepts to assert "some error occurred" behavior.
+[tool.ruff.lint.extend-per-file-ignores]
+"tests/*.py" = ["BLE001"]
+
 # Seed data modules are bulk content, not hand-maintained logic.
 [tool.ruff.lint.per-file-ignores]
 "skillforge/seeds/batch*.py" = ["E402", "E741", "F401", "F811", "SIM", "B"]
diff --git a/skillforge/agents/_json.py b/skillforge/agents/_json.py
new file mode 100644
index 0000000..417cc92
--- /dev/null
+++ b/skillforge/agents/_json.py
@@ -0,0 +1,99 @@
+"""Robust JSON-array extraction from LLM prose.
+
+Shared by agents that ask an LLM for a list of structured items and must
+tolerate responses wrapped in prose, code fences, nested backticks, or
+string values that contain ``[``/``]``. See ``docs/clean-code.md`` §1
+(reuse over duplication).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+
+from skillforge.errors import ParseError
+
+_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*)\n?```", re.DOTALL)
+
+
+def extract_json_array(text: str) -> list[dict]:
+    """Return the outermost JSON array embedded in ``text``.
+
+    Handles three response shapes:
+      1. Raw JSON array — the entire response is a ``[...]``.
+      2. Fenced block — response wrapped in ```` ```json ... ``` ```` fences.
+         Matched greedily so nested fences in string values don't split.
+      3. Array embedded in prose — extracted via bracket-depth scanning
+         that respects JSON string literal state (``[``/``]`` inside
+         string values don't perturb the depth counter).
+
+    Raises:
+        ParseError: no balanced JSON array could be located.
+    """
+    candidate = text.strip()
+
+    if candidate.startswith("[") and candidate.endswith("]"):
+        try:
+            parsed = json.loads(candidate)
+        except json.JSONDecodeError:
+            pass
+        else:
+            if isinstance(parsed, list):
+                return parsed
+
+    fence_match = _FENCE_RE.search(text)
+    if fence_match:
+        fenced = fence_match.group(1).strip()
+        try:
+            parsed = json.loads(fenced)
+        except json.JSONDecodeError:
+            text_to_scan = fenced
+        else:
+            if isinstance(parsed, list):
+                return parsed
+            text_to_scan = fenced
+    else:
+        text_to_scan = text
+
+    array_src = _scan_outermost_array(text_to_scan)
+    if array_src is not None:
+        try:
+            parsed = json.loads(array_src)
+        except json.JSONDecodeError:
+            pass
+        else:
+            if isinstance(parsed, list):
+                return parsed
+
+    raise ParseError("no valid JSON array found in response text")
+
+
+def _scan_outermost_array(text: str) -> str | None:
+    """Return the outermost balanced ``[...]`` substring, or ``None``."""
+    start = text.find("[")
+    if start == -1:
+        return None
+
+    depth = 0
+    in_string = False
+    escape = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if escape:
+            escape = False
+            continue
+        if ch == "\\":
+            escape = True
+            continue
+        if ch == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                return text[start : i + 1]
+    return None
diff --git a/skillforge/agents/breeder.py b/skillforge/agents/breeder.py
index 13c673d..e2d0aac 100644
--- a/skillforge/agents/breeder.py
+++ b/skillforge/agents/breeder.py
@@ -20,6 +20,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import re
 from datetime import UTC, datetime
 
@@ -35,6 +36,8 @@
 )
 from skillforge.models import Generation, SkillGenome
 
+logger = logging.getLogger("skillforge.agents.breeder")
+
 # ---------------------------------------------------------------------------
 # Slot allocation
 # ---------------------------------------------------------------------------
@@ -150,9 +153,9 @@ async def breed(
                 breeding_instructions=diagnostic_instructions,
             )
             next_gen.extend(diagnostic_children[: slots["diagnostic"]])
-        except Exception as exc:  # noqa: BLE001
-            # Fall through — wildcard slots below absorb the shortfall
-            print(f"breeder: diagnostic mutation failed: {exc}")
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            # Fall through — wildcard slots below absorb the shortfall.
+            logger.exception("breeder.diagnostic_failed")
 
     # --- Reflective crossover: combine 2-3 Pareto-optimal parents ---
     pareto_parents = [s for s in ranked if s.is_pareto_optimal][:3]
@@ -171,8 +174,8 @@ async def breed(
                 breeding_instructions=crossover_instructions,
             )
             next_gen.extend(crossover_children[: slots["crossover"]])
-        except Exception as exc:  # noqa: BLE001
-            print(f"breeder: crossover failed: {exc}")
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            logger.exception("breeder.crossover_failed")
 
     # --- Wildcard: fresh Skills via spawn_gen0 ---
     if slots["wildcards"] > 0:
@@ -182,14 +185,14 @@ async def breed(
                 pop_size=slots["wildcards"],
             )
             # Mark wildcards as mutations on the next generation
-            next_gen_num = (generation.number + 1)
+            next_gen_num = generation.number + 1
             for w in wildcards:
                 w.generation = next_gen_num
                 w.mutations = ["wildcard"]
                 w.mutation_rationale = "Wildcard slot: fresh spawn to prevent convergence"
             next_gen.extend(wildcards)
-        except Exception as exc:  # noqa: BLE001
-            print(f"breeder: wildcard spawn failed: {exc}")
+        except Exception:  # noqa: BLE001 — subagent boundary: one slot failure must not kill the whole breed
+            logger.exception("breeder.wildcard_spawn_failed")
 
     # --- Trim or pad to exactly target_pop_size ---
     next_gen = next_gen[:target_pop_size]
@@ -413,8 +416,12 @@ async def _extract_lessons(context: str, learning_log: list[str]) -> list[str]:
             max_tokens=500,
             messages=[{"role": "user", "content": prompt}],
         )
-    except Exception as exc:  # noqa: BLE001
-        return [f"(lesson extraction failed: {exc})"]
+    except Exception:
+        # Degrade gracefully — a breeder that blocks on LLM hiccups would
+        # stall the whole run. The SDK has many concrete error types across
+        # versions; catching at the boundary keeps the engine moving.
+        logger.exception("breeder.lesson_extraction_failed")
+        return ["(lesson extraction failed)"]
 
     match = re.search(r"\[.*\]", text, re.DOTALL)
     if not match:
@@ -452,8 +459,10 @@ async def _extract_breeding_report(
             max_tokens=800,
             messages=[{"role": "user", "content": prompt}],
         )
-    except Exception as exc:  # noqa: BLE001
-        return f"(breeding report failed: {exc})"
+    except Exception:
+        # Degrade gracefully — see _extract_lessons for rationale.
+        logger.exception("breeder.report_extraction_failed")
+        return "(breeding report failed)"
 
 
 async def _extract_consolidated(
@@ -486,8 +495,10 @@ async def _extract_consolidated(
             max_tokens=1200,
             messages=[{"role": "user", "content": prompt}],
         )
-    except Exception as exc:  # noqa: BLE001
-        return ([f"(consolidated extraction failed: {exc})"], "")
+    except Exception:
+        # Degrade gracefully — see _extract_lessons for rationale.
+        logger.exception("breeder.consolidated_extraction_failed")
+        return (["(consolidated extraction failed)"], "")
 
     match = re.search(r"\{.*\}", text, re.DOTALL)
     if not match:
@@ -526,8 +537,8 @@ def publish_findings_to_bible(
     findings_dir = BIBLE_DIR / "findings"
     try:
         findings_dir.mkdir(parents=True, exist_ok=True)
-    except OSError as exc:
-        print(f"bible: failed to create findings dir: {exc}")
+    except OSError:
+        logger.exception("bible.findings_dir_mkdir_failed")
         return
 
     # Determine the next finding number by scanning existing files
@@ -556,8 +567,8 @@ def publish_findings_to_bible(
         )
         try:
             (findings_dir / filename).write_text(content)
-        except OSError as exc:
-            print(f"bible: failed to write finding {filename}: {exc}")
+        except OSError:
+            logger.exception("bible.finding_write_failed", extra={"filename": filename})
             continue
         next_num += 1
 
@@ -570,8 +581,8 @@ def publish_findings_to_bible(
             existing = "# Evolution Log\n\n*Chronological log of all SkillForge evolution runs.*\n\n"
         entry_line = f"- **{timestamp}** — run `{run_id[:8]}` gen {generation}: {len(new_entries)} new finding(s)\n"
         log_path.write_text(existing + entry_line)
-    except OSError as exc:
-        print(f"bible: failed to update evolution log: {exc}")
+    except OSError:
+        logger.exception("bible.evolution_log_write_failed")
 
 
 def _slugify(text: str) -> str:
diff --git a/skillforge/agents/challenge_designer.py b/skillforge/agents/challenge_designer.py
index 5f46e56..45c3819 100644
--- a/skillforge/agents/challenge_designer.py
+++ b/skillforge/agents/challenge_designer.py
@@ -13,13 +13,13 @@
 
 from __future__ import annotations
 
-import json
-import re
 import uuid
 
 from anthropic import AsyncAnthropic
 
+from skillforge.agents._json import extract_json_array
 from skillforge.config import ANTHROPIC_API_KEY, model_for
+from skillforge.errors import ParseError
 from skillforge.models import Challenge
 
 # JSON schema description embedded in prompts
@@ -35,90 +35,6 @@
 ]"""
 
 
-def _extract_json_array(text: str) -> list[dict]:
-    """Extract a JSON array from text.
-
-    Robust against:
-      1. Raw JSON array (ideal case)
-      2. ``` json ... ``` fences with nested backticks in string values
-      3. JSON embedded in prose with `[`/`]` characters in string literals
-
-    Raises:
-        ValueError: if no valid JSON array can be extracted.
-    """
-    candidate = text.strip()
-
-    # 1. Try the whole text as JSON
-    if candidate.startswith("[") and candidate.endswith("]"):
-        try:
-            result = json.loads(candidate)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            pass
-
-    # 2. Strip outer ```json ... ``` fence greedily
-    fence_match = re.search(r"```(?:json)?\s*\n?(.*)\n?```", text, re.DOTALL)
-    if fence_match:
-        fenced = fence_match.group(1).strip()
-        try:
-            result = json.loads(fenced)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            text_to_scan = fenced
-        else:
-            text_to_scan = fenced
-    else:
-        text_to_scan = text
-
-    # 3. Bracket-depth scan respecting string literal state
-    array = _scan_outermost_array(text_to_scan)
-    if array is not None:
-        try:
-            result = json.loads(array)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            pass
-
-    raise ValueError("No valid JSON array found in response text")
-
-
-def _scan_outermost_array(text: str) -> str | None:
-    """Find the outermost JSON array via bracket-depth scanning that
-    respects string literal state. Returns substring including ``[`` and
-    ``]``, or ``None`` if no balanced array found.
-    """
-    start = text.find("[")
-    if start == -1:
-        return None
-
-    depth = 0
-    in_string = False
-    escape = False
-    for i in range(start, len(text)):
-        ch = text[i]
-        if escape:
-            escape = False
-            continue
-        if ch == "\\":
-            escape = True
-            continue
-        if ch == '"':
-            in_string = not in_string
-            continue
-        if in_string:
-            continue
-        if ch == "[":
-            depth += 1
-        elif ch == "]":
-            depth -= 1
-            if depth == 0:
-                return text[start : i + 1]
-    return None
-
-
 _FILE_CONVENTION = """\
 ## File convention (STRICT — follow exactly)
 
@@ -261,14 +177,14 @@ async def design_challenges(specialization: str, n: int = 3) -> list[Challenge]:
     text = await _generate(_build_system_prompt(specialization, n))
 
     try:
-        raw = _extract_json_array(text)
-    except ValueError:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
         # Attempt 2 — retry with more explicit prompt
         text = await _generate(_build_retry_prompt(specialization, n))
         try:
-            raw = _extract_json_array(text)
-        except ValueError as err:
-            raise ValueError(
+            raw = extract_json_array(text)
+        except (ValueError, ParseError) as err:
+            raise ParseError(
                 "challenge designer failed to produce valid JSON after 2 attempts"
             ) from err
 
@@ -331,10 +247,10 @@ async def design_variant_challenge(
     prompt = _build_variant_system_prompt(specialization, dimension)
     text = await _generate(prompt)
     try:
-        raw = _extract_json_array(text)
-    except ValueError:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
         text = await _generate(_build_retry_prompt(specialization, n=1))
-        raw = _extract_json_array(text)
+        raw = extract_json_array(text)
 
     challenges = _parse_challenges(raw)
     if len(challenges) != 1:
diff --git a/skillforge/agents/competitor_sdk.py b/skillforge/agents/competitor_sdk.py
index b48b9f1..c672c66 100644
--- a/skillforge/agents/competitor_sdk.py
+++ b/skillforge/agents/competitor_sdk.py
@@ -19,7 +19,7 @@
 from __future__ import annotations
 
 import asyncio
-import sys
+import logging
 from pathlib import Path
 
 from claude_agent_sdk import ClaudeAgentOptions, query
@@ -28,6 +28,8 @@
 from skillforge.engine.sandbox import collect_written_files
 from skillforge.models import Challenge, CompetitionResult, SkillGenome
 
+logger = logging.getLogger("skillforge.agents.competitor_sdk")
+
 
 def _message_to_dict(msg) -> dict:
     """Convert an SDK message to a JSON-safe dict for trace storage."""
@@ -123,8 +125,8 @@ async def run_competitor(
             trace=trace,
             judge_reasoning="timeout after 300s",
         )
-    except Exception as exc:
-        print(f"sdk error in run_competitor: {exc}", file=sys.stderr)
+    except Exception as exc:  # noqa: BLE001 — SDK boundary: degrade gracefully, never crash the run
+        logger.exception("sdk error in run_competitor")
         output_files = collect_written_files(sandbox_path / "output")
         return CompetitionResult(
             skill_id=skill.id,
diff --git a/skillforge/agents/spawner.py b/skillforge/agents/spawner.py
index 4921459..c25c036 100644
--- a/skillforge/agents/spawner.py
+++ b/skillforge/agents/spawner.py
@@ -15,14 +15,14 @@
 
 from __future__ import annotations
 
-import json
-import re
 import uuid
 
 from anthropic import AsyncAnthropic
 
+from skillforge.agents._json import extract_json_array
 from skillforge.config import ANTHROPIC_API_KEY, BIBLE_DIR, GOLDEN_TEMPLATE_DIR, model_for
 from skillforge.engine.sandbox import validate_skill_structure
+from skillforge.errors import ParseError
 from skillforge.models import SkillGenome
 
 # JSON schema for spawner responses
@@ -69,98 +69,6 @@ def _read_bible_patterns() -> str:
     return "\n\n---\n\n".join(parts)
 
 
-def _extract_json_array(text: str) -> list[dict]:
-    """Extract a JSON array from text.
-
-    Handles three cases robustly:
-      1. Whole response is a raw JSON array
-      2. Response is wrapped in ``` json ... ``` fences (greedy match of
-         the outermost fence — SKILL.md content can contain nested fences
-         that a non-greedy match would trip over)
-      3. JSON array embedded in prose, extracted via bracket-depth scanning
-         that respects string literal state (handles `[` and `]` inside
-         string values like Python list comp examples)
-
-    Raises:
-        ValueError: if no valid JSON array can be extracted.
-    """
-    candidate = text.strip()
-
-    # 1. Try the whole text as JSON (ideal case)
-    if candidate.startswith("[") and candidate.endswith("]"):
-        try:
-            result = json.loads(candidate)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            pass
-
-    # 2. Strip outer ```json ... ``` fence greedily (matches LAST ```).
-    #    Non-greedy would stop at the first nested ``` inside string values.
-    fence_match = re.search(r"```(?:json)?\s*\n?(.*)\n?```", text, re.DOTALL)
-    if fence_match:
-        fenced = fence_match.group(1).strip()
-        try:
-            result = json.loads(fenced)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            # Fall through to bracket scanning on the fenced content
-            text_to_scan = fenced
-        else:
-            text_to_scan = fenced
-    else:
-        text_to_scan = text
-
-    # 3. Bracket-depth scan that respects JSON string literal state
-    array = _scan_outermost_array(text_to_scan)
-    if array is not None:
-        try:
-            result = json.loads(array)
-            if isinstance(result, list):
-                return result
-        except json.JSONDecodeError:
-            pass
-
-    raise ValueError("No valid JSON array found in response text")
-
-
-def _scan_outermost_array(text: str) -> str | None:
-    """Find the outermost JSON array substring via bracket-depth scanning.
-
-    Properly tracks string literal state so brackets inside string values
-    don't throw off the depth counter. Returns the substring (including the
-    outer ``[`` and ``]``), or ``None`` if no balanced array is found.
-    """
-    start = text.find("[")
-    if start == -1:
-        return None
-
-    depth = 0
-    in_string = False
-    escape = False
-    for i in range(start, len(text)):
-        ch = text[i]
-        if escape:
-            escape = False
-            continue
-        if ch == "\\":
-            escape = True
-            continue
-        if ch == '"':
-            in_string = not in_string
-            continue
-        if in_string:
-            continue
-        if ch == "[":
-            depth += 1
-        elif ch == "]":
-            depth -= 1
-            if depth == 0:
-                return text[start : i + 1]
-    return None
-
-
 def _extract_response_text(response) -> str:
     """Extract text from an Anthropic Messages API response.
 
@@ -388,11 +296,11 @@ async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]:
     _save_debug_response("spawn_gen0_attempt1", text)
 
     try:
-        raw = _extract_json_array(text)
+        raw = extract_json_array(text)
         genomes = _parse_genomes(raw, generation=0)
         valid_genomes, invalid = _validate_genomes(genomes)
         first_attempt_failed = False
-    except ValueError:
+    except (ValueError, ParseError):
         # JSON parse failure — treat as if everything was invalid so the
         # retry path runs.
         genomes = []
@@ -421,8 +329,8 @@ async def spawn_gen0(specialization: str, pop_size: int) -> list[SkillGenome]:
     _save_debug_response("spawn_gen0_attempt2", text)
 
     try:
-        raw2 = _extract_json_array(text)
-    except ValueError as exc:
+        raw2 = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
         raise ValueError(
             f"spawner failed to produce valid JSON on retry: {exc}. "
             f"See /tmp/sf-spawn_gen0_attempt2.txt for the raw response."
@@ -473,8 +381,8 @@ async def breed_next_gen(
     text = await _generate(system_prompt)
 
     try:
-        raw = _extract_json_array(text)
-    except ValueError as exc:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
         raise ValueError(
             f"spawner breed_next_gen failed to produce valid JSON: {exc}"
         ) from exc
@@ -506,8 +414,8 @@ async def breed_next_gen(
     text = await _generate(repair_prompt)
 
     try:
-        raw2 = _extract_json_array(text)
-    except ValueError as exc:
+        raw2 = extract_json_array(text)
+    except (ValueError, ParseError) as exc:
         raise ValueError(
             f"spawner breed_next_gen failed to produce valid JSON on retry: {exc}"
         ) from exc
@@ -627,8 +535,8 @@ async def spawn_from_parent(
     text = await _generate(system_prompt)
 
     try:
-        raw = _extract_json_array(text)
-    except ValueError:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
         # If the LLM refused or produced garbage, fall back to elite-only
         # (graceful degradation — evolution can still proceed with just the parent)
         return [elite]
@@ -773,8 +681,8 @@ async def spawn_variant_gen0(
     _save_debug_response(f"spawn_variant_gen0_{dimension.get('name', 'unknown')}", text)
 
     try:
-        raw = _extract_json_array(text)
-    except ValueError:
+        raw = extract_json_array(text)
+    except (ValueError, ParseError):
         # One retry with a stricter formatting reminder
         retry_prompt = (
             system_prompt
@@ -783,7 +691,7 @@ async def spawn_variant_gen0(
             "markdown fences."
         )
         text = await _generate(retry_prompt)
-        raw = _extract_json_array(text)
+        raw = extract_json_array(text)
 
     genomes = _parse_genomes(raw, generation=0)
     valid_genomes, invalid = _validate_genomes(genomes)
diff --git a/skillforge/api/debug.py b/skillforge/api/debug.py
index fea8b2a..a0543eb 100644
--- a/skillforge/api/debug.py
+++ b/skillforge/api/debug.py
@@ -467,17 +467,14 @@ async def debug_status(token: str = "") -> dict:
     if token != ADMIN_TOKEN:
         raise HTTPException(status_code=403, detail="invalid admin token")
 
-    from skillforge.api.routes import _active_runs
     from skillforge.db.queries import list_leaked_skills, list_runs
+    from skillforge.engine.run_registry import registry
 
     # Active runs
-    active = []
-    for run_id, task in _active_runs.items():
-        active.append({
-            "run_id": run_id,
-            "done": task.done(),
-            "cancelled": task.cancelled(),
-        })
+    active = [
+        {"run_id": run_id, "done": task.done(), "cancelled": task.cancelled()}
+        for run_id, task in registry.iter_tasks()
+    ]
 
     # Recent failed runs
     all_runs = await list_runs(limit=20)
diff --git a/skillforge/api/journal.py b/skillforge/api/journal.py
index fae426c..b5f8af5 100644
--- a/skillforge/api/journal.py
+++ b/skillforge/api/journal.py
@@ -63,7 +63,9 @@ async def list_journal_entries() -> list[dict]:
     for path in sorted(JOURNAL_DIR.glob("*.md"), reverse=True):
         try:
             entries.append(_parse_entry(path))
-        except Exception:
+        except (OSError, UnicodeDecodeError, ValueError):
+            # Skip malformed entries — the journal index should never
+            # crash because a single file is unreadable.
             continue
     return entries
 
diff --git a/skillforge/api/llms.py b/skillforge/api/llms.py
index 970dda6..39d10c0 100644
--- a/skillforge/api/llms.py
+++ b/skillforge/api/llms.py
@@ -12,6 +12,7 @@
 from __future__ import annotations
 
 import json
+import logging
 from pathlib import Path
 
 from fastapi import APIRouter, HTTPException
@@ -20,6 +21,8 @@
 from skillforge.config import ROOT_DIR
 from skillforge.db.queries import _connect
 
+logger = logging.getLogger("skillforge.api.llms")
+
 router = APIRouter(tags=["llms"])
 
 SITE_URL = "https://skld.run"
@@ -87,7 +90,7 @@ async def sitemap_xml() -> Response:
             for (run_id,) in await cursor.fetchall():
                 urls.append(f"{SITE_URL}/runs/{run_id}")
                 urls.append(f"{SITE_URL}/runs/{run_id}.md")
-    except Exception:
+    except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
         pass
 
     body_lines = ['<?xml version="1.0" encoding="UTF-8"?>',
@@ -283,7 +286,7 @@ async def bench_md() -> PlainTextResponse:
         try:
             data = json.loads(family_json.read_text(encoding="utf-8"))
             summary = data.get("description") or data.get("summary") or ""
-        except Exception:
+        except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
             pass
         lines.append(f"- [{slug}]({SITE_URL}/bench/{slug}.md)"
                      + (f" — {summary}" if summary else ""))
@@ -305,7 +308,7 @@ async def bench_family_md(slug: str) -> PlainTextResponse:
         try:
             data = json.loads(family_json.read_text(encoding="utf-8"))
             body += "\n\n## family.json\n\n```json\n" + json.dumps(data, indent=2) + "\n```\n"
-        except Exception:
+        except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
             pass
     return _md(body)
 
@@ -330,7 +333,7 @@ async def registry_md() -> PlainTextResponse:
                 "FROM evolution_runs ORDER BY created_at DESC LIMIT 200"
             )
             rows = await cursor.fetchall()
-    except Exception:
+    except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
         rows = []
     if not rows:
         lines.append("_No runs found._")
@@ -348,7 +351,7 @@ async def run_md(run_id: str) -> PlainTextResponse:
     try:
         from skillforge.db.queries import get_run
         run = await get_run(run_id)
-    except Exception:
+    except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
         run = None
     if run is None:
         raise HTTPException(status_code=404, detail=f"run not found: {run_id}")
@@ -371,7 +374,7 @@ async def run_md(run_id: str) -> PlainTextResponse:
             try:
                 fit = max(best.pareto_objectives.values())
                 lines.append(f"- **Fitness:** {fit:.4f}")
-            except Exception:
+            except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
                 pass
             lines.append("- **Pareto objectives:**")
             for k, v in sorted(best.pareto_objectives.items()):
@@ -393,6 +396,6 @@ def _first_heading(path: Path) -> str | None:
                 return s.lstrip("#").strip()
             if s.startswith("# "):
                 return s.lstrip("#").strip()
-    except Exception:
+    except Exception:  # noqa: BLE001 — llms.txt endpoint must serve partial content, never 500
         return None
     return None
diff --git a/skillforge/api/routes.py b/skillforge/api/routes.py
index 7c7e2b2..386d89a 100644
--- a/skillforge/api/routes.py
+++ b/skillforge/api/routes.py
@@ -24,17 +24,15 @@
 from skillforge.config import invite_code_valid
 from skillforge.db.database import init_db
 from skillforge.db.queries import get_lineage, get_run, list_runs, save_run
-from skillforge.engine.evolution import PENDING_PARENTS, run_evolution
+from skillforge.engine.evolution import run_evolution
 from skillforge.engine.export import export_agent_sdk_config, export_skill_md, export_skill_zip
+from skillforge.engine.run_registry import registry
 from skillforge.models import EvolutionRun, SkillGenome
 
 logger = logging.getLogger("skillforge.api")
 
 router = APIRouter(prefix="/api")
 
-# Module-level registry: run_id -> background task
-_active_runs: dict[str, asyncio.Task] = {}
-
 
 async def _classify_run_via_taxonomist(
     run: EvolutionRun, requested_mode: str | None
@@ -193,13 +191,13 @@ async def start_evolution(req: EvolveRequest) -> EvolveResponse:
 
     # Spawn background task — store reference so it isn't GC'd
     task = asyncio.create_task(run_evolution(run))
-    _active_runs[run.id] = task
+    registry.set_task(run.id, task)
     logger.info("run=%s started: spec=%s pop=%d gens=%d",
                 run.id[:8], run.specialization[:60], run.population_size, run.num_generations)
 
     # Cleanup callback removes the task from the registry when it finishes
     def _cleanup(t: asyncio.Task) -> None:
-        _active_runs.pop(run.id, None)
+        registry.clear_task(run.id)
         exc = t.exception() if not t.cancelled() else None
         if exc:
             logger.error("run=%s task failed: %s", run.id[:8], exc)
@@ -239,9 +237,10 @@ async def start_evolution_from_parent(req: EvolveFromParentRequest) -> EvolveRes
       - ``upload``: ``parent_id`` is an upload_id from POST /api/uploads/skill.
         Resolved via the in-memory upload cache.
 
-    The parent is stashed in the ``PENDING_PARENTS`` registry keyed by the new
-    run's id. The evolution engine picks it up at gen 0 spawn time and routes
-    through ``spawner.spawn_from_parent()`` instead of ``spawn_gen0()``.
+    The parent is stashed in the ``RunRegistry`` (see ``engine/run_registry.py``)
+    keyed by the new run's id. The evolution engine picks it up at gen 0 spawn
+    time and routes through ``spawner.spawn_from_parent()`` instead of
+    ``spawn_gen0()``.
     """
     if not invite_code_valid(req.invite_code):
         raise HTTPException(
@@ -316,17 +315,17 @@ async def start_evolution_from_parent(req: EvolveFromParentRequest) -> EvolveRes
     await save_run(run)
 
     # Stash the parent so the engine's gen-0 spawn picks it up
-    PENDING_PARENTS[run.id] = parent
+    registry.stash_parent(run.id, parent)
 
     # Clear the upload cache so we don't leak memory
     if req.parent_source == "upload":
         clear_upload(req.parent_id)
 
     task = asyncio.create_task(run_evolution(run))
-    _active_runs[run.id] = task
+    registry.set_task(run.id, task)
 
     def _cleanup(t: asyncio.Task) -> None:
-        _active_runs.pop(run.id, None)
+        registry.clear_task(run.id)
 
     task.add_done_callback(_cleanup)
 
@@ -337,12 +336,12 @@ def _cleanup(t: asyncio.Task) -> None:
 async def cancel_run(run_id: str) -> dict:
     """Cancel an in-progress evolution run.
 
-    Finds the backing asyncio task in ``_active_runs``, cancels it, and
-    marks the run status as ``cancelled`` in the DB. The engine catches
+    Finds the backing asyncio task in the ``RunRegistry``, cancels it,
+    and marks the run status as ``cancelled`` in the DB. The engine catches
     ``CancelledError`` in its main loop, emits a ``run_cancelled`` event,
     and persists the partial state before exiting.
     """
-    task = _active_runs.get(run_id)
+    task = registry.get_task(run_id)
     if task is None or task.done():
         # Maybe already done, maybe never existed — either way nothing to cancel
         raise HTTPException(
diff --git a/skillforge/api/spec_assistant.py b/skillforge/api/spec_assistant.py
index 70464e3..ed7a08d 100644
--- a/skillforge/api/spec_assistant.py
+++ b/skillforge/api/spec_assistant.py
@@ -379,8 +379,8 @@ async def generate_skill(req: GenerateSkillRequest) -> GenerateSkillResponse:
                     traits=[],
                 )
                 logger.info("auto-saved generated package as candidate seed: %s", name)
-            except Exception as e:
-                logger.warning("failed to auto-save candidate seed: %s", e)
+            except Exception:  # noqa: BLE001 — auto-save is a bonus; never fail the response over it
+                logger.exception("failed to auto-save candidate seed")
 
             return GenerateSkillResponse(
                 name=name,
diff --git a/skillforge/api/uploads.py b/skillforge/api/uploads.py
index 57e5bdf..53179e6 100644
--- a/skillforge/api/uploads.py
+++ b/skillforge/api/uploads.py
@@ -101,7 +101,7 @@ def _sniff_skill_md(zf: zipfile.ZipFile) -> tuple[str, dict[str, str]]:
             continue  # silently skip disallowed extensions
         try:
             content = zf.read(info.filename).decode("utf-8", errors="replace")
-        except Exception:
+        except (OSError, zipfile.BadZipFile, RuntimeError):
             continue
         if rel == "SKILL.md":
             skill_md_body = content
@@ -170,15 +170,16 @@ async def upload_skill(file: UploadFile = File(...)) -> dict:  # noqa: B008  (Fa
     # Extract frontmatter for the preview response
     frontmatter_preview: dict = {}
     if skill_md.startswith("---"):
-        try:
-            import yaml
+        import yaml
 
+        try:
             _, fm_block, _ = skill_md.split("---", 2)
             parsed = yaml.safe_load(fm_block) or {}
-            if isinstance(parsed, dict):
-                frontmatter_preview = parsed
-        except Exception:
-            pass
+        except (ValueError, yaml.YAMLError):
+            # Malformed frontmatter — preview stays empty, upload still succeeds.
+            parsed = None
+        if isinstance(parsed, dict):
+            frontmatter_preview = parsed
 
     _UPLOADS[upload_id] = _UploadRecord(skill=genome, filename=filename)
 
diff --git a/skillforge/api/websocket.py b/skillforge/api/websocket.py
index 2e9f2c7..a3d68da 100644
--- a/skillforge/api/websocket.py
+++ b/skillforge/api/websocket.py
@@ -47,7 +47,7 @@ async def evolution_events(websocket: WebSocket, run_id: str) -> None:
     except WebSocketDisconnect:
         logger.info("ws run=%s client disconnected", run_id[:8])
         return
-    except Exception as exc:
-        logger.error("ws run=%s error: %s", run_id[:8], exc)
+    except Exception:  # noqa: BLE001 — WS handler must always close cleanly, never propagate
+        logger.exception("ws run=%s error", run_id[:8])
         with contextlib.suppress(Exception):
             await websocket.close(code=1011)
diff --git a/skillforge/db/benchmark_seed_loader.py b/skillforge/db/benchmark_seed_loader.py
index b97fd3b..bce11e5 100644
--- a/skillforge/db/benchmark_seed_loader.py
+++ b/skillforge/db/benchmark_seed_loader.py
@@ -59,8 +59,8 @@ async def load_benchmark_results() -> dict:
 
         try:
             records = json.loads(SEED_PATH.read_text(encoding="utf-8"))
-        except Exception as exc:  # noqa: BLE001 - never block boot on a bad seed
-            logger.exception("Failed to parse benchmark seed: %s", exc)
+        except (OSError, json.JSONDecodeError):
+            logger.exception("Failed to parse benchmark seed")
             return {"loaded": 0, "skipped_reason": "parse-error"}
 
         if not isinstance(records, list) or not records:
diff --git a/skillforge/engine/events.py b/skillforge/engine/events.py
index a138d0f..9effdad 100644
--- a/skillforge/engine/events.py
+++ b/skillforge/engine/events.py
@@ -55,8 +55,8 @@ async def _persist_event(run_id: str, event_type: str, payload: dict[str, Any])
                 (run_id, event_type, _json.dumps(payload, default=str), payload.get("timestamp", "")),
             )
             await conn.commit()
-    except Exception:
-        pass  # best-effort, never block the engine
+    except Exception:  # noqa: BLE001 — event persistence is best-effort; must never block the engine
+        logger.debug("event persistence failed (swallowed)", exc_info=True)
 
 
 async def emit(run_id: str, event: str, **kwargs: Any) -> None:
diff --git a/skillforge/engine/evolution.py b/skillforge/engine/evolution.py
index d63546b..fb255b6 100644
--- a/skillforge/engine/evolution.py
+++ b/skillforge/engine/evolution.py
@@ -36,15 +36,12 @@
 from skillforge.db.database import init_db
 from skillforge.db.queries import save_run
 from skillforge.engine.events import emit
+from skillforge.engine.run_registry import registry
 from skillforge.engine.sandbox import cleanup_sandbox, create_sandbox
-from skillforge.models import EvolutionRun, Generation, SkillGenome
+from skillforge.models import EvolutionRun, Generation
 
 logger = logging.getLogger("skillforge.engine")
 
-# Module-level registry: run_id -> parent SkillGenome when the run was started
-# via fork-and-evolve (seed or upload). Looked up at gen 0 spawn time.
-PENDING_PARENTS: dict[str, SkillGenome] = {}
-
 # --- Budget tracking ---------------------------------------------------------
 # MVP: estimate cost from trace length. Each SDK turn is ~$0.02 for Sonnet 4.6
 # (very rough). Real tracking would read token counts from message metadata;
@@ -231,8 +228,8 @@ async def run_evolution(run: EvolutionRun) -> EvolutionRun:
 
         try:
             run = await run_variant_evolution(run)
-        except Exception as exc:  # noqa: BLE001
-            logger.exception("run=%s atomic evolution failed: %s", run.id[:8], exc)
+        except Exception as exc:  # noqa: BLE001 — top-level run boundary must catch all
+            logger.exception("run=%s atomic evolution failed", run.id[:8])
             run.status = "failed"
             run.failure_reason = f"atomic evolution failed: {exc}"
             run.completed_at = datetime.now(UTC)
@@ -288,8 +285,8 @@ async def _build_report() -> None:
                 "managed_environment_ready",
                 environment_id=env_id,
             )
-        except Exception as exc:  # noqa: BLE001
-            logger.error("run=%s managed environment creation failed: %s", run.id[:8], exc)
+        except Exception as exc:  # noqa: BLE001 — managed-env boundary: any SDK failure must be captured
+            logger.exception("run=%s managed environment creation failed", run.id[:8])
             run.status = "failed"
             run.failure_reason = f"managed environment creation failed: {exc}"
             await emit(run.id, "run_failed", reason="env_create_failed")
@@ -319,7 +316,7 @@ async def _build_report() -> None:
 
             # --- Spawn or breed ---------------------------------------
             if gen_num == 0:
-                seed_parent = PENDING_PARENTS.pop(run.id, None)
+                seed_parent = registry.take_parent(run.id)
                 if seed_parent is not None:
                     skills = await spawn_from_parent(seed_parent, run.population_size)
                 else:
@@ -478,8 +475,8 @@ async def _build_report() -> None:
                     source_skill_id=best.id,
                 )
                 logger.info("run=%s auto-saved best skill as candidate seed", run.id[:8])
-            except Exception as e:
-                logger.warning("run=%s failed to save candidate seed: %s", run.id[:8], e)
+            except Exception:  # noqa: BLE001 — auto-save is a bonus; never fail the run over it
+                logger.exception("run=%s failed to save candidate seed", run.id[:8])
 
         await emit(
             run.id,
@@ -558,10 +555,8 @@ async def _persist(run: EvolutionRun) -> None:
     """
     try:
         await save_run(run)
-    except Exception as exc:  # noqa: BLE001
-        # DB persistence failures are non-fatal during a run — the Progress
-        # Tracker event stream is the primary truth; DB is a durable backup.
-        logger.error("run=%s DB persistence failed: %s", run.id[:8], exc)
+    except Exception:  # noqa: BLE001 — DB is a durable backup; event stream is the primary truth
+        logger.exception("run=%s DB persistence failed", run.id[:8])
 
 
 def dump_run_json(run: EvolutionRun) -> Path | None:
@@ -582,7 +577,7 @@ def dump_run_json(run: EvolutionRun) -> Path | None:
         RUN_DUMPS_DIR.mkdir(parents=True, exist_ok=True)
         path = RUN_DUMPS_DIR / f"{run.id}.json"
         path.write_text(json.dumps(run.to_dict(), indent=2, default=str))
-        return path
-    except Exception as exc:  # noqa: BLE001
-        logger.error("run=%s JSON dump failed: %s", run.id[:8], exc)
+    except (OSError, TypeError):
+        logger.exception("run=%s JSON dump failed", run.id[:8])
         return None
+    return path
diff --git a/skillforge/engine/report.py b/skillforge/engine/report.py
index 3a17c36..5a93392 100644
--- a/skillforge/engine/report.py
+++ b/skillforge/engine/report.py
@@ -206,15 +206,15 @@ async def _build_atomic_genomes_section(
             (run.id,),
         ) as cur:
             rows = await cur.fetchall()
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.warning("report: failed to fetch atomic genomes: %s", exc)
+    except Exception:  # noqa: BLE001 — defensive: report rendering must not fail on a bad row
+        logger.exception("report: failed to fetch atomic genomes")
         return []
 
     out: list[dict[str, Any]] = []
     for row in rows:
         try:
             g = _row_to_genome(row)
-        except Exception:
+        except Exception:  # noqa: BLE001 — skip corrupt rows; never crash the whole report
             continue
         out.append(
             {
@@ -250,8 +250,8 @@ async def _build_variant_evolutions_section(
     """
     try:
         evolutions = await get_variant_evolutions_for_run(run.id)
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.warning("report: failed to fetch variant evolutions: %s", exc)
+    except Exception:  # noqa: BLE001 — defensive: report rendering must not fail on a bad row
+        logger.exception("report: failed to fetch variant evolutions")
         return []
 
     out: list[dict[str, Any]] = []
@@ -296,8 +296,8 @@ async def _build_assembly_report(run: EvolutionRun) -> dict[str, Any] | None:
     try:
         variants = await get_variants_for_family(family_id)
         active_variants = [v.to_dict() for v in variants if v.is_active]
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.warning("report: failed to fetch active variants: %s", exc)
+    except Exception:  # noqa: BLE001 — defensive: report rendering degrades gracefully
+        logger.exception("report: failed to fetch active variants")
 
     return {
         "family_id": family.id,
@@ -474,8 +474,8 @@ async def generate_run_report(
     """
     try:
         run = await get_run(run_id)
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.error("report: failed to load run %s: %s", run_id, exc)
+    except Exception:  # noqa: BLE001 — report generation must degrade, never crash
+        logger.exception("report: failed to load run %s", run_id)
         return None
 
     if run is None:
@@ -510,8 +510,8 @@ async def generate_run_report(
             ),
             "generated_at": datetime.now(UTC).isoformat(),
         }
-    except Exception as exc:  # noqa: BLE001
-        logger.exception("report: failed to build report for %s: %s", run_id, exc)
+    except Exception:  # noqa: BLE001 — report generation must degrade, never crash
+        logger.exception("report: failed to build report for %s", run_id)
         return None
 
     target_dir = reports_dir if reports_dir is not None else REPORTS_DIR
@@ -535,8 +535,8 @@ async def generate_run_report(
         logger.info(
             "report: generated %s (%d bytes)", run_id, size_bytes
         )
-    except Exception as exc:  # noqa: BLE001
-        logger.exception("report: failed to write report for %s: %s", run_id, exc)
+    except (OSError, TypeError):
+        logger.exception("report: failed to write report for %s", run_id)
         return None
 
     return report
diff --git a/skillforge/engine/run_registry.py b/skillforge/engine/run_registry.py
new file mode 100644
index 0000000..dadb042
--- /dev/null
+++ b/skillforge/engine/run_registry.py
@@ -0,0 +1,71 @@
+"""In-process registry of active evolution runs.
+
+Centralizes two pieces of runtime state the FastAPI process tracks for
+every currently-running evolution:
+
+- ``pending_parents`` — a ``SkillGenome`` stashed at run-start when the
+  run was forked from a seed or an uploaded skill. Gen 0 spawn pops this
+  to seed the initial population instead of spawning fresh.
+
+- ``tasks`` — the backing ``asyncio.Task`` for each evolution coroutine,
+  so ``DELETE /api/runs/{id}`` can cancel in-flight work.
+
+Before this module existed, the same state lived as two mutable module
+globals split across ``engine/evolution.py`` and ``api/routes.py``. That
+made the state invisible to tests, impossible to reset, and fragile
+under concurrent requests. A single registry with explicit accessors
+replaces both. See ``docs/clean-code.md`` §5.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from skillforge.models import SkillGenome
+
+
+@dataclass(slots=True)
+class RunRegistry:
+    """Holds pending-parent and active-task state for evolution runs."""
+
+    _pending_parents: dict[str, SkillGenome] = field(default_factory=dict)
+    _tasks: dict[str, asyncio.Task] = field(default_factory=dict)
+
+    # --- pending parents (fork-and-evolve) ---------------------------------
+
+    def stash_parent(self, run_id: str, parent: SkillGenome) -> None:
+        """Record the parent genome a run should fork from."""
+        self._pending_parents[run_id] = parent
+
+    def take_parent(self, run_id: str) -> SkillGenome | None:
+        """Pop the parent genome for ``run_id`` (one-shot)."""
+        return self._pending_parents.pop(run_id, None)
+
+    # --- active evolution tasks --------------------------------------------
+
+    def set_task(self, run_id: str, task: asyncio.Task) -> None:
+        self._tasks[run_id] = task
+
+    def clear_task(self, run_id: str) -> None:
+        self._tasks.pop(run_id, None)
+
+    def get_task(self, run_id: str) -> asyncio.Task | None:
+        return self._tasks.get(run_id)
+
+    def active_count(self) -> int:
+        return len(self._tasks)
+
+    def iter_tasks(self) -> Iterable[tuple[str, asyncio.Task]]:
+        # Snapshot via list() so callers can cancel tasks without mutating
+        # the dict during iteration.
+        return list(self._tasks.items())
+
+
+# Module-level singleton. Tests that need isolation can construct their
+# own ``RunRegistry()`` and inject it explicitly; production code uses
+# this shared instance.
+registry = RunRegistry()
diff --git a/skillforge/engine/variant_evolution.py b/skillforge/engine/variant_evolution.py
index d90f283..4972e41 100644
--- a/skillforge/engine/variant_evolution.py
+++ b/skillforge/engine/variant_evolution.py
@@ -533,8 +533,8 @@ async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun:
             )
             logger.info("run=%s managed environment ready: %s", run.id[:8], env_id)
             await emit(run.id, "managed_environment_ready", environment_id=env_id)
-        except Exception as exc:  # noqa: BLE001
-            logger.error("run=%s managed environment creation failed: %s", run.id[:8], exc)
+        except Exception as exc:  # noqa: BLE001 — managed-env boundary: any SDK failure must be captured
+            logger.exception("run=%s managed environment creation failed", run.id[:8])
             run.status = "failed"
             run.failure_reason = f"managed environment creation failed: {exc}"
             await save_run(run)
@@ -572,12 +572,11 @@ async def run_variant_evolution(run: EvolutionRun) -> EvolutionRun:
                     foundation_winner=foundation_winner,
                     env_id=env_id,
                 )
-            except Exception as exc:  # noqa: BLE001
+            except Exception as exc:  # noqa: BLE001 — one bad dimension must not crash the whole atomic run
                 logger.exception(
-                    "run=%s dimension %s mini-evolution failed: %s",
+                    "run=%s dimension %s mini-evolution failed",
                     run.id[:8],
                     vevo.dimension,
-                    exc,
                 )
                 vevo.status = "failed"
                 await save_variant_evolution(vevo)
diff --git a/skillforge/errors.py b/skillforge/errors.py
new file mode 100644
index 0000000..d343dc3
--- /dev/null
+++ b/skillforge/errors.py
@@ -0,0 +1,90 @@
+"""Typed exception hierarchy for the skillforge package.
+
+Every domain error inherits from ``SkldError`` so boundary handlers can
+catch the whole family with one ``except`` clause while inner code can
+still raise (and catch) precise types. See ``docs/clean-code.md`` §4.
+"""
+
+from __future__ import annotations
+
+
+class SkldError(Exception):
+    """Base class for every skillforge domain error."""
+
+
+# --- Agent pipeline ---------------------------------------------------------
+
+
+class AgentError(SkldError):
+    """A failure inside an agent role (spawner, breeder, etc.)."""
+
+
+class SpawnError(AgentError):
+    """Spawner could not produce a valid initial variant population."""
+
+
+class BreedError(AgentError):
+    """Breeder could not evolve a next generation."""
+
+
+class JudgeError(AgentError):
+    """A judging layer (L1-L5) failed to produce a verdict."""
+
+
+class EngineerError(AgentError):
+    """Engineer failed to assemble or refine a composite skill."""
+
+
+class TaxonomistError(AgentError):
+    """Taxonomist failed to classify a specialization."""
+
+
+class ChallengeDesignError(AgentError):
+    """Challenge designer could not produce a valid challenge."""
+
+
+# --- External integrations --------------------------------------------------
+
+
+class AgentSDKError(SkldError):
+    """The Anthropic / claude-agent SDK returned or raised unexpectedly.
+
+    Distinct from ``AgentError`` — ``AgentSDKError`` is a *transport*
+    failure (bad creds, rate limit, network), ``AgentError`` is a
+    *semantic* failure (agent produced garbage output).
+    """
+
+
+class ManagedEnvironmentError(SkldError):
+    """A hosted Managed-Agents environment could not be created, reused, or torn down."""
+
+
+# --- Data + parsing ---------------------------------------------------------
+
+
+class ParseError(SkldError, ValueError):
+    """Could not parse a structured response (JSON, YAML, frontmatter, etc.).
+
+    Inherits from ``ValueError`` so ``except ValueError`` continues to match
+    parse failures in legacy call sites; new code should catch ``ParseError``
+    to distinguish parse failures from other value errors.
+    """
+
+
+class ValidationError(SkldError, ValueError):
+    """A parsed value violated a domain invariant."""
+
+
+# --- Persistence ------------------------------------------------------------
+
+
+class DBError(SkldError):
+    """Database operation failed in an unexpected way.
+
+    Catching this is always optional — most call sites should let it
+    propagate so FastAPI's error middleware returns 500 with context.
+    """
+
+
+class StorageError(SkldError):
+    """Filesystem / blob-storage operation failed (export, upload, etc.)."""
diff --git a/skillforge/main.py b/skillforge/main.py
index f05afbd..f5bce61 100644
--- a/skillforge/main.py
+++ b/skillforge/main.py
@@ -88,14 +88,14 @@ async def lifespan(app: FastAPI):
     await load_seeds()
     try:
         await load_mock_runs()
-    except Exception as exc:  # pragma: no cover - fail-soft on seed run load
-        logger.exception("Seed run loader failed: %s", exc)
+    except Exception:  # noqa: BLE001 — boot-time: never block startup on seed load
+        logger.exception("Seed run loader failed")
     try:
         diag = await load_benchmark_results()
         if diag.get("loaded"):
             logger.info("Benchmark seed loaded: %d rows", diag["loaded"])
-    except Exception as exc:  # pragma: no cover - fail-soft on benchmark seed load
-        logger.exception("Benchmark seed loader failed: %s", exc)
+    except Exception:  # noqa: BLE001 — boot-time: never block startup on seed load
+        logger.exception("Benchmark seed loader failed")
     try:
         taxonomy_diag = await load_taxonomy()
         logger.info(
@@ -104,8 +104,8 @@ async def lifespan(app: FastAPI):
             taxonomy_diag.get("families_created", 0),
             taxonomy_diag.get("families_reused", 0),
         )
-    except Exception as exc:  # pragma: no cover - boot-time resiliency
-        logger.exception("Taxonomy bootstrap failed: %s", exc)
+    except Exception:  # noqa: BLE001 — boot-time: never block startup on bootstrap
+        logger.exception("Taxonomy bootstrap failed")
     zombie_count = await mark_zombie_runs()
     if zombie_count:
         logger.warning("Marked %d zombie run(s) as failed on startup", zombie_count)
@@ -139,11 +139,11 @@ async def lifespan(app: FastAPI):
 @app.get("/api/health")
 async def health() -> dict:
     """Backend health check with active run count."""
-    from skillforge.api.routes import _active_runs
+    from skillforge.engine.run_registry import registry
     return {
         "status": "ok",
         "service": "skillforge",
-        "active_runs": len(_active_runs),
+        "active_runs": registry.active_count(),
     }
 
 
@@ -395,8 +395,8 @@ def _inject_noscript_content(raw_html: str, full_path: str) -> str:
             elif base == "bench":
                 content_lines.append("<h1>SKLD-bench: 867 Elixir Challenges</h1>")
                 content_lines.append("<p>Controlled evaluation benchmark across 7 Elixir skill families with 6-layer composite scoring (L0 string match 10%, compilation 15%, AST quality 15%, behavioral tests 40%, template quality 10%, brevity 10%). Families: phoenix-liveview, ecto-query-writer, ecto-sandbox-test, ecto-schema-changeset, oban-worker, pattern-match-refactor, security-linter.</p>")
-        except Exception:
-            pass
+        except Exception:  # noqa: BLE001 — SEO augmentation must never break page render
+            logger.debug("meta_injection: failed to enrich SEO tags", exc_info=True)
 
         if not content_lines:
             return raw_html
diff --git a/skillforge/seeds/mock_run_loader.py b/skillforge/seeds/mock_run_loader.py
index 4d07954..8cdf05a 100644
--- a/skillforge/seeds/mock_run_loader.py
+++ b/skillforge/seeds/mock_run_loader.py
@@ -501,5 +501,5 @@ async def load_mock_runs() -> None:
     for path in files:
         try:
             await _load_one(path)
-        except Exception as e:  # noqa: BLE001 - fail soft, never break boot
-            logger.exception("seed_run_loader: failed to load %s: %s", path.name, e)
+        except Exception:  # noqa: BLE001 - fail soft, never break boot
+            logger.exception("seed_run_loader: failed to load %s", path.name)
diff --git a/tests/test_seeds.py b/tests/test_seeds.py
index ad62d15..151eb5e 100644
--- a/tests/test_seeds.py
+++ b/tests/test_seeds.py
@@ -28,7 +28,7 @@
 from fastapi.testclient import TestClient
 
 from skillforge.db import seed_loader
-from skillforge.engine.evolution import PENDING_PARENTS
+from skillforge.engine.run_registry import registry as _run_registry
 from skillforge.main import app
 from skillforge.models import EvolutionRun, Generation, SkillGenome
 
@@ -44,10 +44,10 @@ def client() -> TestClient:
 
 @pytest.fixture(autouse=True)
 def _clear_pending_parents():
-    """PENDING_PARENTS is module-level state — wipe between tests."""
-    PENDING_PARENTS.clear()
+    """RunRegistry is process-level state — wipe between tests."""
+    _run_registry._pending_parents.clear()
     yield
-    PENDING_PARENTS.clear()
+    _run_registry._pending_parents.clear()
 
 
 def _make_seed_skill(skill_id: str = "seed-foo", description: str = "Use when foo. NOT for bar.") -> SkillGenome:
@@ -431,8 +431,11 @@ def test_evolve_from_parent_registry_happy_path(client):
     assert "run_id" in payload
     assert payload["ws_url"] == f"/ws/evolve/{payload['run_id']}"
     # Parent was stashed for the engine to pick up
-    assert payload["run_id"] in PENDING_PARENTS
-    assert PENDING_PARENTS[payload["run_id"]].id == "seed-registry-1"
+    parent = _run_registry.take_parent(payload["run_id"])
+    assert parent is not None
+    assert parent.id == "seed-registry-1"
+    # Restore so the autouse fixture's post-yield clear() stays idempotent
+    _run_registry.stash_parent(payload["run_id"], parent)
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_uploads.py b/tests/test_uploads.py
index a1bfab2..9e41673 100644
--- a/tests/test_uploads.py
+++ b/tests/test_uploads.py
@@ -33,7 +33,7 @@
 from fastapi.testclient import TestClient
 
 from skillforge.api import uploads as uploads_module
-from skillforge.engine.evolution import PENDING_PARENTS
+from skillforge.engine.run_registry import registry as _run_registry
 from skillforge.main import app
 
 # ---------------------------------------------------------------------------
@@ -48,12 +48,12 @@ def client() -> TestClient:
 
 @pytest.fixture(autouse=True)
 def _wipe_upload_state():
-    """Per-test cleanup: in-memory upload cache + PENDING_PARENTS registry."""
+    """Per-test cleanup: in-memory upload cache + RunRegistry pending parents."""
     uploads_module._UPLOADS.clear()
-    PENDING_PARENTS.clear()
+    _run_registry._pending_parents.clear()
     yield
     uploads_module._UPLOADS.clear()
-    PENDING_PARENTS.clear()
+    _run_registry._pending_parents.clear()
 
 
 # A valid SKILL.md that passes validate_skill_structure:
@@ -519,8 +519,9 @@ def test_upload_then_fork_round_trip(client):
     assert fork_resp.status_code == 200, fork_resp.text
     payload = fork_resp.json()
     new_run_id = payload["run_id"]
-    assert new_run_id in PENDING_PARENTS
-    parent = PENDING_PARENTS[new_run_id]
+    parent = _run_registry.take_parent(new_run_id)
+    assert parent is not None
     assert parent.skill_md_content == VALID_SKILL_MD
+    _run_registry.stash_parent(new_run_id, parent)
     # The upload should be cleared from the cache after the fork starts
     assert uploads_module.get_upload(upload_id) is None