diff --git a/eval/score.py b/eval/score.py
index 40c43a7..c5683d2 100644
--- a/eval/score.py
+++ b/eval/score.py
@@ -1,393 +1,233 @@
 #!/usr/bin/env python3
-"""Eval script for the Remote Factory.
+"""Auto-generated eval script for the Software Factory.
 
-Runs 6 project-specific (hygiene) evaluation dimensions and outputs JSON
-to stdout. The factory's eval runner injects universal growth dimensions
-on top of these, so this script only needs to cover project health.
+This script was generated by `factory discover`. It runs each eval dimension
+as a subprocess and outputs JSON to stdout.
 
 Output format:
     {"results": [{"name": str, "score": float, "weight": float, "passed": bool, "details": str}, ...]}
 
-Each dimension parses real metrics from tool output rather than using
-binary exit-code checks.
+You can edit this file to add custom evals or adjust weights.
+Once edited, it becomes a Tier 1 (explicit) eval — the factory will use it as-is.
 """
 
-import asyncio
 import json
-import os
-import re
 import subprocess
 import sys
 
-# Ensure the project root is on sys.path so factory.* imports work.
-PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if PROJECT_ROOT not in sys.path:
-    sys.path.insert(0, PROJECT_ROOT)
-
-
-# ── Dimension 1: tests (weight 0.30) ─────────────────────────────
-
-
 def eval_tests() -> dict:
-    """Run test suite and parse pass/fail counts."""
+    """Run test suite: uv run pytest -v"""
     try:
         result = subprocess.run(
-            ["uv", "run", "pytest", "-v"],
+            ['uv', 'run', 'pytest', '-v'],
             capture_output=True,
             text=True,
             timeout=120,
-            cwd=PROJECT_ROOT,
         )
-        output = result.stdout + result.stderr
-
-        # Look for "X passed" and optionally "Y failed"
-        passed_match = re.search(r"(\d+)\s+passed", output)
-        failed_match = re.search(r"(\d+)\s+failed", output)
-
-        passed_count = int(passed_match.group(1)) if passed_match else 0
-        failed_count = int(failed_match.group(1)) if failed_match else 0
-        total = passed_count + failed_count
-
-        if total == 0:
-            score = 0.0
-            ok = False
-            details = "No test results found in output"
+        passed = result.returncode == 0
+        if passed:
+            score = 1.0
         else:
-            score = passed_count / total
-            ok = failed_count == 0
-            details = f"{passed_count} passed, {failed_count} failed"
-
+            # Partial score: count output lines as a rough error metric
+            error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()]
+            if not error_lines:
+                score = 0.0
+            else:
+                score = max(0.0, 1.0 - len(error_lines) * 0.05)
         return {
-            "name": "tests",
-            "score": round(score, 4),
-            "weight": 0.30,
-            "passed": ok,
-            "details": details,
+            "name": 'tests',
+            "score": score,
+            "weight": 0.41666666666666663,
+            "passed": passed,
+            "details": (result.stdout or result.stderr).strip()[-500:],
         }
     except subprocess.TimeoutExpired:
         return {
-            "name": "tests",
+            "name": 'tests',
             "score": 0.0,
-            "weight": 0.30,
+            "weight": 0.41666666666666663,
             "passed": False,
             "details": "Timed out after 120s",
         }
-    except Exception as exc:
-        return {
-            "name": "tests",
-            "score": 0.0,
-            "weight": 0.30,
-            "passed": False,
-            "details": f"Error: {exc}",
-        }
-
-
-# ── Dimension 2: lint (weight 0.15) ──────────────────────────────
-
 
 def eval_lint() -> dict:
-    """Run ruff and parse error count."""
+    """Run linter: uv run ruff check ."""
     try:
         result = subprocess.run(
-            ["uv", "run", "ruff", "check", "."],
+            ['uv', 'run', 'ruff', 'check', '.'],
             capture_output=True,
             text=True,
             timeout=120,
-            cwd=PROJECT_ROOT,
         )
-        if result.returncode == 0:
-            return {
-                "name": "lint",
-                "score": 1.0,
-                "weight": 0.15,
-                "passed": True,
-                "details": "No lint errors",
-            }
-
-        # Parse "Found X error(s)"
-        output = result.stdout + result.stderr
-        error_match = re.search(r"Found\s+(\d+)\s+error", output)
-        if error_match:
-            error_count = int(error_match.group(1))
-            # Partial credit: lose 0.1 per error, floor at 0
-            score = max(0.0, 1.0 - error_count * 0.1)
-            details = f"Found {error_count} lint error(s)"
+        passed = result.returncode == 0
+        if passed:
+            score = 1.0
         else:
-            score = 0.0
-            details = output.strip()[-500:]
-
+            # Partial score: count output lines as a rough error metric
+            error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()]
+            if not error_lines:
+                score = 0.0
+            else:
+                score = max(0.0, 1.0 - len(error_lines) * 0.05)
         return {
-            "name": "lint",
-            "score": round(score, 4),
-            "weight": 0.15,
-            "passed": False,
-            "details": details,
+            "name": 'lint',
+            "score": score,
+            "weight": 0.24999999999999994,
+            "passed": passed,
+            "details": (result.stdout or result.stderr).strip()[-500:],
         }
     except subprocess.TimeoutExpired:
         return {
-            "name": "lint",
+            "name": 'lint',
             "score": 0.0,
-            "weight": 0.15,
+            "weight": 0.24999999999999994,
             "passed": False,
             "details": "Timed out after 120s",
         }
-    except Exception as exc:
-        return {
-            "name": "lint",
-            "score": 0.0,
-            "weight": 0.15,
-            "passed": False,
-            "details": f"Error: {exc}",
-        }
-
-
-# ── Dimension 3: type_check (weight 0.10) ────────────────────────
-
 
 def eval_type_check() -> dict:
-    """Run mypy and parse error count."""
+    """Run type checker: uv run mypy factory/"""
     try:
         result = subprocess.run(
-            ["uv", "run", "mypy", "factory/"],
+            ['uv', 'run', 'mypy', 'factory/'],
             capture_output=True,
             text=True,
             timeout=120,
-            cwd=PROJECT_ROOT,
         )
-        if result.returncode == 0:
-            return {
-                "name": "type_check",
-                "score": 1.0,
-                "weight": 0.10,
-                "passed": True,
-                "details": "No type errors",
-            }
-
-        output = result.stdout + result.stderr
-        # mypy prints "Found X error(s)" at the end
-        error_match = re.search(r"Found\s+(\d+)\s+error", output)
-        if error_match:
-            error_count = int(error_match.group(1))
-            score = max(0.0, 1.0 - error_count * 0.05)
-            details = f"Found {error_count} type error(s)"
+        passed = result.returncode == 0
+        if passed:
+            score = 1.0
         else:
-            score = 0.0
-            details = output.strip()[-500:]
-
+            # Partial score: count output lines as a rough error metric
+            error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()]
+            if not error_lines:
+                score = 0.0
+            else:
+                score = max(0.0, 1.0 - len(error_lines) * 0.05)
         return {
-            "name": "type_check",
-            "score": round(score, 4),
-            "weight": 0.10,
-            "passed": False,
-            "details": details,
+            "name": 'type_check',
+            "score": score,
+            "weight": 0.12499999999999997,
+            "passed": passed,
+            "details": (result.stdout or result.stderr).strip()[-500:],
         }
     except subprocess.TimeoutExpired:
         return {
-            "name": "type_check",
+            "name": 'type_check',
             "score": 0.0,
-            "weight": 0.10,
+            "weight": 0.12499999999999997,
             "passed": False,
             "details": "Timed out after 120s",
         }
-    except Exception as exc:
-        return {
-            "name": "type_check",
-            "score": 0.0,
-            "weight": 0.10,
-            "passed": False,
-            "details": f"Error: {exc}",
-        }
-
-
-# ── Dimension 4: coverage (weight 0.25) ──────────────────────────
-
 
 def eval_coverage() -> dict:
-    """Run pytest with coverage and parse the TOTAL percentage."""
+    """Measure test coverage"""
+    import re as _re
     try:
         result = subprocess.run(
-            ["uv", "run", "pytest", "--cov=factory", "--cov-report=term", "-q"],
+            ['uv', 'run', 'python', '-m', 'pytest', '--cov=factory', '--cov-report=term', '-q'],
             capture_output=True,
             text=True,
-            timeout=120,
-            cwd=PROJECT_ROOT,
+            timeout=180,
         )
-        output = result.stdout + result.stderr
-
-        # Parse TOTAL line: "TOTAL    123    30    75%"
-        total_match = re.search(r"TOTAL\s+\d+\s+\d+\s+(\d+)%", output)
-        if total_match:
-            percentage = int(total_match.group(1))
-            score = percentage / 100.0
-            ok = percentage >= 80
-            details = f"Coverage: {percentage}% (threshold: 80%)"
-        else:
-            score = 0.0
-            ok = False
-            details = "Could not parse coverage from output"
-
-        return {
-            "name": "coverage",
-            "score": round(score, 4),
-            "weight": 0.25,
-            "passed": ok,
-            "details": details,
+        output = (result.stdout + result.stderr).strip()
+        pct = 0.0
+        m = _re.search(r'TOTAL\s+\d+\s+\d+\s+(\d+)%', output)
+        if m:
+            pct = int(m.group(1))
+        score = pct / 100.0
+        passed = score >= 0.70
+        return {
+            "name": 'coverage',
+            "score": round(score, 3),
+            "weight": 0.12499999999999997,
+            "passed": passed,
+            "details": output[-500:],
         }
     except subprocess.TimeoutExpired:
         return {
-            "name": "coverage",
-            "score": 0.0,
-            "weight": 0.25,
-            "passed": False,
-            "details": "Timed out after 120s",
-        }
-    except Exception as exc:
-        return {
-            "name": "coverage",
+            "name": 'coverage',
             "score": 0.0,
-            "weight": 0.25,
+            "weight": 0.12499999999999997,
             "passed": False,
-            "details": f"Error: {exc}",
-        }
-
-
-# ── Dimension 5: guard_patterns (weight 0.10) ────────────────────
-
-
-def eval_guard_patterns() -> dict:
-    """Test that the guard system's glob matching works correctly."""
-    try:
-        from factory.eval.guards import _glob_match
-    except (ImportError, AttributeError) as exc:
-        return {
-            "name": "guard_patterns",
-            "score": 0.0,
-            "weight": 0.10,
-            "passed": False,
-            "details": f"Could not import _glob_match: {exc}",
-        }
-
-    try:
-        test_cases: list[tuple[str, str, bool]] = [
-            ("factory/**/*.py", "factory/eval/runner.py", True),
-            ("factory/**/*.py", "tests/test_guards.py", False),
-            ("tests/**/*.py", "tests/test_guards.py", True),
-            ("templates/**", "templates/factory_config.md", True),
-        ]
-
-        correct = 0
-        results_detail: list[str] = []
-        for pattern, filepath, expected in test_cases:
-            actual = _glob_match(filepath, pattern)
-            if actual == expected:
-                correct += 1
-                results_detail.append(f"OK: {pattern} vs {filepath}")
-            else:
-                results_detail.append(
-                    f"FAIL: {pattern} vs {filepath} — "
-                    f"expected {expected}, got {actual}"
-                )
-
-        total = len(test_cases)
-        score = correct / total
-        ok = correct == total
-
-        return {
-            "name": "guard_patterns",
-            "score": round(score, 4),
-            "weight": 0.10,
-            "passed": ok,
-            "details": "; ".join(results_detail),
-        }
-    except Exception as exc:
-        return {
-            "name": "guard_patterns",
-            "score": 0.0,
-            "weight": 0.10,
-            "passed": False,
-            "details": f"Error running guard pattern tests: {exc}",
-        }
-
-
-# ── Dimension 6: config_parser (weight 0.10) ─────────────────────
-
-
-def eval_config_parser() -> dict:
-    """Test that the factory.md parser extracts fields correctly."""
-    try:
-        from factory.store import ExperimentStore
-    except ImportError as exc:
-        return {
-            "name": "config_parser",
-            "score": 0.0,
-            "weight": 0.10,
-            "passed": False,
-            "details": f"Could not import ExperimentStore: {exc}",
-        }
-
-    try:
-        from pathlib import Path
-
-        store = ExperimentStore(Path(PROJECT_ROOT))
-        config = asyncio.run(store.reparse_config())
-
-        checks: list[tuple[str, bool]] = []
-
-        # goal should be non-empty
-        checks.append(("goal is non-empty", bool(config.goal and len(config.goal) > 0)))
-
-        # scope should contain expected patterns
-        checks.append((
-            "scope contains factory/**/*.py",
-            "factory/**/*.py" in config.scope,
-        ))
-
-        # eval_command should reference eval/score.py
-        checks.append((
-            "eval_command references score.py",
-            "eval/score.py" in config.eval_command,
-        ))
-
-        # eval_threshold should be 0.8
-        checks.append(("eval_threshold is 0.8", config.eval_threshold == 0.8))
-
-        correct = sum(1 for _, ok in checks if ok)
-        total = len(checks)
-        score = correct / total
-        ok = correct == total
-
-        details_parts = [
-            f"{'OK' if passed else 'FAIL'}: {label}"
-            for label, passed in checks
-        ]
-
-        return {
-            "name": "config_parser",
-            "score": round(score, 4),
-            "weight": 0.10,
-            "passed": ok,
-            "details": "; ".join(details_parts),
-        }
-    except Exception as exc:
-        return {
-            "name": "config_parser",
-            "score": 0.0,
-            "weight": 0.10,
-            "passed": False,
-            "details": f"Error running config parser tests: {exc}",
-        }
-
-
-# ── Main ──────────────────────────────────────────────────────────
-
-EVALS = [
-    eval_tests,
-    eval_lint,
-    eval_type_check,
-    eval_coverage,
-    eval_guard_patterns,
-    eval_config_parser,
-]
+            "details": "Timed out after 180s",
+        }
+
+def eval_observability() -> dict:
+    """Analyze observability coverage: logging, structured logging, request tracing."""
+    import ast
+    import re
+    from pathlib import Path
+
+    skip = {
+        "tests", "test", ".venv", "venv", "node_modules", "__pycache__",
+        ".git", ".factory", "eval", "dist", "build", ".mypy_cache",
+    }
+    log_pats = [
+        r"\blogger\.\w+\(",
+        r"\blogging\.\w+\(",
+        r"\blog\.\w+\(",
+        r"\bconsole\.\w+\(",
+    ]
+    struct_pats = [r"\bstructlog\b", r"\bpino\b", r"\bwinston\b",
+                   r"\bslog\.\w+\(", r"\btracing::"]
+    trace_pats = [r"request.id|req.id|trace.id", r"\bcontextvars\b|ContextVar",
+                  r"\bopentelemetry\b", r"trace.context|TraceContext|span"]
+
+    sources = [f for f in Path(".").rglob("*.py")
+               if not any(p in f.parts for p in skip)]
+    total_fn = logged_fn = total_log = 0
+    has_struct = has_trace = False
+
+    for src in sources:
+        try:
+            code = src.read_text(errors="replace")
+        except OSError:
+            continue
+        try:
+            tree = ast.parse(code)
+        except SyntaxError:
+            continue
+        lines = code.splitlines()
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                if node.name.startswith("__"):
+                    continue
+                total_fn += 1
+                start = node.lineno - 1
+                end = node.end_lineno or start + 1
+                body = "\n".join(lines[start:end])
+                for pat in log_pats:
+                    if re.search(pat, body):
+                        logged_fn += 1
+                        break
+        for pat in log_pats:
+            total_log += len(re.findall(pat, code))
+        for pat in struct_pats:
+            if re.search(pat, code):
+                has_struct = True
+        for pat in trace_pats:
+            if re.search(pat, code, re.IGNORECASE):
+                has_trace = True
+
+    if total_fn == 0:
+        return {"name": "observability", "score": 0.0, "weight": 0.08333333333333333,
+                "passed": True, "details": "No functions found to analyze"}
+
+    cov = logged_fn / total_fn
+    density = min(1.0, total_log / max(total_fn, 1))
+    score = 0.40 * cov + 0.25 * float(has_struct) + 0.20 * float(has_trace) + 0.15 * density
+
+    details = (f"coverage={cov:.0%} ({logged_fn}/{total_fn}), "
+               f"structured={'yes' if has_struct else 'no'}, "
+               f"tracing={'yes' if has_trace else 'no'}, "
+               f"density={density:.0%}")
+
+    return {"name": "observability", "score": round(score, 3), "weight": 0.08333333333333333,
+            "passed": score >= 0.3, "details": details}
+
+# Register all eval functions here.
+EVALS = [eval_tests, eval_lint, eval_type_check, eval_coverage, eval_observability]
 
 
 def main() -> None:
diff --git a/factory.md b/factory.md
index bc03b7e..8677777 100644
--- a/factory.md
+++ b/factory.md
@@ -4,8 +4,9 @@
 <!-- Fill in each section below. -->
 
 ## Goal
+<!-- A single sentence describing what this project should achieve. -->
 
-Domain-agnostic multi-agent software evolution loop that can auto-discover evals and continuously improve any software project.
+Provide a CLI and agent framework ("Remote Factory") that autonomously evolves software projects through systematic experimentation — detecting project state, discovering eval harnesses, running improvement cycles, and archiving learnings.
 
 ## Scope
 
@@ -14,17 +15,19 @@ Domain-agnostic multi-agent software evolution loop that can auto-discover evals
 <!-- One path per line. Glob patterns are supported. -->
 
 - factory/**/*.py
-- factory/agents/prompts/*.md
 - factory/dashboard/static/*
 - tests/**/*.py
 - templates/**
 - docs/**
+- eval/score.py
 
 ### Read-only
 <!-- Files the factory may read but must never modify. -->
 
 - README.md
 - pyproject.toml
+- CLAUDE.md
+- factory.md
 
 ## Guards
 <!-- Rules the factory must never violate. Checked before every commit. -->
@@ -32,7 +35,6 @@ Domain-agnostic multi-agent software evolution loop that can auto-discover evals
 - Do not delete or overwrite existing tests
 - Do not modify files outside the declared scope
 - Do not introduce secrets or credentials into the repository
-- Do not modify test fixtures that other tests depend on
 
 ## Eval
 
@@ -47,32 +49,27 @@ python eval/score.py
 ### Threshold
 <!-- Minimum composite score (0.0-1.0) required to keep a change. -->
 
-0.8
+0.74
 
 ## Target Branch
+<!-- Branch that experiment PRs target. Default: main -->
+<!-- Set to a different branch (e.g. factory/dev) to stage factory changes before merging to main -->
 
 main
 
 ## Project Eval
-<!-- No project-specific eval dimensions for the factory itself -->
-<!-- The factory uses the standard hygiene + growth eval framework -->
+<!-- No project-specific eval dimensions for this project. -->
 
 ## Eval Weights
-<!-- Using defaults: 50/50 hygiene/growth (no project eval) -->
-
-## Hypothesis Budget
-<!-- Controls how many hypotheses the Strategist generates per cycle. -->
-<!-- These are defaults — override per-run with --min-growth, --min-fix, --max-total -->
-
-- min_growth: 2
-- min_fix: 0
-- max_total: 7
+<!-- Default: hygiene 0.50, growth 0.50 -->
 
 ## Smoke Test
-<!-- Optional e2e smoke test command. Failure = mandatory revert. -->
+<!-- Optional shell command that must pass before any change is kept. -->
+<!-- If configured, this runs as part of `factory precheck` — failure = mandatory revert. -->
+<!-- Use for e2e verification: hit an endpoint, run a CLI command, check a process starts. -->
 
 ```bash
-pytest tests/ -x -q --tb=short
+uv run python -m factory detect . && uv run python -m factory --help
 ```
 
 ## Constraints
@@ -81,3 +78,18 @@ pytest tests/ -x -q --tb=short
 - Prefer small, incremental changes over large rewrites
 - Each change should be accompanied by at least one test
 - Follow the existing code style and conventions
+
+## Research Target
+<!-- Not a research project. -->
+
+## Mutable Surfaces
+<!-- Not used — this is not a research project. -->
+
+## Fixed Surfaces
+<!-- Not used — this is not a research project. -->
+
+## Research Constraints
+<!-- Not used. -->
+
+## Cost Budget
+<!-- Not configured. -->
diff --git a/factory/mcp_server.py b/factory/mcp_server.py
index b3288d8..772f620 100644
--- a/factory/mcp_server.py
+++ b/factory/mcp_server.py
@@ -22,8 +22,10 @@
 async def handle_get_score(project_path: str) -> str:
     """Read .factory/last_eval.json and return its contents as JSON text."""
     p = Path(project_path).resolve()
+    log.debug("handle_get_score", project=str(p))
     last_eval = p / ".factory" / "last_eval.json"
     if not last_eval.exists():
+        log.warning("handle_get_score_not_found", path=str(last_eval))
         return json.dumps({"error": f"No last_eval.json found at {last_eval}"})
     return last_eval.read_text()
 
@@ -33,8 +35,10 @@ async def handle_list_experiments(project_path: str, last_n: int = 10) -> str:
     from factory.store import ExperimentStore
 
     p = Path(project_path).resolve()
+    log.debug("handle_list_experiments", project=str(p), last_n=last_n)
     factory_dir = p / ".factory"
     if not factory_dir.is_dir():
+        log.warning("handle_list_experiments_no_factory_dir", project=str(p))
         return json.dumps({"error": f"No .factory/ directory at {p}"})
 
     store = ExperimentStore(p)
@@ -52,6 +56,7 @@ async def handle_get_status(project_path: str) -> str:
     from factory.state import detect_state
 
     p = Path(project_path).resolve()
+    log.debug("handle_get_status", project=str(p))
     state = detect_state(p)
     result: dict[str, object] = {"project_path": str(p), "state": state.value}
 
@@ -65,7 +70,9 @@ async def handle_get_status(project_path: str) -> str:
 async def handle_list_projects(projects_dir: str) -> str:
     """Scan for subdirectories containing .factory/config.json."""
     d = Path(projects_dir).resolve()
+    log.debug("handle_list_projects", dir=str(d))
     if not d.is_dir():
+        log.warning("handle_list_projects_dir_not_found", dir=str(d))
         return json.dumps({"error": f"Directory not found: {d}"})
 
     projects: list[dict[str, str]] = []
@@ -151,6 +158,7 @@ async def handle_list_projects(projects_dir: str) -> str:
 
 @server.list_tools()
 async def list_tools() -> list[Tool]:
+    log.debug("list_tools", count=len(_TOOLS))
     return _TOOLS
 
 
@@ -167,14 +175,17 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
 
     handler = handlers.get(name)
     if handler is None:
+        log.warning("call_tool_unknown", tool=name)
         return [TextContent(type="text", text=json.dumps({"error": f"Unknown tool: {name}"}))]
 
+    log.info("call_tool_dispatch", tool=name)
     result_text = await handler(arguments)
     return [TextContent(type="text", text=result_text)]
 
 
 async def run_server() -> None:
     """Start the MCP stdio server."""
+    log.info("mcp_server_starting")
     async with stdio_server() as (read_stream, write_stream):
         await server.run(
             read_stream,
diff --git a/factory/runners/__init__.py b/factory/runners/__init__.py
index 4c64060..890fe35 100644
--- a/factory/runners/__init__.py
+++ b/factory/runners/__init__.py
@@ -6,6 +6,8 @@
 from pathlib import Path
 from typing import Literal
 
+import structlog
+
 from factory.runners._stream import should_stream, stream_subprocess
 from factory.runners.bob import BobRunner, is_dry_run
 from factory.runners.claude import ClaudeRunner
@@ -22,6 +24,8 @@
     "stream_subprocess",
 ]
 
+log = structlog.get_logger()
+
 RunnerName = Literal["claude", "bob"]
 
 _RUNNERS: dict[str, type[Runner]] = {
@@ -50,8 +54,10 @@ def get_runner(name: str | None = None, project_path: Path | None = None) -> Run
 
     if resolved not in _RUNNERS:
         available = ", ".join(_RUNNERS.keys())
+        log.warning("get_runner_unknown", runner=resolved, available=available)
         raise ValueError(f"Unknown runner '{resolved}'. Available: {available}")
 
+    log.info("get_runner", runner=resolved)
     if resolved == "bob":
         return BobRunner(project_path=project_path)
     return _RUNNERS[resolved]()
@@ -59,4 +65,5 @@ def get_runner(name: str | None = None, project_path: Path | None = None) -> Run
 
 def register_runner(name: str, runner_class: type[Runner]) -> None:
     """Register a runner implementation (used by bob module on import)."""
+    log.debug("register_runner", name=name)
     _RUNNERS[name] = runner_class
diff --git a/factory/runners/_stream.py b/factory/runners/_stream.py
index 29aae3c..c020cfa 100644
--- a/factory/runners/_stream.py
+++ b/factory/runners/_stream.py
@@ -7,6 +7,10 @@
 import sys
 from typing import BinaryIO
 
+import structlog
+
+log = structlog.get_logger()
+
 
 def should_stream() -> bool:
     """Determine if we should stream subprocess output to the terminal.
@@ -16,9 +20,12 @@ def should_stream() -> bool:
     - stdout is not a TTY (e.g., piped to file)
     """
     if os.environ.get("FACTORY_RUNNER_QUIET", "").lower() in ("1", "true", "yes"):
+        log.debug("should_stream", result=False, reason="FACTORY_RUNNER_QUIET")
         return False
     if not sys.stdout.isatty():
+        log.debug("should_stream", result=False, reason="not_tty")
         return False
+    log.debug("should_stream", result=True)
     return True
 
 
@@ -67,6 +74,8 @@ async def stream_subprocess(
     Returns:
         (stdout_bytes, stderr_bytes) tuple with all collected output.
     """
+    log.debug("stream_subprocess_start", stream=stream, prefix=prefix)
+
     stdout_buf: list[bytes] = []
     stderr_buf: list[bytes] = []
 
@@ -94,4 +103,11 @@ async def stream_subprocess(
 
     await proc.wait()
 
+    log.debug(
+        "stream_subprocess_complete",
+        returncode=getattr(proc, "returncode", None),
+        stdout_bytes=sum(len(c) for c in stdout_buf),
+        stderr_bytes=sum(len(c) for c in stderr_buf),
+    )
+
     return b"".join(stdout_buf), b"".join(stderr_buf)
diff --git a/factory/runners/protocol.py b/factory/runners/protocol.py
index 78c30df..753a92c 100644
--- a/factory/runners/protocol.py
+++ b/factory/runners/protocol.py
@@ -5,6 +5,10 @@
 from pathlib import Path
 from typing import NoReturn, Protocol
 
+import structlog
+
+log = structlog.get_logger()
+
 
 class Runner(Protocol):
     """Protocol for CLI backend implementations (claude, bob, etc.)."""