diff --git a/eval/score.py b/eval/score.py index 40c43a7..c5683d2 100644 --- a/eval/score.py +++ b/eval/score.py @@ -1,393 +1,233 @@ #!/usr/bin/env python3 -"""Eval script for the Remote Factory. +"""Auto-generated eval script for the Software Factory. -Runs 6 project-specific (hygiene) evaluation dimensions and outputs JSON -to stdout. The factory's eval runner injects universal growth dimensions -on top of these, so this script only needs to cover project health. +This script was generated by `factory discover`. It runs each eval dimension +as a subprocess and outputs JSON to stdout. Output format: {"results": [{"name": str, "score": float, "weight": float, "passed": bool, "details": str}, ...]} -Each dimension parses real metrics from tool output rather than using -binary exit-code checks. +You can edit this file to add custom evals or adjust weights. +Once edited, it becomes a Tier 1 (explicit) eval — the factory will use it as-is. """ -import asyncio import json -import os -import re import subprocess import sys -# Ensure the project root is on sys.path so factory.* imports work. -PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if PROJECT_ROOT not in sys.path: - sys.path.insert(0, PROJECT_ROOT) - - -# ── Dimension 1: tests (weight 0.30) ───────────────────────────── - - def eval_tests() -> dict: - """Run test suite and parse pass/fail counts.""" + """Run test suite: uv run pytest -v""" try: result = subprocess.run( - ["uv", "run", "pytest", "-v"], + ['uv', 'run', 'pytest', '-v'], capture_output=True, text=True, timeout=120, - cwd=PROJECT_ROOT, ) - output = result.stdout + result.stderr - - # Look for "X passed" and optionally "Y failed" - passed_match = re.search(r"(\d+)\s+passed", output) - failed_match = re.search(r"(\d+)\s+failed", output) - - passed_count = int(passed_match.group(1)) if passed_match else 0 - failed_count = int(failed_match.group(1)) if failed_match else 0 - total = passed_count + failed_count - - if total == 0: - score = 0.0 - ok = False - details = "No test results found in output" + passed = result.returncode == 0 + if passed: + score = 1.0 else: - score = passed_count / total - ok = failed_count == 0 - details = f"{passed_count} passed, {failed_count} failed" - + # Partial score: count output lines as a rough error metric + error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()] + if not error_lines: + score = 0.0 + else: + score = max(0.0, 1.0 - len(error_lines) * 0.05) return { - "name": "tests", - "score": round(score, 4), - "weight": 0.30, - "passed": ok, - "details": details, + "name": 'tests', + "score": score, + "weight": 0.41666666666666663, + "passed": passed, + "details": (result.stdout or result.stderr).strip()[-500:], } except subprocess.TimeoutExpired: return { - "name": "tests", + "name": 'tests', "score": 0.0, - "weight": 0.30, + "weight": 0.41666666666666663, "passed": False, "details": "Timed out after 120s", } - except Exception as exc: - return { - "name": "tests", - "score": 0.0, - "weight": 0.30, - "passed": False, - "details": f"Error: {exc}", - } - - -# ── Dimension 2: lint (weight 0.15) ────────────────────────────── - def eval_lint() -> dict: - """Run ruff and parse error count.""" + """Run linter: uv run ruff check .""" try: result = subprocess.run( - ["uv", "run", "ruff", "check", "."], + ['uv', 'run', 'ruff', 'check', '.'], capture_output=True, text=True, timeout=120, - cwd=PROJECT_ROOT, ) - if result.returncode == 0: - return { - "name": "lint", - "score": 1.0, - "weight": 0.15, - "passed": True, - "details": "No lint errors", - } - - # Parse "Found X error(s)" - output = result.stdout + result.stderr - error_match = re.search(r"Found\s+(\d+)\s+error", output) - if error_match: - error_count = int(error_match.group(1)) - # Partial credit: lose 0.1 per error, floor at 0 - score = max(0.0, 1.0 - error_count * 0.1) - details = f"Found {error_count} lint error(s)" + passed = result.returncode == 0 + if passed: + score = 1.0 else: - score = 0.0 - details = output.strip()[-500:] - + # Partial score: count output lines as a rough error metric + error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()] + if not error_lines: + score = 0.0 + else: + score = max(0.0, 1.0 - len(error_lines) * 0.05) return { - "name": "lint", - "score": round(score, 4), - "weight": 0.15, - "passed": False, - "details": details, + "name": 'lint', + "score": score, + "weight": 0.24999999999999994, + "passed": passed, + "details": (result.stdout or result.stderr).strip()[-500:], } except subprocess.TimeoutExpired: return { - "name": "lint", + "name": 'lint', "score": 0.0, - "weight": 0.15, + "weight": 0.24999999999999994, "passed": False, "details": "Timed out after 120s", } - except Exception as exc: - return { - "name": "lint", - "score": 0.0, - "weight": 0.15, - "passed": False, - "details": f"Error: {exc}", - } - - -# ── Dimension 3: type_check (weight 0.10) ──────────────────────── - def eval_type_check() -> dict: - """Run mypy and parse error count.""" + """Run type checker: uv run mypy factory/""" try: result = subprocess.run( - ["uv", "run", "mypy", "factory/"], + ['uv', 'run', 'mypy', 'factory/'], capture_output=True, text=True, timeout=120, - cwd=PROJECT_ROOT, ) - if result.returncode == 0: - return { - "name": "type_check", - "score": 1.0, - "weight": 0.10, - "passed": True, - "details": "No type errors", - } - - output = result.stdout + result.stderr - # mypy prints "Found X error(s)" at the end - error_match = re.search(r"Found\s+(\d+)\s+error", output) - if error_match: - error_count = int(error_match.group(1)) - score = max(0.0, 1.0 - error_count * 0.05) - details = f"Found {error_count} type error(s)" + passed = result.returncode == 0 + if passed: + score = 1.0 else: - score = 0.0 - details = output.strip()[-500:] - + # Partial score: count output lines as a rough error metric + error_lines = [ln for ln in (result.stdout + result.stderr).splitlines() if ln.strip()] + if not error_lines: + score = 0.0 + else: + score = max(0.0, 1.0 - len(error_lines) * 0.05) return { - "name": "type_check", - "score": round(score, 4), - "weight": 0.10, - "passed": False, - "details": details, + "name": 'type_check', + "score": score, + "weight": 0.12499999999999997, + "passed": passed, + "details": (result.stdout or result.stderr).strip()[-500:], } except subprocess.TimeoutExpired: return { - "name": "type_check", + "name": 'type_check', "score": 0.0, - "weight": 0.10, + "weight": 0.12499999999999997, "passed": False, "details": "Timed out after 120s", } - except Exception as exc: - return { - "name": "type_check", - "score": 0.0, - "weight": 0.10, - "passed": False, - "details": f"Error: {exc}", - } - - -# ── Dimension 4: coverage (weight 0.25) ────────────────────────── - def eval_coverage() -> dict: - """Run pytest with coverage and parse the TOTAL percentage.""" + """Measure test coverage""" + import re as _re try: result = subprocess.run( - ["uv", "run", "pytest", "--cov=factory", "--cov-report=term", "-q"], + ['uv', 'run', 'python', '-m', 'pytest', '--cov=factory', '--cov-report=term', '-q'], capture_output=True, text=True, - timeout=120, - cwd=PROJECT_ROOT, + timeout=180, ) - output = result.stdout + result.stderr - - # Parse TOTAL line: "TOTAL 123 30 75%" - total_match = re.search(r"TOTAL\s+\d+\s+\d+\s+(\d+)%", output) - if total_match: - percentage = int(total_match.group(1)) - score = percentage / 100.0 - ok = percentage >= 80 - details = f"Coverage: {percentage}% (threshold: 80%)" - else: - score = 0.0 - ok = False - details = "Could not parse coverage from output" - - return { - "name": "coverage", - "score": round(score, 4), - "weight": 0.25, - "passed": ok, - "details": details, + output = (result.stdout + result.stderr).strip() + pct = 0.0 + m = _re.search(r'TOTAL\s+\d+\s+\d+\s+(\d+)%', output) + if m: + pct = int(m.group(1)) + score = pct / 100.0 + passed = score >= 0.70 + return { + "name": 'coverage', + "score": round(score, 3), + "weight": 0.12499999999999997, + "passed": passed, + "details": output[-500:], } except subprocess.TimeoutExpired: return { - "name": "coverage", - "score": 0.0, - "weight": 0.25, - "passed": False, - "details": "Timed out after 120s", - } - except Exception as exc: - return { - "name": "coverage", + "name": 'coverage', "score": 0.0, - "weight": 0.25, + "weight": 0.12499999999999997, "passed": False, - "details": f"Error: {exc}", - } - - -# ── Dimension 5: guard_patterns (weight 0.10) ──────────────────── - - -def eval_guard_patterns() -> dict: - """Test that the guard system's glob matching works correctly.""" - try: - from factory.eval.guards import _glob_match - except (ImportError, AttributeError) as exc: - return { - "name": "guard_patterns", - "score": 0.0, - "weight": 0.10, - "passed": False, - "details": f"Could not import _glob_match: {exc}", - } - - try: - test_cases: list[tuple[str, str, bool]] = [ - ("factory/**/*.py", "factory/eval/runner.py", True), - ("factory/**/*.py", "tests/test_guards.py", False), - ("tests/**/*.py", "tests/test_guards.py", True), - ("templates/**", "templates/factory_config.md", True), - ] - - correct = 0 - results_detail: list[str] = [] - for pattern, filepath, expected in test_cases: - actual = _glob_match(filepath, pattern) - if actual == expected: - correct += 1 - results_detail.append(f"OK: {pattern} vs {filepath}") - else: - results_detail.append( - f"FAIL: {pattern} vs {filepath} — " - f"expected {expected}, got {actual}" - ) - - total = len(test_cases) - score = correct / total - ok = correct == total - - return { - "name": "guard_patterns", - "score": round(score, 4), - "weight": 0.10, - "passed": ok, - "details": "; ".join(results_detail), - } - except Exception as exc: - return { - "name": "guard_patterns", - "score": 0.0, - "weight": 0.10, - "passed": False, - "details": f"Error running guard pattern tests: {exc}", - } - - -# ── Dimension 6: config_parser (weight 0.10) ───────────────────── - - -def eval_config_parser() -> dict: - """Test that the factory.md parser extracts fields correctly.""" - try: - from factory.store import ExperimentStore - except ImportError as exc: - return { - "name": "config_parser", - "score": 0.0, - "weight": 0.10, - "passed": False, - "details": f"Could not import ExperimentStore: {exc}", - } - - try: - from pathlib import Path - - store = ExperimentStore(Path(PROJECT_ROOT)) - config = asyncio.run(store.reparse_config()) - - checks: list[tuple[str, bool]] = [] - - # goal should be non-empty - checks.append(("goal is non-empty", bool(config.goal and len(config.goal) > 0))) - - # scope should contain expected patterns - checks.append(( - "scope contains factory/**/*.py", - "factory/**/*.py" in config.scope, - )) - - # eval_command should reference eval/score.py - checks.append(( - "eval_command references score.py", - "eval/score.py" in config.eval_command, - )) - - # eval_threshold should be 0.8 - checks.append(("eval_threshold is 0.8", config.eval_threshold == 0.8)) - - correct = sum(1 for _, ok in checks if ok) - total = len(checks) - score = correct / total - ok = correct == total - - details_parts = [ - f"{'OK' if passed else 'FAIL'}: {label}" - for label, passed in checks - ] - - return { - "name": "config_parser", - "score": round(score, 4), - "weight": 0.10, - "passed": ok, - "details": "; ".join(details_parts), - } - except Exception as exc: - return { - "name": "config_parser", - "score": 0.0, - "weight": 0.10, - "passed": False, - "details": f"Error running config parser tests: {exc}", - } - - -# ── Main ────────────────────────────────────────────────────────── - -EVALS = [ - eval_tests, - eval_lint, - eval_type_check, - eval_coverage, - eval_guard_patterns, - eval_config_parser, -] + "details": "Timed out after 180s", + } + +def eval_observability() -> dict: + """Analyze observability coverage: logging, structured logging, request tracing.""" + import ast + import re + from pathlib import Path + + skip = { + "tests", "test", ".venv", "venv", "node_modules", "__pycache__", + ".git", ".factory", "eval", "dist", "build", ".mypy_cache", + } + log_pats = [ + r"\blogger\.\w+\(", + r"\blogging\.\w+\(", + r"\blog\.\w+\(", + r"\bconsole\.\w+\(", + ] + struct_pats = [r"\bstructlog\b", r"\bpino\b", r"\bwinston\b", + r"\bslog\.\w+\(", r"\btracing::"] + trace_pats = [r"request.id|req.id|trace.id", r"\bcontextvars\b|ContextVar", + r"\bopentelemetry\b", r"trace.context|TraceContext|span"] + + sources = [f for f in Path(".").rglob("*.py") + if not any(p in f.parts for p in skip)] + total_fn = logged_fn = total_log = 0 + has_struct = has_trace = False + + for src in sources: + try: + code = src.read_text(errors="replace") + except OSError: + continue + try: + tree = ast.parse(code) + except SyntaxError: + continue + lines = code.splitlines() + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + if node.name.startswith("__"): + continue + total_fn += 1 + start = node.lineno - 1 + end = node.end_lineno or start + 1 + body = "\n".join(lines[start:end]) + for pat in log_pats: + if re.search(pat, body): + logged_fn += 1 + break + for pat in log_pats: + total_log += len(re.findall(pat, code)) + for pat in struct_pats: + if re.search(pat, code): + has_struct = True + for pat in trace_pats: + if re.search(pat, code, re.IGNORECASE): + has_trace = True + + if total_fn == 0: + return {"name": "observability", "score": 0.0, "weight": 0.08333333333333333, + "passed": True, "details": "No functions found to analyze"} + + cov = logged_fn / total_fn + density = min(1.0, total_log / max(total_fn, 1)) + score = 0.40 * cov + 0.25 * float(has_struct) + 0.20 * float(has_trace) + 0.15 * density + + details = (f"coverage={cov:.0%} ({logged_fn}/{total_fn}), " + f"structured={'yes' if has_struct else 'no'}, " + f"tracing={'yes' if has_trace else 'no'}, " + f"density={density:.0%}") + + return {"name": "observability", "score": round(score, 3), "weight": 0.08333333333333333, + "passed": score >= 0.3, "details": details} + +# Register all eval functions here. +EVALS = [eval_tests, eval_lint, eval_type_check, eval_coverage, eval_observability] def main() -> None: diff --git a/factory.md b/factory.md index bc03b7e..8677777 100644 --- a/factory.md +++ b/factory.md @@ -4,8 +4,9 @@ ## Goal + -Domain-agnostic multi-agent software evolution loop that can auto-discover evals and continuously improve any software project. +Provide a CLI and agent framework ("Remote Factory") that autonomously evolves software projects through systematic experimentation — detecting project state, discovering eval harnesses, running improvement cycles, and archiving learnings. ## Scope @@ -14,17 +15,19 @@ Domain-agnostic multi-agent software evolution loop that can auto-discover evals - factory/**/*.py -- factory/agents/prompts/*.md - factory/dashboard/static/* - tests/**/*.py - templates/** - docs/** +- eval/score.py ### Read-only - README.md - pyproject.toml +- CLAUDE.md +- factory.md ## Guards @@ -32,7 +35,6 @@ Domain-agnostic multi-agent software evolution loop that can auto-discover evals - Do not delete or overwrite existing tests - Do not modify files outside the declared scope - Do not introduce secrets or credentials into the repository -- Do not modify test fixtures that other tests depend on ## Eval @@ -47,32 +49,27 @@ python eval/score.py ### Threshold -0.8 +0.74 ## Target Branch + + main ## Project Eval - - + ## Eval Weights - - -## Hypothesis Budget - - - -- min_growth: 2 -- min_fix: 0 -- max_total: 7 + ## Smoke Test - + + + ```bash -pytest tests/ -x -q --tb=short +uv run python -m factory detect . && uv run python -m factory --help ``` ## Constraints @@ -81,3 +78,18 @@ pytest tests/ -x -q --tb=short - Prefer small, incremental changes over large rewrites - Each change should be accompanied by at least one test - Follow the existing code style and conventions + +## Research Target + + +## Mutable Surfaces + + +## Fixed Surfaces + + +## Research Constraints + + +## Cost Budget + diff --git a/factory/mcp_server.py b/factory/mcp_server.py index b3288d8..772f620 100644 --- a/factory/mcp_server.py +++ b/factory/mcp_server.py @@ -22,8 +22,10 @@ async def handle_get_score(project_path: str) -> str: """Read .factory/last_eval.json and return its contents as JSON text.""" p = Path(project_path).resolve() + log.debug("handle_get_score", project=str(p)) last_eval = p / ".factory" / "last_eval.json" if not last_eval.exists(): + log.warning("handle_get_score_not_found", path=str(last_eval)) return json.dumps({"error": f"No last_eval.json found at {last_eval}"}) return last_eval.read_text() @@ -33,8 +35,10 @@ async def handle_list_experiments(project_path: str, last_n: int = 10) -> str: from factory.store import ExperimentStore p = Path(project_path).resolve() + log.debug("handle_list_experiments", project=str(p), last_n=last_n) factory_dir = p / ".factory" if not factory_dir.is_dir(): + log.warning("handle_list_experiments_no_factory_dir", project=str(p)) return json.dumps({"error": f"No .factory/ directory at {p}"}) store = ExperimentStore(p) @@ -52,6 +56,7 @@ async def handle_get_status(project_path: str) -> str: from factory.state import detect_state p = Path(project_path).resolve() + log.debug("handle_get_status", project=str(p)) state = detect_state(p) result: dict[str, object] = {"project_path": str(p), "state": state.value} @@ -65,7 +70,9 @@ async def handle_get_status(project_path: str) -> str: async def handle_list_projects(projects_dir: str) -> str: """Scan for subdirectories containing .factory/config.json.""" d = Path(projects_dir).resolve() + log.debug("handle_list_projects", dir=str(d)) if not d.is_dir(): + log.warning("handle_list_projects_dir_not_found", dir=str(d)) return json.dumps({"error": f"Directory not found: {d}"}) projects: list[dict[str, str]] = [] @@ -151,6 +158,7 @@ async def handle_list_projects(projects_dir: str) -> str: @server.list_tools() async def list_tools() -> list[Tool]: + log.debug("list_tools", count=len(_TOOLS)) return _TOOLS @@ -167,14 +175,17 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: handler = handlers.get(name) if handler is None: + log.warning("call_tool_unknown", tool=name) return [TextContent(type="text", text=json.dumps({"error": f"Unknown tool: {name}"}))] + log.info("call_tool_dispatch", tool=name) result_text = await handler(arguments) return [TextContent(type="text", text=result_text)] async def run_server() -> None: """Start the MCP stdio server.""" + log.info("mcp_server_starting") async with stdio_server() as (read_stream, write_stream): await server.run( read_stream, diff --git a/factory/runners/__init__.py b/factory/runners/__init__.py index 4c64060..890fe35 100644 --- a/factory/runners/__init__.py +++ b/factory/runners/__init__.py @@ -6,6 +6,8 @@ from pathlib import Path from typing import Literal +import structlog + from factory.runners._stream import should_stream, stream_subprocess from factory.runners.bob import BobRunner, is_dry_run from factory.runners.claude import ClaudeRunner @@ -22,6 +24,8 @@ "stream_subprocess", ] +log = structlog.get_logger() + RunnerName = Literal["claude", "bob"] _RUNNERS: dict[str, type[Runner]] = { @@ -50,8 +54,10 @@ def get_runner(name: str | None = None, project_path: Path | None = None) -> Run if resolved not in _RUNNERS: available = ", ".join(_RUNNERS.keys()) + log.warning("get_runner_unknown", runner=resolved, available=available) raise ValueError(f"Unknown runner '{resolved}'. Available: {available}") + log.info("get_runner", runner=resolved) if resolved == "bob": return BobRunner(project_path=project_path) return _RUNNERS[resolved]() @@ -59,4 +65,5 @@ def get_runner(name: str | None = None, project_path: Path | None = None) -> Run def register_runner(name: str, runner_class: type[Runner]) -> None: """Register a runner implementation (used by bob module on import).""" + log.debug("register_runner", name=name) _RUNNERS[name] = runner_class diff --git a/factory/runners/_stream.py b/factory/runners/_stream.py index 29aae3c..c020cfa 100644 --- a/factory/runners/_stream.py +++ b/factory/runners/_stream.py @@ -7,6 +7,10 @@ import sys from typing import BinaryIO +import structlog + +log = structlog.get_logger() + def should_stream() -> bool: """Determine if we should stream subprocess output to the terminal. @@ -16,9 +20,12 @@ def should_stream() -> bool: - stdout is not a TTY (e.g., piped to file) """ if os.environ.get("FACTORY_RUNNER_QUIET", "").lower() in ("1", "true", "yes"): + log.debug("should_stream", result=False, reason="FACTORY_RUNNER_QUIET") return False if not sys.stdout.isatty(): + log.debug("should_stream", result=False, reason="not_tty") return False + log.debug("should_stream", result=True) return True @@ -67,6 +74,8 @@ async def stream_subprocess( Returns: (stdout_bytes, stderr_bytes) tuple with all collected output. """ + log.debug("stream_subprocess_start", stream=stream, prefix=prefix) + stdout_buf: list[bytes] = [] stderr_buf: list[bytes] = [] @@ -94,4 +103,11 @@ async def stream_subprocess( await proc.wait() + log.debug( + "stream_subprocess_complete", + returncode=getattr(proc, "returncode", None), + stdout_bytes=sum(len(c) for c in stdout_buf), + stderr_bytes=sum(len(c) for c in stderr_buf), + ) + return b"".join(stdout_buf), b"".join(stderr_buf) diff --git a/factory/runners/protocol.py b/factory/runners/protocol.py index 78c30df..753a92c 100644 --- a/factory/runners/protocol.py +++ b/factory/runners/protocol.py @@ -5,6 +5,10 @@ from pathlib import Path from typing import NoReturn, Protocol +import structlog + +log = structlog.get_logger() + class Runner(Protocol): """Protocol for CLI backend implementations (claude, bob, etc.)."""