Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 74 additions & 9 deletions factory/eval/hygiene.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
"""Universal hygiene eval dimensions applied to every factory-managed project.

These 6 dimensions are mandatory and cannot be removed. They are computed by
These 7 dimensions are mandatory and cannot be removed. They are computed by
the factory itself (not by per-project eval/score.py) and auto-detect the
project's tooling. Projects can ADD dimensions via eval/score.py but cannot
remove any of these.

Together with the 5 growth dimensions in growth.py, these form the 11
Together with the 5 growth dimensions in growth.py, these form the 12
mandatory eval dimensions that define the factory's quality baseline.

All functions take a project_path and return an EvalResult-compatible dict.
If a tool is not detected for a dimension, score is 0.5 (neutral), not 0.
"""

import json
import os
import re
import subprocess
Expand All @@ -20,12 +21,13 @@
# Relative weights within the hygiene category (sum to 1.0).
# The runner normalizes these so that hygiene gets 50% of the composite.
HYGIENE_WEIGHTS = {
"tests": 0.30,
"lint": 0.15,
"type_check": 0.10,
"coverage": 0.25,
"guard_patterns": 0.10,
"config_parser": 0.10,
"tests": 0.28,
"lint": 0.14,
"type_check": 0.09,
"coverage": 0.23,
"guard_patterns": 0.09,
"config_parser": 0.09,
"security": 0.08,
}


Expand Down Expand Up @@ -523,16 +525,79 @@ def eval_config_parser(project_path: Path) -> dict:
}


# ── Dimension 7: security (weight 0.08) ─────────────────────────


def eval_security(project_path: Path) -> dict:
"""Run security scanners across detected sub-projects. Partial credit per issue."""
sub_projects = _find_sub_projects(project_path)
total_issues = 0
ran_any = False
details_parts: list[str] = []

for sp in sub_projects:
if _detect_python_project(sp):
rc, stdout, stderr = _run_cmd(
["python", "-m", "bandit", "-r", ".", "-f", "json", "-q"], sp,
)
if rc == 1 and "Command not found" in stderr:
continue
try:
data = json.loads(stdout) if stdout.strip() else {}
issues = data.get("results", [])
count = len(issues)
except (json.JSONDecodeError, TypeError):
count = 0
if rc == 0 and count == 0:
ran_any = True
details_parts.append(f"{sp.name}: clean")
elif count > 0:
ran_any = True
total_issues += count
details_parts.append(f"{sp.name}: {count} issues")

if _detect_node_project(sp):
rc, stdout, stderr = _run_cmd(["npm", "audit", "--json"], sp, timeout=180)
if rc == 1 and "Command not found" in stderr:
continue
try:
data = json.loads(stdout) if stdout.strip() else {}
vulns = data.get("metadata", {}).get("vulnerabilities", {})
count = sum(vulns.get(sev, 0) for sev in ("low", "moderate", "high", "critical"))
except (json.JSONDecodeError, TypeError):
count = 0
if count == 0 and rc == 0:
ran_any = True
details_parts.append(f"{sp.name}(js): clean")
elif count > 0:
ran_any = True
total_issues += count
details_parts.append(f"{sp.name}(js): {count} vulnerabilities")

if not ran_any:
return _neutral("security", "no security scanner detected")

score = max(0.0, 1.0 - total_issues * 0.1)
return {
"name": "security",
"score": round(score, 4),
"weight": HYGIENE_WEIGHTS["security"],
"passed": total_issues == 0,
"details": "; ".join(details_parts),
}


# ── Public API ─────────────────────────────────────────────────────


def compute_hygiene_results(project_path: Path) -> list[dict]:
"""Compute all 6 mandatory hygiene dimensions for a project."""
"""Compute all 7 mandatory hygiene dimensions for a project."""
return [
eval_tests(project_path),
eval_lint(project_path),
eval_type_check(project_path),
eval_coverage(project_path),
eval_guard_patterns(project_path),
eval_config_parser(project_path),
eval_security(project_path),
]
87 changes: 83 additions & 4 deletions tests/eval/test_hygiene.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Tests for factory.eval.hygiene — universal hygiene dimensions."""

import json
from unittest.mock import patch

from factory.eval.hygiene import (
HYGIENE_WEIGHTS,
_find_sub_projects,
Expand All @@ -8,6 +11,7 @@
eval_coverage,
eval_guard_patterns,
eval_lint,
eval_security,
eval_tests,
eval_type_check,
)
Expand All @@ -18,9 +22,10 @@ def test_weights_sum_to_one(self):
total = sum(HYGIENE_WEIGHTS.values())
assert abs(total - 1.0) < 1e-9

def test_all_six_dimensions(self):
def test_all_seven_dimensions(self):
assert set(HYGIENE_WEIGHTS.keys()) == {
"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser",
"security",
}


Expand Down Expand Up @@ -125,11 +130,11 @@ def test_valid_factory_md(self, tmp_path):


class TestComputeHygieneResults:
def test_returns_all_six(self, tmp_path):
def test_returns_all_seven(self, tmp_path):
results = compute_hygiene_results(tmp_path)
assert len(results) == 6
assert len(results) == 7
names = {r["name"] for r in results}
assert names == {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser"}
assert names == {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser", "security"}

def test_all_have_required_keys(self, tmp_path):
results = compute_hygiene_results(tmp_path)
Expand All @@ -139,3 +144,77 @@ def test_all_have_required_keys(self, tmp_path):
assert "weight" in r
assert "passed" in r
assert "details" in r


class TestEvalSecurity:
def test_no_scanner_returns_neutral(self, tmp_path):
result = eval_security(tmp_path)
assert result["name"] == "security"
assert result["score"] == 0.5
assert "Not detected" in result["details"]

def test_python_bandit_clean(self, tmp_path):
(tmp_path / "pyproject.toml").write_text("[project]\n")
bandit_output = json.dumps({"results": []})
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (0, bandit_output, "")
result = eval_security(tmp_path)
assert result["score"] == 1.0
assert result["passed"] is True
assert "clean" in result["details"]

def test_python_bandit_issues(self, tmp_path):
(tmp_path / "pyproject.toml").write_text("[project]\n")
bandit_output = json.dumps({
"results": [
{"issue_severity": "HIGH", "issue_text": "Use of exec"},
{"issue_severity": "MEDIUM", "issue_text": "Hardcoded password"},
{"issue_severity": "LOW", "issue_text": "Assert used"},
],
})
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (1, bandit_output, "")
result = eval_security(tmp_path)
assert result["score"] == round(1.0 - 3 * 0.1, 4)
assert result["passed"] is False
assert "3 issues" in result["details"]

def test_node_npm_audit_clean(self, tmp_path):
(tmp_path / "package.json").write_text("{}\n")
audit_output = json.dumps({
"metadata": {"vulnerabilities": {"low": 0, "moderate": 0, "high": 0, "critical": 0}},
})
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (0, audit_output, "")
result = eval_security(tmp_path)
assert result["score"] == 1.0
assert result["passed"] is True
assert "js" in result["details"]

def test_node_npm_audit_vulnerabilities(self, tmp_path):
(tmp_path / "package.json").write_text("{}\n")
audit_output = json.dumps({
"metadata": {"vulnerabilities": {"low": 2, "moderate": 1, "high": 1, "critical": 0}},
})
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (1, audit_output, "")
result = eval_security(tmp_path)
assert result["passed"] is False
assert "4 vulnerabilities" in result["details"]

def test_bandit_not_installed(self, tmp_path):
(tmp_path / "pyproject.toml").write_text("[project]\n")
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (1, "", "Command not found: bandit")
result = eval_security(tmp_path)
assert result["score"] == 0.5
assert "Not detected" in result["details"]

def test_score_floor_at_zero(self, tmp_path):
(tmp_path / "pyproject.toml").write_text("[project]\n")
issues = [{"issue_severity": "HIGH", "issue_text": f"issue {i}"} for i in range(15)]
bandit_output = json.dumps({"results": issues})
with patch("factory.eval.hygiene._run_cmd") as mock:
mock.return_value = (1, bandit_output, "")
result = eval_security(tmp_path)
assert result["score"] == 0.0
6 changes: 3 additions & 3 deletions tests/eval/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def test_failed_project_eval_still_has_mandatory(self, tmp_path):
result = await run_eval("nonexistent_command_xyz", tmp_path, threshold=0.0)
names = {r.name for r in result.results}
# All 11 mandatory should still be present
assert len(names) >= 11
assert len(names) >= 12
assert "tests" in names
assert "capability_surface" in names

Expand All @@ -79,12 +79,12 @@ async def test_timeout_project_eval(self, tmp_path):
result = await run_eval(f"{sys.executable} {script}", tmp_path, threshold=0.0, timeout=1.0)
# Mandatory dimensions still computed
names = {r.name for r in result.results}
assert len(names) >= 11
assert len(names) >= 12

async def test_weight_split_is_50_50(self, tmp_path):
"""Hygiene dimensions get 50% total weight, growth gets 50%."""
result = await run_eval("true", tmp_path, threshold=0.0)
hygiene_names = {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser"}
hygiene_names = {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser", "security"}
growth_names = {
"capability_surface", "experiment_diversity", "observability",
"research_grounding", "factory_effectiveness",
Expand Down
Loading