From 7af7e51c0214ee72fdc16cf15c6f52087e539912 Mon Sep 17 00:00:00 2001 From: Luke Inglis Date: Fri, 1 May 2026 14:54:37 -0400 Subject: [PATCH] feat: add security hygiene eval dimension (bandit + npm audit) Add a 7th mandatory hygiene dimension that runs security scanners on detected sub-projects: - Python: runs bandit with JSON output, counts issues - Node.js: runs npm audit with JSON output, sums vulnerabilities - Returns neutral score (0.5) when no scanner is detected Rebalances HYGIENE_WEIGHTS to maintain sum of 1.0 with the new dimension at 0.08 weight. Tests and coverage keep the highest weights. Includes 7 tests covering clean scans, issue parsing, tool-not-found fallback, and score floor at zero. Updates existing tests for the new dimension count (6 -> 7, 11 -> 12 total with growth). Closes #128 item from contributing.md "Good First Issues" table. Signed-off-by: Luke Inglis --- factory/eval/hygiene.py | 83 ++++++++++++++++++++++++++++++++---- tests/eval/test_hygiene.py | 87 ++++++++++++++++++++++++++++++++++++-- tests/eval/test_runner.py | 6 +-- 3 files changed, 160 insertions(+), 16 deletions(-) diff --git a/factory/eval/hygiene.py b/factory/eval/hygiene.py index 46c3fbe..3ed0165 100644 --- a/factory/eval/hygiene.py +++ b/factory/eval/hygiene.py @@ -1,17 +1,18 @@ """Universal hygiene eval dimensions applied to every factory-managed project. -These 6 dimensions are mandatory and cannot be removed. They are computed by +These 7 dimensions are mandatory and cannot be removed. They are computed by the factory itself (not by per-project eval/score.py) and auto-detect the project's tooling. Projects can ADD dimensions via eval/score.py but cannot remove any of these. -Together with the 5 growth dimensions in growth.py, these form the 11 +Together with the 5 growth dimensions in growth.py, these form the 12 mandatory eval dimensions that define the factory's quality baseline. All functions take a project_path and return an EvalResult-compatible dict. If a tool is not detected for a dimension, score is 0.5 (neutral), not 0. """ +import json import os import re import subprocess @@ -20,12 +21,13 @@ # Relative weights within the hygiene category (sum to 1.0). # The runner normalizes these so that hygiene gets 50% of the composite. HYGIENE_WEIGHTS = { - "tests": 0.30, - "lint": 0.15, - "type_check": 0.10, - "coverage": 0.25, - "guard_patterns": 0.10, - "config_parser": 0.10, + "tests": 0.28, + "lint": 0.14, + "type_check": 0.09, + "coverage": 0.23, + "guard_patterns": 0.09, + "config_parser": 0.09, + "security": 0.08, } @@ -523,11 +525,73 @@ def eval_config_parser(project_path: Path) -> dict: } +# ── Dimension 7: security (weight 0.08) ───────────────────────── + + +def eval_security(project_path: Path) -> dict: + """Run security scanners across detected sub-projects. Partial credit per issue.""" + sub_projects = _find_sub_projects(project_path) + total_issues = 0 + ran_any = False + details_parts: list[str] = [] + + for sp in sub_projects: + if _detect_python_project(sp): + rc, stdout, stderr = _run_cmd( + ["python", "-m", "bandit", "-r", ".", "-f", "json", "-q"], sp, + ) + if rc == 1 and "Command not found" in stderr: + continue + try: + data = json.loads(stdout) if stdout.strip() else {} + issues = data.get("results", []) + count = len(issues) + except (json.JSONDecodeError, TypeError): + count = 0 + if rc == 0 and count == 0: + ran_any = True + details_parts.append(f"{sp.name}: clean") + elif count > 0: + ran_any = True + total_issues += count + details_parts.append(f"{sp.name}: {count} issues") + + if _detect_node_project(sp): + rc, stdout, stderr = _run_cmd(["npm", "audit", "--json"], sp, timeout=180) + if rc == 1 and "Command not found" in stderr: + continue + try: + data = json.loads(stdout) if stdout.strip() else {} + vulns = data.get("metadata", {}).get("vulnerabilities", {}) + count = sum(vulns.get(sev, 0) for sev in ("low", "moderate", "high", "critical")) + except (json.JSONDecodeError, TypeError): + count = 0 + if count == 0 and rc == 0: + ran_any = True + details_parts.append(f"{sp.name}(js): clean") + elif count > 0: + ran_any = True + total_issues += count + details_parts.append(f"{sp.name}(js): {count} vulnerabilities") + + if not ran_any: + return _neutral("security", "no security scanner detected") + + score = max(0.0, 1.0 - total_issues * 0.1) + return { + "name": "security", + "score": round(score, 4), + "weight": HYGIENE_WEIGHTS["security"], + "passed": total_issues == 0, + "details": "; ".join(details_parts), + } + + # ── Public API ───────────────────────────────────────────────────── def compute_hygiene_results(project_path: Path) -> list[dict]: - """Compute all 6 mandatory hygiene dimensions for a project.""" + """Compute all 7 mandatory hygiene dimensions for a project.""" return [ eval_tests(project_path), eval_lint(project_path), @@ -535,4 +599,5 @@ def compute_hygiene_results(project_path: Path) -> list[dict]: eval_coverage(project_path), eval_guard_patterns(project_path), eval_config_parser(project_path), + eval_security(project_path), ] diff --git a/tests/eval/test_hygiene.py b/tests/eval/test_hygiene.py index 7f96172..2cbb354 100644 --- a/tests/eval/test_hygiene.py +++ b/tests/eval/test_hygiene.py @@ -1,5 +1,8 @@ """Tests for factory.eval.hygiene — universal hygiene dimensions.""" +import json +from unittest.mock import patch + from factory.eval.hygiene import ( HYGIENE_WEIGHTS, _find_sub_projects, @@ -8,6 +11,7 @@ eval_coverage, eval_guard_patterns, eval_lint, + eval_security, eval_tests, eval_type_check, ) @@ -18,9 +22,10 @@ def test_weights_sum_to_one(self): total = sum(HYGIENE_WEIGHTS.values()) assert abs(total - 1.0) < 1e-9 - def test_all_six_dimensions(self): + def test_all_seven_dimensions(self): assert set(HYGIENE_WEIGHTS.keys()) == { "tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser", + "security", } @@ -125,11 +130,11 @@ def test_valid_factory_md(self, tmp_path): class TestComputeHygieneResults: - def test_returns_all_six(self, tmp_path): + def test_returns_all_seven(self, tmp_path): results = compute_hygiene_results(tmp_path) - assert len(results) == 6 + assert len(results) == 7 names = {r["name"] for r in results} - assert names == {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser"} + assert names == {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser", "security"} def test_all_have_required_keys(self, tmp_path): results = compute_hygiene_results(tmp_path) @@ -139,3 +144,77 @@ def test_all_have_required_keys(self, tmp_path): assert "weight" in r assert "passed" in r assert "details" in r + + +class TestEvalSecurity: + def test_no_scanner_returns_neutral(self, tmp_path): + result = eval_security(tmp_path) + assert result["name"] == "security" + assert result["score"] == 0.5 + assert "Not detected" in result["details"] + + def test_python_bandit_clean(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\n") + bandit_output = json.dumps({"results": []}) + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (0, bandit_output, "") + result = eval_security(tmp_path) + assert result["score"] == 1.0 + assert result["passed"] is True + assert "clean" in result["details"] + + def test_python_bandit_issues(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\n") + bandit_output = json.dumps({ + "results": [ + {"issue_severity": "HIGH", "issue_text": "Use of exec"}, + {"issue_severity": "MEDIUM", "issue_text": "Hardcoded password"}, + {"issue_severity": "LOW", "issue_text": "Assert used"}, + ], + }) + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (1, bandit_output, "") + result = eval_security(tmp_path) + assert result["score"] == round(1.0 - 3 * 0.1, 4) + assert result["passed"] is False + assert "3 issues" in result["details"] + + def test_node_npm_audit_clean(self, tmp_path): + (tmp_path / "package.json").write_text("{}\n") + audit_output = json.dumps({ + "metadata": {"vulnerabilities": {"low": 0, "moderate": 0, "high": 0, "critical": 0}}, + }) + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (0, audit_output, "") + result = eval_security(tmp_path) + assert result["score"] == 1.0 + assert result["passed"] is True + assert "js" in result["details"] + + def test_node_npm_audit_vulnerabilities(self, tmp_path): + (tmp_path / "package.json").write_text("{}\n") + audit_output = json.dumps({ + "metadata": {"vulnerabilities": {"low": 2, "moderate": 1, "high": 1, "critical": 0}}, + }) + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (1, audit_output, "") + result = eval_security(tmp_path) + assert result["passed"] is False + assert "4 vulnerabilities" in result["details"] + + def test_bandit_not_installed(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\n") + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (1, "", "Command not found: bandit") + result = eval_security(tmp_path) + assert result["score"] == 0.5 + assert "Not detected" in result["details"] + + def test_score_floor_at_zero(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\n") + issues = [{"issue_severity": "HIGH", "issue_text": f"issue {i}"} for i in range(15)] + bandit_output = json.dumps({"results": issues}) + with patch("factory.eval.hygiene._run_cmd") as mock: + mock.return_value = (1, bandit_output, "") + result = eval_security(tmp_path) + assert result["score"] == 0.0 diff --git a/tests/eval/test_runner.py b/tests/eval/test_runner.py index 38a58fc..cde607f 100644 --- a/tests/eval/test_runner.py +++ b/tests/eval/test_runner.py @@ -62,7 +62,7 @@ async def test_failed_project_eval_still_has_mandatory(self, tmp_path): result = await run_eval("nonexistent_command_xyz", tmp_path, threshold=0.0) names = {r.name for r in result.results} # All 11 mandatory should still be present - assert len(names) >= 11 + assert len(names) >= 12 assert "tests" in names assert "capability_surface" in names @@ -79,12 +79,12 @@ async def test_timeout_project_eval(self, tmp_path): result = await run_eval(f"{sys.executable} {script}", tmp_path, threshold=0.0, timeout=1.0) # Mandatory dimensions still computed names = {r.name for r in result.results} - assert len(names) >= 11 + assert len(names) >= 12 async def test_weight_split_is_50_50(self, tmp_path): """Hygiene dimensions get 50% total weight, growth gets 50%.""" result = await run_eval("true", tmp_path, threshold=0.0) - hygiene_names = {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser"} + hygiene_names = {"tests", "lint", "type_check", "coverage", "guard_patterns", "config_parser", "security"} growth_names = { "capability_surface", "experiment_diversity", "observability", "research_grounding", "factory_effectiveness",