diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md index 969c743..a060f59 100644 --- a/benchmark/RESULTS.md +++ b/benchmark/RESULTS.md @@ -6,11 +6,11 @@ under `test/fixtures/`. Ground truth (intent, expected finding kinds, expected rating) is fixed by fixture design — the runner scores the audit engine against it. Benign cases include deliberate false-positive traps a naive detector would flag. -- Cases: **31** (24 rogue, 7 benign) across **24** detector kinds +- Cases: **33** (24 rogue, 9 benign) across **24** detector kinds - Detection (any finding): recall **100.0%**, false-positive rate **0.0%**, precision **100.0%** - At a `fail-on: high` CI gate: recall **54.2%**, false-positive rate **0.0%**, precision **100.0%** - Correct primary finding kind identified on **24/24** rogue cases; all expected kinds on **24/24** -- Exact rating match where the label pins one: **21/21** +- Exact rating match where the label pins one: **23/23** ## Confusion matrix by CI gate threshold @@ -18,10 +18,10 @@ A repo is predicted "drift" when its overall rating meets the threshold. `low` = | Gate (`fail-on`) | TP | FP | FN | TN | Precision | Recall | FP rate | F1 | Accuracy | | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| low | 24 | 0 | 0 | 7 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | -| medium | 22 | 0 | 2 | 7 | 100.0% | 91.7% | 0.0% | 95.7% | 93.5% | -| high | 13 | 0 | 11 | 7 | 100.0% | 54.2% | 0.0% | 70.3% | 64.5% | -| critical | 1 | 0 | 23 | 7 | 100.0% | 4.2% | 0.0% | 8.0% | 25.8% | +| low | 24 | 0 | 0 | 9 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% | +| medium | 22 | 0 | 2 | 9 | 100.0% | 91.7% | 0.0% | 95.7% | 93.9% | +| high | 13 | 0 | 11 | 9 | 100.0% | 54.2% | 0.0% | 70.3% | 66.7% | +| critical | 1 | 0 | 23 | 9 | 100.0% | 4.2% | 0.0% | 8.0% | 30.3% | ## Detector-kind identification (rogue cases) @@ -31,7 +31,7 @@ A repo is predicted "drift" when its overall rating meets the threshold. `low` = ## Rating agreement -Of the **21** cases whose label pins an exact consolidated rating, the audit matched **21**. +Of the **23** cases whose label pins an exact consolidated rating, the audit matched **23**. ## Results by category @@ -40,7 +40,7 @@ Of the **21** cases whose label pins an exact consolidated rating, the audit mat | baseline | 2 | 2/2 | — | | clean | 3 | — | 3/3 | | exceptions | 2 | 2/2 | — | -| fp-guard | 3 | — | 3/3 | +| fp-guard | 5 | — | 5/5 | | instructions | 1 | 1/1 | — | | mcp-drift | 10 | 10/10 | — | | multi-class | 1 | 1/1 | — | @@ -64,6 +64,8 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas | aider-safe | benign | fp-guard | none | 0 | — | — | yes | | codex-multiline-args | benign | fp-guard | none | 0 | — | — | — | | mcp-command-neutral-flag-equivalence | benign | fp-guard | none | 0 | — | — | — | +| mcp-github-sha-pinned | benign | fp-guard | none | 0 | — | — | yes | +| mcp-absolute-script-path | benign | fp-guard | none | 0 | — | — | yes | | exceptions-active | benign | suppression | none | 0 | — | — | yes | | conflicted | rogue | multi-class | high | 7 | yes | yes | yes | | claude-missing-mcp-server | rogue | mcp-drift | medium | 1 | yes | yes | yes | diff --git a/benchmark/labels.json b/benchmark/labels.json index fe3b699..e263d31 100644 --- a/benchmark/labels.json +++ b/benchmark/labels.json @@ -10,6 +10,8 @@ { "fixture": "aider-safe", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "dangerously-allow-non-git is false — must not fire." }, { "fixture": "codex-multiline-args", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": null, "recursive": false, "note": "Identical github invocation; Codex writes args across multiple TOML lines. A line-by-line parser would falsely flag mcp_command_mismatch." }, { "fixture": "mcp-command-neutral-flag-equivalence", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": null, "recursive": false, "note": "npx -y vs npx: the neutral -y flag must canonicalize to equal, no mcp_command_mismatch." }, + { "fixture": "mcp-github-sha-pinned", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "A GitHub URL pinned to a 40-char commit SHA is reproducible; it must NOT be flagged mcp_unpinned the way a branch/@latest URL is." }, + { "fixture": "mcp-absolute-script-path", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "An absolute local script path (outside the repo) is treated as a system path, not a repo-relative script; must NOT be flagged missing_local_script." }, { "fixture": "exceptions-active", "intent": "benign", "category": "suppression", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "An active .policymesh-exceptions.json entry suppresses the underlying mismatch." }, { "fixture": "conflicted", "intent": "rogue", "category": "multi-class", "expectFlagged": true, "expectKinds": ["policy_mesh.mcp_command_mismatch", "policy_mesh.mcp_unpinned", "policy_mesh.claude_deny_allow_overlap", "policy_mesh.claude_broad_allow_no_guard", "policy_mesh.codex_network_without_review", "policy_mesh.codex_trusted_with_risky_mcp", "policy_mesh.codex_claude_posture_gap"], "expectRating": "high", "recursive": false, "note": "Seven distinct drift classes in one repo." }, diff --git a/dist/parsers/mcp.js b/dist/parsers/mcp.js index 208fa64..cd24771 100644 --- a/dist/parsers/mcp.js +++ b/dist/parsers/mcp.js @@ -114,8 +114,16 @@ export function isUnpinnedCommand(server) { if (normalized.includes('@latest')) { return true; } - if (/https:\/\/github\.com\/[^ ]+/.test(normalized)) { - return true; + const githubUrl = normalized.match(/https:\/\/github\.com\/[^ ]+/); + if (githubUrl) { + // A GitHub URL is unpinned UNLESS it references an immutable 40-char + // commit SHA (e.g. .../archive/.tar.gz, .../tree/, or + // git+https://...#). Branch / tag / HEAD references are mutable + // and stay flagged. This avoids over-flagging every GitHub URL as + // unpinned when a SHA already makes the install reproducible. + if (!/[0-9a-f]{40}/.test(githubUrl[0])) { + return true; + } } if (/\bcurl\b.+\|\s*(bash|sh)\b/.test(normalized)) { return true; diff --git a/dist/report.js b/dist/report.js index fa1d057..1d3036d 100644 --- a/dist/report.js +++ b/dist/report.js @@ -370,6 +370,13 @@ function shortDescriptionForKind(kind) { 'policy_mesh.codex_trusted_with_risky_mcp': 'Codex project trusted while MCP is unpinned or inconsistent.', 'policy_mesh.codex_claude_posture_gap': 'Codex sandbox posture inconsistent with Claude deny rules.', 'policy_mesh.aider_dangerous_allow_non_git': 'Aider bypasses the git-tracked audit trail.', + 'policy_mesh.instructions_skip_confirmation': 'Instruction file directs the agent to act without asking for confirmation.', + 'policy_mesh.instructions_override_safety': 'Instruction file directs the agent to ignore or bypass safety/deny rules.', + 'policy_mesh.instructions_broad_write': 'Instruction file grants broad read/write access to any file or path.', + 'policy_mesh.instructions_auto_version_control': 'Instruction file directs the agent to commit, merge, or push automatically.', + 'policy_mesh.baseline_rating_drift': 'Audit rating exceeds the team baseline expected rating.', + 'policy_mesh.baseline_version_drift': 'MCP server version drifts from the team baseline pin.', + 'policy_mesh.baseline_parse_error': 'Baseline file could not be parsed.', 'policy_mesh.config_parse_error': 'Agent config file could not be parsed.', 'policy_mesh.exceptions_parse_error': 'Exceptions baseline file could not be parsed.' }; diff --git a/src/parsers/mcp.ts b/src/parsers/mcp.ts index 9c8866d..ef4f737 100644 --- a/src/parsers/mcp.ts +++ b/src/parsers/mcp.ts @@ -163,8 +163,16 @@ export function isUnpinnedCommand(server: McpServerRaw): boolean { return true; } - if (/https:\/\/github\.com\/[^ ]+/.test(normalized)) { - return true; + const githubUrl = normalized.match(/https:\/\/github\.com\/[^ ]+/); + if (githubUrl) { + // A GitHub URL is unpinned UNLESS it references an immutable 40-char + // commit SHA (e.g. .../archive/.tar.gz, .../tree/, or + // git+https://...#). Branch / tag / HEAD references are mutable + // and stay flagged. This avoids over-flagging every GitHub URL as + // unpinned when a SHA already makes the install reproducible. + if (!/[0-9a-f]{40}/.test(githubUrl[0])) { + return true; + } } if (/\bcurl\b.+\|\s*(bash|sh)\b/.test(normalized)) { diff --git a/src/report.ts b/src/report.ts index 5b60b87..1228377 100644 --- a/src/report.ts +++ b/src/report.ts @@ -415,6 +415,13 @@ function shortDescriptionForKind(kind: string): string { 'policy_mesh.codex_trusted_with_risky_mcp': 'Codex project trusted while MCP is unpinned or inconsistent.', 'policy_mesh.codex_claude_posture_gap': 'Codex sandbox posture inconsistent with Claude deny rules.', 'policy_mesh.aider_dangerous_allow_non_git': 'Aider bypasses the git-tracked audit trail.', + 'policy_mesh.instructions_skip_confirmation': 'Instruction file directs the agent to act without asking for confirmation.', + 'policy_mesh.instructions_override_safety': 'Instruction file directs the agent to ignore or bypass safety/deny rules.', + 'policy_mesh.instructions_broad_write': 'Instruction file grants broad read/write access to any file or path.', + 'policy_mesh.instructions_auto_version_control': 'Instruction file directs the agent to commit, merge, or push automatically.', + 'policy_mesh.baseline_rating_drift': 'Audit rating exceeds the team baseline expected rating.', + 'policy_mesh.baseline_version_drift': 'MCP server version drifts from the team baseline pin.', + 'policy_mesh.baseline_parse_error': 'Baseline file could not be parsed.', 'policy_mesh.config_parse_error': 'Agent config file could not be parsed.', 'policy_mesh.exceptions_parse_error': 'Exceptions baseline file could not be parsed.' }; diff --git a/test/cli-output.test.mjs b/test/cli-output.test.mjs index 8aaf565..def2e62 100644 --- a/test/cli-output.test.mjs +++ b/test/cli-output.test.mjs @@ -1670,6 +1670,36 @@ test('CLI render passes through SARIF format from a saved audit JSON', async () } }); +test('CLI SARIF rules carry human descriptions for instruction and baseline kinds', async () => { + async function rulesFor(fixture) { + const { stdout } = await execFileAsync( + process.execPath, + ['dist/index.js', 'audit', '--repo', join(testDir, 'fixtures', fixture), '--format', 'sarif'], + { cwd: packageRoot } + ); + return JSON.parse(stdout).runs[0].tool.driver.rules; + } + + // Instruction kinds previously fell back to the raw kind string in SARIF. + const instructionRules = await rulesFor('instructions-risky'); + for (const kind of [ + 'policy_mesh.instructions_skip_confirmation', + 'policy_mesh.instructions_override_safety', + 'policy_mesh.instructions_broad_write', + 'policy_mesh.instructions_auto_version_control' + ]) { + const rule = instructionRules.find((r) => r.id === kind); + assert.ok(rule, `expected a SARIF rule for ${kind}`); + assert.notEqual(rule.shortDescription.text, kind, `${kind} should have a real description, not the bare kind`); + } + + // Baseline drift kinds too. + const baselineRules = await rulesFor('baseline-version-drift'); + const drift = baselineRules.find((r) => r.id === 'policy_mesh.baseline_version_drift'); + assert.ok(drift); + assert.notEqual(drift.shortDescription.text, 'policy_mesh.baseline_version_drift'); +}); + test('CLI emits severity-aware GitHub annotations', async () => { const repo = join(testDir, 'fixtures', 'conflicted'); diff --git a/test/fixtures/mcp-absolute-script-path/.mcp.json b/test/fixtures/mcp-absolute-script-path/.mcp.json new file mode 100644 index 0000000..38d911a --- /dev/null +++ b/test/fixtures/mcp-absolute-script-path/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "runner": { + "command": "node", + "args": ["/opt/policymesh/runner.js"] + } + } +} diff --git a/test/fixtures/mcp-github-sha-pinned/.mcp.json b/test/fixtures/mcp-github-sha-pinned/.mcp.json new file mode 100644 index 0000000..2beb575 --- /dev/null +++ b/test/fixtures/mcp-github-sha-pinned/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "tool": { + "command": "npx", + "args": ["-y", "https://github.com/owner/repo/archive/a1b2c3d4e5f60718293a4b5c6d7e8f9012345678.tar.gz"] + } + } +} diff --git a/test/heuristics.test.mjs b/test/heuristics.test.mjs index a5905b7..4f12b83 100644 --- a/test/heuristics.test.mjs +++ b/test/heuristics.test.mjs @@ -9,6 +9,10 @@ const claudeModule = await import( pathToFileURL(join(testDir, '..', 'dist', 'parsers', 'claude.js')).href ); const { isBroadAllow, isSensitiveDeny } = claudeModule; +const mcpModule = await import( + pathToFileURL(join(testDir, '..', 'dist', 'parsers', 'mcp.js')).href +); +const { isUnpinnedCommand } = mcpModule; const { matchSecret } = await import('agent-gov-core'); const exceptionsModule = await import( pathToFileURL(join(testDir, '..', 'dist', 'exceptions.js')).href @@ -90,6 +94,22 @@ test('isBroadAllow: bare Task spawn is broad; scoped Task is not', () => { assert.equal(isBroadAllow('Task(explore-codebase)'), false); }); +test('isUnpinnedCommand: a GitHub URL pinned to a commit SHA is pinned; a branch URL is not', () => { + const sha = 'a1b2c3d4e5f60718293a4b5c6d7e8f9012345678'; + // Reproducible: the 40-char commit SHA makes the install immutable. + assert.equal( + isUnpinnedCommand({ command: 'npx', args: ['-y', `https://github.com/owner/repo/archive/${sha}.tar.gz`] }), + false + ); + // Mutable: a branch reference can move under you. + assert.equal( + isUnpinnedCommand({ command: 'npx', args: ['-y', 'https://github.com/owner/repo/archive/main.tar.gz'] }), + true + ); + // @latest stays unpinned regardless of host. + assert.equal(isUnpinnedCommand({ command: 'npx', args: ['-y', 'some-pkg@latest'] }), true); +}); + test('isBroadAllow: bare Bash/Read/Write/Edit are broad; scoped forms are not', () => { // A bare tool name in Claude Code allow rules matches every use of the // tool, so it grants unrestricted shell / filesystem access.