Conalh · Conalh · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md
@@ -6,22 +6,22 @@ under `test/fixtures/`. Ground truth (intent, expected finding kinds, expected
 rating) is fixed by fixture design — the runner scores the audit engine against
 it. Benign cases include deliberate false-positive traps a naive detector would flag.
 
-- Cases: **31** (24 rogue, 7 benign) across **24** detector kinds
+- Cases: **33** (24 rogue, 9 benign) across **24** detector kinds
 - Detection (any finding): recall **100.0%**, false-positive rate **0.0%**, precision **100.0%**
 - At a `fail-on: high` CI gate: recall **54.2%**, false-positive rate **0.0%**, precision **100.0%**
 - Correct primary finding kind identified on **24/24** rogue cases; all expected kinds on **24/24**
-- Exact rating match where the label pins one: **21/21**
+- Exact rating match where the label pins one: **23/23**
 
 ## Confusion matrix by CI gate threshold
 
 A repo is predicted "drift" when its overall rating meets the threshold. `low` = any finding at all.
 
 | Gate (`fail-on`) | TP | FP | FN | TN | Precision | Recall | FP rate | F1 | Accuracy |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
-| low | 24 | 0 | 0 | 7 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% |
-| medium | 22 | 0 | 2 | 7 | 100.0% | 91.7% | 0.0% | 95.7% | 93.5% |
-| high | 13 | 0 | 11 | 7 | 100.0% | 54.2% | 0.0% | 70.3% | 64.5% |
-| critical | 1 | 0 | 23 | 7 | 100.0% | 4.2% | 0.0% | 8.0% | 25.8% |
+| low | 24 | 0 | 0 | 9 | 100.0% | 100.0% | 0.0% | 100.0% | 100.0% |
+| medium | 22 | 0 | 2 | 9 | 100.0% | 91.7% | 0.0% | 95.7% | 93.9% |
+| high | 13 | 0 | 11 | 9 | 100.0% | 54.2% | 0.0% | 70.3% | 66.7% |
+| critical | 1 | 0 | 23 | 9 | 100.0% | 4.2% | 0.0% | 8.0% | 30.3% |
 
 ## Detector-kind identification (rogue cases)
 
@@ -31,7 +31,7 @@ A repo is predicted "drift" when its overall rating meets the threshold. `low` =
 
 ## Rating agreement
 
-Of the **21** cases whose label pins an exact consolidated rating, the audit matched **21**.
+Of the **23** cases whose label pins an exact consolidated rating, the audit matched **23**.
 
 ## Results by category
 
@@ -40,7 +40,7 @@ Of the **21** cases whose label pins an exact consolidated rating, the audit mat
 | baseline | 2 | 2/2 | — |
 | clean | 3 | — | 3/3 |
 | exceptions | 2 | 2/2 | — |
-| fp-guard | 3 | — | 3/3 |
+| fp-guard | 5 | — | 5/5 |
 | instructions | 1 | 1/1 | — |
 | mcp-drift | 10 | 10/10 | — |
 | multi-class | 1 | 1/1 | — |
@@ -64,6 +64,8 @@ None. Every rogue case produced all of its expected finding kinds, no benign cas
 | aider-safe | benign | fp-guard | none | 0 | — | — | yes |
 | codex-multiline-args | benign | fp-guard | none | 0 | — | — | — |
 | mcp-command-neutral-flag-equivalence | benign | fp-guard | none | 0 | — | — | — |
+| mcp-github-sha-pinned | benign | fp-guard | none | 0 | — | — | yes |
+| mcp-absolute-script-path | benign | fp-guard | none | 0 | — | — | yes |
 | exceptions-active | benign | suppression | none | 0 | — | — | yes |
 | conflicted | rogue | multi-class | high | 7 | yes | yes | yes |
 | claude-missing-mcp-server | rogue | mcp-drift | medium | 1 | yes | yes | yes |

diff --git a/benchmark/labels.json b/benchmark/labels.json
@@ -10,6 +10,8 @@
     { "fixture": "aider-safe", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "dangerously-allow-non-git is false — must not fire." },
     { "fixture": "codex-multiline-args", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": null, "recursive": false, "note": "Identical github invocation; Codex writes args across multiple TOML lines. A line-by-line parser would falsely flag mcp_command_mismatch." },
     { "fixture": "mcp-command-neutral-flag-equivalence", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": null, "recursive": false, "note": "npx -y vs npx: the neutral -y flag must canonicalize to equal, no mcp_command_mismatch." },
+    { "fixture": "mcp-github-sha-pinned", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "A GitHub URL pinned to a 40-char commit SHA is reproducible; it must NOT be flagged mcp_unpinned the way a branch/@latest URL is." },
+    { "fixture": "mcp-absolute-script-path", "intent": "benign", "category": "fp-guard", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "An absolute local script path (outside the repo) is treated as a system path, not a repo-relative script; must NOT be flagged missing_local_script." },
     { "fixture": "exceptions-active", "intent": "benign", "category": "suppression", "expectFlagged": false, "expectKinds": [], "expectRating": "none", "recursive": false, "note": "An active .policymesh-exceptions.json entry suppresses the underlying mismatch." },
 
     { "fixture": "conflicted", "intent": "rogue", "category": "multi-class", "expectFlagged": true, "expectKinds": ["policy_mesh.mcp_command_mismatch", "policy_mesh.mcp_unpinned", "policy_mesh.claude_deny_allow_overlap", "policy_mesh.claude_broad_allow_no_guard", "policy_mesh.codex_network_without_review", "policy_mesh.codex_trusted_with_risky_mcp", "policy_mesh.codex_claude_posture_gap"], "expectRating": "high", "recursive": false, "note": "Seven distinct drift classes in one repo." },

diff --git a/dist/parsers/mcp.js b/dist/parsers/mcp.js
@@ -114,8 +114,16 @@ export function isUnpinnedCommand(server) {
     if (normalized.includes('@latest')) {
         return true;
     }
-    if (/https:\/\/github\.com\/[^ ]+/.test(normalized)) {
-        return true;
+    const githubUrl = normalized.match(/https:\/\/github\.com\/[^ ]+/);
+    if (githubUrl) {
+        // A GitHub URL is unpinned UNLESS it references an immutable 40-char
+        // commit SHA (e.g. .../archive/<sha>.tar.gz, .../tree/<sha>, or
+        // git+https://...#<sha>). Branch / tag / HEAD references are mutable
+        // and stay flagged. This avoids over-flagging every GitHub URL as
+        // unpinned when a SHA already makes the install reproducible.
+        if (!/[0-9a-f]{40}/.test(githubUrl[0])) {
+            return true;
+        }
     }
     if (/\bcurl\b.+\|\s*(bash|sh)\b/.test(normalized)) {
         return true;

diff --git a/dist/report.js b/dist/report.js
@@ -370,6 +370,13 @@ function shortDescriptionForKind(kind) {
         'policy_mesh.codex_trusted_with_risky_mcp': 'Codex project trusted while MCP is unpinned or inconsistent.',
         'policy_mesh.codex_claude_posture_gap': 'Codex sandbox posture inconsistent with Claude deny rules.',
         'policy_mesh.aider_dangerous_allow_non_git': 'Aider bypasses the git-tracked audit trail.',
+        'policy_mesh.instructions_skip_confirmation': 'Instruction file directs the agent to act without asking for confirmation.',
+        'policy_mesh.instructions_override_safety': 'Instruction file directs the agent to ignore or bypass safety/deny rules.',
+        'policy_mesh.instructions_broad_write': 'Instruction file grants broad read/write access to any file or path.',
+        'policy_mesh.instructions_auto_version_control': 'Instruction file directs the agent to commit, merge, or push automatically.',
+        'policy_mesh.baseline_rating_drift': 'Audit rating exceeds the team baseline expected rating.',
+        'policy_mesh.baseline_version_drift': 'MCP server version drifts from the team baseline pin.',
+        'policy_mesh.baseline_parse_error': 'Baseline file could not be parsed.',
         'policy_mesh.config_parse_error': 'Agent config file could not be parsed.',
         'policy_mesh.exceptions_parse_error': 'Exceptions baseline file could not be parsed.'
     };

diff --git a/src/parsers/mcp.ts b/src/parsers/mcp.ts
@@ -163,8 +163,16 @@ export function isUnpinnedCommand(server: McpServerRaw): boolean {
     return true;
   }
 
-  if (/https:\/\/github\.com\/[^ ]+/.test(normalized)) {
-    return true;
+  const githubUrl = normalized.match(/https:\/\/github\.com\/[^ ]+/);
+  if (githubUrl) {
+    // A GitHub URL is unpinned UNLESS it references an immutable 40-char
+    // commit SHA (e.g. .../archive/<sha>.tar.gz, .../tree/<sha>, or
+    // git+https://...#<sha>). Branch / tag / HEAD references are mutable
+    // and stay flagged. This avoids over-flagging every GitHub URL as
+    // unpinned when a SHA already makes the install reproducible.
+    if (!/[0-9a-f]{40}/.test(githubUrl[0])) {
+      return true;
+    }
   }
 
   if (/\bcurl\b.+\|\s*(bash|sh)\b/.test(normalized)) {

diff --git a/src/report.ts b/src/report.ts
@@ -415,6 +415,13 @@ function shortDescriptionForKind(kind: string): string {
     'policy_mesh.codex_trusted_with_risky_mcp': 'Codex project trusted while MCP is unpinned or inconsistent.',
     'policy_mesh.codex_claude_posture_gap': 'Codex sandbox posture inconsistent with Claude deny rules.',
     'policy_mesh.aider_dangerous_allow_non_git': 'Aider bypasses the git-tracked audit trail.',
+    'policy_mesh.instructions_skip_confirmation': 'Instruction file directs the agent to act without asking for confirmation.',
+    'policy_mesh.instructions_override_safety': 'Instruction file directs the agent to ignore or bypass safety/deny rules.',
+    'policy_mesh.instructions_broad_write': 'Instruction file grants broad read/write access to any file or path.',
+    'policy_mesh.instructions_auto_version_control': 'Instruction file directs the agent to commit, merge, or push automatically.',
+    'policy_mesh.baseline_rating_drift': 'Audit rating exceeds the team baseline expected rating.',
+    'policy_mesh.baseline_version_drift': 'MCP server version drifts from the team baseline pin.',
+    'policy_mesh.baseline_parse_error': 'Baseline file could not be parsed.',
     'policy_mesh.config_parse_error': 'Agent config file could not be parsed.',
     'policy_mesh.exceptions_parse_error': 'Exceptions baseline file could not be parsed.'
   };

diff --git a/test/cli-output.test.mjs b/test/cli-output.test.mjs
@@ -1670,6 +1670,36 @@ test('CLI render passes through SARIF format from a saved audit JSON', async ()
   }
 });
 
+test('CLI SARIF rules carry human descriptions for instruction and baseline kinds', async () => {
+  async function rulesFor(fixture) {
+    const { stdout } = await execFileAsync(
+      process.execPath,
+      ['dist/index.js', 'audit', '--repo', join(testDir, 'fixtures', fixture), '--format', 'sarif'],
+      { cwd: packageRoot }
+    );
+    return JSON.parse(stdout).runs[0].tool.driver.rules;
+  }
+
+  // Instruction kinds previously fell back to the raw kind string in SARIF.
+  const instructionRules = await rulesFor('instructions-risky');
+  for (const kind of [
+    'policy_mesh.instructions_skip_confirmation',
+    'policy_mesh.instructions_override_safety',
+    'policy_mesh.instructions_broad_write',
+    'policy_mesh.instructions_auto_version_control'
+  ]) {
+    const rule = instructionRules.find((r) => r.id === kind);
+    assert.ok(rule, `expected a SARIF rule for ${kind}`);
+    assert.notEqual(rule.shortDescription.text, kind, `${kind} should have a real description, not the bare kind`);
+  }
+
+  // Baseline drift kinds too.
+  const baselineRules = await rulesFor('baseline-version-drift');
+  const drift = baselineRules.find((r) => r.id === 'policy_mesh.baseline_version_drift');
+  assert.ok(drift);
+  assert.notEqual(drift.shortDescription.text, 'policy_mesh.baseline_version_drift');
+});
+
 test('CLI emits severity-aware GitHub annotations', async () => {
   const repo = join(testDir, 'fixtures', 'conflicted');
 

diff --git a/test/fixtures/mcp-absolute-script-path/.mcp.json b/test/fixtures/mcp-absolute-script-path/.mcp.json
@@ -0,0 +1,8 @@
+{
+  "mcpServers": {
+    "runner": {
+      "command": "node",
+      "args": ["/opt/policymesh/runner.js"]
+    }
+  }
+}
diff --git a/test/fixtures/mcp-github-sha-pinned/.mcp.json b/test/fixtures/mcp-github-sha-pinned/.mcp.json
@@ -0,0 +1,8 @@
+{
+  "mcpServers": {
+    "tool": {
+      "command": "npx",
+      "args": ["-y", "https://github.com/owner/repo/archive/a1b2c3d4e5f60718293a4b5c6d7e8f9012345678.tar.gz"]
+    }
+  }
+}
diff --git a/test/heuristics.test.mjs b/test/heuristics.test.mjs
@@ -9,6 +9,10 @@ const claudeModule = await import(
   pathToFileURL(join(testDir, '..', 'dist', 'parsers', 'claude.js')).href
 );
 const { isBroadAllow, isSensitiveDeny } = claudeModule;
+const mcpModule = await import(
+  pathToFileURL(join(testDir, '..', 'dist', 'parsers', 'mcp.js')).href
+);
+const { isUnpinnedCommand } = mcpModule;
 const { matchSecret } = await import('agent-gov-core');
 const exceptionsModule = await import(
   pathToFileURL(join(testDir, '..', 'dist', 'exceptions.js')).href
@@ -90,6 +94,22 @@ test('isBroadAllow: bare Task spawn is broad; scoped Task is not', () => {
   assert.equal(isBroadAllow('Task(explore-codebase)'), false);
 });
 
+test('isUnpinnedCommand: a GitHub URL pinned to a commit SHA is pinned; a branch URL is not', () => {
+  const sha = 'a1b2c3d4e5f60718293a4b5c6d7e8f9012345678';
+  // Reproducible: the 40-char commit SHA makes the install immutable.
+  assert.equal(
+    isUnpinnedCommand({ command: 'npx', args: ['-y', `https://github.com/owner/repo/archive/${sha}.tar.gz`] }),
+    false
+  );
+  // Mutable: a branch reference can move under you.
+  assert.equal(
+    isUnpinnedCommand({ command: 'npx', args: ['-y', 'https://github.com/owner/repo/archive/main.tar.gz'] }),
+    true
+  );
+  // @latest stays unpinned regardless of host.
+  assert.equal(isUnpinnedCommand({ command: 'npx', args: ['-y', 'some-pkg@latest'] }), true);
+});
+
 test('isBroadAllow: bare Bash/Read/Write/Edit are broad; scoped forms are not', () => {
   // A bare tool name in Claude Code allow rules matches every use of the
   // tool, so it grants unrestricted shell / filesystem access.