diff --git a/.crane/scripts/score.go b/.crane/scripts/score.go
index da3d1a4e..ff2494c1 100644
--- a/.crane/scripts/score.go
+++ b/.crane/scripts/score.go
@@ -137,7 +137,7 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) {
 	knownExceptions := knownExceptionsFromEnv(getenv("APM_KNOWN_EXCEPTIONS"))
 	pythonReference := BoolGate{}
 	pythonTests := BoolGate{Seen: getenv("APM_PYTHON_TESTS") != "", Passed: getenv("APM_PYTHON_TESTS") == "pass"}
-	benchmarks := BoolGate{Seen: getenv("APM_BENCHMARKS") != "", Passed: getenv("APM_BENCHMARKS") == "pass"}
+	benchmarks := RatioGate{}
 	surface := RatioGate{}
 	help := RatioGate{}
 	functional := RatioGate{}
@@ -224,25 +224,25 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) {
 		pythonReference = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionHardGate") || pythonReferenceReady(getenv("APM_PYTHON_BIN"))}
 	}
 	if !surface.Seen {
-		surface = inferredAnyRatioGate(passed, failed, "TestParityCompletionSurfaceParity", "TestParitySurfaceInventory")
+		surface = missingRatioGate()
 	}
 	if !help.Seen {
-		help = inferredAllRatioGate(passed, failed, "TestParityCompletionCommandMatrix", "TestParityCompletionHelpIdentical")
+		help = missingRatioGate()
 	}
 	if !functional.Seen {
-		functional = inferredAnyRatioGate(passed, failed, "TestParityCompletionFunctionalContracts", "TestParityFunctionalContracts")
+		functional = missingRatioGate()
 	}
 	if !stateDiff.Seen {
-		stateDiff = inferredAnyRatioGate(passed, failed, "TestParityCompletionStateDiffContracts", "TestParityStateDiffContracts")
+		stateDiff = missingRatioGate()
 	}
 	if !behaviorContracts.Seen {
-		behaviorContracts = RatioGate{Seen: true, Passing: 0, Total: 1}
+		behaviorContracts = missingRatioGate()
 	}
 	if !pythonTests.Seen {
 		pythonTests = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionPythonSuite")}
 	}
 	if !benchmarks.Seen {
-		benchmarks = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionBenchmarks")}
+		benchmarks = missingRatioGate()
 	}
 
 	goTestsPass := !goTestsFailed && targetTotal > 0 && targetPassing == targetTotal
@@ -346,7 +346,7 @@ func applyGateEvent(
 	behaviorContracts *RatioGate,
 	knownExceptions *int,
 	pythonTests *BoolGate,
-	benchmarks *BoolGate,
+	benchmarks *RatioGate,
 ) {
 	switch gate.Name {
 	case "python_reference":
@@ -366,7 +366,7 @@ func applyGateEvent(
 	case "python_tests":
 		*pythonTests = BoolGate{Seen: true, Passed: gate.Passed}
 	case "benchmarks":
-		*benchmarks = BoolGate{Seen: true, Passed: gate.Passed}
+		*benchmarks = RatioGate{Seen: true, Passing: gate.Passing, Total: gate.Total}
 	}
 }
 
@@ -399,31 +399,8 @@ func testPassed(passed, failed map[string]bool, names ...string) bool {
 	return false
 }
 
-func inferredAnyRatioGate(passed, failed map[string]bool, names ...string) RatioGate {
-	for _, name := range names {
-		if failed[name] {
-			return RatioGate{Seen: true, Passing: 0, Total: 1}
-		}
-	}
-	return RatioGate{Seen: true, Passing: boolToInt(testPassed(passed, failed, names...)), Total: 1}
-}
-
-func inferredAllRatioGate(passed, failed map[string]bool, names ...string) RatioGate {
-	for _, name := range names {
-		if failed[name] {
-			return RatioGate{Seen: true, Passing: 0, Total: 1}
-		}
-	}
-	return RatioGate{Seen: true, Passing: boolToInt(allRequiredTestsPassed(passed, names...)), Total: 1}
-}
-
-func allRequiredTestsPassed(passed map[string]bool, names ...string) bool {
-	for _, name := range names {
-		if !passed[name] {
-			return false
-		}
-	}
-	return true
+func missingRatioGate() RatioGate {
+	return RatioGate{Seen: true, Passing: 0, Total: 1}
 }
 
 func gateResults(gates CutoverGates) []GateResult {
@@ -448,13 +425,6 @@ func passFail(ok bool) string {
 	return "fail"
 }
 
-func boolToInt(ok bool) int {
-	if ok {
-		return 1
-	}
-	return 0
-}
-
 func knownExceptionsFromEnv(raw string) int {
 	if raw == "" {
 		return 0
diff --git a/.github/workflows/migration-ci.yml b/.github/workflows/migration-ci.yml
index 3a3d197e..a8555269 100644
--- a/.github/workflows/migration-ci.yml
+++ b/.github/workflows/migration-ci.yml
@@ -4,6 +4,12 @@ on:
   pull_request:
     branches: [main]
   workflow_dispatch:
+    inputs:
+      enforce_completion:
+        description: "Fail unless migration completion gates are fully satisfied"
+        required: false
+        default: false
+        type: boolean
 
 permissions:
   contents: read
@@ -99,6 +105,18 @@ jobs:
       - name: Run Go parity tests
         shell: bash
         run: |
+          enforce_completion=false
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then
+            enforce_completion=true
+          elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then
+            enforce_completion=true
+          fi
+
+          echo "MIGRATION_COMPLETION_ENFORCED=$enforce_completion" >> "$GITHUB_ENV"
+          if [ "$enforce_completion" = "true" ]; then
+            export APM_ENFORCE_COMPLETION_GATES=1
+          fi
+
           set +e
           go test -json ./... | tee "$RUNNER_TEMP/go-test-events.json"
           status=${PIPESTATUS[0]}
@@ -113,21 +131,38 @@ jobs:
             --coverage tests/parity/python_contract_coverage.yml \
             --allow-intentionally-incomplete \
             --summary "$RUNNER_TEMP/python-contract-coverage.md" || true
-          python - "$RUNNER_TEMP/migration-score.json" <<'PY'
+          python - "$RUNNER_TEMP/migration-score.json" "${MIGRATION_COMPLETION_ENFORCED:-false}" <<'PY'
           import json
           import sys
 
           with open(sys.argv[1], encoding="utf-8") as fh:
               score = json.load(fh)
+          enforce_completion = sys.argv[2].lower() == "true"
 
           print(json.dumps(score, indent=2, sort_keys=True))
+          if not enforce_completion:
+              print(
+                  "::notice::Non-enforcing migration evidence run; "
+                  "completion gates are enforced only for crane/* PRs and "
+                  "manual runs with enforce_completion=true."
+              )
+              raise SystemExit(0)
           if score.get("progress") != 1.0:
               raise SystemExit("progress must be 1.0 for completion parity")
           if score.get("migration_score") == 1.0 and not score.get("deletion_grade_ready"):
               raise SystemExit("migration_score 1.0 requires deletion_grade_ready")
           PY
-          test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0"
-          test "${GO_TEST_STATUS:-1}" = "0"
+          if [ "${MIGRATION_COMPLETION_ENFORCED:-false}" = "true" ]; then
+            test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0"
+            test "${GO_TEST_STATUS:-1}" = "0"
+          else
+            if [ "${PYTHON_CLI_CONTRACT_STATUS:-1}" != "0" ]; then
+              echo "::notice::Python behavior contract tests are incomplete in collection mode."
+            fi
+            if [ "${GO_TEST_STATUS:-1}" != "0" ]; then
+              echo "::notice::Go parity tests are incomplete in collection mode."
+            fi
+          fi
 
       - name: Upload parity evidence
         if: always()
@@ -171,13 +206,27 @@ jobs:
         run: go build -o "$RUNNER_TEMP/apm-go" ./cmd/apm
 
       - name: Run Python-vs-Go CLI benchmark
+        shell: bash
         run: |
+          enforce_completion=false
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then
+            enforce_completion=true
+          elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then
+            enforce_completion=true
+          fi
+
+          extra_args=()
+          if [ "$enforce_completion" != "true" ]; then
+            extra_args+=(--allow-failures)
+          fi
+
           python scripts/ci/migration_cli_benchmark.py \
             --python-bin "$GITHUB_WORKSPACE/.venv/bin/apm" \
             --go-bin "$RUNNER_TEMP/apm-go" \
             --json-out "$RUNNER_TEMP/migration-cli-benchmark.json" \
             --markdown-out "$RUNNER_TEMP/migration-cli-benchmark.md" \
-            --max-ratio 5.0
+            --max-ratio 5.0 \
+            "${extra_args[@]}"
 
       - name: Run Python scaling guards
         run: uv run pytest tests/benchmarks/test_scaling_guards.py -v
diff --git a/README.md b/README.md
index d24c888c..35d8ba8f 100644
--- a/README.md
+++ b/README.md
@@ -61,10 +61,20 @@ Maintainers can dispatch the migration workflow manually:
 gh workflow run migration-ci.yml --repo githubnext/apm --ref main
 ```
 
+That default manual run collects parity and benchmark evidence without treating
+known migration gaps as a CI failure. To run the deterministic hard completion
+gate, opt in explicitly:
+
+```bash
+gh workflow run migration-ci.yml --repo githubnext/apm --ref main -f enforce_completion=true
+```
+
 After it runs, open the **Migration Benchmarks** job summary for the timing
 table. The same run uploads the `migration-benchmark-evidence` artifact with
 JSON and Markdown copies of the benchmark data. In the benchmark table, the
 `Go/Python` ratio is the Go median duration divided by the Python median
-duration: values below `1.00x` mean Go is faster. Recent smoke benchmark
-evidence for startup/help/init-style commands shows the Go CLI roughly
-`327x`-`370x` faster than the Python CLI.
+duration: values below `1.00x` mean Go is faster. The benchmark includes
+fixture-backed commands that read, write, execute, or fail against realistic APM
+project state: `apm.yml`, `apm.lock.yaml`, installed `apm_modules`, local
+`.apm` primitives, target directories, deployed prompt files, and sample source
+files.
diff --git a/cmd/apm/CUTOVER.md b/cmd/apm/CUTOVER.md
index 1b554aa1..632110e1 100644
--- a/cmd/apm/CUTOVER.md
+++ b/cmd/apm/CUTOVER.md
@@ -15,7 +15,59 @@ The Go CLI currently implements:
 - `apm init [--yes] [PROJECT_NAME]` (functional, creates apm.yml)
 - Per-command `--help` for all 26 commands (golden-file verified)
 
-Remaining commands return a "not yet fully implemented" message.
+Most remaining commands are wired at the CLI surface. That is not enough for
+cutover. A command that prints success without writing the expected files,
+mutating `apm.yml`, updating `apm.lock.yaml`, executing a script, or detecting a
+planted failure is still incomplete.
+
+## Real Criteria
+
+Every completion criterion must be backed by real command execution. The scorer
+does not infer completion from test names for `surface`, `help`, `functional`,
+`state_diff`, `python_behavior_contracts`, or `benchmarks`; each one must emit an
+explicit ratio gate.
+
+Crane must run `go test ./cmd/apm -run TestParityRealFunctionalAndStateDiffContracts -json`.
+That fixture-backed test executes the built Go `apm` binary in temporary
+projects and emits the existing completion gates directly:
+
+```json
+{"crane":"gate","name":"functional","passing":N,"total":N}
+{"crane":"gate","name":"state_diff","passing":N,"total":N}
+```
+
+Crane must also run the migration benchmark test. It executes fixture-backed
+Python-vs-Go benchmark workloads and emits:
+
+```json
+{"crane":"gate","name":"benchmarks","passing":N,"total":N}
+```
+
+A legacy boolean such as `{"name":"benchmarks","passed":true}` is not enough.
+The benchmark report must prove that every benchmarked command produced the
+expected real artifact or output evidence.
+
+The completion criteria are command-specific:
+
+| Command area | Required proof |
+| --- | --- |
+| `init` | Creates a real `apm.yml` manifest. |
+| `install` | Installs a local package, writes `apm.lock.yaml`, and materializes installed content under `apm_modules/` or target paths. |
+| `update` | Mutates the lockfile when a dependency changes and reports a real no-op when nothing changed. |
+| `compile` | Writes target artifacts such as `.github/copilot-instructions.md` from fixture project state. |
+| `pack` / `unpack` | Writes a non-empty distributable bundle and can extract it back into a temp project. |
+| `run` / `preview` / `list` | Reads project scripts, executes or previews the selected script, and reflects the actual manifest contents. |
+| `audit` / `policy` | Fails on planted hidden Unicode, missing lockfile state, or policy violations instead of always reporting success. |
+| `mcp` / `runtime` / `plugin` / `marketplace` | Persist real manifest or config changes, not just status text. |
+| `cache` | Removes cache entries while respecting the configured cache root. |
+| `prune` / `uninstall` | Removes only files owned by stale dependencies and proves the removed paths are gone. |
+| `deps` / `outdated` / `view` / `search` | Read lockfile, marketplace, or registry fixtures and report fixture-derived results. |
+| `self-update` / `experimental` / `config` | Persist or validate real configuration state where the Python command does. |
+
+Each new command implementation should add or extend functional, state-diff, and
+benchmark fixture coverage before Crane can claim it moved the migration
+forward. Shims, dry-runs, mocks, and help-only assertions do not count as command
+completion.
 
 ## Cutover Trigger Conditions
 
@@ -27,9 +79,13 @@ are true:
    `init`, `install`, `update`, `compile`, `pack`, `run`, `audit`,
    `policy`, `mcp`, `runtime`, `targets`, `list`, `view`, `cache`,
    `deps`, `marketplace`, `uninstall`, `prune`
-3. Python-vs-Go parity tests pass for all commands in the matrix
-4. `go build ./cmd/apm` produces a single static binary
-5. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`)
+3. `TestParityRealFunctionalAndStateDiffContracts` passes every fixture-backed
+   real-command scenario and emits passing `functional` and `state_diff` gates
+4. Python-vs-Go parity tests pass for all commands in the matrix
+5. Migration benchmarks pass real fixture-backed command workloads and emit a
+   passing counted `benchmarks` gate
+6. `go build ./cmd/apm` produces a single static binary
+7. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`)
 
 ## Cutover Steps
 
diff --git a/cmd/apm/parity_completion_test.go b/cmd/apm/parity_completion_test.go
index 6ad47073..bd3d5ca2 100644
--- a/cmd/apm/parity_completion_test.go
+++ b/cmd/apm/parity_completion_test.go
@@ -10,6 +10,7 @@
 package main
 
 import (
+	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
@@ -19,6 +20,23 @@ import (
 	"testing"
 )
 
+func completionGatesEnforced() bool {
+	return os.Getenv("APM_ENFORCE_COMPLETION_GATES") == "1"
+}
+
+func completionGateFailure(t *testing.T, format string, args ...any) {
+	t.Helper()
+	if completionGatesEnforced() {
+		t.Fatalf(format, args...)
+		return
+	}
+	t.Logf(format, args...)
+}
+
+func emitCraneBoolGate(name string, passed bool) {
+	fmt.Printf("{\"crane\":\"gate\",\"name\":%q,\"passed\":%t}\n", name, passed)
+}
+
 // TestParityCompletionHardGate enforces the Python-vs-Go completion gate.
 // Unlike TestParityHarnessHardGatePythonBin (which just logs), this test
 // FAILS when APM_PYTHON_BIN is not set -- ensuring score.go's correctness_gate
@@ -381,8 +399,11 @@ func TestParityCompletionPythonSuite(t *testing.T) {
 	cmd.Stdout = &outBuf
 	cmd.Stderr = &errBuf
 	if runErr := cmd.Run(); runErr != nil {
-		t.Fatalf("Python suite failed:\n%s\n%s", outBuf.String(), errBuf.String())
+		emitCraneBoolGate("python_tests", false)
+		completionGateFailure(t, "Python suite failed:\n%s\n%s", outBuf.String(), errBuf.String())
+		return
 	}
+	emitCraneBoolGate("python_tests", true)
 	t.Logf("[+] Python suite passed:\n%s", outBuf.String())
 }
 
@@ -428,12 +449,44 @@ func TestParityCompletionBenchmarks(t *testing.T) {
 	cmd.Stdout = &outBuf
 	cmd.Stderr = &errBuf
 	if runErr := cmd.Run(); runErr != nil {
-		t.Fatalf("Benchmark failed (Go CLI exceeds 5x Python latency or script error):\n%s\n%s",
+		passing, total := benchmarkGateCounts(t, jsonOut)
+		emitCraneRatioGate("benchmarks", passing, total)
+		completionGateFailure(t, "Benchmark failed (Go CLI exceeds 5x Python latency or script error):\n%s\n%s",
 			outBuf.String(), errBuf.String())
+		return
+	}
+	passing, total := benchmarkGateCounts(t, jsonOut)
+	emitCraneRatioGate("benchmarks", passing, total)
+	if passing != total {
+		completionGateFailure(t, "Benchmark artifact checks incomplete: %d/%d passed\n%s", passing, total, outBuf.String())
+		return
 	}
 	t.Logf("[+] Benchmarks passed:\n%s", outBuf.String())
 }
 
+func benchmarkGateCounts(t *testing.T, path string) (int, int) {
+	t.Helper()
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return 0, 1
+	}
+	var report struct {
+		Results []struct {
+			Passed bool `json:"passed"`
+		} `json:"results"`
+	}
+	if err := json.Unmarshal(data, &report); err != nil || len(report.Results) == 0 {
+		return 0, 1
+	}
+	passing := 0
+	for _, result := range report.Results {
+		if result.Passed {
+			passing++
+		}
+	}
+	return passing, len(report.Results)
+}
+
 // runPyBin runs the Python apm binary with the given args.
 func runPyBin(t *testing.T, bin string, args ...string) (stdout, stderr string, exitCode int) {
 	t.Helper()
diff --git a/cmd/apm/python_behavior_contracts_test.go b/cmd/apm/python_behavior_contracts_test.go
index 2cc6176c..38303711 100644
--- a/cmd/apm/python_behavior_contracts_test.go
+++ b/cmd/apm/python_behavior_contracts_test.go
@@ -195,7 +195,8 @@ func TestParityCompletionPythonBehaviorContracts(t *testing.T) {
 		extract.Env = append(os.Environ(), "NO_COLOR=1", "COLUMNS=10000")
 		if out, err := extract.CombinedOutput(); err != nil {
 			emitCraneRatioGate("python_behavior_contracts", 0, 1)
-			t.Fatalf("HARD-GATE FAILED: python_behavior_contracts extraction failed: %v\n%s", err, string(out))
+			completionGateFailure(t, "HARD-GATE FAILED: python_behavior_contracts extraction failed: %v\n%s", err, string(out))
+			return
 		}
 	}
 
@@ -213,7 +214,8 @@ func TestParityCompletionPythonBehaviorContracts(t *testing.T) {
 	out, err := check.CombinedOutput()
 	if err != nil {
 		emitCraneRatioGate("python_behavior_contracts", 0, 1)
-		t.Fatalf("HARD-GATE FAILED: python_behavior_contracts coverage incomplete:\n%s", string(out))
+		completionGateFailure(t, "HARD-GATE FAILED: python_behavior_contracts coverage incomplete:\n%s", string(out))
+		return
 	}
 	emitCraneRatioGate("python_behavior_contracts", 1, 1)
 }
diff --git a/cmd/apm/real_behavior_test.go b/cmd/apm/real_behavior_test.go
new file mode 100644
index 00000000..5cd07f86
--- /dev/null
+++ b/cmd/apm/real_behavior_test.go
@@ -0,0 +1,354 @@
+package main
+
+import (
+	"bytes"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+type realBehaviorCase struct {
+	name   string
+	args   []string
+	env    map[string]string
+	setup  func(t *testing.T, dir string)
+	verify func(t *testing.T, dir, stdout, stderr string, code int) bool
+}
+
+func TestParityRealFunctionalAndStateDiffContracts(t *testing.T) {
+	cases := []realBehaviorCase{
+		{
+			name: "init creates manifest",
+			args: []string{"init", "--yes"},
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "dependencies:") && ok
+				return ok
+			},
+		},
+		{
+			name:  "install local package materializes lock and modules",
+			args:  []string{"install", "./packages/local-tools"},
+			setup: realBehaviorSetupLocalPackage,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.lock.yaml"), "local-tools") && ok
+				ok = realBehaviorExpectDirHasEntries(t, filepath.Join(dir, "apm_modules")) && ok
+				return ok
+			},
+		},
+		{
+			name:  "compile writes copilot target",
+			args:  []string{"compile", "--target", "copilot"},
+			setup: realBehaviorSetupProject,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, ".github", "copilot-instructions.md"), "real-behavior") && ok
+				return ok
+			},
+		},
+		{
+			name:  "pack writes distributable output",
+			args:  []string{"pack", "--output", "dist"},
+			setup: realBehaviorSetupProjectWithLock,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectDirHasEntries(t, filepath.Join(dir, "dist")) && ok
+				return ok
+			},
+		},
+		{
+			name:  "run executes project script",
+			args:  []string{"run", "stamp"},
+			setup: realBehaviorSetupRunnableProject,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "run-stamp.txt"), "real-run") && ok
+				return ok
+			},
+		},
+		{
+			name:  "audit ci fails on planted hidden unicode",
+			args:  []string{"audit", "--ci"},
+			setup: realBehaviorSetupAuditFinding,
+			verify: func(t *testing.T, _ string, stdout, stderr string, code int) bool {
+				if code == 0 {
+					realBehaviorFailure(t, "expected non-zero exit for hidden unicode finding\nstdout: %s\nstderr: %s", stdout, stderr)
+					return false
+				}
+				return true
+			},
+		},
+		{
+			name:  "mcp install persists manifest dependency",
+			args:  []string{"mcp", "install", "example-server"},
+			setup: realBehaviorSetupProject,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "example-server") && ok
+				return ok
+			},
+		},
+		{
+			name: "plugin init writes plugin manifest",
+			args: []string{"plugin", "init"},
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "plugin.json"), "\"name\"") && ok
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "plugin") && ok
+				return ok
+			},
+		},
+		{
+			name: "marketplace init writes marketplace block",
+			args: []string{"marketplace", "init"},
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "marketplace:") && ok
+				return ok
+			},
+		},
+		{
+			name:  "cache clean removes entries but preserves cache root",
+			args:  []string{"cache", "clean"},
+			env:   map[string]string{"APM_CACHE_DIR": "cache-root"},
+			setup: realBehaviorSetupCacheRoot,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				cacheRoot := filepath.Join(dir, "cache-root")
+				ok = realBehaviorExpectPathExists(t, cacheRoot) && ok
+				ok = realBehaviorExpectPathMissing(t, filepath.Join(cacheRoot, "http_v1", "old", "body")) && ok
+				return ok
+			},
+		},
+		{
+			name:  "prune removes unreferenced module",
+			args:  []string{"prune"},
+			setup: realBehaviorSetupStaleModule,
+			verify: func(t *testing.T, dir, stdout, stderr string, code int) bool {
+				ok := realBehaviorExpectExit(t, stdout, stderr, code, 0)
+				ok = realBehaviorExpectPathMissing(t, filepath.Join(dir, "apm_modules", "stale-package")) && ok
+				return ok
+			},
+		},
+	}
+
+	functionalPassing := 0
+	stateDiffPassing := 0
+	defer func() {
+		emitCraneRatioGate("functional", functionalPassing, len(cases))
+		emitCraneRatioGate("state_diff", stateDiffPassing, len(cases))
+	}()
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			dir := t.TempDir()
+			if tc.setup != nil {
+				tc.setup(t, dir)
+			}
+			stdout, stderr, code := realBehaviorRunGoInDir(t, dir, tc.env, tc.args...)
+			if tc.verify(t, dir, stdout, stderr, code) {
+				functionalPassing++
+				stateDiffPassing++
+			}
+		})
+	}
+}
+
+func realBehaviorCompletionGatesEnforced() bool {
+	return os.Getenv("APM_ENFORCE_COMPLETION_GATES") == "1"
+}
+
+func realBehaviorFailure(t *testing.T, format string, args ...any) {
+	t.Helper()
+	if realBehaviorCompletionGatesEnforced() {
+		t.Errorf(format, args...)
+		return
+	}
+	t.Logf(format, args...)
+}
+
+func realBehaviorRunGoInDir(t *testing.T, dir string, env map[string]string, args ...string) (string, string, int) {
+	t.Helper()
+	if goBinPath == "" {
+		t.Skip("Go binary not built; skipping")
+	}
+
+	var outBuf, errBuf bytes.Buffer
+	cmd := exec.Command(goBinPath, args...)
+	cmd.Dir = dir
+	cmd.Stdout = &outBuf
+	cmd.Stderr = &errBuf
+	cmd.Env = os.Environ()
+	for key, value := range env {
+		cmd.Env = append(cmd.Env, key+"="+value)
+	}
+
+	err := cmd.Run()
+	code := 0
+	if err != nil {
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			code = exitErr.ExitCode()
+		} else {
+			t.Fatalf("failed to run apm %s: %v", strings.Join(args, " "), err)
+		}
+	}
+	return outBuf.String(), errBuf.String(), code
+}
+
+func realBehaviorSetupProject(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm.yml"), `name: real-behavior
+version: 1.0.0
+description: Real behavior fixture
+author: Crane
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
+`)
+	realBehaviorWriteFile(t, filepath.Join(dir, ".apm", "prompts", "real-behavior.md"), "real-behavior prompt\n")
+}
+
+func realBehaviorSetupProjectWithLock(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorSetupProject(t, dir)
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm.lock.yaml"), `lockfile_version: "1"
+dependencies: []
+local_deployed_files:
+  - .apm/prompts/real-behavior.md
+local_deployed_file_hashes: {}
+`)
+}
+
+func realBehaviorSetupRunnableProject(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm.yml"), `name: runnable
+version: 1.0.0
+description: Runnable fixture
+author: Crane
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts:
+  stamp: "printf real-run > run-stamp.txt"
+`)
+}
+
+func realBehaviorSetupLocalPackage(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorSetupProject(t, dir)
+	pkgDir := filepath.Join(dir, "packages", "local-tools")
+	realBehaviorWriteFile(t, filepath.Join(pkgDir, "apm.yml"), `name: local-tools
+version: 1.0.0
+description: Local tools package
+author: Crane
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
+`)
+	realBehaviorWriteFile(t, filepath.Join(pkgDir, ".apm", "prompts", "tool.md"), "local-tools prompt\n")
+}
+
+func realBehaviorSetupAuditFinding(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorSetupProjectWithLock(t, dir)
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm_modules", "unicode-package", "SKILL.md"), "safe text \u202eevil text\n")
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm.lock.yaml"), `lockfile_version: "1"
+dependencies:
+  - repo_url: local/unicode-package
+    resolved_commit: fixture
+    deployed_files:
+      - apm_modules/unicode-package/SKILL.md
+    deployed_file_hashes: {}
+`)
+}
+
+func realBehaviorSetupCacheRoot(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorWriteFile(t, filepath.Join(dir, "cache-root", "http_v1", "old", "body"), "cached\n")
+}
+
+func realBehaviorSetupStaleModule(t *testing.T, dir string) {
+	t.Helper()
+	realBehaviorSetupProjectWithLock(t, dir)
+	realBehaviorWriteFile(t, filepath.Join(dir, "apm_modules", "stale-package", "README.md"), "stale\n")
+}
+
+func realBehaviorWriteFile(t *testing.T, path, content string) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+		t.Fatalf("failed to create parent dir for %s: %v", path, err)
+	}
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatalf("failed to write %s: %v", path, err)
+	}
+}
+
+func realBehaviorExpectExit(t *testing.T, stdout, stderr string, got, want int) bool {
+	t.Helper()
+	if got != want {
+		realBehaviorFailure(t, "exit code = %d, want %d\nstdout: %s\nstderr: %s", got, want, stdout, stderr)
+		return false
+	}
+	return true
+}
+
+func realBehaviorExpectFileContains(t *testing.T, path, needle string) bool {
+	t.Helper()
+	content, err := os.ReadFile(path)
+	if err != nil {
+		realBehaviorFailure(t, "expected file %s to exist: %v", path, err)
+		return false
+	}
+	if !strings.Contains(string(content), needle) {
+		realBehaviorFailure(t, "expected %s to contain %q, got:\n%s", path, needle, string(content))
+		return false
+	}
+	return true
+}
+
+func realBehaviorExpectPathExists(t *testing.T, path string) bool {
+	t.Helper()
+	if _, err := os.Stat(path); err != nil {
+		realBehaviorFailure(t, "expected path %s to exist: %v", path, err)
+		return false
+	}
+	return true
+}
+
+func realBehaviorExpectPathMissing(t *testing.T, path string) bool {
+	t.Helper()
+	if _, err := os.Stat(path); err == nil {
+		realBehaviorFailure(t, "expected path %s to be removed", path)
+		return false
+	} else if !os.IsNotExist(err) {
+		realBehaviorFailure(t, "expected path %s to be absent, got: %v", path, err)
+		return false
+	}
+	return true
+}
+
+func realBehaviorExpectDirHasEntries(t *testing.T, path string) bool {
+	t.Helper()
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		realBehaviorFailure(t, "expected directory %s to exist: %v", path, err)
+		return false
+	}
+	if len(entries) == 0 {
+		realBehaviorFailure(t, "expected directory %s to contain at least one entry", path)
+		return false
+	}
+	return true
+}
diff --git a/scripts/ci/migration_cli_benchmark.py b/scripts/ci/migration_cli_benchmark.py
index e1fc970c..ca24e57a 100644
--- a/scripts/ci/migration_cli_benchmark.py
+++ b/scripts/ci/migration_cli_benchmark.py
@@ -1,32 +1,140 @@
 #!/usr/bin/env python3
-"""Compare Python and Go CLI latency for migration smoke commands."""
+"""Compare Python and Go CLI latency for migration benchmark workloads."""
 
 from __future__ import annotations
 
 import argparse
 import json
 import os
+import re
 import statistics
 import subprocess
 import tempfile
 import time
+from dataclasses import dataclass
 from pathlib import Path
 
-COMMANDS: list[tuple[str, list[str], bool]] = [
-    ("help", ["--help"], False),
-    ("version", ["--version"], False),
-    ("compile-help", ["compile", "--help"], False),
-    ("install-help", ["install", "--help"], False),
-    ("pack-help", ["pack", "--help"], False),
-    ("audit-help", ["audit", "--help"], False),
-    ("init-yes", ["init", "--yes"], True),
+FixtureName = str
+
+
+@dataclass(frozen=True)
+class BenchmarkCommand:
+    name: str
+    args: list[str]
+    fixture: FixtureName
+    workload: str
+    required_paths: tuple[str, ...] = ()
+    stdout_contains: tuple[str, ...] = ()
+    file_contains: tuple[tuple[str, str], ...] = ()
+    expect_nonzero: bool = False
+
+
+COMMANDS: list[BenchmarkCommand] = [
+    BenchmarkCommand(
+        name="init scaffold",
+        args=["init", "--yes"],
+        fixture="empty-project",
+        workload="Creates a new apm.yml in an otherwise empty project directory.",
+        required_paths=("apm.yml",),
+        file_contains=(("apm.yml", "dependencies:"),),
+    ),
+    BenchmarkCommand(
+        name="targets json",
+        args=["targets", "--json"],
+        fixture="installed-project",
+        workload="Reads configured project targets from apm.yml and emits machine output.",
+        stdout_contains=("copilot",),
+    ),
+    BenchmarkCommand(
+        name="script list",
+        args=["list"],
+        fixture="installed-project",
+        workload="Reads apm.yml scripts and renders the runnable script inventory.",
+        stdout_contains=("build",),
+    ),
+    BenchmarkCommand(
+        name="deps list",
+        args=["deps", "list"],
+        fixture="installed-project",
+        workload="Scans apm_modules package directories and apm.lock.yaml metadata.",
+        stdout_contains=("microsoft/apm-package-alpha",),
+    ),
+    BenchmarkCommand(
+        name="deps tree",
+        args=["deps", "tree"],
+        fixture="installed-project",
+        workload="Builds a dependency tree from apm.lock.yaml and installed package metadata.",
+        stdout_contains=("agent-toolkit",),
+    ),
+    BenchmarkCommand(
+        name="install local package",
+        args=["install", "--no-policy", "./packages/local-tools"],
+        fixture="local-install-project",
+        workload="Installs a local package and materializes lock/module state.",
+        required_paths=("apm.lock.yaml", "apm_modules"),
+        file_contains=(("apm.lock.yaml", "local-tools"),),
+    ),
+    BenchmarkCommand(
+        name="compile copilot target",
+        args=["compile", "--target", "copilot"],
+        fixture="compilation-project",
+        workload="Discovers local primitives and writes the Copilot target artifact.",
+        required_paths=(".github/copilot-instructions.md",),
+        file_contains=((".github/copilot-instructions.md", "Benchmark Instruction"),),
+    ),
+    BenchmarkCommand(
+        name="pack output",
+        args=["pack", "--output", "dist"],
+        fixture="installed-project",
+        workload="Resolves local package contents and writes a distributable artifact.",
+        required_paths=("dist",),
+    ),
+    BenchmarkCommand(
+        name="run script",
+        args=["run", "stamp"],
+        fixture="runnable-project",
+        workload="Executes a project script and writes the script's side-effect file.",
+        required_paths=("run-stamp.txt",),
+        file_contains=(("run-stamp.txt", "real-run"),),
+    ),
+    BenchmarkCommand(
+        name="audit hidden unicode",
+        args=["audit", "--ci"],
+        fixture="audit-finding-project",
+        workload="Scans a real installed file and fails on planted hidden Unicode.",
+        expect_nonzero=True,
+    ),
 ]
 
 
-def _run_once(binary: str, args: list[str], cwd: Path, env: dict[str, str]) -> dict[str, object]:
+def _check_run(command: BenchmarkCommand, cwd: Path, stdout: str) -> list[str]:
+    failures: list[str] = []
+    for relpath in command.required_paths:
+        if not (cwd / relpath).exists():
+            failures.append(f"missing expected path: {relpath}")
+    for needle in command.stdout_contains:
+        if needle not in stdout:
+            failures.append(f"stdout missing {needle!r}")
+    for relpath, needle in command.file_contains:
+        path = cwd / relpath
+        if not path.exists():
+            failures.append(f"missing expected file: {relpath}")
+            continue
+        content = path.read_text(encoding="utf-8", errors="replace")
+        if needle not in content:
+            failures.append(f"{relpath} missing {needle!r}")
+    return failures
+
+
+def _run_once(
+    binary: str,
+    command: BenchmarkCommand,
+    cwd: Path,
+    env: dict[str, str],
+) -> dict[str, object]:
     start = time.perf_counter()
     proc = subprocess.run(  # noqa: S603 -- benchmark intentionally executes supplied CLIs.
-        [binary, *args],
+        [binary, *command.args],
         cwd=cwd,
         env=env,
         text=True,
@@ -35,36 +143,312 @@ def _run_once(binary: str, args: list[str], cwd: Path, env: dict[str, str]) -> d
         check=False,
     )
     elapsed = time.perf_counter() - start
+    check_failures = _check_run(command, cwd, proc.stdout)
     return {
         "elapsed_seconds": elapsed,
         "returncode": proc.returncode,
         "stdout_bytes": len(proc.stdout.encode("utf-8")),
         "stderr_bytes": len(proc.stderr.encode("utf-8")),
+        "checks_passed": not check_failures,
+        "check_failures": check_failures,
     }
 
 
-def _workspace(base: Path, name: str, run_index: int) -> Path:
-    workdir = base / name / str(run_index)
+def _write(path: Path, content: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+
+
+def _safe_name(name: str) -> str:
+    return re.sub(r"[^a-zA-Z0-9_.-]+", "-", name).strip("-")
+
+
+def _write_empty_project(workdir: Path) -> None:
+    _write(workdir / "README.md", "# Benchmark fixture\n")
+
+
+def _write_installed_project(workdir: Path) -> None:
+    _write_empty_project(workdir)
+    for directory in [
+        ".github",
+        ".claude",
+        ".cursor/rules",
+        ".codex",
+        "src/apm_bench",
+        ".apm/instructions",
+        ".apm/chatmodes",
+        "apm_modules/microsoft/apm-package-alpha/.apm/instructions",
+        "apm_modules/github/agent-toolkit/.apm/instructions",
+    ]:
+        (workdir / directory).mkdir(parents=True, exist_ok=True)
+
+    _write(
+        workdir / "apm.yml",
+        """name: benchmark-project
+version: 1.2.3
+description: Realistic migration benchmark fixture
+author: benchmark
+targets:
+  - copilot
+  - claude
+  - cursor
+dependencies:
+  apm:
+    - microsoft/apm-package-alpha#v1.0.0
+    - github/agent-toolkit#v2.3.4
+  mcp: []
+scripts:
+  build: Build benchmark artifacts
+  test: Run the test suite
+  lint: Run lint checks
+  release: Prepare release artifacts
+includes: auto
+""",
+    )
+    _write(
+        workdir / "apm.lock.yaml",
+        """lockfile_version: "1"
+generated_at: "2026-01-01T00:00:00+00:00"
+apm_version: benchmark
+dependencies:
+  - repo_url: microsoft/apm-package-alpha
+    resolved_ref: v1.0.0
+    resolved_commit: "1111111111111111111111111111111111111111"
+    version: 1.0.0
+    package_type: instructions
+    deployed_files:
+      - .github/copilot-instructions.md
+  - repo_url: github/agent-toolkit
+    resolved_ref: v2.3.4
+    resolved_commit: "2222222222222222222222222222222222222222"
+    version: 2.3.4
+    depth: 2
+    resolved_by: microsoft/apm-package-alpha
+    package_type: instructions
+    deployed_files:
+      - CLAUDE.md
+local_deployed_files:
+  - .github/copilot-instructions.md
+  - CLAUDE.md
+  - .cursor/rules/AGENTS.md
+""",
+    )
+    _write(
+        workdir / ".github/copilot-instructions.md",
+        "# Copilot Benchmark Instructions\n\nUse the local benchmark context.\n",
+    )
+    _write(
+        workdir / "CLAUDE.md",
+        "# Claude Benchmark Instructions\n\nUse the local benchmark context.\n",
+    )
+    _write(
+        workdir / ".cursor/rules/AGENTS.md",
+        "# Cursor Benchmark Instructions\n\nUse the local benchmark context.\n",
+    )
+
+    for index in range(16):
+        _write(
+            workdir / f".apm/instructions/bench-{index:02d}.instructions.md",
+            f"""---
+applyTo: "src/**/*.py"
+description: Benchmark instruction {index}
+---
+# Benchmark Instruction {index}
+
+Keep implementation clear and tested.
+
+- Check input boundaries.
+- Prefer small functions.
+- Leave useful diagnostics for failures.
+""",
+        )
+    for index in range(2):
+        _write(
+            workdir / f".apm/chatmodes/reviewer-{index}.chatmode.md",
+            f"""---
+description: Review benchmark fixture {index}
+---
+# Reviewer {index}
+
+Review for correctness, maintainability, and test coverage.
+""",
+        )
+    for index in range(24):
+        _write(
+            workdir / f"src/apm_bench/module_{index:02d}.py",
+            f'"""Benchmark source module {index}."""\n\nVALUE_{index} = {index}\n',
+        )
+
+    packages = [
+        ("microsoft", "apm-package-alpha", "1.0.0"),
+        ("github", "agent-toolkit", "2.3.4"),
+    ]
+    for owner, repo, version in packages:
+        package_dir = workdir / "apm_modules" / owner / repo
+        _write(
+            package_dir / "apm.yml",
+            f"""name: {repo}
+version: {version}
+description: Fixture dependency package
+author: benchmark
+dependencies:
+  apm: []
+  mcp: []
+""",
+        )
+        _write(
+            package_dir / f".apm/instructions/{repo}.instructions.md",
+            f"""---
+applyTo: "**/*"
+description: Installed package instruction for {repo}
+---
+# {repo}
+
+Installed dependency instruction used by migration benchmarks.
+""",
+        )
+
+
+def _write_compilation_project(workdir: Path) -> None:
+    _write_empty_project(workdir)
+    _write(
+        workdir / "apm.yml",
+        """name: compilation-project
+version: 1.0.0
+description: Compilation benchmark fixture
+author: benchmark
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
+includes: auto
+""",
+    )
+    _write(
+        workdir / ".apm/instructions/bench.instructions.md",
+        """---
+applyTo: "**/*"
+description: Benchmark Instruction
+---
+# Benchmark Instruction
+
+This content must be compiled into a target artifact.
+""",
+    )
+
+
+def _write_local_install_project(workdir: Path) -> None:
+    _write_empty_project(workdir)
+    _write(
+        workdir / "apm.yml",
+        """name: local-install-project
+version: 1.0.0
+description: Local install benchmark fixture
+author: benchmark
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
+""",
+    )
+    package_dir = workdir / "packages" / "local-tools"
+    _write(
+        package_dir / "apm.yml",
+        """name: local-tools
+version: 1.0.0
+description: Local tools package
+author: benchmark
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts: {}
+""",
+    )
+    _write(package_dir / ".apm/instructions/tool.instructions.md", "# Local tools\n")
+
+
+def _write_runnable_project(workdir: Path) -> None:
+    _write_empty_project(workdir)
+    _write(
+        workdir / "apm.yml",
+        """name: runnable-project
+version: 1.0.0
+description: Runnable benchmark fixture
+author: benchmark
+targets:
+  - copilot
+dependencies:
+  apm: []
+  mcp: []
+scripts:
+  stamp: "printf real-run > run-stamp.txt"
+""",
+    )
+
+
+def _write_audit_finding_project(workdir: Path) -> None:
+    _write_installed_project(workdir)
+    _write(
+        workdir / "apm_modules/unicode-package/SKILL.md",
+        "safe text \u202eevil text\n",
+    )
+    _write(
+        workdir / "apm.lock.yaml",
+        """lockfile_version: "1"
+dependencies:
+  - repo_url: local/unicode-package
+    resolved_commit: fixture
+    deployed_files:
+      - apm_modules/unicode-package/SKILL.md
+    deployed_file_hashes: {}
+""",
+    )
+
+
+def _workspace(base: Path, command: BenchmarkCommand, run_index: int) -> Path:
+    if command.fixture == "none":
+        return base
+
+    workdir = base / _safe_name(command.name) / str(run_index)
     workdir.mkdir(parents=True, exist_ok=True)
-    (workdir / "README.md").write_text("# Benchmark fixture\n", encoding="utf-8")
+
+    if command.fixture == "empty-project":
+        _write_empty_project(workdir)
+    elif command.fixture == "installed-project":
+        _write_installed_project(workdir)
+    elif command.fixture == "compilation-project":
+        _write_compilation_project(workdir)
+    elif command.fixture == "local-install-project":
+        _write_local_install_project(workdir)
+    elif command.fixture == "runnable-project":
+        _write_runnable_project(workdir)
+    elif command.fixture == "audit-finding-project":
+        _write_audit_finding_project(workdir)
+    else:
+        raise ValueError(f"unknown benchmark fixture: {command.fixture}")
+
     return workdir
 
 
 def _measure(
     *,
     binary: str,
-    args: list[str],
-    mutates_workspace: bool,
+    command: BenchmarkCommand,
     repeats: int,
     base: Path,
-    label: str,
     env: dict[str, str],
 ) -> dict[str, object]:
     base.mkdir(parents=True, exist_ok=True)
     samples: list[dict[str, object]] = []
     for index in range(repeats):
-        cwd = _workspace(base, label, index) if mutates_workspace else base
-        samples.append(_run_once(binary, args, cwd, env))
+        cwd = _workspace(base, command, index)
+        samples.append(_run_once(binary, command, cwd, env))
 
     elapsed = [float(sample["elapsed_seconds"]) for sample in samples]
     return {
@@ -72,6 +456,12 @@ def _measure(
         "min_seconds": min(elapsed),
         "max_seconds": max(elapsed),
         "returncodes": sorted({int(sample["returncode"]) for sample in samples}),
+        "checks_passed": all(bool(sample["checks_passed"]) for sample in samples),
+        "check_failures": [
+            failure
+            for sample in samples
+            for failure in sample.get("check_failures", [])
+        ],
         "samples": samples,
     }
 
@@ -90,15 +480,25 @@ def _markdown(results: list[dict[str, object]], max_ratio: float) -> str:
     lines = [
         "## Migration CLI Benchmark",
         "",
+        "Includes fixture-backed commands that must read, write, execute, or fail "
+        "against real project state. "
+        "The installed-project fixture contains apm.yml, apm.lock.yaml, "
+        "apm_modules packages, local .apm primitives, target directories, "
+        "deployed prompt files, and sample source files.",
+        "The harness checks return-code parity for each command. Detailed stdout/stderr "
+        "byte counts are kept in the JSON samples, but this is not an output-parity test.",
+        "",
         f"Max allowed Go/Python median ratio: `{max_ratio:.2f}`",
         "",
-        "| Command | Python median | Go median | Go/Python | Result | Return codes |",
-        "|---|---:|---:|---:|---|---|",
+        "| Benchmark | Command | Fixture | Python median | Go median | Go/Python | Result | Return codes |",
+        "|---|---|---|---:|---:|---:|---|---|",
     ]
     for row in results:
         lines.append(
-            "| {command} | {python:.4f}s | {go:.4f}s | {ratio:.2f}x | {result} | {codes} |".format(
+            "| {name} | `{command}` | {fixture} | {python:.4f}s | {go:.4f}s | {ratio:.2f}x | {result} | {codes} |".format(
+                name=row["name"],
                 command=row["command"],
+                fixture=row["fixture"],
                 python=row["python_median_seconds"],
                 go=row["go_median_seconds"],
                 ratio=row["ratio"],
@@ -106,6 +506,9 @@ def _markdown(results: list[dict[str, object]], max_ratio: float) -> str:
                 codes=row["returncodes"],
             )
         )
+    lines.extend(["", "### Workloads", ""])
+    for row in results:
+        lines.append(f"- **{row['name']}**: {row['workload']}")
     lines.append("")
     return "\n".join(lines)
 
@@ -118,6 +521,11 @@ def main() -> int:
     parser.add_argument("--markdown-out", required=True)
     parser.add_argument("--max-ratio", type=float, default=5.0)
     parser.add_argument("--repeats", type=int, default=5)
+    parser.add_argument(
+        "--allow-failures",
+        action="store_true",
+        help="Write benchmark evidence without returning a failing exit code.",
+    )
     args = parser.parse_args()
 
     env = os.environ.copy()
@@ -133,23 +541,19 @@ def main() -> int:
     failures: list[str] = []
     with tempfile.TemporaryDirectory(prefix="apm-migration-bench-") as tmp:
         base = Path(tmp)
-        for command, command_args, mutates_workspace in COMMANDS:
+        for command in COMMANDS:
             python_result = _measure(
                 binary=args.python_bin,
-                args=command_args,
-                mutates_workspace=mutates_workspace,
+                command=command,
                 repeats=args.repeats,
-                base=base / "python" / command,
-                label=command,
+                base=base / "python" / _safe_name(command.name),
                 env=env,
             )
             go_result = _measure(
                 binary=args.go_bin,
-                args=command_args,
-                mutates_workspace=mutates_workspace,
+                command=command,
                 repeats=args.repeats,
-                base=base / "go" / command,
-                label=command,
+                base=base / "go" / _safe_name(command.name),
                 env=env,
             )
 
@@ -160,25 +564,44 @@ def main() -> int:
                 "python": python_result["returncodes"],
                 "go": go_result["returncodes"],
             }
+            row_failures: list[str] = []
+            if python_result["returncodes"] != go_result["returncodes"]:
+                row_failures.append(f"return codes differ: {returncodes}")
+            if ratio > args.max_ratio:
+                row_failures.append(
+                    f"Go median {ratio:.2f}x slower than Python "
+                    f"(limit {args.max_ratio:.2f}x)"
+                )
+            if not python_result["checks_passed"]:
+                row_failures.append(
+                    f"Python artifact checks failed: {python_result['check_failures']}"
+                )
+            if not go_result["checks_passed"]:
+                row_failures.append(f"Go artifact checks failed: {go_result['check_failures']}")
+            if command.expect_nonzero:
+                if all(code == 0 for code in python_result["returncodes"]):
+                    row_failures.append("Python returned success for expected failure workload")
+                if all(code == 0 for code in go_result["returncodes"]):
+                    row_failures.append("Go returned success for expected failure workload")
 
             row = {
-                "command": " ".join(command_args),
+                "name": command.name,
+                "command": " ".join(command.args),
+                "fixture": command.fixture,
+                "workload": command.workload,
                 "python": python_result,
                 "go": go_result,
                 "python_median_seconds": python_median,
                 "go_median_seconds": go_median,
                 "ratio": ratio,
                 "returncodes": returncodes,
+                "passed": not row_failures,
+                "failures": row_failures,
             }
             results.append(row)
 
-            if python_result["returncodes"] != go_result["returncodes"]:
-                failures.append(f"{command}: return codes differ: {returncodes}")
-            if ratio > args.max_ratio:
-                failures.append(
-                    f"{command}: Go median {ratio:.2f}x slower than Python "
-                    f"(limit {args.max_ratio:.2f}x)"
-                )
+            for failure in row_failures:
+                failures.append(f"{command.name}: {failure}")
 
     json_path = Path(args.json_out)
     markdown_path = Path(args.markdown_out)
@@ -189,9 +612,11 @@ def main() -> int:
 
     print(markdown_path.read_text(encoding="utf-8"))
     if failures:
+        annotation = "warning" if args.allow_failures else "error"
         for failure in failures:
-            print(f"::error::{failure}")
-        return 1
+            print(f"::{annotation}::{failure}")
+        if not args.allow_failures:
+            return 1
     return 0
 
 
diff --git a/tests/parity/python_contract_coverage.yml b/tests/parity/python_contract_coverage.yml
index ef69606f..b571f688 100644
--- a/tests/parity/python_contract_coverage.yml
+++ b/tests/parity/python_contract_coverage.yml
@@ -20219,12 +20219,18 @@ python_tests:
   - tests/unit/test_crane_score.py::test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gate
   - tests/unit/test_crane_score.py::test_crane_score_does_not_infer_behavior_contracts_from_test_name
   - tests/unit/test_crane_score.py::test_crane_score_blocks_incomplete_behavior_contract_gate
+  - tests/unit/test_crane_score.py::test_crane_score_reaches_one_with_completion_tests_and_explicit_real_gates
+  - tests/unit/test_crane_score.py::test_crane_score_does_not_infer_completion_gates_from_test_names
+  - tests/unit/test_crane_score.py::test_crane_score_blocks_incomplete_real_functional_gate
+  - tests/unit/test_crane_score.py::test_crane_score_blocks_legacy_benchmark_bool_without_real_counts
   - tests/unit/test_crane_score.py::test_crane_score_blocks_known_exceptions
   - tests/unit/test_crane_workflow_prompt.py::test_crane_acceptance_requires_shared_iteration_summary_for_pr_updates
   - tests/unit/test_crane_workflow_prompt.py::test_crane_commit_guidance_provides_structured_summary_fallback
   - tests/unit/test_crane_workflow_prompt.py::test_crane_prompt_blocks_stale_completed_state_from_finishing
   - tests/unit/test_crane_workflow_prompt.py::test_crane_completion_is_two_phase_and_pr_head_gated
   - tests/unit/test_crane_workflow_prompt.py::test_crane_state_template_tracks_completion_candidate_gate
+  - tests/unit/test_migration_ci_workflow.py::test_migration_ci_enforces_completion_for_crane_prs_and_explicit_manual_runs
+  - tests/unit/test_migration_ci_workflow.py::test_migration_ci_collects_incomplete_evidence_for_non_crane_prs
   - tests/unit/test_cursor_mcp.py::TestCursorClientFactory::test_create_cursor_client
   - tests/unit/test_cursor_mcp.py::TestCursorClientFactory::test_create_cursor_client_case_insensitive
   - tests/unit/test_cursor_mcp.py::TestCursorClientAdapter::test_config_path_is_repo_local
diff --git a/tests/unit/test_crane_score.py b/tests/unit/test_crane_score.py
index 55f20713..455d26c8 100644
--- a/tests/unit/test_crane_score.py
+++ b/tests/unit/test_crane_score.py
@@ -76,7 +76,7 @@ def _deletion_gates() -> list[str]:
         '{"crane":"gate","name":"python_behavior_contracts","passing":1,"total":1}',
         '{"crane":"gate","name":"known_exceptions","count":0}',
         '{"crane":"gate","name":"python_tests","passed":true}',
-        '{"crane":"gate","name":"benchmarks","passed":true}',
+        '{"crane":"gate","name":"benchmarks","passing":1,"total":1}',
     ]
 
 
@@ -111,6 +111,22 @@ def _behavior_contract_gate_output(passing: int, total: int) -> str:
     )
 
 
+def _ratio_gate_output(test: str, name: str, passing: int, total: int) -> str:
+    return _event(
+        "output",
+        test,
+        output=json.dumps(
+            {
+                "crane": "gate",
+                "name": name,
+                "passing": passing,
+                "total": total,
+            }
+        )
+        + "\n",
+    )
+
+
 def _gates(score: dict[str, object]) -> dict[str, dict[str, object]]:
     gates = score["gates"]
     assert isinstance(gates, list)
@@ -183,7 +199,7 @@ def test_crane_score_can_reach_one_with_all_deletion_grade_gates() -> None:
         '{"crane":"gate","name":"python_behavior_contracts","passing":0,"total":1}',
         '{"crane":"gate","name":"known_exceptions","count":1}',
         '{"crane":"gate","name":"python_tests","passed":false}',
-        '{"crane":"gate","name":"benchmarks","passed":false}',
+        '{"crane":"gate","name":"benchmarks","passing":0,"total":1}',
     ],
 )
 def test_crane_score_full_parity_but_bad_deletion_gate_cannot_reach_one(
@@ -236,12 +252,12 @@ def test_crane_score_rejects_empty_event_stream() -> None:
     assert "empty or incomplete" in result.stderr
 
 
-def test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gate() -> None:
+def test_crane_score_reaches_one_with_completion_tests_and_explicit_real_gates() -> None:
     score = _run_score(
         [
             *_parity_passes(293),
             *_completion_gate_events(),
-            _behavior_contract_gate_output(1, 1),
+            *_deletion_gates(),
             _package_pass(),
         ]
     )
@@ -252,21 +268,30 @@ def test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gat
     assert all(gate["passing"] for gate in _gates(score).values())
 
 
-def test_crane_score_does_not_infer_behavior_contracts_from_test_name() -> None:
+def test_crane_score_does_not_infer_completion_gates_from_test_names() -> None:
     score = _run_score([*_parity_passes(293), *_completion_gate_events(), _package_pass()])
     gates = _gates(score)
 
     assert score["progress"] == 1.0
     assert score["migration_score"] < 1.0
     assert score["deletion_grade_ready"] is False
+    assert gates["functional_contracts"]["passing"] is False
+    assert gates["state_diff_contracts"]["passing"] is False
     assert gates["python_behavior_contracts"]["passing"] is False
+    assert gates["benchmarks_pass"]["passing"] is False
 
 
 def test_crane_score_blocks_incomplete_behavior_contract_gate() -> None:
+    gates = [
+        line
+        for line in _deletion_gates()
+        if json.loads(line)["name"] != "python_behavior_contracts"
+    ]
     score = _run_score(
         [
             *_parity_passes(293),
             *_completion_gate_events(),
+            *gates,
             _behavior_contract_gate_output(0, 1),
             _package_pass(),
         ]
@@ -279,11 +304,55 @@ def test_crane_score_blocks_incomplete_behavior_contract_gate() -> None:
     assert gates["python_behavior_contracts"]["passing"] is False
 
 
+def test_crane_score_blocks_incomplete_real_functional_gate() -> None:
+    gates = [line for line in _deletion_gates() if json.loads(line)["name"] != "functional"]
+    score = _run_score(
+        [
+            *_parity_passes(293),
+            *_completion_gate_events(),
+            *gates,
+            _ratio_gate_output(
+                "TestParityRealFunctionalAndStateDiffContracts",
+                "functional",
+                0,
+                1,
+            ),
+            _package_pass(),
+        ]
+    )
+    gates = _gates(score)
+
+    assert score["progress"] == 1.0
+    assert score["migration_score"] < 1.0
+    assert score["deletion_grade_ready"] is False
+    assert gates["functional_contracts"]["passing"] is False
+
+
+def test_crane_score_blocks_legacy_benchmark_bool_without_real_counts() -> None:
+    gates = [line for line in _deletion_gates() if json.loads(line)["name"] != "benchmarks"]
+    score = _run_score(
+        [
+            *_parity_passes(293),
+            *_completion_gate_events(),
+            *gates,
+            '{"crane":"gate","name":"benchmarks","passed":true}',
+            _package_pass(),
+        ]
+    )
+    gates = _gates(score)
+
+    assert score["progress"] == 1.0
+    assert score["migration_score"] < 1.0
+    assert score["deletion_grade_ready"] is False
+    assert gates["benchmarks_pass"]["passing"] is False
+
+
 def test_crane_score_blocks_known_exceptions() -> None:
     score = _run_score(
         [
             *_parity_passes(293),
             *_completion_gate_events(),
+            *_deletion_gates(),
             _event("output", "TestParityCompletionHelpIdentical", output="APPROVED-EXCEPTION: no"),
             _package_pass(),
         ]
diff --git a/tests/unit/test_mcp_integrator_install_hermetic.py b/tests/unit/test_mcp_integrator_install_hermetic.py
index 0e4f5c18..14da2bab 100644
--- a/tests/unit/test_mcp_integrator_install_hermetic.py
+++ b/tests/unit/test_mcp_integrator_install_hermetic.py
@@ -453,6 +453,10 @@ def test_all_excluded_warns_and_returns_zero(self, tmp_path):
                 "apm_cli.runtime.utils.find_runtime_binary",
                 return_value=None,
             ),
+            patch(
+                "apm_cli.integration.mcp_integrator_install.find_runtime_binary",
+                return_value=None,
+            ),
             patch(
                 "apm_cli.integration.mcp_integrator.MCPIntegrator._gate_project_scoped_runtimes",
                 side_effect=lambda rts, **kw: rts,
diff --git a/tests/unit/test_mcp_integrator_install_phase3w4.py b/tests/unit/test_mcp_integrator_install_phase3w4.py
index 1ccb87d7..c2e3a93a 100644
--- a/tests/unit/test_mcp_integrator_install_phase3w4.py
+++ b/tests/unit/test_mcp_integrator_install_phase3w4.py
@@ -453,6 +453,10 @@ def test_all_excluded_warns_and_returns_zero(self, tmp_path):
                 "apm_cli.runtime.utils.find_runtime_binary",
                 return_value=None,
             ),
+            patch(
+                "apm_cli.integration.mcp_integrator_install.find_runtime_binary",
+                return_value=None,
+            ),
             patch(
                 "apm_cli.integration.mcp_integrator.MCPIntegrator._gate_project_scoped_runtimes",
                 side_effect=lambda rts, **kw: rts,
diff --git a/tests/unit/test_migration_ci_workflow.py b/tests/unit/test_migration_ci_workflow.py
new file mode 100644
index 00000000..9b9fde79
--- /dev/null
+++ b/tests/unit/test_migration_ci_workflow.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+WORKFLOW = ROOT / ".github" / "workflows" / "migration-ci.yml"
+
+
+def _workflow_text() -> str:
+    return WORKFLOW.read_text(encoding="utf-8")
+
+
+def test_migration_ci_enforces_completion_for_crane_prs_and_explicit_manual_runs() -> None:
+    text = _workflow_text()
+
+    assert "enforce_completion:" in text
+    assert "MIGRATION_COMPLETION_ENFORCED=$enforce_completion" in text
+    assert "APM_ENFORCE_COMPLETION_GATES=1" in text
+    assert "inputs.enforce_completion == true" in text
+    assert 'github.event.pull_request.head.ref }}" == crane/*' in text
+    assert "manual runs with enforce_completion=true" in text
+
+
+def test_migration_ci_collects_incomplete_evidence_for_non_crane_prs() -> None:
+    text = _workflow_text()
+
+    assert "--allow-failures" in text
+    assert "Non-enforcing migration evidence run" in text
+    assert "Python behavior contract tests are incomplete in collection mode." in text
+    assert "Go parity tests are incomplete in collection mode." in text
diff --git a/tests/unit/test_readme_go_cli_migration.py b/tests/unit/test_readme_go_cli_migration.py
index ce828eb7..e1f51c37 100644
--- a/tests/unit/test_readme_go_cli_migration.py
+++ b/tests/unit/test_readme_go_cli_migration.py
@@ -22,7 +22,8 @@ def test_readme_documents_go_cli_migration_usage() -> None:
         "gh workflow run migration-ci.yml --repo githubnext/apm --ref main",
         "`migration-benchmark-evidence`",
         "`Go/Python` ratio is the Go median duration divided by the Python median",
-        "`327x`-`370x` faster",
+        "fixture-backed commands that read, write, execute, or fail against realistic APM",
+        "`apm.yml`, `apm.lock.yaml`, installed `apm_modules`",
     ]
 
     for snippet in required_snippets:
diff --git a/tests/unit/test_runtime_windows.py b/tests/unit/test_runtime_windows.py
index bfcab7e1..98fff4e8 100644
--- a/tests/unit/test_runtime_windows.py
+++ b/tests/unit/test_runtime_windows.py
@@ -261,6 +261,7 @@ def test_execute_runtime_command_uses_shlex_on_unix(self):
 
         with (
             patch("sys.platform", "linux"),
+            patch("apm_cli.core.script_runner.find_runtime_binary", return_value=None),
             patch("subprocess.run", return_value=MagicMock(returncode=0)) as mock_run,
         ):
             runner._execute_runtime_command("codex --quiet", "prompt content", env)