githubnext · mrjf · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.crane/scripts/score.go b/.crane/scripts/score.go
@@ -137,7 +137,7 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) {
 	knownExceptions := knownExceptionsFromEnv(getenv("APM_KNOWN_EXCEPTIONS"))
 	pythonReference := BoolGate{}
 	pythonTests := BoolGate{Seen: getenv("APM_PYTHON_TESTS") != "", Passed: getenv("APM_PYTHON_TESTS") == "pass"}
-	benchmarks := BoolGate{Seen: getenv("APM_BENCHMARKS") != "", Passed: getenv("APM_BENCHMARKS") == "pass"}
+	benchmarks := RatioGate{}
 	surface := RatioGate{}
 	help := RatioGate{}
 	functional := RatioGate{}
@@ -224,25 +224,25 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) {
 		pythonReference = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionHardGate") || pythonReferenceReady(getenv("APM_PYTHON_BIN"))}
 	}
 	if !surface.Seen {
-		surface = inferredAnyRatioGate(passed, failed, "TestParityCompletionSurfaceParity", "TestParitySurfaceInventory")
+		surface = missingRatioGate()
 	}
 	if !help.Seen {
-		help = inferredAllRatioGate(passed, failed, "TestParityCompletionCommandMatrix", "TestParityCompletionHelpIdentical")
+		help = missingRatioGate()
 	}
 	if !functional.Seen {
-		functional = inferredAnyRatioGate(passed, failed, "TestParityCompletionFunctionalContracts", "TestParityFunctionalContracts")
+		functional = missingRatioGate()
 	}
 	if !stateDiff.Seen {
-		stateDiff = inferredAnyRatioGate(passed, failed, "TestParityCompletionStateDiffContracts", "TestParityStateDiffContracts")
+		stateDiff = missingRatioGate()
 	}
 	if !behaviorContracts.Seen {
-		behaviorContracts = RatioGate{Seen: true, Passing: 0, Total: 1}
+		behaviorContracts = missingRatioGate()
 	}
 	if !pythonTests.Seen {
 		pythonTests = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionPythonSuite")}
 	}
 	if !benchmarks.Seen {
-		benchmarks = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionBenchmarks")}
+		benchmarks = missingRatioGate()
 	}
 
 	goTestsPass := !goTestsFailed && targetTotal > 0 && targetPassing == targetTotal
@@ -346,7 +346,7 @@ func applyGateEvent(
 	behaviorContracts *RatioGate,
 	knownExceptions *int,
 	pythonTests *BoolGate,
-	benchmarks *BoolGate,
+	benchmarks *RatioGate,
 ) {
 	switch gate.Name {
 	case "python_reference":
@@ -366,7 +366,7 @@ func applyGateEvent(
 	case "python_tests":
 		*pythonTests = BoolGate{Seen: true, Passed: gate.Passed}
 	case "benchmarks":
-		*benchmarks = BoolGate{Seen: true, Passed: gate.Passed}
+		*benchmarks = RatioGate{Seen: true, Passing: gate.Passing, Total: gate.Total}
 	}
 }
 
@@ -399,31 +399,8 @@ func testPassed(passed, failed map[string]bool, names ...string) bool {
 	return false
 }
 
-func inferredAnyRatioGate(passed, failed map[string]bool, names ...string) RatioGate {
-	for _, name := range names {
-		if failed[name] {
-			return RatioGate{Seen: true, Passing: 0, Total: 1}
-		}
-	}
-	return RatioGate{Seen: true, Passing: boolToInt(testPassed(passed, failed, names...)), Total: 1}
-}
-
-func inferredAllRatioGate(passed, failed map[string]bool, names ...string) RatioGate {
-	for _, name := range names {
-		if failed[name] {
-			return RatioGate{Seen: true, Passing: 0, Total: 1}
-		}
-	}
-	return RatioGate{Seen: true, Passing: boolToInt(allRequiredTestsPassed(passed, names...)), Total: 1}
-}
-
-func allRequiredTestsPassed(passed map[string]bool, names ...string) bool {
-	for _, name := range names {
-		if !passed[name] {
-			return false
-		}
-	}
-	return true
+func missingRatioGate() RatioGate {
+	return RatioGate{Seen: true, Passing: 0, Total: 1}
 }
 
 func gateResults(gates CutoverGates) []GateResult {
@@ -448,13 +425,6 @@ func passFail(ok bool) string {
 	return "fail"
 }
 
-func boolToInt(ok bool) int {
-	if ok {
-		return 1
-	}
-	return 0
-}
-
 func knownExceptionsFromEnv(raw string) int {
 	if raw == "" {
 		return 0

diff --git a/.github/workflows/migration-ci.yml b/.github/workflows/migration-ci.yml
@@ -4,6 +4,12 @@ on:
   pull_request:
     branches: [main]
   workflow_dispatch:
+    inputs:
+      enforce_completion:
+        description: "Fail unless migration completion gates are fully satisfied"
+        required: false
+        default: false
+        type: boolean
 
 permissions:
   contents: read
@@ -99,6 +105,18 @@ jobs:
       - name: Run Go parity tests
         shell: bash
         run: |
+          enforce_completion=false
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then
+            enforce_completion=true
+          elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then
+            enforce_completion=true
+          fi
+
+          echo "MIGRATION_COMPLETION_ENFORCED=$enforce_completion" >> "$GITHUB_ENV"
+          if [ "$enforce_completion" = "true" ]; then
+            export APM_ENFORCE_COMPLETION_GATES=1
+          fi
+
           set +e
           go test -json ./... | tee "$RUNNER_TEMP/go-test-events.json"
           status=${PIPESTATUS[0]}
@@ -113,21 +131,38 @@ jobs:
             --coverage tests/parity/python_contract_coverage.yml \
             --allow-intentionally-incomplete \
             --summary "$RUNNER_TEMP/python-contract-coverage.md" || true
-          python - "$RUNNER_TEMP/migration-score.json" <<'PY'
+          python - "$RUNNER_TEMP/migration-score.json" "${MIGRATION_COMPLETION_ENFORCED:-false}" <<'PY'
           import json
           import sys
 
           with open(sys.argv[1], encoding="utf-8") as fh:
               score = json.load(fh)
+          enforce_completion = sys.argv[2].lower() == "true"
 
           print(json.dumps(score, indent=2, sort_keys=True))
+          if not enforce_completion:
+              print(
+                  "::notice::Non-enforcing migration evidence run; "
+                  "completion gates are enforced only for crane/* PRs and "
+                  "manual runs with enforce_completion=true."
+              )
+              raise SystemExit(0)
           if score.get("progress") != 1.0:
               raise SystemExit("progress must be 1.0 for completion parity")
           if score.get("migration_score") == 1.0 and not score.get("deletion_grade_ready"):
               raise SystemExit("migration_score 1.0 requires deletion_grade_ready")
           PY
-          test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0"
-          test "${GO_TEST_STATUS:-1}" = "0"
+          if [ "${MIGRATION_COMPLETION_ENFORCED:-false}" = "true" ]; then
+            test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0"
+            test "${GO_TEST_STATUS:-1}" = "0"
+          else
+            if [ "${PYTHON_CLI_CONTRACT_STATUS:-1}" != "0" ]; then
+              echo "::notice::Python behavior contract tests are incomplete in collection mode."
+            fi
+            if [ "${GO_TEST_STATUS:-1}" != "0" ]; then
+              echo "::notice::Go parity tests are incomplete in collection mode."
+            fi
+          fi
 
       - name: Upload parity evidence
         if: always()
@@ -171,13 +206,27 @@ jobs:
         run: go build -o "$RUNNER_TEMP/apm-go" ./cmd/apm
 
       - name: Run Python-vs-Go CLI benchmark
+        shell: bash
         run: |
+          enforce_completion=false
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then
+            enforce_completion=true
+          elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then
+            enforce_completion=true
+          fi
+
+          extra_args=()
+          if [ "$enforce_completion" != "true" ]; then
+            extra_args+=(--allow-failures)
+          fi
+
           python scripts/ci/migration_cli_benchmark.py \
             --python-bin "$GITHUB_WORKSPACE/.venv/bin/apm" \
             --go-bin "$RUNNER_TEMP/apm-go" \
             --json-out "$RUNNER_TEMP/migration-cli-benchmark.json" \
             --markdown-out "$RUNNER_TEMP/migration-cli-benchmark.md" \
-            --max-ratio 5.0
+            --max-ratio 5.0 \
+            "${extra_args[@]}"
 
       - name: Run Python scaling guards
         run: uv run pytest tests/benchmarks/test_scaling_guards.py -v

diff --git a/README.md b/README.md
@@ -61,10 +61,20 @@ Maintainers can dispatch the migration workflow manually:
 gh workflow run migration-ci.yml --repo githubnext/apm --ref main
 ```
 
+That default manual run collects parity and benchmark evidence without treating
+known migration gaps as a CI failure. To run the deterministic hard completion
+gate, opt in explicitly:
+
+```bash
+gh workflow run migration-ci.yml --repo githubnext/apm --ref main -f enforce_completion=true
+```
+
 After it runs, open the **Migration Benchmarks** job summary for the timing
 table. The same run uploads the `migration-benchmark-evidence` artifact with
 JSON and Markdown copies of the benchmark data. In the benchmark table, the
 `Go/Python` ratio is the Go median duration divided by the Python median
-duration: values below `1.00x` mean Go is faster. Recent smoke benchmark
-evidence for startup/help/init-style commands shows the Go CLI roughly
-`327x`-`370x` faster than the Python CLI.
+duration: values below `1.00x` mean Go is faster. The benchmark includes
+fixture-backed commands that read, write, execute, or fail against realistic APM
+project state: `apm.yml`, `apm.lock.yaml`, installed `apm_modules`, local
+`.apm` primitives, target directories, deployed prompt files, and sample source
+files.
diff --git a/cmd/apm/CUTOVER.md b/cmd/apm/CUTOVER.md
@@ -15,7 +15,59 @@ The Go CLI currently implements:
 - `apm init [--yes] [PROJECT_NAME]` (functional, creates apm.yml)
 - Per-command `--help` for all 26 commands (golden-file verified)
 
-Remaining commands return a "not yet fully implemented" message.
+Most remaining commands are wired at the CLI surface. That is not enough for
+cutover. A command that prints success without writing the expected files,
+mutating `apm.yml`, updating `apm.lock.yaml`, executing a script, or detecting a
+planted failure is still incomplete.
+
+## Real Criteria
+
+Every completion criterion must be backed by real command execution. The scorer
+does not infer completion from test names for `surface`, `help`, `functional`,
+`state_diff`, `python_behavior_contracts`, or `benchmarks`; each one must emit an
+explicit ratio gate.
+
+Crane must run `go test ./cmd/apm -run TestParityRealFunctionalAndStateDiffContracts -json`.
+That fixture-backed test executes the built Go `apm` binary in temporary
+projects and emits the existing completion gates directly:
+
+```json
+{"crane":"gate","name":"functional","passing":N,"total":N}
+{"crane":"gate","name":"state_diff","passing":N,"total":N}
+```
+
+Crane must also run the migration benchmark test. It executes fixture-backed
+Python-vs-Go benchmark workloads and emits:
+
+```json
+{"crane":"gate","name":"benchmarks","passing":N,"total":N}
+```
+
+A legacy boolean such as `{"name":"benchmarks","passed":true}` is not enough.
+The benchmark report must prove that every benchmarked command produced the
+expected real artifact or output evidence.
+
+The completion criteria are command-specific:
+
+| Command area | Required proof |
+| --- | --- |
+| `init` | Creates a real `apm.yml` manifest. |
+| `install` | Installs a local package, writes `apm.lock.yaml`, and materializes installed content under `apm_modules/` or target paths. |
+| `update` | Mutates the lockfile when a dependency changes and reports a real no-op when nothing changed. |
+| `compile` | Writes target artifacts such as `.github/copilot-instructions.md` from fixture project state. |
+| `pack` / `unpack` | Writes a non-empty distributable bundle and can extract it back into a temp project. |
+| `run` / `preview` / `list` | Reads project scripts, executes or previews the selected script, and reflects the actual manifest contents. |
+| `audit` / `policy` | Fails on planted hidden Unicode, missing lockfile state, or policy violations instead of always reporting success. |
+| `mcp` / `runtime` / `plugin` / `marketplace` | Persist real manifest or config changes, not just status text. |
+| `cache` | Removes cache entries while respecting the configured cache root. |
+| `prune` / `uninstall` | Removes only files owned by stale dependencies and proves the removed paths are gone. |
+| `deps` / `outdated` / `view` / `search` | Read lockfile, marketplace, or registry fixtures and report fixture-derived results. |
+| `self-update` / `experimental` / `config` | Persist or validate real configuration state where the Python command does. |
+
+Each new command implementation should add or extend functional, state-diff, and
+benchmark fixture coverage before Crane can claim it moved the migration
+forward. Shims, dry-runs, mocks, and help-only assertions do not count as command
+completion.
 
 ## Cutover Trigger Conditions
 
@@ -27,9 +79,13 @@ are true:
    `init`, `install`, `update`, `compile`, `pack`, `run`, `audit`,
    `policy`, `mcp`, `runtime`, `targets`, `list`, `view`, `cache`,
    `deps`, `marketplace`, `uninstall`, `prune`
-3. Python-vs-Go parity tests pass for all commands in the matrix
-4. `go build ./cmd/apm` produces a single static binary
-5. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`)
+3. `TestParityRealFunctionalAndStateDiffContracts` passes every fixture-backed
+   real-command scenario and emits passing `functional` and `state_diff` gates
+4. Python-vs-Go parity tests pass for all commands in the matrix
+5. Migration benchmarks pass real fixture-backed command workloads and emit a
+   passing counted `benchmarks` gate
+6. `go build ./cmd/apm` produces a single static binary
+7. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`)
 
 ## Cutover Steps