diff --git a/.crane/scripts/score.go b/.crane/scripts/score.go index da3d1a4e..ff2494c1 100644 --- a/.crane/scripts/score.go +++ b/.crane/scripts/score.go @@ -137,7 +137,7 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) { knownExceptions := knownExceptionsFromEnv(getenv("APM_KNOWN_EXCEPTIONS")) pythonReference := BoolGate{} pythonTests := BoolGate{Seen: getenv("APM_PYTHON_TESTS") != "", Passed: getenv("APM_PYTHON_TESTS") == "pass"} - benchmarks := BoolGate{Seen: getenv("APM_BENCHMARKS") != "", Passed: getenv("APM_BENCHMARKS") == "pass"} + benchmarks := RatioGate{} surface := RatioGate{} help := RatioGate{} functional := RatioGate{} @@ -224,25 +224,25 @@ func computeScore(input scanInput, getenv getenvFunc) (Score, error) { pythonReference = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionHardGate") || pythonReferenceReady(getenv("APM_PYTHON_BIN"))} } if !surface.Seen { - surface = inferredAnyRatioGate(passed, failed, "TestParityCompletionSurfaceParity", "TestParitySurfaceInventory") + surface = missingRatioGate() } if !help.Seen { - help = inferredAllRatioGate(passed, failed, "TestParityCompletionCommandMatrix", "TestParityCompletionHelpIdentical") + help = missingRatioGate() } if !functional.Seen { - functional = inferredAnyRatioGate(passed, failed, "TestParityCompletionFunctionalContracts", "TestParityFunctionalContracts") + functional = missingRatioGate() } if !stateDiff.Seen { - stateDiff = inferredAnyRatioGate(passed, failed, "TestParityCompletionStateDiffContracts", "TestParityStateDiffContracts") + stateDiff = missingRatioGate() } if !behaviorContracts.Seen { - behaviorContracts = RatioGate{Seen: true, Passing: 0, Total: 1} + behaviorContracts = missingRatioGate() } if !pythonTests.Seen { pythonTests = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionPythonSuite")} } if !benchmarks.Seen { - benchmarks = BoolGate{Seen: true, Passed: testPassed(passed, failed, "TestParityCompletionBenchmarks")} + benchmarks = missingRatioGate() } goTestsPass := !goTestsFailed && targetTotal > 0 && targetPassing == targetTotal @@ -346,7 +346,7 @@ func applyGateEvent( behaviorContracts *RatioGate, knownExceptions *int, pythonTests *BoolGate, - benchmarks *BoolGate, + benchmarks *RatioGate, ) { switch gate.Name { case "python_reference": @@ -366,7 +366,7 @@ func applyGateEvent( case "python_tests": *pythonTests = BoolGate{Seen: true, Passed: gate.Passed} case "benchmarks": - *benchmarks = BoolGate{Seen: true, Passed: gate.Passed} + *benchmarks = RatioGate{Seen: true, Passing: gate.Passing, Total: gate.Total} } } @@ -399,31 +399,8 @@ func testPassed(passed, failed map[string]bool, names ...string) bool { return false } -func inferredAnyRatioGate(passed, failed map[string]bool, names ...string) RatioGate { - for _, name := range names { - if failed[name] { - return RatioGate{Seen: true, Passing: 0, Total: 1} - } - } - return RatioGate{Seen: true, Passing: boolToInt(testPassed(passed, failed, names...)), Total: 1} -} - -func inferredAllRatioGate(passed, failed map[string]bool, names ...string) RatioGate { - for _, name := range names { - if failed[name] { - return RatioGate{Seen: true, Passing: 0, Total: 1} - } - } - return RatioGate{Seen: true, Passing: boolToInt(allRequiredTestsPassed(passed, names...)), Total: 1} -} - -func allRequiredTestsPassed(passed map[string]bool, names ...string) bool { - for _, name := range names { - if !passed[name] { - return false - } - } - return true +func missingRatioGate() RatioGate { + return RatioGate{Seen: true, Passing: 0, Total: 1} } func gateResults(gates CutoverGates) []GateResult { @@ -448,13 +425,6 @@ func passFail(ok bool) string { return "fail" } -func boolToInt(ok bool) int { - if ok { - return 1 - } - return 0 -} - func knownExceptionsFromEnv(raw string) int { if raw == "" { return 0 diff --git a/.github/workflows/migration-ci.yml b/.github/workflows/migration-ci.yml index 3a3d197e..a8555269 100644 --- a/.github/workflows/migration-ci.yml +++ b/.github/workflows/migration-ci.yml @@ -4,6 +4,12 @@ on: pull_request: branches: [main] workflow_dispatch: + inputs: + enforce_completion: + description: "Fail unless migration completion gates are fully satisfied" + required: false + default: false + type: boolean permissions: contents: read @@ -99,6 +105,18 @@ jobs: - name: Run Go parity tests shell: bash run: | + enforce_completion=false + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then + enforce_completion=true + elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then + enforce_completion=true + fi + + echo "MIGRATION_COMPLETION_ENFORCED=$enforce_completion" >> "$GITHUB_ENV" + if [ "$enforce_completion" = "true" ]; then + export APM_ENFORCE_COMPLETION_GATES=1 + fi + set +e go test -json ./... | tee "$RUNNER_TEMP/go-test-events.json" status=${PIPESTATUS[0]} @@ -113,21 +131,38 @@ jobs: --coverage tests/parity/python_contract_coverage.yml \ --allow-intentionally-incomplete \ --summary "$RUNNER_TEMP/python-contract-coverage.md" || true - python - "$RUNNER_TEMP/migration-score.json" <<'PY' + python - "$RUNNER_TEMP/migration-score.json" "${MIGRATION_COMPLETION_ENFORCED:-false}" <<'PY' import json import sys with open(sys.argv[1], encoding="utf-8") as fh: score = json.load(fh) + enforce_completion = sys.argv[2].lower() == "true" print(json.dumps(score, indent=2, sort_keys=True)) + if not enforce_completion: + print( + "::notice::Non-enforcing migration evidence run; " + "completion gates are enforced only for crane/* PRs and " + "manual runs with enforce_completion=true." + ) + raise SystemExit(0) if score.get("progress") != 1.0: raise SystemExit("progress must be 1.0 for completion parity") if score.get("migration_score") == 1.0 and not score.get("deletion_grade_ready"): raise SystemExit("migration_score 1.0 requires deletion_grade_ready") PY - test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0" - test "${GO_TEST_STATUS:-1}" = "0" + if [ "${MIGRATION_COMPLETION_ENFORCED:-false}" = "true" ]; then + test "${PYTHON_CLI_CONTRACT_STATUS:-1}" = "0" + test "${GO_TEST_STATUS:-1}" = "0" + else + if [ "${PYTHON_CLI_CONTRACT_STATUS:-1}" != "0" ]; then + echo "::notice::Python behavior contract tests are incomplete in collection mode." + fi + if [ "${GO_TEST_STATUS:-1}" != "0" ]; then + echo "::notice::Go parity tests are incomplete in collection mode." + fi + fi - name: Upload parity evidence if: always() @@ -171,13 +206,27 @@ jobs: run: go build -o "$RUNNER_TEMP/apm-go" ./cmd/apm - name: Run Python-vs-Go CLI benchmark + shell: bash run: | + enforce_completion=false + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.enforce_completion == true }}" = "true" ]; then + enforce_completion=true + elif [ "${{ github.event_name }}" = "pull_request" ] && [[ "${{ github.event.pull_request.head.ref }}" == crane/* ]]; then + enforce_completion=true + fi + + extra_args=() + if [ "$enforce_completion" != "true" ]; then + extra_args+=(--allow-failures) + fi + python scripts/ci/migration_cli_benchmark.py \ --python-bin "$GITHUB_WORKSPACE/.venv/bin/apm" \ --go-bin "$RUNNER_TEMP/apm-go" \ --json-out "$RUNNER_TEMP/migration-cli-benchmark.json" \ --markdown-out "$RUNNER_TEMP/migration-cli-benchmark.md" \ - --max-ratio 5.0 + --max-ratio 5.0 \ + "${extra_args[@]}" - name: Run Python scaling guards run: uv run pytest tests/benchmarks/test_scaling_guards.py -v diff --git a/README.md b/README.md index d24c888c..35d8ba8f 100644 --- a/README.md +++ b/README.md @@ -61,10 +61,20 @@ Maintainers can dispatch the migration workflow manually: gh workflow run migration-ci.yml --repo githubnext/apm --ref main ``` +That default manual run collects parity and benchmark evidence without treating +known migration gaps as a CI failure. To run the deterministic hard completion +gate, opt in explicitly: + +```bash +gh workflow run migration-ci.yml --repo githubnext/apm --ref main -f enforce_completion=true +``` + After it runs, open the **Migration Benchmarks** job summary for the timing table. The same run uploads the `migration-benchmark-evidence` artifact with JSON and Markdown copies of the benchmark data. In the benchmark table, the `Go/Python` ratio is the Go median duration divided by the Python median -duration: values below `1.00x` mean Go is faster. Recent smoke benchmark -evidence for startup/help/init-style commands shows the Go CLI roughly -`327x`-`370x` faster than the Python CLI. +duration: values below `1.00x` mean Go is faster. The benchmark includes +fixture-backed commands that read, write, execute, or fail against realistic APM +project state: `apm.yml`, `apm.lock.yaml`, installed `apm_modules`, local +`.apm` primitives, target directories, deployed prompt files, and sample source +files. diff --git a/cmd/apm/CUTOVER.md b/cmd/apm/CUTOVER.md index 1b554aa1..632110e1 100644 --- a/cmd/apm/CUTOVER.md +++ b/cmd/apm/CUTOVER.md @@ -15,7 +15,59 @@ The Go CLI currently implements: - `apm init [--yes] [PROJECT_NAME]` (functional, creates apm.yml) - Per-command `--help` for all 26 commands (golden-file verified) -Remaining commands return a "not yet fully implemented" message. +Most remaining commands are wired at the CLI surface. That is not enough for +cutover. A command that prints success without writing the expected files, +mutating `apm.yml`, updating `apm.lock.yaml`, executing a script, or detecting a +planted failure is still incomplete. + +## Real Criteria + +Every completion criterion must be backed by real command execution. The scorer +does not infer completion from test names for `surface`, `help`, `functional`, +`state_diff`, `python_behavior_contracts`, or `benchmarks`; each one must emit an +explicit ratio gate. + +Crane must run `go test ./cmd/apm -run TestParityRealFunctionalAndStateDiffContracts -json`. +That fixture-backed test executes the built Go `apm` binary in temporary +projects and emits the existing completion gates directly: + +```json +{"crane":"gate","name":"functional","passing":N,"total":N} +{"crane":"gate","name":"state_diff","passing":N,"total":N} +``` + +Crane must also run the migration benchmark test. It executes fixture-backed +Python-vs-Go benchmark workloads and emits: + +```json +{"crane":"gate","name":"benchmarks","passing":N,"total":N} +``` + +A legacy boolean such as `{"name":"benchmarks","passed":true}` is not enough. +The benchmark report must prove that every benchmarked command produced the +expected real artifact or output evidence. + +The completion criteria are command-specific: + +| Command area | Required proof | +| --- | --- | +| `init` | Creates a real `apm.yml` manifest. | +| `install` | Installs a local package, writes `apm.lock.yaml`, and materializes installed content under `apm_modules/` or target paths. | +| `update` | Mutates the lockfile when a dependency changes and reports a real no-op when nothing changed. | +| `compile` | Writes target artifacts such as `.github/copilot-instructions.md` from fixture project state. | +| `pack` / `unpack` | Writes a non-empty distributable bundle and can extract it back into a temp project. | +| `run` / `preview` / `list` | Reads project scripts, executes or previews the selected script, and reflects the actual manifest contents. | +| `audit` / `policy` | Fails on planted hidden Unicode, missing lockfile state, or policy violations instead of always reporting success. | +| `mcp` / `runtime` / `plugin` / `marketplace` | Persist real manifest or config changes, not just status text. | +| `cache` | Removes cache entries while respecting the configured cache root. | +| `prune` / `uninstall` | Removes only files owned by stale dependencies and proves the removed paths are gone. | +| `deps` / `outdated` / `view` / `search` | Read lockfile, marketplace, or registry fixtures and report fixture-derived results. | +| `self-update` / `experimental` / `config` | Persist or validate real configuration state where the Python command does. | + +Each new command implementation should add or extend functional, state-diff, and +benchmark fixture coverage before Crane can claim it moved the migration +forward. Shims, dry-runs, mocks, and help-only assertions do not count as command +completion. ## Cutover Trigger Conditions @@ -27,9 +79,13 @@ are true: `init`, `install`, `update`, `compile`, `pack`, `run`, `audit`, `policy`, `mcp`, `runtime`, `targets`, `list`, `view`, `cache`, `deps`, `marketplace`, `uninstall`, `prune` -3. Python-vs-Go parity tests pass for all commands in the matrix -4. `go build ./cmd/apm` produces a single static binary -5. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`) +3. `TestParityRealFunctionalAndStateDiffContracts` passes every fixture-backed + real-command scenario and emits passing `functional` and `state_diff` gates +4. Python-vs-Go parity tests pass for all commands in the matrix +5. Migration benchmarks pass real fixture-backed command workloads and emit a + passing counted `benchmarks` gate +6. `go build ./cmd/apm` produces a single static binary +7. CI passes on the crane PR branch (`crane/crane-migration-python-to-go-full-apm-cli-rewrite`) ## Cutover Steps diff --git a/cmd/apm/parity_completion_test.go b/cmd/apm/parity_completion_test.go index 6ad47073..bd3d5ca2 100644 --- a/cmd/apm/parity_completion_test.go +++ b/cmd/apm/parity_completion_test.go @@ -10,6 +10,7 @@ package main import ( + "encoding/json" "fmt" "os" "os/exec" @@ -19,6 +20,23 @@ import ( "testing" ) +func completionGatesEnforced() bool { + return os.Getenv("APM_ENFORCE_COMPLETION_GATES") == "1" +} + +func completionGateFailure(t *testing.T, format string, args ...any) { + t.Helper() + if completionGatesEnforced() { + t.Fatalf(format, args...) + return + } + t.Logf(format, args...) +} + +func emitCraneBoolGate(name string, passed bool) { + fmt.Printf("{\"crane\":\"gate\",\"name\":%q,\"passed\":%t}\n", name, passed) +} + // TestParityCompletionHardGate enforces the Python-vs-Go completion gate. // Unlike TestParityHarnessHardGatePythonBin (which just logs), this test // FAILS when APM_PYTHON_BIN is not set -- ensuring score.go's correctness_gate @@ -381,8 +399,11 @@ func TestParityCompletionPythonSuite(t *testing.T) { cmd.Stdout = &outBuf cmd.Stderr = &errBuf if runErr := cmd.Run(); runErr != nil { - t.Fatalf("Python suite failed:\n%s\n%s", outBuf.String(), errBuf.String()) + emitCraneBoolGate("python_tests", false) + completionGateFailure(t, "Python suite failed:\n%s\n%s", outBuf.String(), errBuf.String()) + return } + emitCraneBoolGate("python_tests", true) t.Logf("[+] Python suite passed:\n%s", outBuf.String()) } @@ -428,12 +449,44 @@ func TestParityCompletionBenchmarks(t *testing.T) { cmd.Stdout = &outBuf cmd.Stderr = &errBuf if runErr := cmd.Run(); runErr != nil { - t.Fatalf("Benchmark failed (Go CLI exceeds 5x Python latency or script error):\n%s\n%s", + passing, total := benchmarkGateCounts(t, jsonOut) + emitCraneRatioGate("benchmarks", passing, total) + completionGateFailure(t, "Benchmark failed (Go CLI exceeds 5x Python latency or script error):\n%s\n%s", outBuf.String(), errBuf.String()) + return + } + passing, total := benchmarkGateCounts(t, jsonOut) + emitCraneRatioGate("benchmarks", passing, total) + if passing != total { + completionGateFailure(t, "Benchmark artifact checks incomplete: %d/%d passed\n%s", passing, total, outBuf.String()) + return } t.Logf("[+] Benchmarks passed:\n%s", outBuf.String()) } +func benchmarkGateCounts(t *testing.T, path string) (int, int) { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + return 0, 1 + } + var report struct { + Results []struct { + Passed bool `json:"passed"` + } `json:"results"` + } + if err := json.Unmarshal(data, &report); err != nil || len(report.Results) == 0 { + return 0, 1 + } + passing := 0 + for _, result := range report.Results { + if result.Passed { + passing++ + } + } + return passing, len(report.Results) +} + // runPyBin runs the Python apm binary with the given args. func runPyBin(t *testing.T, bin string, args ...string) (stdout, stderr string, exitCode int) { t.Helper() diff --git a/cmd/apm/python_behavior_contracts_test.go b/cmd/apm/python_behavior_contracts_test.go index 2cc6176c..38303711 100644 --- a/cmd/apm/python_behavior_contracts_test.go +++ b/cmd/apm/python_behavior_contracts_test.go @@ -195,7 +195,8 @@ func TestParityCompletionPythonBehaviorContracts(t *testing.T) { extract.Env = append(os.Environ(), "NO_COLOR=1", "COLUMNS=10000") if out, err := extract.CombinedOutput(); err != nil { emitCraneRatioGate("python_behavior_contracts", 0, 1) - t.Fatalf("HARD-GATE FAILED: python_behavior_contracts extraction failed: %v\n%s", err, string(out)) + completionGateFailure(t, "HARD-GATE FAILED: python_behavior_contracts extraction failed: %v\n%s", err, string(out)) + return } } @@ -213,7 +214,8 @@ func TestParityCompletionPythonBehaviorContracts(t *testing.T) { out, err := check.CombinedOutput() if err != nil { emitCraneRatioGate("python_behavior_contracts", 0, 1) - t.Fatalf("HARD-GATE FAILED: python_behavior_contracts coverage incomplete:\n%s", string(out)) + completionGateFailure(t, "HARD-GATE FAILED: python_behavior_contracts coverage incomplete:\n%s", string(out)) + return } emitCraneRatioGate("python_behavior_contracts", 1, 1) } diff --git a/cmd/apm/real_behavior_test.go b/cmd/apm/real_behavior_test.go new file mode 100644 index 00000000..5cd07f86 --- /dev/null +++ b/cmd/apm/real_behavior_test.go @@ -0,0 +1,354 @@ +package main + +import ( + "bytes" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" +) + +type realBehaviorCase struct { + name string + args []string + env map[string]string + setup func(t *testing.T, dir string) + verify func(t *testing.T, dir, stdout, stderr string, code int) bool +} + +func TestParityRealFunctionalAndStateDiffContracts(t *testing.T) { + cases := []realBehaviorCase{ + { + name: "init creates manifest", + args: []string{"init", "--yes"}, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "dependencies:") && ok + return ok + }, + }, + { + name: "install local package materializes lock and modules", + args: []string{"install", "./packages/local-tools"}, + setup: realBehaviorSetupLocalPackage, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.lock.yaml"), "local-tools") && ok + ok = realBehaviorExpectDirHasEntries(t, filepath.Join(dir, "apm_modules")) && ok + return ok + }, + }, + { + name: "compile writes copilot target", + args: []string{"compile", "--target", "copilot"}, + setup: realBehaviorSetupProject, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, ".github", "copilot-instructions.md"), "real-behavior") && ok + return ok + }, + }, + { + name: "pack writes distributable output", + args: []string{"pack", "--output", "dist"}, + setup: realBehaviorSetupProjectWithLock, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectDirHasEntries(t, filepath.Join(dir, "dist")) && ok + return ok + }, + }, + { + name: "run executes project script", + args: []string{"run", "stamp"}, + setup: realBehaviorSetupRunnableProject, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "run-stamp.txt"), "real-run") && ok + return ok + }, + }, + { + name: "audit ci fails on planted hidden unicode", + args: []string{"audit", "--ci"}, + setup: realBehaviorSetupAuditFinding, + verify: func(t *testing.T, _ string, stdout, stderr string, code int) bool { + if code == 0 { + realBehaviorFailure(t, "expected non-zero exit for hidden unicode finding\nstdout: %s\nstderr: %s", stdout, stderr) + return false + } + return true + }, + }, + { + name: "mcp install persists manifest dependency", + args: []string{"mcp", "install", "example-server"}, + setup: realBehaviorSetupProject, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "example-server") && ok + return ok + }, + }, + { + name: "plugin init writes plugin manifest", + args: []string{"plugin", "init"}, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "plugin.json"), "\"name\"") && ok + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "plugin") && ok + return ok + }, + }, + { + name: "marketplace init writes marketplace block", + args: []string{"marketplace", "init"}, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectFileContains(t, filepath.Join(dir, "apm.yml"), "marketplace:") && ok + return ok + }, + }, + { + name: "cache clean removes entries but preserves cache root", + args: []string{"cache", "clean"}, + env: map[string]string{"APM_CACHE_DIR": "cache-root"}, + setup: realBehaviorSetupCacheRoot, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + cacheRoot := filepath.Join(dir, "cache-root") + ok = realBehaviorExpectPathExists(t, cacheRoot) && ok + ok = realBehaviorExpectPathMissing(t, filepath.Join(cacheRoot, "http_v1", "old", "body")) && ok + return ok + }, + }, + { + name: "prune removes unreferenced module", + args: []string{"prune"}, + setup: realBehaviorSetupStaleModule, + verify: func(t *testing.T, dir, stdout, stderr string, code int) bool { + ok := realBehaviorExpectExit(t, stdout, stderr, code, 0) + ok = realBehaviorExpectPathMissing(t, filepath.Join(dir, "apm_modules", "stale-package")) && ok + return ok + }, + }, + } + + functionalPassing := 0 + stateDiffPassing := 0 + defer func() { + emitCraneRatioGate("functional", functionalPassing, len(cases)) + emitCraneRatioGate("state_diff", stateDiffPassing, len(cases)) + }() + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + dir := t.TempDir() + if tc.setup != nil { + tc.setup(t, dir) + } + stdout, stderr, code := realBehaviorRunGoInDir(t, dir, tc.env, tc.args...) + if tc.verify(t, dir, stdout, stderr, code) { + functionalPassing++ + stateDiffPassing++ + } + }) + } +} + +func realBehaviorCompletionGatesEnforced() bool { + return os.Getenv("APM_ENFORCE_COMPLETION_GATES") == "1" +} + +func realBehaviorFailure(t *testing.T, format string, args ...any) { + t.Helper() + if realBehaviorCompletionGatesEnforced() { + t.Errorf(format, args...) + return + } + t.Logf(format, args...) +} + +func realBehaviorRunGoInDir(t *testing.T, dir string, env map[string]string, args ...string) (string, string, int) { + t.Helper() + if goBinPath == "" { + t.Skip("Go binary not built; skipping") + } + + var outBuf, errBuf bytes.Buffer + cmd := exec.Command(goBinPath, args...) + cmd.Dir = dir + cmd.Stdout = &outBuf + cmd.Stderr = &errBuf + cmd.Env = os.Environ() + for key, value := range env { + cmd.Env = append(cmd.Env, key+"="+value) + } + + err := cmd.Run() + code := 0 + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + code = exitErr.ExitCode() + } else { + t.Fatalf("failed to run apm %s: %v", strings.Join(args, " "), err) + } + } + return outBuf.String(), errBuf.String(), code +} + +func realBehaviorSetupProject(t *testing.T, dir string) { + t.Helper() + realBehaviorWriteFile(t, filepath.Join(dir, "apm.yml"), `name: real-behavior +version: 1.0.0 +description: Real behavior fixture +author: Crane +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: {} +`) + realBehaviorWriteFile(t, filepath.Join(dir, ".apm", "prompts", "real-behavior.md"), "real-behavior prompt\n") +} + +func realBehaviorSetupProjectWithLock(t *testing.T, dir string) { + t.Helper() + realBehaviorSetupProject(t, dir) + realBehaviorWriteFile(t, filepath.Join(dir, "apm.lock.yaml"), `lockfile_version: "1" +dependencies: [] +local_deployed_files: + - .apm/prompts/real-behavior.md +local_deployed_file_hashes: {} +`) +} + +func realBehaviorSetupRunnableProject(t *testing.T, dir string) { + t.Helper() + realBehaviorWriteFile(t, filepath.Join(dir, "apm.yml"), `name: runnable +version: 1.0.0 +description: Runnable fixture +author: Crane +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: + stamp: "printf real-run > run-stamp.txt" +`) +} + +func realBehaviorSetupLocalPackage(t *testing.T, dir string) { + t.Helper() + realBehaviorSetupProject(t, dir) + pkgDir := filepath.Join(dir, "packages", "local-tools") + realBehaviorWriteFile(t, filepath.Join(pkgDir, "apm.yml"), `name: local-tools +version: 1.0.0 +description: Local tools package +author: Crane +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: {} +`) + realBehaviorWriteFile(t, filepath.Join(pkgDir, ".apm", "prompts", "tool.md"), "local-tools prompt\n") +} + +func realBehaviorSetupAuditFinding(t *testing.T, dir string) { + t.Helper() + realBehaviorSetupProjectWithLock(t, dir) + realBehaviorWriteFile(t, filepath.Join(dir, "apm_modules", "unicode-package", "SKILL.md"), "safe text \u202eevil text\n") + realBehaviorWriteFile(t, filepath.Join(dir, "apm.lock.yaml"), `lockfile_version: "1" +dependencies: + - repo_url: local/unicode-package + resolved_commit: fixture + deployed_files: + - apm_modules/unicode-package/SKILL.md + deployed_file_hashes: {} +`) +} + +func realBehaviorSetupCacheRoot(t *testing.T, dir string) { + t.Helper() + realBehaviorWriteFile(t, filepath.Join(dir, "cache-root", "http_v1", "old", "body"), "cached\n") +} + +func realBehaviorSetupStaleModule(t *testing.T, dir string) { + t.Helper() + realBehaviorSetupProjectWithLock(t, dir) + realBehaviorWriteFile(t, filepath.Join(dir, "apm_modules", "stale-package", "README.md"), "stale\n") +} + +func realBehaviorWriteFile(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("failed to create parent dir for %s: %v", path, err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("failed to write %s: %v", path, err) + } +} + +func realBehaviorExpectExit(t *testing.T, stdout, stderr string, got, want int) bool { + t.Helper() + if got != want { + realBehaviorFailure(t, "exit code = %d, want %d\nstdout: %s\nstderr: %s", got, want, stdout, stderr) + return false + } + return true +} + +func realBehaviorExpectFileContains(t *testing.T, path, needle string) bool { + t.Helper() + content, err := os.ReadFile(path) + if err != nil { + realBehaviorFailure(t, "expected file %s to exist: %v", path, err) + return false + } + if !strings.Contains(string(content), needle) { + realBehaviorFailure(t, "expected %s to contain %q, got:\n%s", path, needle, string(content)) + return false + } + return true +} + +func realBehaviorExpectPathExists(t *testing.T, path string) bool { + t.Helper() + if _, err := os.Stat(path); err != nil { + realBehaviorFailure(t, "expected path %s to exist: %v", path, err) + return false + } + return true +} + +func realBehaviorExpectPathMissing(t *testing.T, path string) bool { + t.Helper() + if _, err := os.Stat(path); err == nil { + realBehaviorFailure(t, "expected path %s to be removed", path) + return false + } else if !os.IsNotExist(err) { + realBehaviorFailure(t, "expected path %s to be absent, got: %v", path, err) + return false + } + return true +} + +func realBehaviorExpectDirHasEntries(t *testing.T, path string) bool { + t.Helper() + entries, err := os.ReadDir(path) + if err != nil { + realBehaviorFailure(t, "expected directory %s to exist: %v", path, err) + return false + } + if len(entries) == 0 { + realBehaviorFailure(t, "expected directory %s to contain at least one entry", path) + return false + } + return true +} diff --git a/scripts/ci/migration_cli_benchmark.py b/scripts/ci/migration_cli_benchmark.py index e1fc970c..ca24e57a 100644 --- a/scripts/ci/migration_cli_benchmark.py +++ b/scripts/ci/migration_cli_benchmark.py @@ -1,32 +1,140 @@ #!/usr/bin/env python3 -"""Compare Python and Go CLI latency for migration smoke commands.""" +"""Compare Python and Go CLI latency for migration benchmark workloads.""" from __future__ import annotations import argparse import json import os +import re import statistics import subprocess import tempfile import time +from dataclasses import dataclass from pathlib import Path -COMMANDS: list[tuple[str, list[str], bool]] = [ - ("help", ["--help"], False), - ("version", ["--version"], False), - ("compile-help", ["compile", "--help"], False), - ("install-help", ["install", "--help"], False), - ("pack-help", ["pack", "--help"], False), - ("audit-help", ["audit", "--help"], False), - ("init-yes", ["init", "--yes"], True), +FixtureName = str + + +@dataclass(frozen=True) +class BenchmarkCommand: + name: str + args: list[str] + fixture: FixtureName + workload: str + required_paths: tuple[str, ...] = () + stdout_contains: tuple[str, ...] = () + file_contains: tuple[tuple[str, str], ...] = () + expect_nonzero: bool = False + + +COMMANDS: list[BenchmarkCommand] = [ + BenchmarkCommand( + name="init scaffold", + args=["init", "--yes"], + fixture="empty-project", + workload="Creates a new apm.yml in an otherwise empty project directory.", + required_paths=("apm.yml",), + file_contains=(("apm.yml", "dependencies:"),), + ), + BenchmarkCommand( + name="targets json", + args=["targets", "--json"], + fixture="installed-project", + workload="Reads configured project targets from apm.yml and emits machine output.", + stdout_contains=("copilot",), + ), + BenchmarkCommand( + name="script list", + args=["list"], + fixture="installed-project", + workload="Reads apm.yml scripts and renders the runnable script inventory.", + stdout_contains=("build",), + ), + BenchmarkCommand( + name="deps list", + args=["deps", "list"], + fixture="installed-project", + workload="Scans apm_modules package directories and apm.lock.yaml metadata.", + stdout_contains=("microsoft/apm-package-alpha",), + ), + BenchmarkCommand( + name="deps tree", + args=["deps", "tree"], + fixture="installed-project", + workload="Builds a dependency tree from apm.lock.yaml and installed package metadata.", + stdout_contains=("agent-toolkit",), + ), + BenchmarkCommand( + name="install local package", + args=["install", "--no-policy", "./packages/local-tools"], + fixture="local-install-project", + workload="Installs a local package and materializes lock/module state.", + required_paths=("apm.lock.yaml", "apm_modules"), + file_contains=(("apm.lock.yaml", "local-tools"),), + ), + BenchmarkCommand( + name="compile copilot target", + args=["compile", "--target", "copilot"], + fixture="compilation-project", + workload="Discovers local primitives and writes the Copilot target artifact.", + required_paths=(".github/copilot-instructions.md",), + file_contains=((".github/copilot-instructions.md", "Benchmark Instruction"),), + ), + BenchmarkCommand( + name="pack output", + args=["pack", "--output", "dist"], + fixture="installed-project", + workload="Resolves local package contents and writes a distributable artifact.", + required_paths=("dist",), + ), + BenchmarkCommand( + name="run script", + args=["run", "stamp"], + fixture="runnable-project", + workload="Executes a project script and writes the script's side-effect file.", + required_paths=("run-stamp.txt",), + file_contains=(("run-stamp.txt", "real-run"),), + ), + BenchmarkCommand( + name="audit hidden unicode", + args=["audit", "--ci"], + fixture="audit-finding-project", + workload="Scans a real installed file and fails on planted hidden Unicode.", + expect_nonzero=True, + ), ] -def _run_once(binary: str, args: list[str], cwd: Path, env: dict[str, str]) -> dict[str, object]: +def _check_run(command: BenchmarkCommand, cwd: Path, stdout: str) -> list[str]: + failures: list[str] = [] + for relpath in command.required_paths: + if not (cwd / relpath).exists(): + failures.append(f"missing expected path: {relpath}") + for needle in command.stdout_contains: + if needle not in stdout: + failures.append(f"stdout missing {needle!r}") + for relpath, needle in command.file_contains: + path = cwd / relpath + if not path.exists(): + failures.append(f"missing expected file: {relpath}") + continue + content = path.read_text(encoding="utf-8", errors="replace") + if needle not in content: + failures.append(f"{relpath} missing {needle!r}") + return failures + + +def _run_once( + binary: str, + command: BenchmarkCommand, + cwd: Path, + env: dict[str, str], +) -> dict[str, object]: start = time.perf_counter() proc = subprocess.run( # noqa: S603 -- benchmark intentionally executes supplied CLIs. - [binary, *args], + [binary, *command.args], cwd=cwd, env=env, text=True, @@ -35,36 +143,312 @@ def _run_once(binary: str, args: list[str], cwd: Path, env: dict[str, str]) -> d check=False, ) elapsed = time.perf_counter() - start + check_failures = _check_run(command, cwd, proc.stdout) return { "elapsed_seconds": elapsed, "returncode": proc.returncode, "stdout_bytes": len(proc.stdout.encode("utf-8")), "stderr_bytes": len(proc.stderr.encode("utf-8")), + "checks_passed": not check_failures, + "check_failures": check_failures, } -def _workspace(base: Path, name: str, run_index: int) -> Path: - workdir = base / name / str(run_index) +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _safe_name(name: str) -> str: + return re.sub(r"[^a-zA-Z0-9_.-]+", "-", name).strip("-") + + +def _write_empty_project(workdir: Path) -> None: + _write(workdir / "README.md", "# Benchmark fixture\n") + + +def _write_installed_project(workdir: Path) -> None: + _write_empty_project(workdir) + for directory in [ + ".github", + ".claude", + ".cursor/rules", + ".codex", + "src/apm_bench", + ".apm/instructions", + ".apm/chatmodes", + "apm_modules/microsoft/apm-package-alpha/.apm/instructions", + "apm_modules/github/agent-toolkit/.apm/instructions", + ]: + (workdir / directory).mkdir(parents=True, exist_ok=True) + + _write( + workdir / "apm.yml", + """name: benchmark-project +version: 1.2.3 +description: Realistic migration benchmark fixture +author: benchmark +targets: + - copilot + - claude + - cursor +dependencies: + apm: + - microsoft/apm-package-alpha#v1.0.0 + - github/agent-toolkit#v2.3.4 + mcp: [] +scripts: + build: Build benchmark artifacts + test: Run the test suite + lint: Run lint checks + release: Prepare release artifacts +includes: auto +""", + ) + _write( + workdir / "apm.lock.yaml", + """lockfile_version: "1" +generated_at: "2026-01-01T00:00:00+00:00" +apm_version: benchmark +dependencies: + - repo_url: microsoft/apm-package-alpha + resolved_ref: v1.0.0 + resolved_commit: "1111111111111111111111111111111111111111" + version: 1.0.0 + package_type: instructions + deployed_files: + - .github/copilot-instructions.md + - repo_url: github/agent-toolkit + resolved_ref: v2.3.4 + resolved_commit: "2222222222222222222222222222222222222222" + version: 2.3.4 + depth: 2 + resolved_by: microsoft/apm-package-alpha + package_type: instructions + deployed_files: + - CLAUDE.md +local_deployed_files: + - .github/copilot-instructions.md + - CLAUDE.md + - .cursor/rules/AGENTS.md +""", + ) + _write( + workdir / ".github/copilot-instructions.md", + "# Copilot Benchmark Instructions\n\nUse the local benchmark context.\n", + ) + _write( + workdir / "CLAUDE.md", + "# Claude Benchmark Instructions\n\nUse the local benchmark context.\n", + ) + _write( + workdir / ".cursor/rules/AGENTS.md", + "# Cursor Benchmark Instructions\n\nUse the local benchmark context.\n", + ) + + for index in range(16): + _write( + workdir / f".apm/instructions/bench-{index:02d}.instructions.md", + f"""--- +applyTo: "src/**/*.py" +description: Benchmark instruction {index} +--- +# Benchmark Instruction {index} + +Keep implementation clear and tested. + +- Check input boundaries. +- Prefer small functions. +- Leave useful diagnostics for failures. +""", + ) + for index in range(2): + _write( + workdir / f".apm/chatmodes/reviewer-{index}.chatmode.md", + f"""--- +description: Review benchmark fixture {index} +--- +# Reviewer {index} + +Review for correctness, maintainability, and test coverage. +""", + ) + for index in range(24): + _write( + workdir / f"src/apm_bench/module_{index:02d}.py", + f'"""Benchmark source module {index}."""\n\nVALUE_{index} = {index}\n', + ) + + packages = [ + ("microsoft", "apm-package-alpha", "1.0.0"), + ("github", "agent-toolkit", "2.3.4"), + ] + for owner, repo, version in packages: + package_dir = workdir / "apm_modules" / owner / repo + _write( + package_dir / "apm.yml", + f"""name: {repo} +version: {version} +description: Fixture dependency package +author: benchmark +dependencies: + apm: [] + mcp: [] +""", + ) + _write( + package_dir / f".apm/instructions/{repo}.instructions.md", + f"""--- +applyTo: "**/*" +description: Installed package instruction for {repo} +--- +# {repo} + +Installed dependency instruction used by migration benchmarks. +""", + ) + + +def _write_compilation_project(workdir: Path) -> None: + _write_empty_project(workdir) + _write( + workdir / "apm.yml", + """name: compilation-project +version: 1.0.0 +description: Compilation benchmark fixture +author: benchmark +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: {} +includes: auto +""", + ) + _write( + workdir / ".apm/instructions/bench.instructions.md", + """--- +applyTo: "**/*" +description: Benchmark Instruction +--- +# Benchmark Instruction + +This content must be compiled into a target artifact. +""", + ) + + +def _write_local_install_project(workdir: Path) -> None: + _write_empty_project(workdir) + _write( + workdir / "apm.yml", + """name: local-install-project +version: 1.0.0 +description: Local install benchmark fixture +author: benchmark +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: {} +""", + ) + package_dir = workdir / "packages" / "local-tools" + _write( + package_dir / "apm.yml", + """name: local-tools +version: 1.0.0 +description: Local tools package +author: benchmark +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: {} +""", + ) + _write(package_dir / ".apm/instructions/tool.instructions.md", "# Local tools\n") + + +def _write_runnable_project(workdir: Path) -> None: + _write_empty_project(workdir) + _write( + workdir / "apm.yml", + """name: runnable-project +version: 1.0.0 +description: Runnable benchmark fixture +author: benchmark +targets: + - copilot +dependencies: + apm: [] + mcp: [] +scripts: + stamp: "printf real-run > run-stamp.txt" +""", + ) + + +def _write_audit_finding_project(workdir: Path) -> None: + _write_installed_project(workdir) + _write( + workdir / "apm_modules/unicode-package/SKILL.md", + "safe text \u202eevil text\n", + ) + _write( + workdir / "apm.lock.yaml", + """lockfile_version: "1" +dependencies: + - repo_url: local/unicode-package + resolved_commit: fixture + deployed_files: + - apm_modules/unicode-package/SKILL.md + deployed_file_hashes: {} +""", + ) + + +def _workspace(base: Path, command: BenchmarkCommand, run_index: int) -> Path: + if command.fixture == "none": + return base + + workdir = base / _safe_name(command.name) / str(run_index) workdir.mkdir(parents=True, exist_ok=True) - (workdir / "README.md").write_text("# Benchmark fixture\n", encoding="utf-8") + + if command.fixture == "empty-project": + _write_empty_project(workdir) + elif command.fixture == "installed-project": + _write_installed_project(workdir) + elif command.fixture == "compilation-project": + _write_compilation_project(workdir) + elif command.fixture == "local-install-project": + _write_local_install_project(workdir) + elif command.fixture == "runnable-project": + _write_runnable_project(workdir) + elif command.fixture == "audit-finding-project": + _write_audit_finding_project(workdir) + else: + raise ValueError(f"unknown benchmark fixture: {command.fixture}") + return workdir def _measure( *, binary: str, - args: list[str], - mutates_workspace: bool, + command: BenchmarkCommand, repeats: int, base: Path, - label: str, env: dict[str, str], ) -> dict[str, object]: base.mkdir(parents=True, exist_ok=True) samples: list[dict[str, object]] = [] for index in range(repeats): - cwd = _workspace(base, label, index) if mutates_workspace else base - samples.append(_run_once(binary, args, cwd, env)) + cwd = _workspace(base, command, index) + samples.append(_run_once(binary, command, cwd, env)) elapsed = [float(sample["elapsed_seconds"]) for sample in samples] return { @@ -72,6 +456,12 @@ def _measure( "min_seconds": min(elapsed), "max_seconds": max(elapsed), "returncodes": sorted({int(sample["returncode"]) for sample in samples}), + "checks_passed": all(bool(sample["checks_passed"]) for sample in samples), + "check_failures": [ + failure + for sample in samples + for failure in sample.get("check_failures", []) + ], "samples": samples, } @@ -90,15 +480,25 @@ def _markdown(results: list[dict[str, object]], max_ratio: float) -> str: lines = [ "## Migration CLI Benchmark", "", + "Includes fixture-backed commands that must read, write, execute, or fail " + "against real project state. " + "The installed-project fixture contains apm.yml, apm.lock.yaml, " + "apm_modules packages, local .apm primitives, target directories, " + "deployed prompt files, and sample source files.", + "The harness checks return-code parity for each command. Detailed stdout/stderr " + "byte counts are kept in the JSON samples, but this is not an output-parity test.", + "", f"Max allowed Go/Python median ratio: `{max_ratio:.2f}`", "", - "| Command | Python median | Go median | Go/Python | Result | Return codes |", - "|---|---:|---:|---:|---|---|", + "| Benchmark | Command | Fixture | Python median | Go median | Go/Python | Result | Return codes |", + "|---|---|---|---:|---:|---:|---|---|", ] for row in results: lines.append( - "| {command} | {python:.4f}s | {go:.4f}s | {ratio:.2f}x | {result} | {codes} |".format( + "| {name} | `{command}` | {fixture} | {python:.4f}s | {go:.4f}s | {ratio:.2f}x | {result} | {codes} |".format( + name=row["name"], command=row["command"], + fixture=row["fixture"], python=row["python_median_seconds"], go=row["go_median_seconds"], ratio=row["ratio"], @@ -106,6 +506,9 @@ def _markdown(results: list[dict[str, object]], max_ratio: float) -> str: codes=row["returncodes"], ) ) + lines.extend(["", "### Workloads", ""]) + for row in results: + lines.append(f"- **{row['name']}**: {row['workload']}") lines.append("") return "\n".join(lines) @@ -118,6 +521,11 @@ def main() -> int: parser.add_argument("--markdown-out", required=True) parser.add_argument("--max-ratio", type=float, default=5.0) parser.add_argument("--repeats", type=int, default=5) + parser.add_argument( + "--allow-failures", + action="store_true", + help="Write benchmark evidence without returning a failing exit code.", + ) args = parser.parse_args() env = os.environ.copy() @@ -133,23 +541,19 @@ def main() -> int: failures: list[str] = [] with tempfile.TemporaryDirectory(prefix="apm-migration-bench-") as tmp: base = Path(tmp) - for command, command_args, mutates_workspace in COMMANDS: + for command in COMMANDS: python_result = _measure( binary=args.python_bin, - args=command_args, - mutates_workspace=mutates_workspace, + command=command, repeats=args.repeats, - base=base / "python" / command, - label=command, + base=base / "python" / _safe_name(command.name), env=env, ) go_result = _measure( binary=args.go_bin, - args=command_args, - mutates_workspace=mutates_workspace, + command=command, repeats=args.repeats, - base=base / "go" / command, - label=command, + base=base / "go" / _safe_name(command.name), env=env, ) @@ -160,25 +564,44 @@ def main() -> int: "python": python_result["returncodes"], "go": go_result["returncodes"], } + row_failures: list[str] = [] + if python_result["returncodes"] != go_result["returncodes"]: + row_failures.append(f"return codes differ: {returncodes}") + if ratio > args.max_ratio: + row_failures.append( + f"Go median {ratio:.2f}x slower than Python " + f"(limit {args.max_ratio:.2f}x)" + ) + if not python_result["checks_passed"]: + row_failures.append( + f"Python artifact checks failed: {python_result['check_failures']}" + ) + if not go_result["checks_passed"]: + row_failures.append(f"Go artifact checks failed: {go_result['check_failures']}") + if command.expect_nonzero: + if all(code == 0 for code in python_result["returncodes"]): + row_failures.append("Python returned success for expected failure workload") + if all(code == 0 for code in go_result["returncodes"]): + row_failures.append("Go returned success for expected failure workload") row = { - "command": " ".join(command_args), + "name": command.name, + "command": " ".join(command.args), + "fixture": command.fixture, + "workload": command.workload, "python": python_result, "go": go_result, "python_median_seconds": python_median, "go_median_seconds": go_median, "ratio": ratio, "returncodes": returncodes, + "passed": not row_failures, + "failures": row_failures, } results.append(row) - if python_result["returncodes"] != go_result["returncodes"]: - failures.append(f"{command}: return codes differ: {returncodes}") - if ratio > args.max_ratio: - failures.append( - f"{command}: Go median {ratio:.2f}x slower than Python " - f"(limit {args.max_ratio:.2f}x)" - ) + for failure in row_failures: + failures.append(f"{command.name}: {failure}") json_path = Path(args.json_out) markdown_path = Path(args.markdown_out) @@ -189,9 +612,11 @@ def main() -> int: print(markdown_path.read_text(encoding="utf-8")) if failures: + annotation = "warning" if args.allow_failures else "error" for failure in failures: - print(f"::error::{failure}") - return 1 + print(f"::{annotation}::{failure}") + if not args.allow_failures: + return 1 return 0 diff --git a/tests/parity/python_contract_coverage.yml b/tests/parity/python_contract_coverage.yml index ef69606f..b571f688 100644 --- a/tests/parity/python_contract_coverage.yml +++ b/tests/parity/python_contract_coverage.yml @@ -20219,12 +20219,18 @@ python_tests: - tests/unit/test_crane_score.py::test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gate - tests/unit/test_crane_score.py::test_crane_score_does_not_infer_behavior_contracts_from_test_name - tests/unit/test_crane_score.py::test_crane_score_blocks_incomplete_behavior_contract_gate + - tests/unit/test_crane_score.py::test_crane_score_reaches_one_with_completion_tests_and_explicit_real_gates + - tests/unit/test_crane_score.py::test_crane_score_does_not_infer_completion_gates_from_test_names + - tests/unit/test_crane_score.py::test_crane_score_blocks_incomplete_real_functional_gate + - tests/unit/test_crane_score.py::test_crane_score_blocks_legacy_benchmark_bool_without_real_counts - tests/unit/test_crane_score.py::test_crane_score_blocks_known_exceptions - tests/unit/test_crane_workflow_prompt.py::test_crane_acceptance_requires_shared_iteration_summary_for_pr_updates - tests/unit/test_crane_workflow_prompt.py::test_crane_commit_guidance_provides_structured_summary_fallback - tests/unit/test_crane_workflow_prompt.py::test_crane_prompt_blocks_stale_completed_state_from_finishing - tests/unit/test_crane_workflow_prompt.py::test_crane_completion_is_two_phase_and_pr_head_gated - tests/unit/test_crane_workflow_prompt.py::test_crane_state_template_tracks_completion_candidate_gate + - tests/unit/test_migration_ci_workflow.py::test_migration_ci_enforces_completion_for_crane_prs_and_explicit_manual_runs + - tests/unit/test_migration_ci_workflow.py::test_migration_ci_collects_incomplete_evidence_for_non_crane_prs - tests/unit/test_cursor_mcp.py::TestCursorClientFactory::test_create_cursor_client - tests/unit/test_cursor_mcp.py::TestCursorClientFactory::test_create_cursor_client_case_insensitive - tests/unit/test_cursor_mcp.py::TestCursorClientAdapter::test_config_path_is_repo_local diff --git a/tests/unit/test_crane_score.py b/tests/unit/test_crane_score.py index 55f20713..455d26c8 100644 --- a/tests/unit/test_crane_score.py +++ b/tests/unit/test_crane_score.py @@ -76,7 +76,7 @@ def _deletion_gates() -> list[str]: '{"crane":"gate","name":"python_behavior_contracts","passing":1,"total":1}', '{"crane":"gate","name":"known_exceptions","count":0}', '{"crane":"gate","name":"python_tests","passed":true}', - '{"crane":"gate","name":"benchmarks","passed":true}', + '{"crane":"gate","name":"benchmarks","passing":1,"total":1}', ] @@ -111,6 +111,22 @@ def _behavior_contract_gate_output(passing: int, total: int) -> str: ) +def _ratio_gate_output(test: str, name: str, passing: int, total: int) -> str: + return _event( + "output", + test, + output=json.dumps( + { + "crane": "gate", + "name": name, + "passing": passing, + "total": total, + } + ) + + "\n", + ) + + def _gates(score: dict[str, object]) -> dict[str, dict[str, object]]: gates = score["gates"] assert isinstance(gates, list) @@ -183,7 +199,7 @@ def test_crane_score_can_reach_one_with_all_deletion_grade_gates() -> None: '{"crane":"gate","name":"python_behavior_contracts","passing":0,"total":1}', '{"crane":"gate","name":"known_exceptions","count":1}', '{"crane":"gate","name":"python_tests","passed":false}', - '{"crane":"gate","name":"benchmarks","passed":false}', + '{"crane":"gate","name":"benchmarks","passing":0,"total":1}', ], ) def test_crane_score_full_parity_but_bad_deletion_gate_cannot_reach_one( @@ -236,12 +252,12 @@ def test_crane_score_rejects_empty_event_stream() -> None: assert "empty or incomplete" in result.stderr -def test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gate() -> None: +def test_crane_score_reaches_one_with_completion_tests_and_explicit_real_gates() -> None: score = _run_score( [ *_parity_passes(293), *_completion_gate_events(), - _behavior_contract_gate_output(1, 1), + *_deletion_gates(), _package_pass(), ] ) @@ -252,21 +268,30 @@ def test_crane_score_reaches_one_with_completion_tests_and_explicit_behavior_gat assert all(gate["passing"] for gate in _gates(score).values()) -def test_crane_score_does_not_infer_behavior_contracts_from_test_name() -> None: +def test_crane_score_does_not_infer_completion_gates_from_test_names() -> None: score = _run_score([*_parity_passes(293), *_completion_gate_events(), _package_pass()]) gates = _gates(score) assert score["progress"] == 1.0 assert score["migration_score"] < 1.0 assert score["deletion_grade_ready"] is False + assert gates["functional_contracts"]["passing"] is False + assert gates["state_diff_contracts"]["passing"] is False assert gates["python_behavior_contracts"]["passing"] is False + assert gates["benchmarks_pass"]["passing"] is False def test_crane_score_blocks_incomplete_behavior_contract_gate() -> None: + gates = [ + line + for line in _deletion_gates() + if json.loads(line)["name"] != "python_behavior_contracts" + ] score = _run_score( [ *_parity_passes(293), *_completion_gate_events(), + *gates, _behavior_contract_gate_output(0, 1), _package_pass(), ] @@ -279,11 +304,55 @@ def test_crane_score_blocks_incomplete_behavior_contract_gate() -> None: assert gates["python_behavior_contracts"]["passing"] is False +def test_crane_score_blocks_incomplete_real_functional_gate() -> None: + gates = [line for line in _deletion_gates() if json.loads(line)["name"] != "functional"] + score = _run_score( + [ + *_parity_passes(293), + *_completion_gate_events(), + *gates, + _ratio_gate_output( + "TestParityRealFunctionalAndStateDiffContracts", + "functional", + 0, + 1, + ), + _package_pass(), + ] + ) + gates = _gates(score) + + assert score["progress"] == 1.0 + assert score["migration_score"] < 1.0 + assert score["deletion_grade_ready"] is False + assert gates["functional_contracts"]["passing"] is False + + +def test_crane_score_blocks_legacy_benchmark_bool_without_real_counts() -> None: + gates = [line for line in _deletion_gates() if json.loads(line)["name"] != "benchmarks"] + score = _run_score( + [ + *_parity_passes(293), + *_completion_gate_events(), + *gates, + '{"crane":"gate","name":"benchmarks","passed":true}', + _package_pass(), + ] + ) + gates = _gates(score) + + assert score["progress"] == 1.0 + assert score["migration_score"] < 1.0 + assert score["deletion_grade_ready"] is False + assert gates["benchmarks_pass"]["passing"] is False + + def test_crane_score_blocks_known_exceptions() -> None: score = _run_score( [ *_parity_passes(293), *_completion_gate_events(), + *_deletion_gates(), _event("output", "TestParityCompletionHelpIdentical", output="APPROVED-EXCEPTION: no"), _package_pass(), ] diff --git a/tests/unit/test_mcp_integrator_install_hermetic.py b/tests/unit/test_mcp_integrator_install_hermetic.py index 0e4f5c18..14da2bab 100644 --- a/tests/unit/test_mcp_integrator_install_hermetic.py +++ b/tests/unit/test_mcp_integrator_install_hermetic.py @@ -453,6 +453,10 @@ def test_all_excluded_warns_and_returns_zero(self, tmp_path): "apm_cli.runtime.utils.find_runtime_binary", return_value=None, ), + patch( + "apm_cli.integration.mcp_integrator_install.find_runtime_binary", + return_value=None, + ), patch( "apm_cli.integration.mcp_integrator.MCPIntegrator._gate_project_scoped_runtimes", side_effect=lambda rts, **kw: rts, diff --git a/tests/unit/test_mcp_integrator_install_phase3w4.py b/tests/unit/test_mcp_integrator_install_phase3w4.py index 1ccb87d7..c2e3a93a 100644 --- a/tests/unit/test_mcp_integrator_install_phase3w4.py +++ b/tests/unit/test_mcp_integrator_install_phase3w4.py @@ -453,6 +453,10 @@ def test_all_excluded_warns_and_returns_zero(self, tmp_path): "apm_cli.runtime.utils.find_runtime_binary", return_value=None, ), + patch( + "apm_cli.integration.mcp_integrator_install.find_runtime_binary", + return_value=None, + ), patch( "apm_cli.integration.mcp_integrator.MCPIntegrator._gate_project_scoped_runtimes", side_effect=lambda rts, **kw: rts, diff --git a/tests/unit/test_migration_ci_workflow.py b/tests/unit/test_migration_ci_workflow.py new file mode 100644 index 00000000..9b9fde79 --- /dev/null +++ b/tests/unit/test_migration_ci_workflow.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +WORKFLOW = ROOT / ".github" / "workflows" / "migration-ci.yml" + + +def _workflow_text() -> str: + return WORKFLOW.read_text(encoding="utf-8") + + +def test_migration_ci_enforces_completion_for_crane_prs_and_explicit_manual_runs() -> None: + text = _workflow_text() + + assert "enforce_completion:" in text + assert "MIGRATION_COMPLETION_ENFORCED=$enforce_completion" in text + assert "APM_ENFORCE_COMPLETION_GATES=1" in text + assert "inputs.enforce_completion == true" in text + assert 'github.event.pull_request.head.ref }}" == crane/*' in text + assert "manual runs with enforce_completion=true" in text + + +def test_migration_ci_collects_incomplete_evidence_for_non_crane_prs() -> None: + text = _workflow_text() + + assert "--allow-failures" in text + assert "Non-enforcing migration evidence run" in text + assert "Python behavior contract tests are incomplete in collection mode." in text + assert "Go parity tests are incomplete in collection mode." in text diff --git a/tests/unit/test_readme_go_cli_migration.py b/tests/unit/test_readme_go_cli_migration.py index ce828eb7..e1f51c37 100644 --- a/tests/unit/test_readme_go_cli_migration.py +++ b/tests/unit/test_readme_go_cli_migration.py @@ -22,7 +22,8 @@ def test_readme_documents_go_cli_migration_usage() -> None: "gh workflow run migration-ci.yml --repo githubnext/apm --ref main", "`migration-benchmark-evidence`", "`Go/Python` ratio is the Go median duration divided by the Python median", - "`327x`-`370x` faster", + "fixture-backed commands that read, write, execute, or fail against realistic APM", + "`apm.yml`, `apm.lock.yaml`, installed `apm_modules`", ] for snippet in required_snippets: diff --git a/tests/unit/test_runtime_windows.py b/tests/unit/test_runtime_windows.py index bfcab7e1..98fff4e8 100644 --- a/tests/unit/test_runtime_windows.py +++ b/tests/unit/test_runtime_windows.py @@ -261,6 +261,7 @@ def test_execute_runtime_command_uses_shlex_on_unix(self): with ( patch("sys.platform", "linux"), + patch("apm_cli.core.script_runner.find_runtime_binary", return_value=None), patch("subprocess.run", return_value=MagicMock(returncode=0)) as mock_run, ): runner._execute_runtime_command("codex --quiet", "prompt content", env)