databricks · radakam · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 24, 2026
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -370,6 +370,92 @@ jobs:
         run: |
           go tool -modfile=tools/task/go.mod task test-sandbox
 
+  test-fuzz:
+    needs:
+      - cleanups
+
+    # The terraform/direct create-payload parity tests run two real `bundle deploy`
+    # invocations per seed, so they are too slow for every PR and too noisy to gate
+    # the merge queue. Run them on the nightly schedule to catch engine drift; not
+    # part of test-result for that reason.
+    if: ${{ github.event_name == 'schedule' }}
+    name: "task test-fuzz"
+    runs-on:
+      group: databricks-protected-runner-group-large
+      labels: linux-ubuntu-latest-large
+
+    defaults:
+      run:
+        shell: bash
+
+    permissions:
+      id-token: write
+      contents: read
+      # Needed by the failure-reporting step below to open/comment a tracking issue.
+      issues: write
+
+    steps:
+      - name: Checkout repository and submodules
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Setup build environment
+        uses: ./.github/actions/setup-build-environment
+        with:
+          cache-key: test-fuzz
+
+      - name: Run tests
+        env:
+          # Shift the seed window by the run number every nightly run so CI
+          # explores configs it has never tested before instead of re-checking a
+          # fixed set. The window is kept modest (each seed runs two real deploys)
+          # since the exploration comes from rotating the window, not its size;
+          # raise it once nightly timings are known. A divergence prints
+          # FUZZ_SEED=<n> for one-command reproduction.
+          #
+          # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS. GITHUB_RUN_NUMBER is a
+          # built-in, monotonically increasing, unique-per-run integer, so as long
+          # as FUZZ_SEEDS is constant the windows are non-overlapping (gaps from
+          # non-schedule runs are fine; we only need fresh seeds, not every seed).
+          FUZZ_SEEDS: "25"
+        run: |
+          export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS ))
+          go tool -modfile=tools/task/go.mod task test-fuzz
+
+      # This job is intentionally excluded from test-result, so a failure here is
+      # invisible unless someone watches the Actions tab. Surface it as a GitHub
+      # issue instead. Reuse a single open issue (deduped by label) so a recurring
+      # divergence doesn't open one issue per night.
+      - name: Report failure
+        if: ${{ failure() }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          gh label create fuzz-nightly \
+            --description "Nightly terraform/direct create-payload parity failures" \
+            --color FBCA04 2>/dev/null || true
+
+          body=$(cat <<EOF
+          The nightly terraform/direct create-payload parity job (\`task test-fuzz\`) failed.
+
+          Run: $RUN_URL
+
+          The failing seed(s) are printed in the job log as \`reproduce with: FUZZ_SEED=<n>\`.
+          Reproduce locally with:
+
+          \`\`\`
+          FUZZ_SEED=<seed> go test ./bundle/fuzz -run TestJobCreateParity
+          \`\`\`
+          EOF
+          )
+
+          existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number')
+          if [ -n "$existing" ]; then
+            gh issue comment "$existing" --body "$body"
+          else
+            gh issue create --title "Nightly fuzz parity failure" --label fuzz-nightly --body "$body"
+          fi
+
   # This job groups the result of all the above test jobs.
   # It is a required check, so it blocks auto-merge and the merge queue.
   #

diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,10 @@ tools/testmask/testmask
 # Release artifacts
 dist/
 
+# Terraform binary + provider mirror provisioned by acceptance/install_terraform.py
+# for the bundle/fuzz parity tests (see Taskfile `test-fuzz`).
+/build/
+
 # Local development notes, tmp
 /pr-*
 /tmp/

diff --git a/Taskfile.yml b/Taskfile.yml
@@ -678,6 +678,23 @@ tasks:
           --packages ./acceptance/... \
           -- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/cmd/sandbox"
 
+  test-fuzz:
+    desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform)
+    # No `sources:` fingerprint: the seeds checked are a function of the FUZZ_SEED,
+    # FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on
+    # an unchanged source checksum would silently no-op a FUZZ_SEED=<n> repro run
+    # or a shifted nightly window, so always run.
+    cmds:
+      # The parity harness expects terraform + the provider mirror at <repo>/build;
+      # RequireTerraform skips when it's absent, so provision it first.
+      - python3 acceptance/install_terraform.py --targetdir build
+      - |
+        {{.GO_TOOL}} gotestsum \
+          --format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \
+          --no-summary=skipped \
+          --packages ./bundle/fuzz/... \
+          -- -timeout=${LOCAL_TIMEOUT:-30m}
+
   # --- Integration tests ---
 
   integration:

diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt
@@ -35,6 +35,7 @@ Exit code: [KILLED]
         {
           "new_cluster": {
             "node_type_id": "[NODE_TYPE_ID]",
+            "num_workers": 0,
             "spark_version": "15.4.x-scala2.12"
           },
           "spark_python_task": {
@@ -73,6 +74,7 @@ Exit code: [KILLED]
         {
           "new_cluster": {
             "node_type_id": "[NODE_TYPE_ID]",
+            "num_workers": 0,
             "spark_version": "15.4.x-scala2.12"
           },
           "spark_python_task": {

diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt
@@ -39,6 +39,7 @@ Exit code: [KILLED]
         {
           "new_cluster": {
             "node_type_id": "[NODE_TYPE_ID]",
+            "num_workers": 0,
             "spark_version": "15.4.x-scala2.12"
           },
           "spark_python_task": {

diff --git a/acceptance/bundle/override/job_tasks/output.txt b/acceptance/bundle/override/job_tasks/output.txt
@@ -18,6 +18,7 @@
     },
     {
       "new_cluster": {
+        "num_workers": 0,
         "spark_version": "13.3.x-scala2.12"
       },
       "spark_python_task": {
@@ -42,6 +43,7 @@ Exit code: 1
   "tasks": [
     {
       "new_cluster": {
+        "num_workers": 0,
         "spark_version": "13.3.x-scala2.12"
       },
       "spark_python_task": {

diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json
@@ -30,7 +30,8 @@
           "new_cluster": {
             "custom_tags": {
               "ResourceClass": "SingleNode"
-            }
+            },
+            "num_workers": 0
           },
           "task_key": "test-task"
         }

diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json
@@ -30,7 +30,8 @@
           "new_cluster": {
             "custom_tags": {
               "ResourceClass": "SingleNode"
-            }
+            },
+            "num_workers": 0
           },
           "task_key": "test-task"
         }

diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups.go b/bundle/config/mutator/resourcemutator/cluster_fixups.go
@@ -94,6 +94,7 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) {
 	for _, task := range js.Tasks {
 		if task.NewCluster != nil {
 			ModifyRequestOnInstancePool(task.NewCluster)
+			initializeNumWorkers(task.NewCluster)
 		}
 	}
 	for ind := range js.JobClusters {

diff --git a/bundle/fuzz/capture.go b/bundle/fuzz/capture.go
@@ -0,0 +1,61 @@
+package fuzz
+
+import (
+	"encoding/json"
+	"sync"
+
+	"github.com/databricks/cli/libs/testserver"
+)
+
+// jobsCreatePath is the Jobs API route both engines must hit on create. The
+// direct engine posts here via the SDK and the terraform provider is expected to
+// as well. The testserver registers only this exact route, so if an engine ever
+// posted to a different version the deploy would 404 and CaptureJobCreate would
+// fail with "did not POST". A version skew therefore surfaces as a capture
+// failure, not as a payload diff.
+const jobsCreatePath = "/api/2.2/jobs/create"
+
+// CapturedRequest is a single mutating API request observed by the testserver.
+type CapturedRequest struct {
+	Method string
+	Path   string
+	Body   json.RawMessage
+}
+
+// recorder collects request bodies sent to a testserver. It is safe for
+// concurrent use because the SDK and terraform may issue requests from multiple
+// goroutines.
+type recorder struct {
+	mu       sync.Mutex
+	requests []CapturedRequest
+}
+
+func (r *recorder) callback(req *testserver.Request) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	var body json.RawMessage
+	if json.Valid(req.Body) {
+		// Copy: testserver reuses the underlying buffer across requests.
+		body = append(json.RawMessage(nil), req.Body...)
+	}
+
+	r.requests = append(r.requests, CapturedRequest{
+		Method: req.Method,
+		Path:   req.URL.Path,
+		Body:   body,
+	})
+}
+
+// find returns the body of the first recorded request matching method and path.
+func (r *recorder) find(method, path string) (json.RawMessage, bool) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	for _, req := range r.requests {
+		if req.Method == method && req.Path == path {
+			return req.Body, true
+		}
+	}
+	return nil, false
+}