Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,92 @@ jobs:
run: |
go tool -modfile=tools/task/go.mod task test-sandbox

test-fuzz:
needs:
- cleanups

# The terraform/direct create-payload parity tests run two real `bundle deploy`
# invocations per seed, so they are too slow for every PR and too noisy to gate
# the merge queue. Run them on the nightly schedule to catch engine drift; not
# part of test-result for that reason.
if: ${{ github.event_name == 'schedule' }}
name: "task test-fuzz"
runs-on:
group: databricks-protected-runner-group-large
labels: linux-ubuntu-latest-large

defaults:
run:
shell: bash

permissions:
id-token: write
contents: read
# Needed by the failure-reporting step below to open/comment a tracking issue.
issues: write

steps:
- name: Checkout repository and submodules
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
cache-key: test-fuzz

- name: Run tests
env:
# Shift the seed window by the run number every nightly run so CI
# explores configs it has never tested before instead of re-checking a
# fixed set. The window is kept modest (each seed runs two real deploys)
# since the exploration comes from rotating the window, not its size;
# raise it once nightly timings are known. A divergence prints
# FUZZ_SEED=<n> for one-command reproduction.
#
# offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS. GITHUB_RUN_NUMBER is a
# built-in, monotonically increasing, unique-per-run integer, so as long
# as FUZZ_SEEDS is constant the windows are non-overlapping (gaps from
# non-schedule runs are fine; we only need fresh seeds, not every seed).
FUZZ_SEEDS: "25"
run: |
export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS ))
go tool -modfile=tools/task/go.mod task test-fuzz

# This job is intentionally excluded from test-result, so a failure here is
# invisible unless someone watches the Actions tab. Surface it as a GitHub
# issue instead. Reuse a single open issue (deduped by label) so a recurring
# divergence doesn't open one issue per night.
- name: Report failure
if: ${{ failure() }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
gh label create fuzz-nightly \
--description "Nightly terraform/direct create-payload parity failures" \
--color FBCA04 2>/dev/null || true

body=$(cat <<EOF
The nightly terraform/direct create-payload parity job (\`task test-fuzz\`) failed.

Run: $RUN_URL

The failing seed(s) are printed in the job log as \`reproduce with: FUZZ_SEED=<n>\`.
Reproduce locally with:

\`\`\`
FUZZ_SEED=<seed> go test ./bundle/fuzz -run TestJobCreateParity
\`\`\`
EOF
)

existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number')
if [ -n "$existing" ]; then
gh issue comment "$existing" --body "$body"
else
gh issue create --title "Nightly fuzz parity failure" --label fuzz-nightly --body "$body"
fi

# This job groups the result of all the above test jobs.
# It is a required check, so it blocks auto-merge and the merge queue.
#
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ tools/testmask/testmask
# Release artifacts
dist/

# Terraform binary + provider mirror provisioned by acceptance/install_terraform.py
# for the bundle/fuzz parity tests (see Taskfile `test-fuzz`).
/build/

# Local development notes, tmp
/pr-*
/tmp/
Expand Down
17 changes: 17 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,23 @@ tasks:
--packages ./acceptance/... \
-- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/cmd/sandbox"

test-fuzz:
desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform)
# No `sources:` fingerprint: the seeds checked are a function of the FUZZ_SEED,
# FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on
# an unchanged source checksum would silently no-op a FUZZ_SEED=<n> repro run
# or a shifted nightly window, so always run.
cmds:
# The parity harness expects terraform + the provider mirror at <repo>/build;
# RequireTerraform skips when it's absent, so provision it first.
- python3 acceptance/install_terraform.py --targetdir build
- |
{{.GO_TOOL}} gotestsum \
--format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \
--no-summary=skipped \
--packages ./bundle/fuzz/... \
-- -timeout=${LOCAL_TIMEOUT:-30m}

# --- Integration tests ---

integration:
Expand Down
2 changes: 2 additions & 0 deletions acceptance/bundle/deploy/wal/chain-3-jobs/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down Expand Up @@ -73,6 +74,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down
1 change: 1 addition & 0 deletions acceptance/bundle/deploy/wal/crash-after-create/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Exit code: [KILLED]
{
"new_cluster": {
"node_type_id": "[NODE_TYPE_ID]",
"num_workers": 0,
"spark_version": "15.4.x-scala2.12"
},
"spark_python_task": {
Expand Down
2 changes: 2 additions & 0 deletions acceptance/bundle/override/job_tasks/output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
},
{
"new_cluster": {
"num_workers": 0,
"spark_version": "13.3.x-scala2.12"
},
"spark_python_task": {
Expand All @@ -42,6 +43,7 @@ Exit code: 1
"tasks": [
{
"new_cluster": {
"num_workers": 0,
"spark_version": "13.3.x-scala2.12"
},
"spark_python_task": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"new_cluster": {
"custom_tags": {
"ResourceClass": "SingleNode"
}
},
"num_workers": 0
},
"task_key": "test-task"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"new_cluster": {
"custom_tags": {
"ResourceClass": "SingleNode"
}
},
"num_workers": 0
},
"task_key": "test-task"
}
Expand Down
1 change: 1 addition & 0 deletions bundle/config/mutator/resourcemutator/cluster_fixups.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) {
for _, task := range js.Tasks {
if task.NewCluster != nil {
ModifyRequestOnInstancePool(task.NewCluster)
initializeNumWorkers(task.NewCluster)
}
}
for ind := range js.JobClusters {
Expand Down
61 changes: 61 additions & 0 deletions bundle/fuzz/capture.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package fuzz

import (
"encoding/json"
"sync"

"github.com/databricks/cli/libs/testserver"
)

// jobsCreatePath is the Jobs API route both engines must hit on create. The
// direct engine posts here via the SDK and the terraform provider is expected to
// as well. The testserver registers only this exact route, so if an engine ever
// posted to a different version the deploy would 404 and CaptureJobCreate would
// fail with "did not POST". A version skew therefore surfaces as a capture
// failure, not as a payload diff.
const jobsCreatePath = "/api/2.2/jobs/create"

// CapturedRequest is a single mutating API request observed by the testserver.
type CapturedRequest struct {
Method string
Path string
Body json.RawMessage
}

// recorder collects request bodies sent to a testserver. It is safe for
// concurrent use because the SDK and terraform may issue requests from multiple
// goroutines.
type recorder struct {
mu sync.Mutex
requests []CapturedRequest
}

func (r *recorder) callback(req *testserver.Request) {
r.mu.Lock()
defer r.mu.Unlock()

var body json.RawMessage
if json.Valid(req.Body) {
// Copy: testserver reuses the underlying buffer across requests.
body = append(json.RawMessage(nil), req.Body...)
}

r.requests = append(r.requests, CapturedRequest{
Method: req.Method,
Path: req.URL.Path,
Body: body,
})
}

// find returns the body of the first recorded request matching method and path.
func (r *recorder) find(method, path string) (json.RawMessage, bool) {
r.mu.Lock()
defer r.mu.Unlock()

for _, req := range r.requests {
if req.Method == method && req.Path == path {
return req.Body, true
}
}
return nil, false
}
Loading
Loading