From 05ddd2da0cc8bc27d29d29eace663d84225f8e57 Mon Sep 17 00:00:00 2001 From: Ilya Kuznetsov Date: Mon, 15 Jun 2026 17:45:55 +0200 Subject: [PATCH] configsync: don't sync policy-injected custom_tags and cluster_log_conf Cluster policies inject fixed values for custom_tags and cluster_log_conf when the user omits them, so these fields exist only remotely. config-remote-sync wrote them back into databricks.yml as adds, leaking one environment's policy values into shared config and causing deploys in other environments to be rejected by their own policy. Skip these fields in configsync when they are absent from config, via a new backendDefault marker (skip regardless of remote value when not set locally), while still syncing them for users who manage them explicitly. Add an acceptance test that simulates policy injection via jobs reset and verifies the fields are no longer written back, while a legitimate edit still syncs. --- .../databricks.yml.tmpl | 29 ++++++++++++ .../out.test.toml | 4 ++ .../policy_injected_cluster_fields/output.txt | 38 ++++++++++++++++ .../policy_injected_cluster_fields/script | 44 +++++++++++++++++++ .../policy_injected_cluster_fields/test.toml | 10 +++++ bundle/configsync/defaults.go | 20 +++++++++ 6 files changed, 145 insertions(+) create mode 100644 acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/databricks.yml.tmpl create mode 100644 acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/out.test.toml create mode 100644 acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/output.txt create mode 100644 acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/script create mode 100644 acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/test.toml diff --git a/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/databricks.yml.tmpl b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/databricks.yml.tmpl new file mode 100644 index 00000000000..62dd2814a7e --- /dev/null +++ b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/databricks.yml.tmpl @@ -0,0 +1,29 @@ +bundle: + name: test-bundle-$UNIQUE_NAME + +targets: + default: + mode: development + +resources: + jobs: + job1: + max_concurrent_runs: 1 + job_clusters: + - job_cluster_key: shared + new_cluster: + spark_version: $DEFAULT_SPARK_VERSION + node_type_id: $NODE_TYPE_ID + num_workers: 1 + tasks: + - task_key: shared_cluster_task + job_cluster_key: shared + notebook_task: + notebook_path: /Users/{{workspace_user_name}}/notebook + - task_key: own_cluster_task + notebook_task: + notebook_path: /Users/{{workspace_user_name}}/notebook + new_cluster: + spark_version: $DEFAULT_SPARK_VERSION + node_type_id: $NODE_TYPE_ID + num_workers: 1 diff --git a/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/out.test.toml b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/out.test.toml new file mode 100644 index 00000000000..579b1e4a3c9 --- /dev/null +++ b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/out.test.toml @@ -0,0 +1,4 @@ +Local = true +Cloud = true +GOOS.windows = false +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct", "terraform"] diff --git a/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/output.txt b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/output.txt new file mode 100644 index 00000000000..b6d44241592 --- /dev/null +++ b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/output.txt @@ -0,0 +1,38 @@ +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== Simulate a cluster policy injecting custom_tags and cluster_log_conf +These fields exist only remotely (the user never set them in config). +A legitimate edit (max_concurrent_runs) is made too, to show real edits still sync. + +=== Detect and save all changes +Detected changes in 1 resource(s): + +Resource: resources.jobs.job1 + max_concurrent_runs: replace + + + +=== Configuration changes + +>>> diff.py databricks.yml.backup databricks.yml +--- databricks.yml.backup ++++ databricks.yml +@@ -9,5 +9,5 @@ + jobs: + job1: +- max_concurrent_runs: 1 ++ max_concurrent_runs: 5 + job_clusters: + - job_cluster_key: shared + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.jobs.job1 + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default + +Deleting files... +Destroy complete! diff --git a/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/script b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/script new file mode 100644 index 00000000000..bcc9844b03f --- /dev/null +++ b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/script @@ -0,0 +1,44 @@ +#!/bin/bash + +envsubst < databricks.yml.tmpl > databricks.yml + +cleanup() { + trace $CLI bundle destroy --auto-approve +} +trap cleanup EXIT + +$CLI bundle deploy +job1_id="$(read_id.py job1)" + +title "Simulate a cluster policy injecting custom_tags and cluster_log_conf" +echo +echo "These fields exist only remotely (the user never set them in config)." +echo "A legitimate edit (max_concurrent_runs) is made too, to show real edits still sync." +edit_resource.py jobs $job1_id <<'EOF' +r["max_concurrent_runs"] = 5 + +# A DBFS destination is used instead of S3: the real backend rejects an S3 +# cluster log destination without a per-cluster instance-profile ARN, and the +# field's value is irrelevant here — only that it exists remotely and not in config. +policy_tags = {"CostCenter": "dev-1234", "Team": "finops"} +policy_log_conf = {"dbfs": {"destination": "dbfs:/cluster-logs/dev"}} + +for task in r.get("tasks", []): + if "new_cluster" in task: + task["new_cluster"]["custom_tags"] = dict(policy_tags) + task["new_cluster"]["cluster_log_conf"] = dict(policy_log_conf) + +for jc in r.get("job_clusters", []): + jc["new_cluster"]["custom_tags"] = dict(policy_tags) + jc["new_cluster"]["cluster_log_conf"] = dict(policy_log_conf) +EOF + +title "Detect and save all changes" +echo +cp databricks.yml databricks.yml.backup +$CLI bundle config-remote-sync --save + +title "Configuration changes" +echo +trace diff.py databricks.yml.backup databricks.yml +rm databricks.yml.backup diff --git a/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/test.toml b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/test.toml new file mode 100644 index 00000000000..1067069b9fa --- /dev/null +++ b/acceptance/bundle/config-remote-sync/policy_injected_cluster_fields/test.toml @@ -0,0 +1,10 @@ +Cloud = true + +RecordRequests = false +Ignore = [".databricks", "databricks.yml", "databricks.yml.backup"] + +[Env] +DATABRICKS_BUNDLE_ENABLE_EXPERIMENTAL_YAML_SYNC = "true" + +[EnvMatrix] +DATABRICKS_BUNDLE_ENGINE = ["direct", "terraform"] diff --git a/bundle/configsync/defaults.go b/bundle/configsync/defaults.go index 9fc59918e1f..cc56b361aff 100644 --- a/bundle/configsync/defaults.go +++ b/bundle/configsync/defaults.go @@ -11,12 +11,18 @@ type ( skipIfEmptyOrDefault struct { defaults map[string]any } + // skipBackendDefault skips a field, regardless of its remote value, when it is + // absent from config. Used for fields the backend or a cluster policy may fill + // in: their remote-only value must not be synced into config. Fields the user + // does manage (present in config) still sync normally. + skipBackendDefault struct{} ) var ( alwaysSkip = skipAlways{} zeroOrNil = skipIfZeroOrNil{} emptyEmailNotifications = skipIfEmptyOrDefault{defaults: map[string]any{"no_alert_for_skipped_runs": false}} + backendDefault = skipBackendDefault{} ) // serverSideDefaults contains all hardcoded server-side defaults. @@ -54,6 +60,13 @@ var serverSideDefaults = map[string]any{ "resources.jobs.*.tasks[*].new_cluster.data_security_mode": "SINGLE_USER", // TODO this field is computed on some workspaces in integration tests, check why and if we can skip it "resources.jobs.*.tasks[*].new_cluster.enable_elastic_disk": alwaysSkip, // deprecated field "resources.jobs.*.tasks[*].new_cluster.single_user_name": alwaysSkip, + // custom_tags and cluster_log_conf are commonly injected by cluster policies + // when the user omits them, so they exist only remotely. Syncing them back leaks + // one environment's policy values into (often shared) config and breaks deploys in + // other environments. TODO: move to backend_defaults in resources.yml once + // configsync filtering is migrated to the direct engine lifecycle metadata. + "resources.jobs.*.tasks[*].new_cluster.custom_tags": backendDefault, + "resources.jobs.*.tasks[*].new_cluster.cluster_log_conf": backendDefault, // Cluster fields (job_clusters) "resources.jobs.*.job_clusters[*].new_cluster.aws_attributes": alwaysSkip, @@ -62,6 +75,8 @@ var serverSideDefaults = map[string]any{ "resources.jobs.*.job_clusters[*].new_cluster.data_security_mode": "SINGLE_USER", // TODO this field is computed on some workspaces in integration tests, check why and if we can skip it "resources.jobs.*.job_clusters[*].new_cluster.enable_elastic_disk": alwaysSkip, // deprecated field "resources.jobs.*.job_clusters[*].new_cluster.single_user_name": alwaysSkip, + "resources.jobs.*.job_clusters[*].new_cluster.custom_tags": backendDefault, // see tasks[*].new_cluster.custom_tags + "resources.jobs.*.job_clusters[*].new_cluster.cluster_log_conf": backendDefault, // see tasks[*].new_cluster.cluster_log_conf // Standalone cluster fields "resources.clusters.*.aws_attributes": alwaysSkip, @@ -71,6 +86,8 @@ var serverSideDefaults = map[string]any{ "resources.clusters.*.driver_node_type_id": alwaysSkip, "resources.clusters.*.enable_elastic_disk": alwaysSkip, "resources.clusters.*.single_user_name": alwaysSkip, + "resources.clusters.*.custom_tags": backendDefault, // see jobs.*.tasks[*].new_cluster.custom_tags + "resources.clusters.*.cluster_log_conf": backendDefault, // see jobs.*.tasks[*].new_cluster.cluster_log_conf // Experiment fields "resources.experiments.*.artifact_location": alwaysSkip, @@ -118,6 +135,9 @@ func shouldSkipField(path string, value any, hasConfigValue bool) bool { if hasConfigValue { return false } + if _, ok := expected.(skipBackendDefault); ok { + return true + } if _, ok := expected.(skipIfZeroOrNil); ok { return value == nil || value == int64(0) }