databricks · riddhibhagwat-db · Jun 11, 2026 · Jun 12, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -4,7 +4,7 @@ vars:
   # Absolute path so tasks with `dir:` (lint-go-tools, lint-go-codegen) can use it.
   GO_TOOL: go tool -modfile={{.ROOT_DIR}}/tools/go.mod
   EXE_EXT: '{{if eq OS "windows"}}.exe{{end}}'
-  TEST_PACKAGES: ./acceptance/internal ./libs/... ./internal/... ./cmd/... ./bundle/... ./experimental/ssh/... .
+  TEST_PACKAGES: ./acceptance/internal ./libs/... ./internal/... ./cmd/... ./bundle/... ./experimental/air/... ./experimental/ssh/... .
   ACCEPTANCE_TEST_FILTER: ""
   # Single brace-expansion glob covering every //go:embed target in the repo,
   # computed by grepping `//go:embed` directives. Evaluated lazily by Task so

diff --git a/acceptance/experimental/air/get/out.test.toml b/acceptance/experimental/air/get/out.test.toml
diff --git a/acceptance/experimental/air/get/output.txt b/acceptance/experimental/air/get/output.txt
@@ -0,0 +1,36 @@
+
+=== get (text)
+>>> [CLI] experimental air get 123
+Run ID:       123
+Status:       SUCCESS
+Submitted:    [TIMESTAMP]
+Duration:     12s
+Retries:      0
+Experiment:   my-exp
+User:         user@example.com
+Accelerators: 8x H100
+MLflow:       [DATABRICKS_URL]/ml/experiments/exp1/runs/run1/artifacts/logs/node_0
+Dashboard:    https://my-workspace.cloud.databricks.test/jobs/runs/123
+
+=== get (json)
+>>> [CLI] experimental air get 123 -o json
+{
+  "v": 1,
+  "ts": "[TIMESTAMP]",
+  "data": {
+    "run_id": "123",
+    "status": "SUCCESS",
+    "started_at": "[TIMESTAMP]",
+    "duration_seconds": 12,
+    "attempt_number": 0,
+    "experiment_name": "my-exp",
+    "dashboard_url": "https://my-workspace.cloud.databricks.test/jobs/runs/123",
+    "mlflow_url": "[DATABRICKS_URL]/ml/experiments/exp1/runs/run1/artifacts/logs/node_0"
+  }
+}
+
+=== invalid run id
+>>> [CLI] experimental air get notanumber
+Error: invalid RUN_ID "notanumber": must be a positive integer
+
+Exit code: 1
diff --git a/acceptance/experimental/air/get/script b/acceptance/experimental/air/get/script
@@ -0,0 +1,8 @@
+title "get (text)"
+trace $CLI experimental air get 123
+
+title "get (json)"
+trace $CLI experimental air get 123 -o json
+
+title "invalid run id"
+errcode trace $CLI experimental air get notanumber
diff --git a/acceptance/experimental/air/get/test.toml b/acceptance/experimental/air/get/test.toml
@@ -0,0 +1,40 @@
+# This command does not deploy a bundle, so no engine matrix is needed.
+[EnvMatrix]
+DATABRICKS_BUNDLE_ENGINE = []
+
+# The SDK occasionally probes host reachability with a HEAD request; stub it so
+# the test is deterministic.
+[[Server]]
+Pattern = "HEAD /"
+Response.Body = ''
+
+# A single GenAI-compute run with an experiment, GPUs, and a creator.
+[[Server]]
+Pattern = "GET /api/2.2/jobs/runs/get"
+Response.Body = '''
+{
+  "run_id": 123,
+  "run_page_url": "https://my-workspace.cloud.databricks.test/jobs/runs/123",
+  "creator_user_name": "user@example.com",
+  "start_time": 1700000000000,
+  "end_time": 1700000012000,
+  "state": {"life_cycle_state": "TERMINATED", "result_state": "SUCCESS"},
+  "tasks": [
+    {
+      "task_key": "train",
+      "attempt_number": 0,
+      "gen_ai_compute_task": {
+        "mlflow_experiment_name": "/Users/user@example.com/my-exp",
+        "compute": {"gpu_type": "GPU_8xH100", "num_gpus": 8}
+      }
+    }
+  ]
+}
+'''
+
+# MLflow identifiers for the deep-link (runs/get-output is not modeled by the typed SDK).
+[[Server]]
+Pattern = "GET /api/2.2/jobs/runs/get-output"
+Response.Body = '''
+{"gen_ai_compute_output": {"run_info": {"mlflow_experiment_id": "exp1", "mlflow_run_id": "run1"}}}
+'''
diff --git a/acceptance/experimental/air/help/out.test.toml b/acceptance/experimental/air/help/out.test.toml
diff --git a/acceptance/experimental/air/help/output.txt b/acceptance/experimental/air/help/output.txt
@@ -0,0 +1,29 @@
+
+=== help
+>>> [CLI] experimental air --help
+Run and manage AI runtime training workloads on Databricks serverless GPU compute.
+
+This command set is the Go port of the standalone Python "air" CLI. It is
+experimental and may change in future versions.
+
+Usage:
+  databricks experimental air [command]
+
+Available Commands:
+  cancel         Cancel one or more runs
+  get            Show details for a run
+  list           List recent runs
+  logs           Stream or fetch logs for a run
+  register-image Mirror a Docker image into the workspace registry
+  run            Submit a training workload from a YAML config
+
+Flags:
+  -h, --help   help for air
+
+Global Flags:
+      --debug            enable debug logging
+  -o, --output type      output type: text or json (default text)
+  -p, --profile string   ~/.databrickscfg profile
+  -t, --target string    bundle target to use (if applicable)
+
+Use "databricks experimental air [command] --help" for more information about a command.
diff --git a/acceptance/experimental/air/help/script b/acceptance/experimental/air/help/script
@@ -0,0 +1,5 @@
+# Pin the command tree so any change to a subcommand or its short description
+# shows up as a diff here.
+
+title "help"
+trace $CLI experimental air --help
diff --git a/acceptance/experimental/air/help/test.toml b/acceptance/experimental/air/help/test.toml
@@ -0,0 +1,3 @@
+# --help prints without authenticating, so no server stubs are needed.
+[EnvMatrix]
+DATABRICKS_BUNDLE_ENGINE = []
diff --git a/acceptance/experimental/air/unimplemented/out.test.toml b/acceptance/experimental/air/unimplemented/out.test.toml
diff --git a/acceptance/experimental/air/unimplemented/output.txt b/acceptance/experimental/air/unimplemented/output.txt
@@ -0,0 +1,30 @@
+
+=== run
+>>> [CLI] experimental air run
+Error: `air run` is not implemented yet
+
+Exit code: 1
+
+=== list
+>>> [CLI] experimental air list
+Error: `air list` is not implemented yet
+
+Exit code: 1
+
+=== logs
+>>> [CLI] experimental air logs 123
+Error: `air logs` is not implemented yet
+
+Exit code: 1
+
+=== cancel
+>>> [CLI] experimental air cancel 123
+Error: `air cancel` is not implemented yet
+
+Exit code: 1
+
+=== register-image
+>>> [CLI] experimental air register-image my-image:latest
+Error: `air register-image` is not implemented yet
+
+Exit code: 1
diff --git a/acceptance/experimental/air/unimplemented/script b/acceptance/experimental/air/unimplemented/script
@@ -0,0 +1,16 @@
+# Each stub must fail with "not implemented"; errcode records the exit code.
+
+title "run"
+errcode trace $CLI experimental air run
+
+title "list"
+errcode trace $CLI experimental air list
+
+title "logs"
+errcode trace $CLI experimental air logs 123
+
+title "cancel"
+errcode trace $CLI experimental air cancel 123
+
+title "register-image"
+errcode trace $CLI experimental air register-image my-image:latest
diff --git a/acceptance/experimental/air/unimplemented/test.toml b/acceptance/experimental/air/unimplemented/test.toml
@@ -0,0 +1,3 @@
+# Stubs fail locally before any API call, so no server stubs needed.
+[EnvMatrix]
+DATABRICKS_BUNDLE_ENGINE = []
diff --git a/cmd/experimental/experimental.go b/cmd/experimental/experimental.go
@@ -1,6 +1,7 @@
 package experimental
 
 import (
+	aircmd "github.com/databricks/cli/experimental/air/cmd"
 	aitoolscmd "github.com/databricks/cli/experimental/aitools/cmd"
 	geniecmd "github.com/databricks/cli/experimental/genie/cmd"
 	postgrescmd "github.com/databricks/cli/experimental/postgres/cmd"
@@ -22,6 +23,7 @@ These commands provide early access to new features that are still under
 development. They may change or be removed in future versions without notice.`,
 	}
 
+	cmd.AddCommand(aircmd.New())
 	cmd.AddCommand(aitoolscmd.NewAitoolsCmd())
 	cmd.AddCommand(geniecmd.NewGenieCmd())
 	cmd.AddCommand(postgrescmd.New())

diff --git a/experimental/air/cmd/air.go b/experimental/air/cmd/air.go
@@ -0,0 +1,36 @@
+package aircmd
+
+import (
+	"fmt"
+
+	"github.com/spf13/cobra"
+)
+
+// New returns the root command for the experimental AI runtime CLI.
+//
+// Milestone 0: scaffolds the command group with every subcommand registered as a
+// stub (not yet implemented), pending the port from the Python `air` CLI.
+func New() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "air",
+		Short: "Run and manage AI runtime training workloads",
+		Long: `Run and manage AI runtime training workloads on Databricks serverless GPU compute.
+
+This command set is the Go port of the standalone Python "air" CLI. It is
+experimental and may change in future versions.`,
+	}
+
+	cmd.AddCommand(newRunCommand())
+	cmd.AddCommand(newGetCommand())
+	cmd.AddCommand(newListCommand())
+	cmd.AddCommand(newLogsCommand())
+	cmd.AddCommand(newCancelCommand())
+	cmd.AddCommand(newRegisterImageCommand())
+
+	return cmd
+}
+
+// notImplemented returns the placeholder error used by milestone-0 stubs.
+func notImplemented(name string) error {
+	return fmt.Errorf("`air %s` is not implemented yet", name)
+}
diff --git a/experimental/air/cmd/air_test.go b/experimental/air/cmd/air_test.go
@@ -0,0 +1,22 @@
+package aircmd
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// TestNewRegistersAllSubcommands asserts the `air` command wires up every
+// expected subcommand, so none is accidentally dropped from New.
+func TestNewRegistersAllSubcommands(t *testing.T) {
+	registered := make(map[string]bool)
+	for _, c := range New().Commands() {
+		registered[c.Name()] = true
+	}
+
+	want := []string{"run", "get", "list", "logs", "cancel", "register-image"}
+	for _, name := range want {
+		assert.True(t, registered[name], "subcommand %q is not registered", name)
+	}
+	assert.Len(t, registered, len(want), "unexpected number of subcommands")
+}
diff --git a/experimental/air/cmd/cancel.go b/experimental/air/cmd/cancel.go
@@ -0,0 +1,39 @@
+package aircmd
+
+import (
+	"github.com/databricks/cli/cmd/root"
+	"github.com/spf13/cobra"
+)
+
+func newCancelCommand() *cobra.Command {
+	var (
+		all bool
+		yes bool
+	)
+
+	cmd := &cobra.Command{
+		Use:   "cancel [RUN_ID...]",
+		Short: "Cancel one or more runs",
+		Long:  `Cancel one or more runs by ID, or cancel all of your active runs with --all.`,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return notImplemented("cancel")
+		},
+	}
+
+	cmd.Flags().BoolVar(&all, "all", false, "Cancel all of your active runs")
+	cmd.Flags().BoolVarP(&yes, "yes", "y", false, "Skip the confirmation prompt")
+
+	// Require exactly one of: one or more RUN_IDs, or --all. Cobra parses flags
+	// before running this, so `all` reflects the user's input.
+	cmd.Args = func(cmd *cobra.Command, args []string) error {
+		switch {
+		case all && len(args) > 0:
+			return &root.InvalidArgsError{Command: cmd, Message: "cannot combine RUN_ID arguments with --all"}
+		case !all && len(args) == 0:
+			return &root.InvalidArgsError{Command: cmd, Message: "provide at least one RUN_ID, or use --all"}
+		}
+		return nil
+	}
+
+	return cmd
+}
diff --git a/experimental/air/cmd/compute.go b/experimental/air/cmd/compute.go
@@ -0,0 +1,81 @@
+package aircmd
+
+import (
+	"fmt"
+	"strings"
+)
+
+// gpuType is a wire-facing accelerator type submitted to the training service.
+// The number in the name is the partition count (e.g. GPU_8xH100 is 8 GPUs).
+type gpuType string
+
+const (
+	gpuType1xA10  gpuType = "GPU_1xA10"
+	gpuType8xH100 gpuType = "GPU_8xH100"
+	gpuType1xH100 gpuType = "GPU_1xH100"
+)
+
+// gpuTypes lists every valid type. Used for validation error messages.
+var gpuTypes = []gpuType{gpuType1xA10, gpuType1xH100, gpuType8xH100}
+
+func validGPUTypesHint() string {
+	names := make([]string, len(gpuTypes))
+	for i, g := range gpuTypes {
+		names[i] = string(g)
+	}
+	return "valid types are: " + strings.Join(names, ", ")
+}
+
+// parseGPUType resolves a YAML accelerator_type string to a gpuType. The match is
+// exact: the server's lookup is case-sensitive.
+func parseGPUType(value string) (gpuType, error) {
+	switch gpuType(value) {
+	case gpuType1xA10, gpuType8xH100, gpuType1xH100:
+		return gpuType(value), nil
+	}
+	return "", fmt.Errorf("invalid GPU type %q: %s", value, validGPUTypesHint())
+}
+
+// gpusPerNode returns the per-node GPU count, which is the partition count from
+// the name (GPU_1xH100 -> 1, GPU_8xH100 -> 8). num_accelerators must be a
+// round multiple of this since accelerators are allocated in whole nodes.
+func gpusPerNode(g gpuType) (int, error) {
+	switch g {
+	case gpuType1xA10, gpuType1xH100:
+		return 1, nil
+	case gpuType8xH100:
+		return 8, nil
+	}
+	// Unreachable: callers resolve g through parseGPUType first, which rejects
+	// unknown types. Kept as a defensive guard.
+	return 0, fmt.Errorf("invalid GPU type %q", string(g))
+}
+
+// computeConfig is the `compute` block of the run YAML: which accelerators to
+// use and how many.
+type computeConfig struct {
+	NumAccelerators int    `yaml:"num_accelerators"`
+	AcceleratorType string `yaml:"accelerator_type"`
+}
+
+// validate checks the compute block against the backend's constraints.
+func (c computeConfig) validate() error {
+	g, err := parseGPUType(c.AcceleratorType)
+	if err != nil {
+		return fmt.Errorf("compute.accelerator_type: %w", err)
+	}
+
+	if c.NumAccelerators <= 0 {
+		return fmt.Errorf("compute.num_accelerators must be positive, got %d", c.NumAccelerators)
+	}
+
+	perNode, err := gpusPerNode(g)
+	if err != nil {
+		return err
+	}
+	if c.NumAccelerators%perNode != 0 {
+		return fmt.Errorf("compute.num_accelerators for %s must be a multiple of %d, got %d", c.AcceleratorType, perNode, c.NumAccelerators)
+	}
+
+	return nil
+}