Skip to content
Open
2 changes: 1 addition & 1 deletion Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ vars:
# Absolute path so tasks with `dir:` (lint-go-tools, lint-go-codegen) can use it.
GO_TOOL: go tool -modfile={{.ROOT_DIR}}/tools/go.mod
EXE_EXT: '{{if eq OS "windows"}}.exe{{end}}'
TEST_PACKAGES: ./acceptance/internal ./libs/... ./internal/... ./cmd/... ./bundle/... ./experimental/ssh/... .
TEST_PACKAGES: ./acceptance/internal ./libs/... ./internal/... ./cmd/... ./bundle/... ./experimental/air/... ./experimental/ssh/... .
ACCEPTANCE_TEST_FILTER: ""
# Single brace-expansion glob covering every //go:embed target in the repo,
# computed by grepping `//go:embed` directives. Evaluated lazily by Task so
Expand Down
3 changes: 3 additions & 0 deletions acceptance/experimental/air/get/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions acceptance/experimental/air/get/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

=== get (text)
>>> [CLI] experimental air get 123
Run ID: 123
Status: SUCCESS
Submitted: [TIMESTAMP]
Duration: 12s
Retries: 0
Experiment: my-exp
User: user@example.com
Accelerators: 8x H100
MLflow: [DATABRICKS_URL]/ml/experiments/exp1/runs/run1/artifacts/logs/node_0
Dashboard: https://my-workspace.cloud.databricks.test/jobs/runs/123

=== get (json)
>>> [CLI] experimental air get 123 -o json
{
"v": 1,
"ts": "[TIMESTAMP]",
"data": {
"run_id": "123",
"status": "SUCCESS",
"started_at": "[TIMESTAMP]",
"duration_seconds": 12,
"attempt_number": 0,
"experiment_name": "my-exp",
"dashboard_url": "https://my-workspace.cloud.databricks.test/jobs/runs/123",
"mlflow_url": "[DATABRICKS_URL]/ml/experiments/exp1/runs/run1/artifacts/logs/node_0"
}
}

=== invalid run id
>>> [CLI] experimental air get notanumber
Error: invalid RUN_ID "notanumber": must be a positive integer

Exit code: 1
8 changes: 8 additions & 0 deletions acceptance/experimental/air/get/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
title "get (text)"
trace $CLI experimental air get 123

title "get (json)"
trace $CLI experimental air get 123 -o json

title "invalid run id"
errcode trace $CLI experimental air get notanumber
40 changes: 40 additions & 0 deletions acceptance/experimental/air/get/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This command does not deploy a bundle, so no engine matrix is needed.
[EnvMatrix]
DATABRICKS_BUNDLE_ENGINE = []

# The SDK occasionally probes host reachability with a HEAD request; stub it so
# the test is deterministic.
[[Server]]
Pattern = "HEAD /"
Response.Body = ''

# A single GenAI-compute run with an experiment, GPUs, and a creator.
[[Server]]
Pattern = "GET /api/2.2/jobs/runs/get"
Response.Body = '''
{
"run_id": 123,
"run_page_url": "https://my-workspace.cloud.databricks.test/jobs/runs/123",
"creator_user_name": "user@example.com",
"start_time": 1700000000000,
"end_time": 1700000012000,
"state": {"life_cycle_state": "TERMINATED", "result_state": "SUCCESS"},
"tasks": [
{
"task_key": "train",
"attempt_number": 0,
"gen_ai_compute_task": {
"mlflow_experiment_name": "/Users/user@example.com/my-exp",
"compute": {"gpu_type": "GPU_8xH100", "num_gpus": 8}
}
}
]
}
'''

# MLflow identifiers for the deep-link (runs/get-output is not modeled by the typed SDK).
[[Server]]
Pattern = "GET /api/2.2/jobs/runs/get-output"
Response.Body = '''
{"gen_ai_compute_output": {"run_info": {"mlflow_experiment_id": "exp1", "mlflow_run_id": "run1"}}}
'''
3 changes: 3 additions & 0 deletions acceptance/experimental/air/help/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions acceptance/experimental/air/help/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

=== help
>>> [CLI] experimental air --help
Run and manage AI runtime training workloads on Databricks serverless GPU compute.

This command set is the Go port of the standalone Python "air" CLI. It is
experimental and may change in future versions.

Usage:
databricks experimental air [command]

Available Commands:
cancel Cancel one or more runs
get Show details for a run
list List recent runs
logs Stream or fetch logs for a run
register-image Mirror a Docker image into the workspace registry
run Submit a training workload from a YAML config

Flags:
-h, --help help for air

Global Flags:
--debug enable debug logging
-o, --output type output type: text or json (default text)
-p, --profile string ~/.databrickscfg profile
-t, --target string bundle target to use (if applicable)

Use "databricks experimental air [command] --help" for more information about a command.
5 changes: 5 additions & 0 deletions acceptance/experimental/air/help/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Pin the command tree so any change to a subcommand or its short description
# shows up as a diff here.

title "help"
trace $CLI experimental air --help
3 changes: 3 additions & 0 deletions acceptance/experimental/air/help/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# --help prints without authenticating, so no server stubs are needed.
[EnvMatrix]
DATABRICKS_BUNDLE_ENGINE = []
3 changes: 3 additions & 0 deletions acceptance/experimental/air/unimplemented/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions acceptance/experimental/air/unimplemented/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

=== run
>>> [CLI] experimental air run
Error: `air run` is not implemented yet

Exit code: 1

=== list
>>> [CLI] experimental air list
Error: `air list` is not implemented yet

Exit code: 1

=== logs
>>> [CLI] experimental air logs 123
Error: `air logs` is not implemented yet

Exit code: 1

=== cancel
>>> [CLI] experimental air cancel 123
Error: `air cancel` is not implemented yet

Exit code: 1

=== register-image
>>> [CLI] experimental air register-image my-image:latest
Error: `air register-image` is not implemented yet

Exit code: 1
16 changes: 16 additions & 0 deletions acceptance/experimental/air/unimplemented/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Each stub must fail with "not implemented"; errcode records the exit code.

title "run"
errcode trace $CLI experimental air run

title "list"
errcode trace $CLI experimental air list

title "logs"
errcode trace $CLI experimental air logs 123

title "cancel"
errcode trace $CLI experimental air cancel 123

title "register-image"
errcode trace $CLI experimental air register-image my-image:latest
3 changes: 3 additions & 0 deletions acceptance/experimental/air/unimplemented/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Stubs fail locally before any API call, so no server stubs needed.
[EnvMatrix]
DATABRICKS_BUNDLE_ENGINE = []
2 changes: 2 additions & 0 deletions cmd/experimental/experimental.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package experimental

import (
aircmd "github.com/databricks/cli/experimental/air/cmd"
aitoolscmd "github.com/databricks/cli/experimental/aitools/cmd"
geniecmd "github.com/databricks/cli/experimental/genie/cmd"
postgrescmd "github.com/databricks/cli/experimental/postgres/cmd"
Expand All @@ -22,6 +23,7 @@ These commands provide early access to new features that are still under
development. They may change or be removed in future versions without notice.`,
}

cmd.AddCommand(aircmd.New())
cmd.AddCommand(aitoolscmd.NewAitoolsCmd())
cmd.AddCommand(geniecmd.NewGenieCmd())
cmd.AddCommand(postgrescmd.New())
Expand Down
36 changes: 36 additions & 0 deletions experimental/air/cmd/air.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package aircmd

import (
"fmt"

"github.com/spf13/cobra"
)

// New returns the root command for the experimental AI runtime CLI.
//
// Milestone 0: scaffolds the command group with every subcommand registered as a
// stub (not yet implemented), pending the port from the Python `air` CLI.
func New() *cobra.Command {
cmd := &cobra.Command{
Use: "air",
Short: "Run and manage AI runtime training workloads",
Long: `Run and manage AI runtime training workloads on Databricks serverless GPU compute.

This command set is the Go port of the standalone Python "air" CLI. It is
experimental and may change in future versions.`,
}

cmd.AddCommand(newRunCommand())
cmd.AddCommand(newGetCommand())
cmd.AddCommand(newListCommand())
cmd.AddCommand(newLogsCommand())
cmd.AddCommand(newCancelCommand())
cmd.AddCommand(newRegisterImageCommand())

return cmd
}

// notImplemented returns the placeholder error used by milestone-0 stubs.
func notImplemented(name string) error {
return fmt.Errorf("`air %s` is not implemented yet", name)
}
22 changes: 22 additions & 0 deletions experimental/air/cmd/air_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package aircmd

import (
"testing"

"github.com/stretchr/testify/assert"
)

// TestNewRegistersAllSubcommands asserts the `air` command wires up every
// expected subcommand, so none is accidentally dropped from New.
func TestNewRegistersAllSubcommands(t *testing.T) {
registered := make(map[string]bool)
for _, c := range New().Commands() {
registered[c.Name()] = true
}

want := []string{"run", "get", "list", "logs", "cancel", "register-image"}
for _, name := range want {
assert.True(t, registered[name], "subcommand %q is not registered", name)
}
assert.Len(t, registered, len(want), "unexpected number of subcommands")
}
39 changes: 39 additions & 0 deletions experimental/air/cmd/cancel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package aircmd

import (
"github.com/databricks/cli/cmd/root"
"github.com/spf13/cobra"
)

func newCancelCommand() *cobra.Command {
var (
all bool
yes bool
)

cmd := &cobra.Command{
Use: "cancel [RUN_ID...]",
Short: "Cancel one or more runs",
Long: `Cancel one or more runs by ID, or cancel all of your active runs with --all.`,
RunE: func(cmd *cobra.Command, args []string) error {
return notImplemented("cancel")
},
}

cmd.Flags().BoolVar(&all, "all", false, "Cancel all of your active runs")
cmd.Flags().BoolVarP(&yes, "yes", "y", false, "Skip the confirmation prompt")

// Require exactly one of: one or more RUN_IDs, or --all. Cobra parses flags
// before running this, so `all` reflects the user's input.
cmd.Args = func(cmd *cobra.Command, args []string) error {
switch {
case all && len(args) > 0:
return &root.InvalidArgsError{Command: cmd, Message: "cannot combine RUN_ID arguments with --all"}
case !all && len(args) == 0:
return &root.InvalidArgsError{Command: cmd, Message: "provide at least one RUN_ID, or use --all"}
}
return nil
}

return cmd
}
81 changes: 81 additions & 0 deletions experimental/air/cmd/compute.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package aircmd

import (
"fmt"
"strings"
)

// gpuType is a wire-facing accelerator type submitted to the training service.
// The number in the name is the partition count (e.g. GPU_8xH100 is 8 GPUs).
type gpuType string

const (
gpuType1xA10 gpuType = "GPU_1xA10"
gpuType8xH100 gpuType = "GPU_8xH100"
gpuType1xH100 gpuType = "GPU_1xH100"
)

// gpuTypes lists every valid type. Used for validation error messages.
var gpuTypes = []gpuType{gpuType1xA10, gpuType1xH100, gpuType8xH100}

func validGPUTypesHint() string {
names := make([]string, len(gpuTypes))
for i, g := range gpuTypes {
names[i] = string(g)
}
return "valid types are: " + strings.Join(names, ", ")
}

// parseGPUType resolves a YAML accelerator_type string to a gpuType. The match is
// exact: the server's lookup is case-sensitive.
func parseGPUType(value string) (gpuType, error) {
switch gpuType(value) {
case gpuType1xA10, gpuType8xH100, gpuType1xH100:
return gpuType(value), nil
}
return "", fmt.Errorf("invalid GPU type %q: %s", value, validGPUTypesHint())
}

// gpusPerNode returns the per-node GPU count, which is the partition count from
// the name (GPU_1xH100 -> 1, GPU_8xH100 -> 8). num_accelerators must be a
// round multiple of this since accelerators are allocated in whole nodes.
func gpusPerNode(g gpuType) (int, error) {
switch g {
case gpuType1xA10, gpuType1xH100:
return 1, nil
case gpuType8xH100:
return 8, nil
}
// Unreachable: callers resolve g through parseGPUType first, which rejects
// unknown types. Kept as a defensive guard.
return 0, fmt.Errorf("invalid GPU type %q", string(g))
}

// computeConfig is the `compute` block of the run YAML: which accelerators to
// use and how many.
type computeConfig struct {
NumAccelerators int `yaml:"num_accelerators"`
AcceleratorType string `yaml:"accelerator_type"`
}

// validate checks the compute block against the backend's constraints.
func (c computeConfig) validate() error {
g, err := parseGPUType(c.AcceleratorType)
if err != nil {
return fmt.Errorf("compute.accelerator_type: %w", err)
}

if c.NumAccelerators <= 0 {
return fmt.Errorf("compute.num_accelerators must be positive, got %d", c.NumAccelerators)
}

perNode, err := gpusPerNode(g)
if err != nil {
return err
}
if c.NumAccelerators%perNode != 0 {
return fmt.Errorf("compute.num_accelerators for %s must be a multiple of %d, got %d", c.AcceleratorType, perNode, c.NumAccelerators)
}

return nil
}
Loading