Skip to content

Commit 77f3be1

Browse files
peterschmidt85Andrey Cheptsovclaude
authored
[CloudRift] Fix NTP clock skew breaking Docker; handle amd-smi 7.x output format (#3701)
CloudRift VMs boot with an incorrect RTC clock (~1h ahead). When NTP corrects it backwards, Docker discards container exit events, leaving containers stuck as ghosts forever. Add NTP sync wait before launching the shim to prevent this. Also handle both amd-smi output formats (flat array in ROCm 6.x, wrapped {"gpu_data": [...]} in ROCm 7.x) and add a 2-minute timeout to AMD GPU detection to prevent the shim from hanging indefinitely. Co-authored-by: Andrey Cheptsov <andrey.cheptsov@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 30e90d3 commit 77f3be1

File tree

3 files changed

+55
-8
lines changed

3 files changed

+55
-8
lines changed

runner/internal/shim/host/gpu.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"path/filepath"
1111
"strconv"
1212
"strings"
13+
"time"
1314

1415
execute "github.com/alexellis/go-execute/v2"
1516

@@ -114,6 +115,11 @@ type amdGpu struct {
114115
Bus amdBus `json:"bus"`
115116
}
116117

118+
// amd-smi >= 7.x wraps the array in {"gpu_data": [...]}
119+
type amdSmiOutput struct {
120+
GpuData []amdGpu `json:"gpu_data"`
121+
}
122+
117123
type amdAsic struct {
118124
Name string `json:"market_name"`
119125
}
@@ -130,9 +136,27 @@ type amdBus struct {
130136
BDF string `json:"bdf"` // PCIe Domain:Bus:Device.Function notation
131137
}
132138

139+
// parseAmdSmiOutput handles both amd-smi output formats:
140+
// ROCm 6.x returns a flat array: [{"gpu": 0, ...}, ...]
141+
// ROCm 7.x wraps it: {"gpu_data": [{"gpu": 0, ...}, ...]}
142+
func parseAmdSmiOutput(data []byte) ([]amdGpu, error) {
143+
var amdGpus []amdGpu
144+
if err := json.Unmarshal(data, &amdGpus); err == nil {
145+
return amdGpus, nil
146+
}
147+
var wrapped amdSmiOutput
148+
if err := json.Unmarshal(data, &wrapped); err != nil {
149+
return nil, err
150+
}
151+
return wrapped.GpuData, nil
152+
}
153+
133154
func getAmdGpuInfo(ctx context.Context) []GpuInfo {
134155
gpus := []GpuInfo{}
135156

157+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
158+
defer cancel()
159+
136160
cmd := execute.ExecTask{
137161
Command: "docker",
138162
Args: []string{
@@ -158,8 +182,8 @@ func getAmdGpuInfo(ctx context.Context) []GpuInfo {
158182
return gpus
159183
}
160184

161-
var amdGpus []amdGpu
162-
if err := json.Unmarshal([]byte(res.Stdout), &amdGpus); err != nil {
185+
amdGpus, err := parseAmdSmiOutput([]byte(res.Stdout))
186+
if err != nil {
163187
log.Error(ctx, "cannot read json", "err", err)
164188
return gpus
165189
}

src/dstack/_internal/core/backends/cloudrift/api_client.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,16 @@ def get_vm_recipies(self) -> List[Dict]:
7272

7373
return vm_recipes
7474

75-
def get_vm_image_url(self) -> Optional[str]:
75+
def get_vm_image_url(self, gpu_vendor: Optional[str] = None) -> Optional[str]:
7676
recipes = self.get_vm_recipies()
77+
if gpu_vendor == "amd":
78+
driver_tag = "amd-driver"
79+
else:
80+
driver_tag = "nvidia-driver"
81+
7782
ubuntu_images = []
7883
for recipe in recipes:
79-
has_nvidia_driver = "nvidia-driver" in recipe.get("tags", [])
80-
if not has_nvidia_driver:
84+
if driver_tag not in recipe.get("tags", []):
8185
continue
8286

8387
recipe_name = recipe.get("name", "")
@@ -97,9 +101,14 @@ def get_vm_image_url(self) -> Optional[str]:
97101
return None
98102

99103
def deploy_instance(
100-
self, instance_type: str, region: str, ssh_keys: List[str], cmd: str
104+
self,
105+
instance_type: str,
106+
region: str,
107+
ssh_keys: List[str],
108+
cmd: str,
109+
gpu_vendor: Optional[str] = None,
101110
) -> List[str]:
102-
image_url = self.get_vm_image_url()
111+
image_url = self.get_vm_image_url(gpu_vendor=gpu_vendor)
103112
if not image_url:
104113
raise BackendError("No suitable VM image found.")
105114

src/dstack/_internal/core/backends/cloudrift/compute.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,31 @@ def create_instance(
7373
instance_config: InstanceConfiguration,
7474
placement_group: Optional[PlacementGroup],
7575
) -> JobProvisioningData:
76-
commands = get_shim_commands()
76+
# TODO: Remove once CloudRift fixes their VM RTC clock.
77+
# Wrong RTC + NTP backward jump breaks Docker container lifecycle.
78+
ntp_sync_commands = [
79+
(
80+
"timeout 60 bash -c '"
81+
"while ! timedatectl show -p NTPSynchronized --value | grep -q yes;"
82+
" do sleep 1; done' || true"
83+
),
84+
]
85+
commands = ntp_sync_commands + get_shim_commands()
7786
startup_script = " ".join([" && ".join(commands)])
7887
logger.debug(
7988
f"Creating instance for offer {instance_offer.instance.name} in region {instance_offer.region} with commands: {startup_script}"
8089
)
8190

91+
gpu_vendor = None
92+
if instance_offer.instance.resources.gpus:
93+
gpu_vendor = instance_offer.instance.resources.gpus[0].vendor.value
94+
8295
instance_ids = self.client.deploy_instance(
8396
instance_type=instance_offer.instance.name,
8497
region=instance_offer.region,
8598
ssh_keys=instance_config.get_public_keys(),
8699
cmd=startup_script,
100+
gpu_vendor=gpu_vendor,
87101
)
88102

89103
if len(instance_ids) == 0:

0 commit comments

Comments
 (0)