Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion src/dstack/_internal/core/backends/base/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,8 @@ def get_shim_commands(
backend_shim_env: Optional[Dict[str, str]] = None,
arch: Optional[str] = None,
) -> List[str]:
commands = get_shim_pre_start_commands(
commands = get_setup_cloud_instance_commands()
commands += get_shim_pre_start_commands(
base_path=base_path,
bin_path=bin_path,
arch=arch,
Expand Down Expand Up @@ -641,6 +642,23 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
return url_template.format(version=version, arch=arch)


def get_setup_cloud_instance_commands() -> list[str]:
return [
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
# Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
(
"/bin/sh -c '" # wrap in /bin/sh to avoid interfering with other cloud init commands
" grep -q nvidia /etc/docker/daemon.json"
" && ! grep -q native.cgroupdriver /etc/docker/daemon.json"
" && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json"
" && sudo mv /tmp/daemon.json /etc/docker/daemon.json"
" && sudo service docker restart"
" || true"
"'"
),
]


def get_shim_pre_start_commands(
base_path: Optional[PathLike] = None,
bin_path: Optional[PathLike] = None,
Expand Down
32 changes: 23 additions & 9 deletions src/dstack/_internal/core/backends/cudo/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,13 @@ def create_instance(
public_keys = instance_config.get_public_keys()
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
commands = get_shim_commands(authorized_keys=public_keys)
gpus_no = len(instance_offer.instance.resources.gpus)
shim_commands = " ".join([" && ".join(commands)])
startup_script = (
shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
)
if gpus_no > 0:
# we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands()
commands = install_jq_commands()
else:
commands = install_docker_commands()
commands += get_shim_commands(authorized_keys=public_keys)

try:
resp_data = self.api_client.create_virtual_machine(
Expand All @@ -85,7 +86,7 @@ def create_instance(
memory_gib=memory_size,
vcpus=instance_offer.instance.resources.cpus,
vm_id=vm_id,
start_script=startup_script,
start_script=" && ".join(commands),
password=None,
customSshKeys=public_keys,
)
Expand Down Expand Up @@ -151,6 +152,19 @@ def _get_image_id(cuda: bool) -> str:
return image_name


def install_docker_script():
commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin'
return commands
def install_jq_commands():
return [
"export DEBIAN_FRONTEND=noninteractive",
"apt-get --assume-yes install jq",
]


def install_docker_commands():
return [
"export DEBIAN_FRONTEND=noninteractive",
"mkdir -p /etc/apt/keyrings",
"curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
"apt-get update",
"apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
]
3 changes: 2 additions & 1 deletion src/dstack/_internal/core/backends/lambdalabs/compute.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import shlex
import subprocess
import tempfile
from threading import Thread
Expand Down Expand Up @@ -98,7 +99,7 @@ def update_provisioning_data(
arch=provisioning_data.instance_type.resources.cpu_arch,
)
# shim is assumed to be run under root
launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
thread = Thread(
target=_start_runner,
kwargs={
Expand Down
Loading