diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 0125f6066b..53d062567b 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -559,7 +559,8 @@ def get_shim_commands( backend_shim_env: Optional[Dict[str, str]] = None, arch: Optional[str] = None, ) -> List[str]: - commands = get_shim_pre_start_commands( + commands = get_setup_cloud_instance_commands() + commands += get_shim_pre_start_commands( base_path=base_path, bin_path=bin_path, arch=arch, @@ -641,6 +642,23 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str: return url_template.format(version=version, arch=arch) +def get_setup_cloud_instance_commands() -> list[str]: + return [ + # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + # Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have. + ( + "/bin/sh -c '" # wrap in /bin/sh to avoid interfering with other cloud init commands + " grep -q nvidia /etc/docker/daemon.json" + " && ! grep -q native.cgroupdriver /etc/docker/daemon.json" + " && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json" + " && sudo mv /tmp/daemon.json /etc/docker/daemon.json" + " && sudo service docker restart" + " || true" + "'" + ), + ] + + def get_shim_pre_start_commands( base_path: Optional[PathLike] = None, bin_path: Optional[PathLike] = None, diff --git a/src/dstack/_internal/core/backends/cudo/compute.py b/src/dstack/_internal/core/backends/cudo/compute.py index 36d260cdea..4da43b6b2a 100644 --- a/src/dstack/_internal/core/backends/cudo/compute.py +++ b/src/dstack/_internal/core/backends/cudo/compute.py @@ -65,12 +65,13 @@ def create_instance( public_keys = instance_config.get_public_keys() memory_size = round(instance_offer.instance.resources.memory_mib / 1024) disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024) - commands = get_shim_commands(authorized_keys=public_keys) gpus_no = len(instance_offer.instance.resources.gpus) - shim_commands = " ".join([" && ".join(commands)]) - startup_script = ( - shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}" - ) + if gpus_no > 0: + # we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands() + commands = install_jq_commands() + else: + commands = install_docker_commands() + commands += get_shim_commands(authorized_keys=public_keys) try: resp_data = self.api_client.create_virtual_machine( @@ -85,7 +86,7 @@ def create_instance( memory_gib=memory_size, vcpus=instance_offer.instance.resources.cpus, vm_id=vm_id, - start_script=startup_script, + start_script=" && ".join(commands), password=None, customSshKeys=public_keys, ) @@ -151,6 +152,19 @@ def _get_image_id(cuda: bool) -> str: return image_name -def install_docker_script(): - commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin' - return commands +def install_jq_commands(): + return [ + "export DEBIAN_FRONTEND=noninteractive", + "apt-get --assume-yes install jq", + ] + + +def install_docker_commands(): + return [ + "export DEBIAN_FRONTEND=noninteractive", + "mkdir -p /etc/apt/keyrings", + "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", + 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', + "apt-get update", + "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", + ] diff --git a/src/dstack/_internal/core/backends/lambdalabs/compute.py b/src/dstack/_internal/core/backends/lambdalabs/compute.py index 865b8972a5..15641851ac 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/compute.py +++ b/src/dstack/_internal/core/backends/lambdalabs/compute.py @@ -1,4 +1,5 @@ import hashlib +import shlex import subprocess import tempfile from threading import Thread @@ -98,7 +99,7 @@ def update_provisioning_data( arch=provisioning_data.instance_type.resources.cpu_arch, ) # shim is assumed to be run under root - launch_command = "sudo sh -c '" + "&& ".join(commands) + "'" + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) thread = Thread( target=_start_runner, kwargs={