diff --git a/.github/workflows/docker-efa.yml b/.github/workflows/docker-efa.yml deleted file mode 100644 index 5f6ac0064f..0000000000 --- a/.github/workflows/docker-efa.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: Build EFA Docker image - -on: - workflow_dispatch: - inputs: - image_name: - description: "Docker image name" - required: true - default: "dstackai/efa" - dstack_revision: - description: "Docker image revision" - required: true - default: 0 - -jobs: - build-efa: - defaults: - run: - working-directory: docker/efa - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and upload to DockerHub - run: | - IMAGE_NAME=${{ inputs.image_name }} - BUILD_DATE=$(date --utc --iso-8601=seconds)Z - docker buildx build . \ - --load \ - --provenance=false \ - --platform linux/amd64 \ - --build-arg IMAGE_NAME=${IMAGE_NAME} \ - --build-arg DSTACK_REVISION=${{ inputs.dstack_revision }} \ - --build-arg BUILD_DATE=${BUILD_DATE} \ - --tag ${IMAGE_NAME}:latest - VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME}) - docker tag ${IMAGE_NAME}:latest ${IMAGE_NAME}:${VERSION} - docker push ${IMAGE_NAME}:${VERSION} - docker push ${IMAGE_NAME}:latest diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 0beaf582ba..f4c97b5da1 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,8 +51,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: ["3.9", "3.10", "3.11", "3.12", "3.13"] - flavor: ["base", "devel"] + flavor: ["base", "devel", "devel-efa"] + ubuntu_version: ["22"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -67,7 +67,21 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Build and upload to DockerHub run: | - docker buildx build --platform linux/amd64 --build-arg FLAVOR=${{ matrix.flavor }} --build-arg PYTHON=${{ matrix.python }} --push --provenance=false --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${{ matrix.flavor == 'devel' && '-devel' || '' }} -f base/Dockerfile . + if [ "${{ matrix.flavor }}" = "base" ]; then + FILE="base/Dockerfile" + elif [ "${{ matrix.flavor }}" = "devel" ]; then + FILE="base/Dockerfile" + else + FILE="base/efa/Dockerfile" + fi + docker buildx build \ + --platform linux/amd64 \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ + --build-arg FLAVOR=${{ matrix.flavor }} \ + --build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \ + --provenance=false \ + --push \ + -f $FILE . build-aws-images: needs: build-docker diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 2ed2fbd984..a42788984a 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,28 +1,79 @@ +# syntax = edrevo/dockerfile-plus +ARG UBUNTU_VERSION + +# Build stage +FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder + +ENV NCCL_HOME=/opt/nccl +ENV CUDA_HOME=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi + +# Prerequisites + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + git \ + curl \ + python3 \ + build-essential + +# NCCL + +ARG NCCL_VERSION=2.26.2-1 + +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} + +# NCCL tests + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +# Final stage + +INCLUDE+ base/Dockerfile.common + +ENV NCCL_HOME=/opt/nccl + +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} +COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build + ARG FLAVOR -FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04 - -ARG PYTHON -ARG _UV_HOME="/opt/uv" -ENV UV_PYTHON="${PYTHON}" -ENV UV_INSTALL_DIR="${_UV_HOME}/bin" -ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python" -ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin" -ENV UV_MANAGED_PYTHON=1 -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 - -ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}" - -RUN export DEBIAN_FRONTEND=noninteractive && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update --fix-missing && \ - apt-get upgrade -y && \ - ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ - dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ - libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \ - sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \ - mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_* - -RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ - uv python install --preview --default + +# MPI, NVCC, and /etc/ld.so.conf.d + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + openmpi-bin \ + && if [ "$FLAVOR" = "devel" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev; \ + fi \ + && rm -rf /var/lib/apt/lists/* \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common new file mode 100644 index 0000000000..ae76f30124 --- /dev/null +++ b/docker/base/Dockerfile.common @@ -0,0 +1,35 @@ +ARG UBUNTU_VERSION + +FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 + +ARG _UV_HOME="/opt/uv" + +ENV UV_INSTALL_DIR="${_UV_HOME}/bin" +ENV UV_MANAGED_PYTHON=1 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +ENV PATH="${UV_INSTALL_DIR}:${PATH}" + +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + && rm -rf /var/lib/apt/lists/* \ + && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ + && mkdir /run/sshd \ + && mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && chmod 600 ~/.ssh/authorized_keys \ + && rm /etc/ssh/ssh_host_* + +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ + && uv python install --preview --default diff --git a/docker/efa/Dockerfile b/docker/base/efa/Dockerfile similarity index 61% rename from docker/efa/Dockerfile rename to docker/base/efa/Dockerfile index 0f8d717b0d..50b6c1c5ef 100644 --- a/docker/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,15 +1,15 @@ -ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1 +# syntax = edrevo/dockerfile-plus -FROM ${BASE_IMAGE} +INCLUDE+ base/Dockerfile.common -ENV PREFIX=/usr/local -ENV CUDA_PATH=/usr/local/cuda +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# prerequisites +# Prerequisites RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get update \ @@ -19,61 +19,58 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ libhwloc-dev \ autoconf \ automake \ - libtool + libtool \ + && rm -rf /var/lib/apt/lists/* # EFA ARG EFA_VERSION=1.38.1 -RUN cd $HOME \ +RUN cd /tmp \ + && apt-get update \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* # NCCL ARG NCCL_VERSION=2.26.2-1 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${PREFIX} + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && rm -rf /tmp/nccl # AWS OFI NCCL ARG OFI_VERSION=1.14.0 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_PATH} \ + --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ - --with-cuda=${CUDA_PATH} \ - --with-nccl=${PREFIX} \ + --with-cuda=${CUDA_HOME} \ + --with-nccl=${NCCL_HOME} \ --disable-tests \ - --prefix=${PREFIX} \ - && make -j$(numproc) \ - && make install + --prefix=${NCCL_HOME} \ + && make -j$(nproc) \ + && make install \ + && rm -rf /tmp/aws-ofi-nccl /var/lib/apt/lists/* # NCCL Tests -RUN cd $HOME \ +RUN cd /opt \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ - && make -j$(numproc) \ + && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${PREFIX} - -ARG BUILD_DATE -ARG IMAGE_NAME -ARG DSTACK_REVISION - -LABEL org.opencontainers.image.title="${IMAGE_NAME}" -LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}" -LABEL org.opencontainers.image.created="${BUILD_DATE}" + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} diff --git a/docker/efa/README.md b/docker/base/efa/README.md similarity index 100% rename from docker/efa/README.md rename to docker/base/efa/README.md diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index 3870731e35..164148b3c7 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -5,12 +5,9 @@ nodes: 2 startup_order: workers-first stop_criteria: master-done -# This image comes with MPI and NCCL tests pre-built -image: dstackai/efa env: - NCCL_DEBUG=INFO commands: - - cd /root/nccl-tests/build - | if [ $DSTACK_NODE_RANK -eq 0 ]; then mpirun \ @@ -18,13 +15,12 @@ commands: --hostfile $DSTACK_MPI_HOSTFILE \ -n $DSTACK_GPUS_NUM \ -N $DSTACK_GPUS_PER_NODE \ - --mca btl_tcp_if_exclude lo,docker0 \ --bind-to none \ - ./all_reduce_perf -b 8 -e 8G -f 2 -g 1 + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 else sleep infinity fi resources: - gpu: nvidia:4:16GB + gpu: nvidia:1..8 shm_size: 16GB diff --git a/examples/distributed-training/torchrun/.dstack.yml b/examples/distributed-training/torchrun/.dstack.yml index 062051f365..4eccdb263e 100644 --- a/examples/distributed-training/torchrun/.dstack.yml +++ b/examples/distributed-training/torchrun/.dstack.yml @@ -1,7 +1,6 @@ type: task name: train-distrib -# The size of the cluster nodes: 2 python: 3.12 @@ -21,6 +20,5 @@ commands: multinode.py 50 10 resources: - gpu: 24GB:1..2 - # Uncomment if using multiple GPUs - #shm_size: 24GB + gpu: 1..8 + shm_size: 16GB diff --git a/scripts/packer/aws-image-cuda.json b/scripts/packer/aws-image-cuda.json index 64f4280bb3..f7719304ef 100644 --- a/scripts/packer/aws-image-cuda.json +++ b/scripts/packer/aws-image-cuda.json @@ -81,14 +81,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/aws-image.json b/scripts/packer/aws-image.json index c9e3cd5fb8..0327d6f9fb 100644 --- a/scripts/packer/aws-image.json +++ b/scripts/packer/aws-image.json @@ -71,14 +71,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/azure-image-cuda.json b/scripts/packer/azure-image-cuda.json index 4107c1ff65..c191282aec 100644 --- a/scripts/packer/azure-image-cuda.json +++ b/scripts/packer/azure-image-cuda.json @@ -73,14 +73,6 @@ "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index adef2d4ba9..58239b777a 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -71,14 +71,6 @@ "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/azure-image.json b/scripts/packer/azure-image.json index 2c5602040a..0b0d378335 100644 --- a/scripts/packer/azure-image.json +++ b/scripts/packer/azure-image.json @@ -63,14 +63,6 @@ "./install-docker.sh --version {{user `docker_version`}}" ] }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/build-cuda-image.pkr.hcl b/scripts/packer/build-cuda-image.pkr.hcl index b50dbada9f..48b1c20024 100644 --- a/scripts/packer/build-cuda-image.pkr.hcl +++ b/scripts/packer/build-cuda-image.pkr.hcl @@ -31,9 +31,4 @@ build { provisioner "shell" { script = "provisioners/install-nvidia-container-toolkit.sh" } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" - } } diff --git a/scripts/packer/build-image.pkr.hcl b/scripts/packer/build-image.pkr.hcl index 4cb1517dc3..6033ee4b1f 100644 --- a/scripts/packer/build-image.pkr.hcl +++ b/scripts/packer/build-image.pkr.hcl @@ -22,9 +22,4 @@ build { provisioner "shell" { inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"] } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" - } } diff --git a/scripts/packer/gcp-image-cuda.json b/scripts/packer/gcp-image-cuda.json index 9ebb24e7c9..2d606a2b42 100644 --- a/scripts/packer/gcp-image-cuda.json +++ b/scripts/packer/gcp-image-cuda.json @@ -56,14 +56,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/gcp-image.json b/scripts/packer/gcp-image.json index 6e6ba64537..7b9fd2f950 100644 --- a/scripts/packer/gcp-image.json +++ b/scripts/packer/gcp-image.json @@ -46,14 +46,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/oci-image-cuda.json b/scripts/packer/oci-image-cuda.json index 7d7251bc0b..e0406d8738 100644 --- a/scripts/packer/oci-image-cuda.json +++ b/scripts/packer/oci-image-cuda.json @@ -65,14 +65,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/oci-image.json b/scripts/packer/oci-image.json index 742a8649a0..370cb4bf2d 100644 --- a/scripts/packer/oci-image.json +++ b/scripts/packer/oci-image.json @@ -55,14 +55,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/provisioners/pull-docker-images.sh b/scripts/packer/provisioners/pull-docker-images.sh deleted file mode 100644 index 4d9d826435..0000000000 --- a/scripts/packer/provisioners/pull-docker-images.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -set -e - -IMAGES=" - dstackai/${IMAGE_REPO}:py3.13-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.12-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.11-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.10-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.9-${IMAGE_VERSION}-cuda-12.1 -" -echo "START pull image" -for img in $IMAGES; do - docker pull --platform linux/amd64 $img -done -echo "LIST installed images" -docker image ls --all -echo "END " diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index e05f98fd23..b834db39b9 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -1,4 +1,5 @@ import asyncio +import re from collections.abc import Iterable from datetime import timedelta, timezone from typing import Dict, List, Optional @@ -7,6 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload +from dstack._internal import settings from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT from dstack._internal.core.errors import GatewayError from dstack._internal.core.models.backends.base import BackendType @@ -517,14 +519,14 @@ def _process_provisioning_with_shim( cpu = None memory = None network_mode = NetworkMode.HOST - + image_name = _patch_base_image_for_aws_efa(job_spec, job_provisioning_data) if shim_client.is_api_v2_supported(): shim_client.submit_task( task_id=job_model.id, name=job_model.job_name, registry_username=registry_username, registry_password=registry_password, - image_name=job_spec.image_name, + image_name=image_name, container_user=container_user, privileged=job_spec.privileged, gpu=gpu, @@ -545,7 +547,7 @@ def _process_provisioning_with_shim( submitted = shim_client.submit( username=registry_username, password=registry_password, - image_name=job_spec.image_name, + image_name=image_name, privileged=job_spec.privileged, container_name=job_model.job_name, container_user=container_user, @@ -969,3 +971,43 @@ def _get_instance_specific_gpu_devices( GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl") ) return gpu_devices + + +def _patch_base_image_for_aws_efa( + job_spec: JobSpec, job_provisioning_data: JobProvisioningData +) -> str: + image_name = job_spec.image_name + + if job_provisioning_data.backend != BackendType.AWS: + return image_name + + instance_type = job_provisioning_data.instance_type.name + efa_enabled_patterns = [ + # TODO: p6-b200 isn't supported yet in gpuhunt + r"^p6-b200\.(48xlarge)$", + r"^p5\.(48xlarge)$", + r"^p5e\.(48xlarge)$", + r"^p5en\.(48xlarge)$", + r"^p4d\.(24xlarge)$", + r"^p4de\.(24xlarge)$", + r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^gr6\.8xlarge$", + r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$", + r"^p3dn\.(24xlarge)$", + ] + + is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) + if not is_efa_enabled: + return image_name + + if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"): + return image_name + + if image_name.endswith(f"-base-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"): + return image_name[:-17] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + elif image_name.endswith(f"-devel-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"): + return image_name[:-18] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + + return image_name diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index e3c7b89ee4..6ef0ca7712 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -50,11 +50,15 @@ def get_default_python_verison() -> str: ) -def get_default_image(python_version: str, nvcc: bool = False) -> str: - suffix = "" - if nvcc: - suffix = "-devel" - return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}" +def get_default_image(nvcc: bool = False) -> str: + """ + Note: May be overridden by dstack (e.g., EFA-enabled version for AWS EFA-capable instances). + See `dstack._internal.server.background.tasks.process_running_jobs._patch_base_image_for_aws_efa` for details. + + Args: + nvcc: If True, returns 'devel' variant, otherwise 'base'. + """ + return f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" class JobConfigurator(ABC): @@ -173,7 +177,7 @@ def _dstack_image_commands(self) -> List[str]: ): return [] return [ - f"uv venv --prompt workflow --seed {DEFAULT_REPO_DIR}/.venv > /dev/null 2>&1", + f"uv venv --python {self._python()} --prompt workflow --seed {DEFAULT_REPO_DIR}/.venv > /dev/null 2>&1", f"echo 'source {DEFAULT_REPO_DIR}/.venv/bin/activate' >> ~/.bashrc", f"source {DEFAULT_REPO_DIR}/.venv/bin/activate", ] @@ -199,7 +203,7 @@ def _home_dir(self) -> Optional[str]: def _image_name(self) -> str: if self.run_spec.configuration.image is not None: return self.run_spec.configuration.image - return get_default_image(self._python(), nvcc=bool(self.run_spec.configuration.nvcc)) + return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc)) async def _user(self) -> Optional[UnixUser]: user = self.run_spec.configuration.user diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 92a7326a0c..2636a3b362 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -14,6 +14,9 @@ DSTACK_BASE_IMAGE = os.getenv("DSTACK_BASE_IMAGE", "dstackai/base") DSTACK_BASE_IMAGE_VERSION = os.getenv("DSTACK_BASE_IMAGE_VERSION", version.base_image) +DSTACK_BASE_IMAGE_UBUNTU_VERSION = os.getenv( + "DSTACK_BASE_IMAGE_UBUNTU_VERSION", version.base_image_ubuntu_version +) class FeatureFlags: diff --git a/src/dstack/version.py b/src/dstack/version.py index c71d1a7665..d4255a2301 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -1,3 +1,4 @@ __version__ = "0.0.0" __is_release__ = False -base_image = "0.9" +base_image = "0.10" +base_image_ubuntu_version = "22.04" diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index d842441fe9..59a08ddc4d 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -7,16 +7,21 @@ from freezegun import freeze_time from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal import settings from dstack._internal.core.errors import SSHError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import NetworkMode from dstack._internal.core.models.configurations import DevEnvironmentConfiguration -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceType from dstack._internal.core.models.profiles import StartupOrder, UtilizationPolicy +from dstack._internal.core.models.resources import ResourcesSpec from dstack._internal.core.models.runs import ( + JobProvisioningData, JobRuntimeData, + JobSpec, JobStatus, JobTerminationReason, + Requirements, RunStatus, ) from dstack._internal.core.models.volumes import ( @@ -24,8 +29,11 @@ VolumeMountPoint, VolumeStatus, ) -from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.tasks.process_running_jobs import ( + _patch_base_image_for_aws_efa, + process_running_jobs, +) from dstack._internal.server.schemas.runner import ( HealthcheckResponse, JobStateEvent, @@ -221,7 +229,7 @@ async def test_updates_running_job(self, test_db, session: AsyncSession, tmp_pat patch( "dstack._internal.server.services.runner.client.RunnerClient" ) as RunnerClientMock, - patch.object(settings, "SERVER_DIR_PATH", tmp_path), + patch.object(server_settings, "SERVER_DIR_PATH", tmp_path), ): runner_client_mock = RunnerClientMock.return_value runner_client_mock.pull.return_value = PullResponse( @@ -330,7 +338,7 @@ async def test_provisioning_shim_with_volumes( name="test-run-0-0", registry_username="", registry_password="", - image_name="dstackai/base:py3.13-0.9-cuda-12.1", + image_name="dstackai/base:0.10-base-ubuntu22.04", container_user="root", privileged=privileged, gpu=None, @@ -878,3 +886,129 @@ async def test_master_job_waits_for_workers(self, test_db, session: AsyncSession await process_running_jobs() await session.refresh(master_job) assert master_job.status == JobStatus.RUNNING + + +class TestPatchBaseImageForAwsEfa: + @staticmethod + def _create_job_spec(image_name: str) -> "JobSpec": + return JobSpec( + job_num=0, + job_name="test-job", + commands=["echo hello"], + env={}, + image_name=image_name, + requirements=Requirements(resources=ResourcesSpec()), + ) + + @staticmethod + def _create_job_provisioning_data_with_instance_type( + backend: BackendType, instance_type: str + ) -> JobProvisioningData: + job_provisioning_data = get_job_provisioning_data(backend=backend) + job_provisioning_data.instance_type = InstanceType( + name=instance_type, + resources=job_provisioning_data.instance_type.resources, + ) + return job_provisioning_data + + @staticmethod + def _call_patch_base_image_for_aws_efa( + image_name: str, backend: BackendType, instance_type: str + ) -> str: + job_spec = TestPatchBaseImageForAwsEfa._create_job_spec(image_name) + job_provisioning_data = ( + TestPatchBaseImageForAwsEfa._create_job_provisioning_data_with_instance_type( + backend, instance_type + ) + ) + return _patch_base_image_for_aws_efa(job_spec, job_provisioning_data) + + @pytest.mark.parametrize( + "suffix,instance_type", + [ + ("-base", "p6-b200.48xlarge"), + ("-devel", "p5.48xlarge"), + ], + ) + def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + [ + "p5.48xlarge", + "p5e.48xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "g6.8xlarge", + "g6e.8xlarge", + ], + ) + def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "backend", + [BackendType.GCP, BackendType.AZURE, BackendType.LAMBDA, BackendType.LOCAL], + ) + @pytest.mark.parametrize( + "instance_type", + [ + "standard-4", + "p5.xlarge", + "p6.2xlarge", + "g6.xlarge", + ], # Mix of generic and EFA-named types + ) + def test_no_patch_non_aws_backends( + self, backend: BackendType, suffix: str, instance_type: str + ): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + result = self._call_patch_base_image_for_aws_efa(image_name, backend, instance_type) + assert result == image_name + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + ["t3.micro", "m5.large", "c5.xlarge", "r5.2xlarge", "m6i.large", "g6.xlarge"], + ) + def test_no_patch_non_efa_aws_instances(self, instance_type: str, suffix: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + assert result == image_name + + @pytest.mark.parametrize( + "instance_type", + ["p5.xlarge", "p6.2xlarge", "t3.micro", "m5.large"], # Mix of EFA and non-EFA instances + ) + @pytest.mark.parametrize( + "image_name", + [ + "ubuntu:20.04", + "nvidia/cuda:11.8-runtime-ubuntu20.04", + "python:3.9-slim", + "custom/image:latest", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-custom", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}", + ], + ) + def test_no_patch_other_images(self, instance_type: str, image_name: str): + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + assert result == image_name diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 4a40c3273f..3c2181a209 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -170,7 +170,7 @@ def get_dev_env_run_plan_dict( "/bin/bash", "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && (echo pip install ipykernel... && " @@ -188,7 +188,7 @@ def get_dev_env_run_plan_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.13-0.9-cuda-12.1", + "image_name": "dstackai/base:0.10-base-ubuntu22.04", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", @@ -334,7 +334,7 @@ def get_dev_env_run_dict( "/bin/bash", "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && (echo pip install ipykernel... && " @@ -352,7 +352,7 @@ def get_dev_env_run_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.13-0.9-cuda-12.1", + "image_name": "dstackai/base:0.10-base-ubuntu22.04", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", diff --git a/src/tests/_internal/server/services/jobs/configurators/test_task.py b/src/tests/_internal/server/services/jobs/configurators/test_task.py index 07c8da7328..e954e6a01f 100644 --- a/src/tests/_internal/server/services/jobs/configurators/test_task.py +++ b/src/tests/_internal/server/services/jobs/configurators/test_task.py @@ -88,7 +88,7 @@ async def test_with_commands_and_image(self, shell: Optional[str], expected_shel ], ) async def test_with_commands_no_image(self, shell: Optional[str], expected_shell: str): - configuration = TaskConfiguration(commands=["sleep inf"], shell=shell) + configuration = TaskConfiguration(python="3.12", commands=["sleep inf"], shell=shell) run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) configurator = TaskJobConfigurator(run_spec) @@ -98,7 +98,7 @@ async def test_with_commands_no_image(self, shell: Optional[str], expected_shell expected_shell, "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.12 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && sleep inf",