From 5290d9893a1bb8b6edb554c74ae410bb152c1353 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 12 Jun 2025 15:09:58 +0300 Subject: [PATCH 01/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/{ => base}/efa/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename docker/{ => base}/efa/Dockerfile (96%) diff --git a/docker/efa/Dockerfile b/docker/base/efa/Dockerfile similarity index 96% rename from docker/efa/Dockerfile rename to docker/base/efa/Dockerfile index 0f8d717b0d..8948ac1235 100644 --- a/docker/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,6 +1,6 @@ -ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1 +# syntax = edrevo/dockerfile-plus -FROM ${BASE_IMAGE} +INCLUDE+ base/Dockerfile ENV PREFIX=/usr/local ENV CUDA_PATH=/usr/local/cuda From 53a717d18ea81673ec3104420d45a00e2c9affa5 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 12 Jun 2025 15:13:09 +0300 Subject: [PATCH 02/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 0beaf582ba..5c378e2bf0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,7 +52,7 @@ jobs: strategy: matrix: python: ["3.9", "3.10", "3.11", "3.12", "3.13"] - flavor: ["base", "devel"] + flavor: ["base", "devel", "efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -67,7 +67,27 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Build and upload to DockerHub run: | - docker buildx build --platform linux/amd64 --build-arg FLAVOR=${{ matrix.flavor }} --build-arg PYTHON=${{ matrix.python }} --push --provenance=false --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${{ matrix.flavor == 'devel' && '-devel' || '' }} -f base/Dockerfile . + if [ "${{ matrix.flavor }}" = "base" ]; then + TAG_SUFFIX="" + FLAVOR="base" + FILE="base/Dockerfile" + elif [ "${{ matrix.flavor }}" = "efa" ]; then + TAG_SUFFIX="-efa" + FLAVOR="devel" + FILE="base/efa/Dockerfile" + else + TAG_SUFFIX="-devel" + FLAVOR="devel" + FILE="base/Dockerfile" + fi + docker buildx build \ + --platform linux/amd64 \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${TAG_SUFFIX} \ + --build-arg FLAVOR=$FLAVOR \ + --build-arg PYTHON=${{ matrix.python }} \ + --provenance=false \ + --push \ + -f $FILE . build-aws-images: needs: build-docker From e412313c98e20a118631b107d988b8ba1a755f8f Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 12 Jun 2025 22:49:30 +0300 Subject: [PATCH 03/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker-efa.yml | 46 -------------------------------- docker/base/Dockerfile | 32 ++++++++++++++++++++++ docker/base/efa/Dockerfile | 31 +++++++-------------- 3 files changed, 41 insertions(+), 68 deletions(-) delete mode 100644 .github/workflows/docker-efa.yml diff --git a/.github/workflows/docker-efa.yml b/.github/workflows/docker-efa.yml deleted file mode 100644 index 5f6ac0064f..0000000000 --- a/.github/workflows/docker-efa.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: Build EFA Docker image - -on: - workflow_dispatch: - inputs: - image_name: - description: "Docker image name" - required: true - default: "dstackai/efa" - dstack_revision: - description: "Docker image revision" - required: true - default: 0 - -jobs: - build-efa: - defaults: - run: - working-directory: docker/efa - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and upload to DockerHub - run: | - IMAGE_NAME=${{ inputs.image_name }} - BUILD_DATE=$(date --utc --iso-8601=seconds)Z - docker buildx build . \ - --load \ - --provenance=false \ - --platform linux/amd64 \ - --build-arg IMAGE_NAME=${IMAGE_NAME} \ - --build-arg DSTACK_REVISION=${{ inputs.dstack_revision }} \ - --build-arg BUILD_DATE=${BUILD_DATE} \ - --tag ${IMAGE_NAME}:latest - VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME}) - docker tag ${IMAGE_NAME}:latest ${IMAGE_NAME}:${VERSION} - docker push ${IMAGE_NAME}:${VERSION} - docker push ${IMAGE_NAME}:latest diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 2ed2fbd984..24049288b8 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,8 +1,11 @@ ARG FLAVOR FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04 +# UV & Python + ARG PYTHON ARG _UV_HOME="/opt/uv" + ENV UV_PYTHON="${PYTHON}" ENV UV_INSTALL_DIR="${_UV_HOME}/bin" ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python" @@ -26,3 +29,32 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ uv python install --preview --default + +# NCCL & NCCL tests + +ARG NCCL_VERSION=2.26.2-1 +ARG FLAVOR + +ENV FLAVOR=${FLAVOR} +ENV NCCL_HOME=/usr/local +ENV CUDA_PATH=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" +ENV PATH="${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" + +RUN if [ "${FLAVOR}" = "devel" ]; then \ + apt-get install -y --no-install-recommends \ + libopenmpi-dev \ + && cd $HOME \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && cd $HOME \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_PATH} \ + NCCL_HOME=${NCCL_HOME}; \ + fi diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 8948ac1235..1c78cccc5e 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,21 +1,16 @@ -# syntax = edrevo/dockerfile-plus +FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 -INCLUDE+ base/Dockerfile - -ENV PREFIX=/usr/local +ENV NCCL_HOME=/usr/local ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# prerequisites +# Prerequisites -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ libhwloc-dev \ autoconf \ automake \ @@ -38,7 +33,7 @@ ARG NCCL_VERSION=2.26.2-1 RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${PREFIX} + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} # AWS OFI NCCL @@ -53,9 +48,9 @@ RUN cd $HOME \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ --with-cuda=${CUDA_PATH} \ - --with-nccl=${PREFIX} \ + --with-nccl=${NCCL_HOME} \ --disable-tests \ - --prefix=${PREFIX} \ + --prefix=${NCCL_HOME} \ && make -j$(numproc) \ && make install @@ -68,12 +63,4 @@ RUN cd $HOME \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${PREFIX} - -ARG BUILD_DATE -ARG IMAGE_NAME -ARG DSTACK_REVISION - -LABEL org.opencontainers.image.title="${IMAGE_NAME}" -LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}" -LABEL org.opencontainers.image.created="${BUILD_DATE}" + NCCL_HOME=${NCCL_HOME} From 22b2530f579b256aeaf8cae23c830f911923b540 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 12 Jun 2025 23:05:15 +0300 Subject: [PATCH 04/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/Dockerfile | 31 ------------------------------- docker/base/devel/Dockerfile | 28 ++++++++++++++++++++++++++++ docker/base/efa/Dockerfile | 4 +++- 3 files changed, 31 insertions(+), 32 deletions(-) create mode 100644 docker/base/devel/Dockerfile diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 24049288b8..5a5228daba 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,8 +1,6 @@ ARG FLAVOR FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04 -# UV & Python - ARG PYTHON ARG _UV_HOME="/opt/uv" @@ -29,32 +27,3 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ uv python install --preview --default - -# NCCL & NCCL tests - -ARG NCCL_VERSION=2.26.2-1 -ARG FLAVOR - -ENV FLAVOR=${FLAVOR} -ENV NCCL_HOME=/usr/local -ENV CUDA_PATH=/usr/local/cuda -ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" -ENV PATH="${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" - -RUN if [ "${FLAVOR}" = "devel" ]; then \ - apt-get install -y --no-install-recommends \ - libopenmpi-dev \ - && cd $HOME \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && cd $HOME \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${NCCL_HOME}; \ - fi diff --git a/docker/base/devel/Dockerfile b/docker/base/devel/Dockerfile new file mode 100644 index 0000000000..7d1a839b91 --- /dev/null +++ b/docker/base/devel/Dockerfile @@ -0,0 +1,28 @@ +# syntax = edrevo/dockerfile-plus + +INCLUDE+ base/Dockerfile + +# NCCL & NCCL tests + +ARG NCCL_VERSION=2.26.2-1 + +ENV NCCL_HOME=/usr/local +ENV CUDA_PATH=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" +ENV PATH="${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" + +RUN apt-get install -y --no-install-recommends \ + libopenmpi-dev \ + && cd $HOME \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && cd $HOME \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_PATH} \ + NCCL_HOME=${NCCL_HOME} diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 1c78cccc5e..9abfb0302e 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,4 +1,6 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 +# syntax = edrevo/dockerfile-plus + +INCLUDE+ base/Dockerfile ENV NCCL_HOME=/usr/local ENV CUDA_PATH=/usr/local/cuda From 2891ccb9d81ef32a3ff9ceadefd7d07a63b4833d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 12 Jun 2025 23:14:49 +0300 Subject: [PATCH 05/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 5c378e2bf0..579d8ddb5c 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -78,7 +78,7 @@ jobs: else TAG_SUFFIX="-devel" FLAVOR="devel" - FILE="base/Dockerfile" + FILE="base/devel/Dockerfile" fi docker buildx build \ --platform linux/amd64 \ From 5b0539f6e4901a60106f2081609783c22f4065ea Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 11:23:25 +0300 Subject: [PATCH 06/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 3 --- docker/base/Dockerfile | 3 +-- docker/base/devel/Dockerfile | 26 ++++++++++++++++++-------- docker/base/efa/Dockerfile | 19 ++++++++++++------- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 579d8ddb5c..84562a23b7 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -69,15 +69,12 @@ jobs: run: | if [ "${{ matrix.flavor }}" = "base" ]; then TAG_SUFFIX="" - FLAVOR="base" FILE="base/Dockerfile" elif [ "${{ matrix.flavor }}" = "efa" ]; then TAG_SUFFIX="-efa" - FLAVOR="devel" FILE="base/efa/Dockerfile" else TAG_SUFFIX="-devel" - FLAVOR="devel" FILE="base/devel/Dockerfile" fi docker buildx build \ diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 5a5228daba..6ecf15b48b 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,5 +1,4 @@ -ARG FLAVOR -FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04 +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 ARG PYTHON ARG _UV_HOME="/opt/uv" diff --git a/docker/base/devel/Dockerfile b/docker/base/devel/Dockerfile index 7d1a839b91..7d8b1b875a 100644 --- a/docker/base/devel/Dockerfile +++ b/docker/base/devel/Dockerfile @@ -9,20 +9,30 @@ ARG NCCL_VERSION=2.26.2-1 ENV NCCL_HOME=/usr/local ENV CUDA_PATH=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi +ENV NCCL_TESTS_PATH=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" -ENV PATH="${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" -RUN apt-get install -y --no-install-recommends \ - libopenmpi-dev \ - && cd $HOME \ +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev + +RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && cd $HOME \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ + && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \ + && cd ${NCCL_TESTS_PATH} \ && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${NCCL_HOME} + NCCL_HOME=${NCCL_HOME} \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 9abfb0302e..67ceae5db6 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -6,13 +6,16 @@ ENV NCCL_HOME=/usr/local ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" +ENV NCCL_TESTS_PATH=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" # Prerequisites -RUN apt-get update \ +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ libhwloc-dev \ autoconf \ automake \ @@ -58,11 +61,13 @@ RUN cd $HOME \ # NCCL Tests -RUN cd $HOME \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \ + && cd ${NCCL_TESTS_PATH} \ && make -j$(numproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${NCCL_HOME} + NCCL_HOME=${NCCL_HOME} \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig + From a4e6d29948e38bd29c44b8d367cd023bb734f23f Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 15:23:15 +0300 Subject: [PATCH 07/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 19 +++------ .../Dockerfile => default/Dockerfile.common} | 0 docker/{base => default}/README.md | 0 docker/{base => default}/efa/Dockerfile | 39 +++++++++++++------ .../{base/devel => default/nvidia}/Dockerfile | 32 +++++++++++---- 5 files changed, 57 insertions(+), 33 deletions(-) rename docker/{base/Dockerfile => default/Dockerfile.common} (100%) rename docker/{base => default}/README.md (100%) rename docker/{base => default}/efa/Dockerfile (63%) rename docker/{base/devel => default/nvidia}/Dockerfile (57%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 84562a23b7..b97a84d2e6 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,7 +52,8 @@ jobs: strategy: matrix: python: ["3.9", "3.10", "3.11", "3.12", "3.13"] - flavor: ["base", "devel", "efa"] + version: ["nvidia", "efa"] + flavor: ["base", "devel"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -67,24 +68,14 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Build and upload to DockerHub run: | - if [ "${{ matrix.flavor }}" = "base" ]; then - TAG_SUFFIX="" - FILE="base/Dockerfile" - elif [ "${{ matrix.flavor }}" = "efa" ]; then - TAG_SUFFIX="-efa" - FILE="base/efa/Dockerfile" - else - TAG_SUFFIX="-devel" - FILE="base/devel/Dockerfile" - fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${TAG_SUFFIX} \ - --build-arg FLAVOR=$FLAVOR \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-${{ matrix.version }}-${{ matrix.flavor }} \ + --build-arg FLAVOR=${{ matrix.flavor }} \ --build-arg PYTHON=${{ matrix.python }} \ --provenance=false \ --push \ - -f $FILE . + -f default/${{ matrix.version }}/Dockerfile . build-aws-images: needs: build-docker diff --git a/docker/base/Dockerfile b/docker/default/Dockerfile.common similarity index 100% rename from docker/base/Dockerfile rename to docker/default/Dockerfile.common diff --git a/docker/base/README.md b/docker/default/README.md similarity index 100% rename from docker/base/README.md rename to docker/default/README.md diff --git a/docker/base/efa/Dockerfile b/docker/default/efa/Dockerfile similarity index 63% rename from docker/base/efa/Dockerfile rename to docker/default/efa/Dockerfile index 67ceae5db6..ad9a317c61 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/default/efa/Dockerfile @@ -3,12 +3,12 @@ INCLUDE+ base/Dockerfile ENV NCCL_HOME=/usr/local -ENV CUDA_PATH=/usr/local/cuda +ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_PATH=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" +ENV MPI_HOME=/opt/amazon/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}" +ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" # Prerequisites @@ -49,10 +49,9 @@ RUN cd $HOME \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_PATH} \ + --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${OPEN_MPI_PATH} \ - --with-cuda=${CUDA_PATH} \ + --with-mpi=${MPI_HOME} \ --with-nccl=${NCCL_HOME} \ --disable-tests \ --prefix=${NCCL_HOME} \ @@ -61,13 +60,29 @@ RUN cd $HOME \ # NCCL Tests -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \ - && cd ${NCCL_TESTS_PATH} \ +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ && make -j$(numproc) \ MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ + MPI_HOME=${MPI_HOME} \ + CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig +ARG FLAVOR +ENV FLAVOR=${FLAVOR} + +# If FLAVOR is base, uninstall development packages to reduce image size +RUN if [ "$FLAVOR" = "base" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ +fi diff --git a/docker/base/devel/Dockerfile b/docker/default/nvidia/Dockerfile similarity index 57% rename from docker/base/devel/Dockerfile rename to docker/default/nvidia/Dockerfile index 7d8b1b875a..1ce4d69660 100644 --- a/docker/base/devel/Dockerfile +++ b/docker/default/nvidia/Dockerfile @@ -1,16 +1,16 @@ # syntax = edrevo/dockerfile-plus -INCLUDE+ base/Dockerfile +INCLUDE+ default/Dockerfile.common # NCCL & NCCL tests ARG NCCL_VERSION=2.26.2-1 ENV NCCL_HOME=/usr/local -ENV CUDA_PATH=/usr/local/cuda +ENV CUDA_HOME=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -ENV NCCL_TESTS_PATH=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${NCCL_TESTS_PATH}:${PATH}" +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ @@ -27,12 +27,30 @@ RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_PATH} \ - && cd ${NCCL_TESTS_PATH} \ + && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ + CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig + +ARG FLAVOR +ENV FLAVOR=${FLAVOR} + +# If FLAVOR is base, uninstall development packages to reduce image size +RUN if [ "$FLAVOR" = "base" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ +fi From b102ddba65219240574b86395ed3db07de50f1dd Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 15:29:21 +0300 Subject: [PATCH 08/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- docker/base/Dockerfile.common | 28 +++++++++++ docker/base/README.md | 1 + docker/base/efa/Dockerfile | 88 +++++++++++++++++++++++++++++++++++ docker/base/nvidia/Dockerfile | 56 ++++++++++++++++++++++ 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 docker/base/Dockerfile.common create mode 100644 docker/base/README.md create mode 100644 docker/base/efa/Dockerfile create mode 100644 docker/base/nvidia/Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index b97a84d2e6..84301752f9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -75,7 +75,7 @@ jobs: --build-arg PYTHON=${{ matrix.python }} \ --provenance=false \ --push \ - -f default/${{ matrix.version }}/Dockerfile . + -f base/${{ matrix.version }}/Dockerfile . build-aws-images: needs: build-docker diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common new file mode 100644 index 0000000000..6ecf15b48b --- /dev/null +++ b/docker/base/Dockerfile.common @@ -0,0 +1,28 @@ +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 + +ARG PYTHON +ARG _UV_HOME="/opt/uv" + +ENV UV_PYTHON="${PYTHON}" +ENV UV_INSTALL_DIR="${_UV_HOME}/bin" +ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python" +ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin" +ENV UV_MANAGED_PYTHON=1 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}" + +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + apt-get update --fix-missing && \ + apt-get upgrade -y && \ + ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ + apt-get install -y tzdata && \ + dpkg-reconfigure --frontend noninteractive tzdata && \ + apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \ + sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \ + mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_* + +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ + uv python install --preview --default diff --git a/docker/base/README.md b/docker/base/README.md new file mode 100644 index 0000000000..3bbbafa732 --- /dev/null +++ b/docker/base/README.md @@ -0,0 +1 @@ +Image for `dstack` runner instances. diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile new file mode 100644 index 0000000000..ad9a317c61 --- /dev/null +++ b/docker/base/efa/Dockerfile @@ -0,0 +1,88 @@ +# syntax = edrevo/dockerfile-plus + +INCLUDE+ base/Dockerfile + +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV MPI_HOME=/opt/amazon/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}" +ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" + +# Prerequisites + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool + +# EFA + +ARG EFA_VERSION=1.38.1 + +RUN cd $HOME \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g + +# NCCL + +ARG NCCL_VERSION=2.26.2-1 + +RUN cd $HOME \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} + +# AWS OFI NCCL + +ARG OFI_VERSION=1.14.0 + +RUN cd $HOME \ + && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ + && cd aws-ofi-nccl \ + && ./autogen.sh \ + && ./configure \ + --with-cuda=${CUDA_HOME} \ + --with-libfabric=${LIBFABRIC_PATH} \ + --with-mpi=${MPI_HOME} \ + --with-nccl=${NCCL_HOME} \ + --disable-tests \ + --prefix=${NCCL_HOME} \ + && make -j$(numproc) \ + && make install + +# NCCL Tests + +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(numproc) \ + MPI=1 \ + MPI_HOME=${MPI_HOME} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig + +ARG FLAVOR +ENV FLAVOR=${FLAVOR} + +# If FLAVOR is base, uninstall development packages to reduce image size +RUN if [ "$FLAVOR" = "base" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ +fi diff --git a/docker/base/nvidia/Dockerfile b/docker/base/nvidia/Dockerfile new file mode 100644 index 0000000000..5cb937ce71 --- /dev/null +++ b/docker/base/nvidia/Dockerfile @@ -0,0 +1,56 @@ +# syntax = edrevo/dockerfile-plus + +INCLUDE+ base/Dockerfile.common + +# NCCL & NCCL tests + +ARG NCCL_VERSION=2.26.2-1 + +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev + +RUN cd $HOME \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig + +ARG FLAVOR +ENV FLAVOR=${FLAVOR} + +# If FLAVOR is base, uninstall development packages to reduce image size +RUN if [ "$FLAVOR" = "base" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ +fi From ee05d464d02bb41d8ced50e42b815be4c8dc3916 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 15:34:58 +0300 Subject: [PATCH 09/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/efa/Dockerfile | 2 +- docker/default/Dockerfile.common | 28 ---------- docker/default/README.md | 1 - docker/default/efa/Dockerfile | 88 -------------------------------- docker/default/nvidia/Dockerfile | 56 -------------------- 5 files changed, 1 insertion(+), 174 deletions(-) delete mode 100644 docker/default/Dockerfile.common delete mode 100644 docker/default/README.md delete mode 100644 docker/default/efa/Dockerfile delete mode 100644 docker/default/nvidia/Dockerfile diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index ad9a317c61..6edfdfb15f 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,6 +1,6 @@ # syntax = edrevo/dockerfile-plus -INCLUDE+ base/Dockerfile +INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/usr/local ENV CUDA_HOME=/usr/local/cuda diff --git a/docker/default/Dockerfile.common b/docker/default/Dockerfile.common deleted file mode 100644 index 6ecf15b48b..0000000000 --- a/docker/default/Dockerfile.common +++ /dev/null @@ -1,28 +0,0 @@ -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 - -ARG PYTHON -ARG _UV_HOME="/opt/uv" - -ENV UV_PYTHON="${PYTHON}" -ENV UV_INSTALL_DIR="${_UV_HOME}/bin" -ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python" -ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin" -ENV UV_MANAGED_PYTHON=1 -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 - -ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}" - -RUN export DEBIAN_FRONTEND=noninteractive && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update --fix-missing && \ - apt-get upgrade -y && \ - ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ - dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ - libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \ - sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \ - mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_* - -RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ - uv python install --preview --default diff --git a/docker/default/README.md b/docker/default/README.md deleted file mode 100644 index 3bbbafa732..0000000000 --- a/docker/default/README.md +++ /dev/null @@ -1 +0,0 @@ -Image for `dstack` runner instances. diff --git a/docker/default/efa/Dockerfile b/docker/default/efa/Dockerfile deleted file mode 100644 index ad9a317c61..0000000000 --- a/docker/default/efa/Dockerfile +++ /dev/null @@ -1,88 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -INCLUDE+ base/Dockerfile - -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV MPI_HOME=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}" -ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" - -# Prerequisites - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool - -# EFA - -ARG EFA_VERSION=1.38.1 - -RUN cd $HOME \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g - -# NCCL - -ARG NCCL_VERSION=2.26.2-1 - -RUN cd $HOME \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} - -# AWS OFI NCCL - -ARG OFI_VERSION=1.14.0 - -RUN cd $HOME \ - && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ - && cd aws-ofi-nccl \ - && ./autogen.sh \ - && ./configure \ - --with-cuda=${CUDA_HOME} \ - --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${MPI_HOME} \ - --with-nccl=${NCCL_HOME} \ - --disable-tests \ - --prefix=${NCCL_HOME} \ - && make -j$(numproc) \ - && make install - -# NCCL Tests - -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(numproc) \ - MPI=1 \ - MPI_HOME=${MPI_HOME} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig - -ARG FLAVOR -ENV FLAVOR=${FLAVOR} - -# If FLAVOR is base, uninstall development packages to reduce image size -RUN if [ "$FLAVOR" = "base" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ -fi diff --git a/docker/default/nvidia/Dockerfile b/docker/default/nvidia/Dockerfile deleted file mode 100644 index 1ce4d69660..0000000000 --- a/docker/default/nvidia/Dockerfile +++ /dev/null @@ -1,56 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -INCLUDE+ default/Dockerfile.common - -# NCCL & NCCL tests - -ARG NCCL_VERSION=2.26.2-1 - -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda -ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - libopenmpi-dev - -RUN cd $HOME \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig - -ARG FLAVOR -ENV FLAVOR=${FLAVOR} - -# If FLAVOR is base, uninstall development packages to reduce image size -RUN if [ "$FLAVOR" = "base" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - libopenmpi-dev \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ -fi From eac604a8409494b72609d39a1e591d9a6dc26ba4 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 16:09:28 +0300 Subject: [PATCH 10/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 4 +--- docker/base/Dockerfile.common | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 84301752f9..dc32896979 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,7 +51,6 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: ["3.9", "3.10", "3.11", "3.12", "3.13"] version: ["nvidia", "efa"] flavor: ["base", "devel"] steps: @@ -70,9 +69,8 @@ jobs: run: | docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-${{ matrix.version }}-${{ matrix.flavor }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }}-${{ inputs.image_version }} \ --build-arg FLAVOR=${{ matrix.flavor }} \ - --build-arg PYTHON=${{ matrix.python }} \ --provenance=false \ --push \ -f base/${{ matrix.version }}/Dockerfile . diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 6ecf15b48b..71ebb7c021 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -1,16 +1,12 @@ FROM nvidia/cuda:12.1.1-base-ubuntu20.04 -ARG PYTHON ARG _UV_HOME="/opt/uv" -ENV UV_PYTHON="${PYTHON}" ENV UV_INSTALL_DIR="${_UV_HOME}/bin" -ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python" -ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin" ENV UV_MANAGED_PYTHON=1 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}" +ENV PATH="${UV_INSTALL_DIR}:${PATH}" RUN export DEBIAN_FRONTEND=noninteractive && \ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ From 0172ab7e5dfb700dacb598e68cb4f1c7fb8a1ec8 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 18:25:18 +0300 Subject: [PATCH 11/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/efa/Dockerfile | 69 ++++++++++++----------------------- docker/base/nvidia/Dockerfile | 37 ++++++++----------- 2 files changed, 39 insertions(+), 67 deletions(-) diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 6edfdfb15f..c03cd6727b 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -10,7 +10,10 @@ ENV NCCL_TESTS_HOME=/opt/nccl-tests ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}" ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" -# Prerequisites +ARG EFA_VERSION=1.38.1 +ARG NCCL_VERSION=2.26.2-1 +ARG OFI_VERSION=1.14.0 +ARG FLAVOR RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ @@ -19,32 +22,17 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ libhwloc-dev \ autoconf \ automake \ - libtool - -# EFA - -ARG EFA_VERSION=1.38.1 - -RUN cd $HOME \ + libtool \ + && cd $HOME \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g - -# NCCL - -ARG NCCL_VERSION=2.26.2-1 - -RUN cd $HOME \ + && ./efa_installer.sh -y --skip-kmod -g \ + && cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} - -# AWS OFI NCCL - -ARG OFI_VERSION=1.14.0 - -RUN cd $HOME \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && cd $HOME \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ @@ -56,11 +44,8 @@ RUN cd $HOME \ --disable-tests \ --prefix=${NCCL_HOME} \ && make -j$(numproc) \ - && make install - -# NCCL Tests - -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && make install \ + && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ && cd ${NCCL_TESTS_HOME} \ && make -j$(numproc) \ MPI=1 \ @@ -68,21 +53,15 @@ RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig - -ARG FLAVOR -ENV FLAVOR=${FLAVOR} - -# If FLAVOR is base, uninstall development packages to reduce image size -RUN if [ "$FLAVOR" = "base" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ -fi + && ldconfig \ + && if [ "$FLAVOR" = "base" ]; then \ + apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi diff --git a/docker/base/nvidia/Dockerfile b/docker/base/nvidia/Dockerfile index 5cb937ce71..967ca9be31 100644 --- a/docker/base/nvidia/Dockerfile +++ b/docker/base/nvidia/Dockerfile @@ -5,6 +5,7 @@ INCLUDE+ base/Dockerfile.common # NCCL & NCCL tests ARG NCCL_VERSION=2.26.2-1 +ARG FLAVOR ENV NCCL_HOME=/usr/local ENV CUDA_HOME=/usr/local/cuda @@ -21,9 +22,8 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ autoconf \ automake \ libtool \ - libopenmpi-dev - -RUN cd $HOME \ + libopenmpi-dev \ + && cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ @@ -35,22 +35,15 @@ RUN cd $HOME \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig - -ARG FLAVOR -ENV FLAVOR=${FLAVOR} - -# If FLAVOR is base, uninstall development packages to reduce image size -RUN if [ "$FLAVOR" = "base" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - libopenmpi-dev \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ -fi + && ldconfig \ + && if [ "$FLAVOR" = "base" ]; then \ + apt-get remove -y \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi From 8418ad7a617e1856a8a42a44dcf20eb5e6dd38a8 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 13 Jun 2025 18:51:24 +0000 Subject: [PATCH 12/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/nvidia/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/base/nvidia/Dockerfile b/docker/base/nvidia/Dockerfile index 967ca9be31..ca22ec9659 100644 --- a/docker/base/nvidia/Dockerfile +++ b/docker/base/nvidia/Dockerfile @@ -22,6 +22,7 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ autoconf \ automake \ libtool \ + openmpi-bin \ libopenmpi-dev \ && cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ From b5710a01239d9557c44afe12a1ad11ab8e3c7f74 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 23:18:54 +0300 Subject: [PATCH 13/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 17 +++-- docker/base/Dockerfile | 72 +++++++++++++++++++ .../base/{efa/Dockerfile => efa.Dockerfile} | 13 +--- docker/base/nvidia/Dockerfile | 50 ------------- 4 files changed, 86 insertions(+), 66 deletions(-) create mode 100644 docker/base/Dockerfile rename docker/base/{efa/Dockerfile => efa.Dockerfile} (85%) delete mode 100644 docker/base/nvidia/Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index dc32896979..8597b7620e 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,8 +51,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - version: ["nvidia", "efa"] - flavor: ["base", "devel"] + flavor: ["base", "devel", "devel-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -67,13 +66,23 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Build and upload to DockerHub run: | + if [ "${{ matrix.flavor }}" = "base" ]; then + FLAVOR="base" + FILE="base/Dockerfile" + elif [ "${{ matrix.flavor }}" = "devel" ]; then + FLAVOR="devel" + FILE="base/Dockerfile" + else + FLAVOR="devel-efa" + FILE="base/efa.Dockerfile" + fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }}-${{ inputs.image_version }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }} \ --build-arg FLAVOR=${{ matrix.flavor }} \ --provenance=false \ --push \ - -f base/${{ matrix.version }}/Dockerfile . + -f $FILE . build-aws-images: needs: build-docker diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile new file mode 100644 index 0000000000..999016cdc5 --- /dev/null +++ b/docker/base/Dockerfile @@ -0,0 +1,72 @@ +# syntax = edrevo/dockerfile-plus + +# Build stage +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder + +ARG NCCL_VERSION=2.26.2-1 + +ENV NCCL_HOME=/opt/nccl +ENV CUDA_HOME=/usr/local/cuda +ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests + +# Install build dependencies +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + git \ + curl \ + python3 \ + build-essential + +# Build NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} + +# Build NCCL tests +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +# Final stage +INCLUDE+ base/Dockerfile.common + +ENV NCCL_HOME=/opt/nccl +ENV NCCL_TESTS_HOME=/opt/nccl-tests + +COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib +COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} + +ARG FLAVOR + +# Configure library paths +RUN apt-get install -y --no-install-recommends openmpi-bin \ + && if [ "$FLAVOR" = "devel" ]; then \ + cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev; \ + fi \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && ldconfig diff --git a/docker/base/efa/Dockerfile b/docker/base/efa.Dockerfile similarity index 85% rename from docker/base/efa/Dockerfile rename to docker/base/efa.Dockerfile index c03cd6727b..de2d78d3c4 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa.Dockerfile @@ -53,15 +53,4 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig \ - && if [ "$FLAVOR" = "base" ]; then \ - apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ - fi + && ldconfig diff --git a/docker/base/nvidia/Dockerfile b/docker/base/nvidia/Dockerfile deleted file mode 100644 index ca22ec9659..0000000000 --- a/docker/base/nvidia/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -INCLUDE+ base/Dockerfile.common - -# NCCL & NCCL tests - -ARG NCCL_VERSION=2.26.2-1 -ARG FLAVOR - -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda -ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${NCCL_TESTS_HOME}/build:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - openmpi-bin \ - libopenmpi-dev \ - && cd $HOME \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig \ - && if [ "$FLAVOR" = "base" ]; then \ - apt-get remove -y \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - && apt-get autoremove -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/*; \ - fi From f46afe8ed26f720468032b99cffc7e13ffe6c755 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 23:23:47 +0300 Subject: [PATCH 14/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 8597b7620e..4979815409 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -78,7 +78,7 @@ jobs: fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.flavor }}:${{ inputs.image_version }} \ --build-arg FLAVOR=${{ matrix.flavor }} \ --provenance=false \ --push \ From 59edf646ac126fa48d5116773b267a33f43ff87f Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Jun 2025 23:31:32 +0300 Subject: [PATCH 15/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 4979815409..02f042461a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -78,7 +78,7 @@ jobs: fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.flavor }}:${{ inputs.image_version }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.flavor }}-${{ inputs.image_version }} \ --build-arg FLAVOR=${{ matrix.flavor }} \ --provenance=false \ --push \ From 76ed3bca67b405ab4e519ec23b0b7f98c6928ed0 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 01:14:03 +0300 Subject: [PATCH 16/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- docker/base/Dockerfile | 1 + docker/base/Dockerfile.common | 30 +++++++++++++++++------------- docker/base/efa.Dockerfile | 15 ++++++++------- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 02f042461a..b7086e2066 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -78,7 +78,7 @@ jobs: fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.flavor }}-${{ inputs.image_version }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }} \ --build-arg FLAVOR=${{ matrix.flavor }} \ --provenance=false \ --push \ diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 999016cdc5..d6158dceb1 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -68,5 +68,6 @@ RUN apt-get install -y --no-install-recommends openmpi-bin \ cuda-nvcc-${cuda_version} \ libhwloc-dev; \ fi \ + && rm -rf /var/lib/apt/lists/* \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 71ebb7c021..1ddd2e227c 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -8,17 +8,21 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH="${UV_INSTALL_DIR}:${PATH}" -RUN export DEBIAN_FRONTEND=noninteractive && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update --fix-missing && \ - apt-get upgrade -y && \ - ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ - dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ - libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \ - sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \ - mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_* +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + && rm -rf /var/lib/apt/lists/* \ + && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ + && mkdir /run/sshd \ + && mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && chmod 600 ~/.ssh/authorized_keys \ + && rm /etc/ssh/ssh_host_* -RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \ - uv python install --preview --default +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ + && uv python install --preview --default diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index de2d78d3c4..a0116d0b01 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -5,10 +5,9 @@ INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/usr/local ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV MPI_HOME=/opt/amazon/openmpi +ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}" -ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}" +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ARG EFA_VERSION=1.38.1 ARG NCCL_VERSION=2.26.2-1 @@ -39,18 +38,20 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && ./configure \ --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${MPI_HOME} \ + --with-mpi=${OPEN_MPI_PATH} \ --with-nccl=${NCCL_HOME} \ --disable-tests \ --prefix=${NCCL_HOME} \ && make -j$(numproc) \ && make install \ - && git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ + && git clone https://github.com/NVIDIA/nccl-tests ${HOME}/nccl-tests \ + && cd ${HOME}/nccl-tests \ && make -j$(numproc) \ MPI=1 \ - MPI_HOME=${MPI_HOME} \ + MPI_HOME=${OPEN_MPI_PATH} \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} \ + && ln -s ${HOME}/nccl-tests/build ${NCCL_TESTS_HOME} \ + && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig From 0e36c85095f0cfe4e59ab821be010d333a3c1f80 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 01:47:33 +0300 Subject: [PATCH 17/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/Dockerfile | 3 ++- docker/base/efa.Dockerfile | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index d6158dceb1..ccc11d292d 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -60,7 +60,8 @@ COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} ARG FLAVOR # Configure library paths -RUN apt-get install -y --no-install-recommends openmpi-bin \ +RUN apt-get update \ + && apt-get install -y --no-install-recommends openmpi-bin \ && if [ "$FLAVOR" = "devel" ]; then \ cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index a0116d0b01..929b7ebfde 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -14,7 +14,8 @@ ARG NCCL_VERSION=2.26.2-1 ARG OFI_VERSION=1.14.0 ARG FLAVOR -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ +RUN apt-get update \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ cuda-libraries-dev-${cuda_version} \ cuda-nvcc-${cuda_version} \ From 6225135acbdd577c71ade2cd3aac83cc3ba17148 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 02:34:45 +0300 Subject: [PATCH 18/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/efa.Dockerfile | 67 +++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index 929b7ebfde..1f91bf618c 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -2,57 +2,70 @@ INCLUDE+ base/Dockerfile.common -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda +ENV PREFIX=/usr/local +ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -ARG EFA_VERSION=1.38.1 -ARG NCCL_VERSION=2.26.2-1 -ARG OFI_VERSION=1.14.0 -ARG FLAVOR +# prerequisites -RUN apt-get update \ - && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ && apt-get install -y --no-install-recommends \ cuda-libraries-dev-${cuda_version} \ cuda-nvcc-${cuda_version} \ libhwloc-dev \ autoconf \ automake \ - libtool \ - && cd $HOME \ + libtool + +# EFA + +ARG EFA_VERSION=1.38.1 + +RUN cd $HOME \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && cd $HOME \ + && ./efa_installer.sh -y --skip-kmod -g + +# NCCL + +ARG NCCL_VERSION=2.26.2-1 + +RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && cd $HOME \ + && make -j$(nproc) src.build BUILDDIR=${PREFIX} + +# AWS OFI NCCL + +ARG OFI_VERSION=1.14.0 + +RUN cd $HOME \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_HOME} \ + --with-cuda=${CUDA_PATH} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ - --with-nccl=${NCCL_HOME} \ + --with-cuda=${CUDA_PATH} \ + --with-nccl=${PREFIX} \ --disable-tests \ - --prefix=${NCCL_HOME} \ + --prefix=${PREFIX} \ && make -j$(numproc) \ - && make install \ - && git clone https://github.com/NVIDIA/nccl-tests ${HOME}/nccl-tests \ - && cd ${HOME}/nccl-tests \ + && make install + +# NCCL Tests + +RUN cd $HOME \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ && make -j$(numproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} \ - && ln -s ${HOME}/nccl-tests/build ${NCCL_TESTS_HOME} \ - && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && ldconfig + CUDA_HOME=${CUDA_PATH} \ + NCCL_HOME=${PREFIX} From dec318f6135106b83f3821f712d8db9c541918a5 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 11:23:25 +0300 Subject: [PATCH 19/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 8 +-- docker/base/base-efa.Dockerfile | 96 +++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 docker/base/base-efa.Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index b7086e2066..c4f30da70a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,7 +51,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - flavor: ["base", "devel", "devel-efa"] + # flavor: ["base", "devel", "devel-efa"] + flavor: ["devel-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -67,13 +68,12 @@ jobs: - name: Build and upload to DockerHub run: | if [ "${{ matrix.flavor }}" = "base" ]; then - FLAVOR="base" FILE="base/Dockerfile" elif [ "${{ matrix.flavor }}" = "devel" ]; then - FLAVOR="devel" FILE="base/Dockerfile" + elif [ "${{ matrix.flavor }}" = "base-efa" ]; then + FILE="base/base-efa.Dockerfile" else - FLAVOR="devel-efa" FILE="base/efa.Dockerfile" fi docker buildx build \ diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile new file mode 100644 index 0000000000..30dfe8a093 --- /dev/null +++ b/docker/base/base-efa.Dockerfile @@ -0,0 +1,96 @@ +# syntax = edrevo/dockerfile-plus + +# Build stage +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder + +ENV NCCL_HOME=/opt/nccl +ENV CUDA_PATH=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" + +# prerequisites + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool + +# EFA + +ARG EFA_VERSION=1.38.1 + +RUN cd $HOME \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g + +# NCCL + +ARG NCCL_VERSION=2.26.2-1 + +# Build NCCL tests +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + + +# AWS OFI NCCL + +ARG OFI_VERSION=1.14.0 + +RUN cd $HOME \ + && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ + && cd aws-ofi-nccl \ + && ./autogen.sh \ + && ./configure \ + --with-cuda=${CUDA_PATH} \ + --with-libfabric=${LIBFABRIC_PATH} \ + --with-mpi=${OPEN_MPI_PATH} \ + --with-nccl=${NCCL_HOME} \ + --disable-tests \ + --prefix=${NCCL_HOME} \ + && make -j$(numproc) \ + && make install + +# NCCL Tests + +RUN cd $HOME \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(numproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_PATH} \ + NCCL_HOME=${NCCL_HOME} + +# Final stage +INCLUDE+ base/Dockerfile.common + +ENV NCCL_HOME=/opt/nccl +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" + +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} +COPY --from=builder ${LIBFABRIC_PATH} ${LIBFABRIC_PATH} +COPY --from=builder ${OPEN_MPI_PATH} ${OPEN_MPI_PATH} +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} + +RUN echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ + && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ + && ldconfig From 5a34a9c9b4f4a5800cbeb0959a1d971e7c4745fc Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 11:24:26 +0300 Subject: [PATCH 20/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index c4f30da70a..018bff32dd 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,7 +52,7 @@ jobs: strategy: matrix: # flavor: ["base", "devel", "devel-efa"] - flavor: ["devel-efa"] + flavor: ["base-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 From 562266f6103e1cdd142731405feed876a0d41761 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 11:37:40 +0300 Subject: [PATCH 21/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 30dfe8a093..529411b4e5 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -10,17 +10,27 @@ ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# prerequisites - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get update \ +# Install build dependencies +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ cuda-libraries-dev-${cuda_version} \ cuda-nvcc-${cuda_version} \ libhwloc-dev \ autoconf \ automake \ - libtool + libtool \ + libopenmpi-dev \ + git \ + curl \ + python3 \ + build-essential # EFA From 2fd1bdf1de850cc0b4d79d8d790e42620eab5cb5 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 11:48:27 +0300 Subject: [PATCH 22/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 529411b4e5..0ce6619b6b 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -7,8 +7,7 @@ ENV NCCL_HOME=/opt/nccl ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" +ENV NCCL_TESTS_HOME=/opt/nccl-tests # Install build dependencies RUN export DEBIAN_FRONTEND=noninteractive \ From dbb3545593c82a7d3ee57a0661d99c083f1e53eb Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 12:00:46 +0300 Subject: [PATCH 23/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 0ce6619b6b..702b6c150b 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -8,6 +8,8 @@ ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" # Install build dependencies RUN export DEBIAN_FRONTEND=noninteractive \ From bcfc8567b35c044bc3c91d747a1e74df034883e5 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 12:24:26 +0300 Subject: [PATCH 24/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 52 ++++++++++++--------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 702b6c150b..55af2239e3 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -3,13 +3,15 @@ # Build stage FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder +ARG NCCL_VERSION=2.26.2-1 +ARG EFA_VERSION=1.38.1 +ARG OFI_VERSION=1.14.0 + ENV NCCL_HOME=/opt/nccl -ENV CUDA_PATH=/usr/local/cuda +ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" # Install build dependencies RUN export DEBIAN_FRONTEND=noninteractive \ @@ -33,40 +35,18 @@ RUN export DEBIAN_FRONTEND=noninteractive \ python3 \ build-essential -# EFA - -ARG EFA_VERSION=1.38.1 - -RUN cd $HOME \ +RUN cd /tmp \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y --skip-kmod -g -# NCCL - -ARG NCCL_VERSION=2.26.2-1 - -# Build NCCL tests -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} - - -# AWS OFI NCCL - -ARG OFI_VERSION=1.14.0 - -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_PATH} \ + --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ --with-nccl=${NCCL_HOME} \ @@ -75,15 +55,19 @@ RUN cd $HOME \ && make -j$(numproc) \ && make install -# NCCL Tests +# Build NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} -RUN cd $HOME \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ - && make -j$(numproc) \ +# Build NCCL tests +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ + CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} # Final stage From fc5a8ebee3e68595089cd325b2f35d6119bc1bec Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 13:07:49 +0300 Subject: [PATCH 25/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 55af2239e3..af1e1d0606 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -49,10 +49,9 @@ RUN cd /tmp \ --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ - --with-nccl=${NCCL_HOME} \ --disable-tests \ --prefix=${NCCL_HOME} \ - && make -j$(numproc) \ + && make -j$(nproc) \ && make install # Build NCCL @@ -77,13 +76,17 @@ ENV NCCL_HOME=/opt/nccl ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests + ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +# TODO: Unsure if this is required, updating ` /etc/ld.so.conf.d` should be enough ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} COPY --from=builder ${LIBFABRIC_PATH} ${LIBFABRIC_PATH} COPY --from=builder ${OPEN_MPI_PATH} ${OPEN_MPI_PATH} COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} +COPY --from=builder /etc/ld.so.conf.d/000_efa.conf /etc/ld.so.conf.d/000_efa.conf +COPY --from=builder /etc/profile.d/zippy_efa.sh /etc/profile.d/zippy_efa.sh RUN echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ From 1902cdea0a79323381db2373212a1fcc2b02562b Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 15:19:16 +0300 Subject: [PATCH 26/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index af1e1d0606..0087d66ebd 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -8,6 +8,7 @@ ARG EFA_VERSION=1.38.1 ARG OFI_VERSION=1.14.0 ENV NCCL_HOME=/opt/nccl +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi @@ -76,19 +77,24 @@ ENV NCCL_HOME=/opt/nccl ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests - ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" -# TODO: Unsure if this is required, updating ` /etc/ld.so.conf.d` should be enough -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} -COPY --from=builder ${LIBFABRIC_PATH} ${LIBFABRIC_PATH} -COPY --from=builder ${OPEN_MPI_PATH} ${OPEN_MPI_PATH} +COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME} +COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} -COPY --from=builder /etc/ld.so.conf.d/000_efa.conf /etc/ld.so.conf.d/000_efa.conf -COPY --from=builder /etc/profile.d/zippy_efa.sh /etc/profile.d/zippy_efa.sh -RUN echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libevent-dev \ + libhwloc-dev \ + && cd /tmp \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ && ldconfig From 709b63c7c50dd4a81a82227caaa78864019fde34 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 15:21:06 +0300 Subject: [PATCH 27/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- examples/clusters/nccl-tests/.dstack.yml | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index 3870731e35..fa2c33dce5 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -5,26 +5,13 @@ nodes: 2 startup_order: workers-first stop_criteria: master-done -# This image comes with MPI and NCCL tests pre-built -image: dstackai/efa +image: dstackai/base-stgn:0.11-base-efa env: - NCCL_DEBUG=INFO commands: - - cd /root/nccl-tests/build - | - if [ $DSTACK_NODE_RANK -eq 0 ]; then - mpirun \ - --allow-run-as-root \ - --hostfile $DSTACK_MPI_HOSTFILE \ - -n $DSTACK_GPUS_NUM \ - -N $DSTACK_GPUS_PER_NODE \ - --mca btl_tcp_if_exclude lo,docker0 \ - --bind-to none \ - ./all_reduce_perf -b 8 -e 8G -f 2 -g 1 - else - sleep infinity - fi + sleep infinity resources: - gpu: nvidia:4:16GB + gpu: nvidia:1..8 shm_size: 16GB From 312a9eed0ad5da89b8a4a151c741465473f8ddde Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 15:22:35 +0300 Subject: [PATCH 28/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 0087d66ebd..4facbf27ea 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -92,7 +92,7 @@ RUN apt-get update \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g + && ./efa_installer.sh -y --skip-kmod -g \ && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ From 2b58f6771af656f7e8c621daa7188b4f5154a684 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 16:42:03 +0300 Subject: [PATCH 29/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 4facbf27ea..6042165306 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -73,6 +73,8 @@ RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ # Final stage INCLUDE+ base/Dockerfile.common +ARG EFA_VERSION=1.38.1 + ENV NCCL_HOME=/opt/nccl ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi From 8ae1be65a1fdc1407761d3252e5438e0caff943a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 17:59:15 +0300 Subject: [PATCH 30/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- docker/base/base-efa.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile index 6042165306..0331b8f54b 100644 --- a/docker/base/base-efa.Dockerfile +++ b/docker/base/base-efa.Dockerfile @@ -76,6 +76,7 @@ INCLUDE+ base/Dockerfile.common ARG EFA_VERSION=1.38.1 ENV NCCL_HOME=/opt/nccl +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV NCCL_TESTS_HOME=/opt/nccl-tests From a24bd1a1292d28c42d7f2c7b9e6fe404cb44846f Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 21:29:30 +0300 Subject: [PATCH 31/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 5 +- docker/base/Dockerfile | 2 +- docker/base/base-efa.Dockerfile | 103 -------------------------------- docker/base/efa.Dockerfile | 32 +++++----- 4 files changed, 19 insertions(+), 123 deletions(-) delete mode 100644 docker/base/base-efa.Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 018bff32dd..af99d8cd28 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,8 +51,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - # flavor: ["base", "devel", "devel-efa"] - flavor: ["base-efa"] + flavor: ["base", "devel", "devel-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -71,8 +70,6 @@ jobs: FILE="base/Dockerfile" elif [ "${{ matrix.flavor }}" = "devel" ]; then FILE="base/Dockerfile" - elif [ "${{ matrix.flavor }}" = "base-efa" ]; then - FILE="base/base-efa.Dockerfile" else FILE="base/efa.Dockerfile" fi diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index ccc11d292d..23f9939eb1 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -55,7 +55,7 @@ ENV NCCL_TESTS_HOME=/opt/nccl-tests COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include -COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}/build ARG FLAVOR diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile deleted file mode 100644 index 0331b8f54b..0000000000 --- a/docker/base/base-efa.Dockerfile +++ /dev/null @@ -1,103 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -# Build stage -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder - -ARG NCCL_VERSION=2.26.2-1 -ARG EFA_VERSION=1.38.1 -ARG OFI_VERSION=1.14.0 - -ENV NCCL_HOME=/opt/nccl -ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl -ENV CUDA_HOME=/usr/local/cuda -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests - -# Install build dependencies -RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ - && apt-get update --fix-missing \ - && apt-get upgrade -y \ - && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ - && apt-get install -y tzdata \ - && dpkg-reconfigure --frontend noninteractive tzdata \ - && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - libopenmpi-dev \ - git \ - curl \ - python3 \ - build-essential - -RUN cd /tmp \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g - -RUN cd /tmp \ - && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ - && cd aws-ofi-nccl \ - && ./autogen.sh \ - && ./configure \ - --with-cuda=${CUDA_HOME} \ - --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${OPEN_MPI_PATH} \ - --disable-tests \ - --prefix=${NCCL_HOME} \ - && make -j$(nproc) \ - && make install - -# Build NCCL -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} - -# Build NCCL tests -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} - -# Final stage -INCLUDE+ base/Dockerfile.common - -ARG EFA_VERSION=1.38.1 - -ENV NCCL_HOME=/opt/nccl -ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" - -COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} -COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME} -COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf -COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - libevent-dev \ - libhwloc-dev \ - && cd /tmp \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ - && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ - && ldconfig diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index 1f91bf618c..537c44485f 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -2,14 +2,15 @@ INCLUDE+ base/Dockerfile.common +ENV NCCL_HOME=/opt/nccl ENV PREFIX=/usr/local -ENV CUDA_PATH=/usr/local/cuda +ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# prerequisites +# Prerequisites RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get update \ @@ -25,47 +26,48 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ ARG EFA_VERSION=1.38.1 -RUN cd $HOME \ +RUN cd /tmp \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* # NCCL ARG NCCL_VERSION=2.26.2-1 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${PREFIX} + && make -j$(nproc) src.build BUILDDIR=${PREFIX} \ + && rm -rf /tmp/nccl # AWS OFI NCCL ARG OFI_VERSION=1.14.0 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_PATH} \ + --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ - --with-cuda=${CUDA_PATH} \ - --with-nccl=${PREFIX} \ --disable-tests \ --prefix=${PREFIX} \ - && make -j$(numproc) \ - && make install + && make -j$(nproc) \ + && make install \ + && rm -rf /tmp/aws-ofi-nccl # NCCL Tests -RUN cd $HOME \ +RUN cd $NCCL_HOME \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ - && make -j$(numproc) \ + && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ + CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${PREFIX} From 2946a20c049f0e8a162a60178591ae8bc94e95fa Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 21:47:03 +0300 Subject: [PATCH 32/54] Revert "[UX] Pre-build a EFA version of the default Docker image #2793" This reverts commit a24bd1a1292d28c42d7f2c7b9e6fe404cb44846f. --- .github/workflows/docker.yml | 5 +- docker/base/Dockerfile | 2 +- docker/base/base-efa.Dockerfile | 103 ++++++++++++++++++++++++++++++++ docker/base/efa.Dockerfile | 32 +++++----- 4 files changed, 123 insertions(+), 19 deletions(-) create mode 100644 docker/base/base-efa.Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index af99d8cd28..018bff32dd 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,7 +51,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - flavor: ["base", "devel", "devel-efa"] + # flavor: ["base", "devel", "devel-efa"] + flavor: ["base-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -70,6 +71,8 @@ jobs: FILE="base/Dockerfile" elif [ "${{ matrix.flavor }}" = "devel" ]; then FILE="base/Dockerfile" + elif [ "${{ matrix.flavor }}" = "base-efa" ]; then + FILE="base/base-efa.Dockerfile" else FILE="base/efa.Dockerfile" fi diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 23f9939eb1..ccc11d292d 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -55,7 +55,7 @@ ENV NCCL_TESTS_HOME=/opt/nccl-tests COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include -COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}/build +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} ARG FLAVOR diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile new file mode 100644 index 0000000000..0331b8f54b --- /dev/null +++ b/docker/base/base-efa.Dockerfile @@ -0,0 +1,103 @@ +# syntax = edrevo/dockerfile-plus + +# Build stage +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder + +ARG NCCL_VERSION=2.26.2-1 +ARG EFA_VERSION=1.38.1 +ARG OFI_VERSION=1.14.0 + +ENV NCCL_HOME=/opt/nccl +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl +ENV CUDA_HOME=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests + +# Install build dependencies +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + autoconf \ + automake \ + libtool \ + libopenmpi-dev \ + git \ + curl \ + python3 \ + build-essential + +RUN cd /tmp \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g + +RUN cd /tmp \ + && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ + && cd aws-ofi-nccl \ + && ./autogen.sh \ + && ./configure \ + --with-cuda=${CUDA_HOME} \ + --with-libfabric=${LIBFABRIC_PATH} \ + --with-mpi=${OPEN_MPI_PATH} \ + --disable-tests \ + --prefix=${NCCL_HOME} \ + && make -j$(nproc) \ + && make install + +# Build NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} + +# Build NCCL tests +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ + && cd ${NCCL_TESTS_HOME} \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +# Final stage +INCLUDE+ base/Dockerfile.common + +ARG EFA_VERSION=1.38.1 + +ENV NCCL_HOME=/opt/nccl +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV NCCL_TESTS_HOME=/opt/nccl-tests +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" + +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} +COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME} +COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libevent-dev \ + libhwloc-dev \ + && cd /tmp \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ + && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ + && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ + && ldconfig diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index 537c44485f..1f91bf618c 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -2,15 +2,14 @@ INCLUDE+ base/Dockerfile.common -ENV NCCL_HOME=/opt/nccl ENV PREFIX=/usr/local -ENV CUDA_HOME=/usr/local/cuda +ENV CUDA_PATH=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# Prerequisites +# prerequisites RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get update \ @@ -26,48 +25,47 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ ARG EFA_VERSION=1.38.1 -RUN cd /tmp \ +RUN cd $HOME \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* + && ./efa_installer.sh -y --skip-kmod -g # NCCL ARG NCCL_VERSION=2.26.2-1 -RUN cd /tmp \ +RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${PREFIX} \ - && rm -rf /tmp/nccl + && make -j$(nproc) src.build BUILDDIR=${PREFIX} # AWS OFI NCCL ARG OFI_VERSION=1.14.0 -RUN cd /tmp \ +RUN cd $HOME \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_HOME} \ + --with-cuda=${CUDA_PATH} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ + --with-cuda=${CUDA_PATH} \ + --with-nccl=${PREFIX} \ --disable-tests \ --prefix=${PREFIX} \ - && make -j$(nproc) \ - && make install \ - && rm -rf /tmp/aws-ofi-nccl + && make -j$(numproc) \ + && make install # NCCL Tests -RUN cd $NCCL_HOME \ +RUN cd $HOME \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ - && make -j$(nproc) \ + && make -j$(numproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ + CUDA_HOME=${CUDA_PATH} \ NCCL_HOME=${PREFIX} From 9903d01025f045da5cf026283f701b16ce1361c8 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 22:11:15 +0300 Subject: [PATCH 33/54] Revert "[UX] Pre-build a EFA version of the default Docker image #2793" --- docker/base/Dockerfile | 32 +++++++++++++++------------- docker/base/efa.Dockerfile | 43 +++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index ccc11d292d..94812a242a 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -3,14 +3,12 @@ # Build stage FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder -ARG NCCL_VERSION=2.26.2-1 - ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -# Install build dependencies +# Prerequisites + RUN export DEBIAN_FRONTEND=noninteractive \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ @@ -32,15 +30,20 @@ RUN export DEBIAN_FRONTEND=noninteractive \ python3 \ build-essential -# Build NCCL +# NCCL + +ARG NCCL_VERSION=2.26.2-1 + RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} -# Build NCCL tests -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ +# NCCL tests + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ @@ -48,20 +51,21 @@ RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ NCCL_HOME=${NCCL_HOME} # Final stage + INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/opt/nccl -ENV NCCL_TESTS_HOME=/opt/nccl-tests -COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib -COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include -COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} +COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build ARG FLAVOR -# Configure library paths +# MPI, NVCC, and /etc/ld.so.conf.d + RUN apt-get update \ - && apt-get install -y --no-install-recommends openmpi-bin \ + && apt-get install -y --no-install-recommends \ + openmpi-bin \ && if [ "$FLAVOR" = "devel" ]; then \ cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ diff --git a/docker/base/efa.Dockerfile b/docker/base/efa.Dockerfile index 1f91bf618c..50b6c1c5ef 100644 --- a/docker/base/efa.Dockerfile +++ b/docker/base/efa.Dockerfile @@ -2,14 +2,14 @@ INCLUDE+ base/Dockerfile.common -ENV PREFIX=/usr/local -ENV CUDA_PATH=/usr/local/cuda +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda ENV LIBFABRIC_PATH=/opt/amazon/efa ENV OPEN_MPI_PATH=/opt/amazon/openmpi ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" -# prerequisites +# Prerequisites RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get update \ @@ -19,53 +19,58 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ libhwloc-dev \ autoconf \ automake \ - libtool + libtool \ + && rm -rf /var/lib/apt/lists/* # EFA ARG EFA_VERSION=1.38.1 -RUN cd $HOME \ +RUN cd /tmp \ + && apt-get update \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* # NCCL ARG NCCL_VERSION=2.26.2-1 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${PREFIX} + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && rm -rf /tmp/nccl # AWS OFI NCCL ARG OFI_VERSION=1.14.0 -RUN cd $HOME \ +RUN cd /tmp \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure \ - --with-cuda=${CUDA_PATH} \ + --with-cuda=${CUDA_HOME} \ --with-libfabric=${LIBFABRIC_PATH} \ --with-mpi=${OPEN_MPI_PATH} \ - --with-cuda=${CUDA_PATH} \ - --with-nccl=${PREFIX} \ + --with-cuda=${CUDA_HOME} \ + --with-nccl=${NCCL_HOME} \ --disable-tests \ - --prefix=${PREFIX} \ - && make -j$(numproc) \ - && make install + --prefix=${NCCL_HOME} \ + && make -j$(nproc) \ + && make install \ + && rm -rf /tmp/aws-ofi-nccl /var/lib/apt/lists/* # NCCL Tests -RUN cd $HOME \ +RUN cd /opt \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ - && make -j$(numproc) \ + && make -j$(nproc) \ MPI=1 \ MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_PATH} \ - NCCL_HOME=${PREFIX} + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} From 8b796bb8e181c734010dab1d9248db8dca53238d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 22:13:45 +0300 Subject: [PATCH 34/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .github/workflows/docker.yml | 7 +- docker/base/base-efa.Dockerfile | 103 ------------------ .../base/{efa.Dockerfile => efa/Dockerfile} | 0 docker/{ => base}/efa/README.md | 0 4 files changed, 2 insertions(+), 108 deletions(-) delete mode 100644 docker/base/base-efa.Dockerfile rename docker/base/{efa.Dockerfile => efa/Dockerfile} (100%) rename docker/{ => base}/efa/README.md (100%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 018bff32dd..97be8253e0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,8 +51,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - # flavor: ["base", "devel", "devel-efa"] - flavor: ["base-efa"] + flavor: ["base", "devel", "devel-efa"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -71,10 +70,8 @@ jobs: FILE="base/Dockerfile" elif [ "${{ matrix.flavor }}" = "devel" ]; then FILE="base/Dockerfile" - elif [ "${{ matrix.flavor }}" = "base-efa" ]; then - FILE="base/base-efa.Dockerfile" else - FILE="base/efa.Dockerfile" + FILE="base/efa/Dockerfile" fi docker buildx build \ --platform linux/amd64 \ diff --git a/docker/base/base-efa.Dockerfile b/docker/base/base-efa.Dockerfile deleted file mode 100644 index 0331b8f54b..0000000000 --- a/docker/base/base-efa.Dockerfile +++ /dev/null @@ -1,103 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -# Build stage -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder - -ARG NCCL_VERSION=2.26.2-1 -ARG EFA_VERSION=1.38.1 -ARG OFI_VERSION=1.14.0 - -ENV NCCL_HOME=/opt/nccl -ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl -ENV CUDA_HOME=/usr/local/cuda -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests - -# Install build dependencies -RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ - && apt-get update --fix-missing \ - && apt-get upgrade -y \ - && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ - && apt-get install -y tzdata \ - && dpkg-reconfigure --frontend noninteractive tzdata \ - && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ - libopenmpi-dev \ - git \ - curl \ - python3 \ - build-essential - -RUN cd /tmp \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g - -RUN cd /tmp \ - && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ - && cd aws-ofi-nccl \ - && ./autogen.sh \ - && ./configure \ - --with-cuda=${CUDA_HOME} \ - --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${OPEN_MPI_PATH} \ - --disable-tests \ - --prefix=${NCCL_HOME} \ - && make -j$(nproc) \ - && make install - -# Build NCCL -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} - -# Build NCCL tests -RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ - && cd ${NCCL_TESTS_HOME} \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} - -# Final stage -INCLUDE+ base/Dockerfile.common - -ARG EFA_VERSION=1.38.1 - -ENV NCCL_HOME=/opt/nccl -ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV NCCL_TESTS_HOME=/opt/nccl-tests -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" - -COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} -COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME} -COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf -COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - libevent-dev \ - libhwloc-dev \ - && cd /tmp \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ - && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ - && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ - && ldconfig diff --git a/docker/base/efa.Dockerfile b/docker/base/efa/Dockerfile similarity index 100% rename from docker/base/efa.Dockerfile rename to docker/base/efa/Dockerfile diff --git a/docker/efa/README.md b/docker/base/efa/README.md similarity index 100% rename from docker/efa/README.md rename to docker/base/efa/README.md From 305e5f17c1a20582556bb80ec5c8a00618a995c3 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 23:54:28 +0300 Subject: [PATCH 35/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- scripts/packer/aws-image-cuda.json | 8 -------- scripts/packer/aws-image.json | 8 -------- scripts/packer/azure-image-cuda.json | 8 -------- scripts/packer/azure-image-grid.json | 8 -------- scripts/packer/azure-image.json | 8 -------- scripts/packer/build-cuda-image.pkr.hcl | 5 ----- scripts/packer/build-image.pkr.hcl | 5 ----- scripts/packer/gcp-image-cuda.json | 8 -------- scripts/packer/gcp-image.json | 8 -------- scripts/packer/oci-image-cuda.json | 8 -------- scripts/packer/oci-image.json | 8 -------- .../packer/provisioners/pull-docker-images.sh | 18 ------------------ .../server/services/jobs/configurators/base.py | 11 ++++------- src/dstack/version.py | 2 +- .../tasks/test_process_running_jobs.py | 2 +- .../_internal/server/routers/test_runs.py | 8 ++++---- .../services/jobs/configurators/test_task.py | 4 ++-- 17 files changed, 12 insertions(+), 115 deletions(-) delete mode 100644 scripts/packer/provisioners/pull-docker-images.sh diff --git a/scripts/packer/aws-image-cuda.json b/scripts/packer/aws-image-cuda.json index 64f4280bb3..f7719304ef 100644 --- a/scripts/packer/aws-image-cuda.json +++ b/scripts/packer/aws-image-cuda.json @@ -81,14 +81,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/aws-image.json b/scripts/packer/aws-image.json index c9e3cd5fb8..0327d6f9fb 100644 --- a/scripts/packer/aws-image.json +++ b/scripts/packer/aws-image.json @@ -71,14 +71,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/azure-image-cuda.json b/scripts/packer/azure-image-cuda.json index 4107c1ff65..c191282aec 100644 --- a/scripts/packer/azure-image-cuda.json +++ b/scripts/packer/azure-image-cuda.json @@ -73,14 +73,6 @@ "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index adef2d4ba9..58239b777a 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -71,14 +71,6 @@ "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/azure-image.json b/scripts/packer/azure-image.json index 2c5602040a..0b0d378335 100644 --- a/scripts/packer/azure-image.json +++ b/scripts/packer/azure-image.json @@ -63,14 +63,6 @@ "./install-docker.sh --version {{user `docker_version`}}" ] }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" - }, { "type": "shell", "execute_command": "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'", diff --git a/scripts/packer/build-cuda-image.pkr.hcl b/scripts/packer/build-cuda-image.pkr.hcl index b50dbada9f..48b1c20024 100644 --- a/scripts/packer/build-cuda-image.pkr.hcl +++ b/scripts/packer/build-cuda-image.pkr.hcl @@ -31,9 +31,4 @@ build { provisioner "shell" { script = "provisioners/install-nvidia-container-toolkit.sh" } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" - } } diff --git a/scripts/packer/build-image.pkr.hcl b/scripts/packer/build-image.pkr.hcl index 4cb1517dc3..6033ee4b1f 100644 --- a/scripts/packer/build-image.pkr.hcl +++ b/scripts/packer/build-image.pkr.hcl @@ -22,9 +22,4 @@ build { provisioner "shell" { inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"] } - - provisioner "shell" { - environment_vars = ["IMAGE_VERSION=${var.image_version}"] - script = "provisioners/pull-docker-images.sh" - } } diff --git a/scripts/packer/gcp-image-cuda.json b/scripts/packer/gcp-image-cuda.json index 9ebb24e7c9..2d606a2b42 100644 --- a/scripts/packer/gcp-image-cuda.json +++ b/scripts/packer/gcp-image-cuda.json @@ -56,14 +56,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/gcp-image.json b/scripts/packer/gcp-image.json index 6e6ba64537..7b9fd2f950 100644 --- a/scripts/packer/gcp-image.json +++ b/scripts/packer/gcp-image.json @@ -46,14 +46,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/oci-image-cuda.json b/scripts/packer/oci-image-cuda.json index 7d7251bc0b..e0406d8738 100644 --- a/scripts/packer/oci-image-cuda.json +++ b/scripts/packer/oci-image-cuda.json @@ -65,14 +65,6 @@ { "type": "shell", "script": "provisioners/install-nvidia-container-toolkit.sh" - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/oci-image.json b/scripts/packer/oci-image.json index 742a8649a0..370cb4bf2d 100644 --- a/scripts/packer/oci-image.json +++ b/scripts/packer/oci-image.json @@ -55,14 +55,6 @@ "cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version {{user `docker_version`}}"] - }, - { - "type": "shell", - "environment_vars": [ - "IMAGE_REPO={{user `image_repo`}}", - "IMAGE_VERSION={{user `image_version`}}" - ], - "script": "provisioners/pull-docker-images.sh" } ] } diff --git a/scripts/packer/provisioners/pull-docker-images.sh b/scripts/packer/provisioners/pull-docker-images.sh deleted file mode 100644 index 4d9d826435..0000000000 --- a/scripts/packer/provisioners/pull-docker-images.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -set -e - -IMAGES=" - dstackai/${IMAGE_REPO}:py3.13-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.12-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.11-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.10-${IMAGE_VERSION}-cuda-12.1 - dstackai/${IMAGE_REPO}:py3.9-${IMAGE_VERSION}-cuda-12.1 -" -echo "START pull image" -for img in $IMAGES; do - docker pull --platform linux/amd64 $img -done -echo "LIST installed images" -docker image ls --all -echo "END " diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index e3c7b89ee4..4c6d7ca973 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -50,11 +50,8 @@ def get_default_python_verison() -> str: ) -def get_default_image(python_version: str, nvcc: bool = False) -> str: - suffix = "" - if nvcc: - suffix = "-devel" - return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}" +def get_default_image(nvcc: bool = False) -> str: + return f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}" class JobConfigurator(ABC): @@ -173,7 +170,7 @@ def _dstack_image_commands(self) -> List[str]: ): return [] return [ - f"uv venv --prompt workflow --seed {DEFAULT_REPO_DIR}/.venv > /dev/null 2>&1", + f"uv venv --python {self._python()} --prompt workflow --seed {DEFAULT_REPO_DIR}/.venv > /dev/null 2>&1", f"echo 'source {DEFAULT_REPO_DIR}/.venv/bin/activate' >> ~/.bashrc", f"source {DEFAULT_REPO_DIR}/.venv/bin/activate", ] @@ -199,7 +196,7 @@ def _home_dir(self) -> Optional[str]: def _image_name(self) -> str: if self.run_spec.configuration.image is not None: return self.run_spec.configuration.image - return get_default_image(self._python(), nvcc=bool(self.run_spec.configuration.nvcc)) + return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc)) async def _user(self) -> Optional[UnixUser]: user = self.run_spec.configuration.user diff --git a/src/dstack/version.py b/src/dstack/version.py index c71d1a7665..80e1b21d2a 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -1,3 +1,3 @@ __version__ = "0.0.0" __is_release__ = False -base_image = "0.9" +base_image = "0.10" diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index d842441fe9..a19003842c 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -330,7 +330,7 @@ async def test_provisioning_shim_with_volumes( name="test-run-0-0", registry_username="", registry_password="", - image_name="dstackai/base:py3.13-0.9-cuda-12.1", + image_name="dstackai/base:0.10-base", container_user="root", privileged=privileged, gpu=None, diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 4a40c3273f..9bedd934f9 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -170,7 +170,7 @@ def get_dev_env_run_plan_dict( "/bin/bash", "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && (echo pip install ipykernel... && " @@ -188,7 +188,7 @@ def get_dev_env_run_plan_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.13-0.9-cuda-12.1", + "image_name": "dstackai/base:0.10-base", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", @@ -334,7 +334,7 @@ def get_dev_env_run_dict( "/bin/bash", "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && (echo pip install ipykernel... && " @@ -352,7 +352,7 @@ def get_dev_env_run_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:py3.13-0.9-cuda-12.1", + "image_name": "dstackai/base:0.10-base", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", diff --git a/src/tests/_internal/server/services/jobs/configurators/test_task.py b/src/tests/_internal/server/services/jobs/configurators/test_task.py index 07c8da7328..e954e6a01f 100644 --- a/src/tests/_internal/server/services/jobs/configurators/test_task.py +++ b/src/tests/_internal/server/services/jobs/configurators/test_task.py @@ -88,7 +88,7 @@ async def test_with_commands_and_image(self, shell: Optional[str], expected_shel ], ) async def test_with_commands_no_image(self, shell: Optional[str], expected_shell: str): - configuration = TaskConfiguration(commands=["sleep inf"], shell=shell) + configuration = TaskConfiguration(python="3.12", commands=["sleep inf"], shell=shell) run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) configurator = TaskJobConfigurator(run_spec) @@ -98,7 +98,7 @@ async def test_with_commands_no_image(self, shell: Optional[str], expected_shell expected_shell, "-i", "-c", - "uv venv --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + "uv venv --python 3.12 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" " && source /workflow/.venv/bin/activate" " && sleep inf", From 46a7d5189bacd5a8cf397c86501d246fc9bf0515 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 14 Jun 2025 23:59:44 +0300 Subject: [PATCH 36/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- examples/clusters/nccl-tests/.dstack.yml | 10 ++++++++-- examples/distributed-training/torchrun/.dstack.yml | 6 ++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index fa2c33dce5..6a8fd1c5c0 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -5,12 +5,18 @@ nodes: 2 startup_order: workers-first stop_criteria: master-done -image: dstackai/base-stgn:0.11-base-efa env: - NCCL_DEBUG=INFO commands: - | - sleep infinity + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --mca btl_tcp_if_exclude lo,docker0 \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 resources: gpu: nvidia:1..8 diff --git a/examples/distributed-training/torchrun/.dstack.yml b/examples/distributed-training/torchrun/.dstack.yml index 062051f365..4eccdb263e 100644 --- a/examples/distributed-training/torchrun/.dstack.yml +++ b/examples/distributed-training/torchrun/.dstack.yml @@ -1,7 +1,6 @@ type: task name: train-distrib -# The size of the cluster nodes: 2 python: 3.12 @@ -21,6 +20,5 @@ commands: multinode.py 50 10 resources: - gpu: 24GB:1..2 - # Uncomment if using multiple GPUs - #shm_size: 24GB + gpu: 1..8 + shm_size: 16GB From 0cbf5b9d0792bdcadfcc9efe77088d687487711a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sun, 15 Jun 2025 00:00:36 +0300 Subject: [PATCH 37/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- examples/clusters/nccl-tests/.dstack.yml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index 6a8fd1c5c0..79a7b8ec8f 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -9,15 +9,19 @@ env: - NCCL_DEBUG=INFO commands: - | - mpirun \ - --allow-run-as-root \ - --hostfile $DSTACK_MPI_HOSTFILE \ - -n $DSTACK_GPUS_NUM \ - -N $DSTACK_GPUS_PER_NODE \ - --mca btl_tcp_if_exclude lo,docker0 \ - --bind-to none \ - /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 - + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --mca btl_tcp_if_exclude lo,docker0 \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + resources: gpu: nvidia:1..8 shm_size: 16GB From 0105f7002c2dfe3122822cd9b92d344b76aa5a05 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sun, 15 Jun 2025 00:01:23 +0300 Subject: [PATCH 38/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- examples/clusters/nccl-tests/.dstack.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index 79a7b8ec8f..dc399534f4 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -21,7 +21,7 @@ commands: else sleep infinity fi - + resources: gpu: nvidia:1..8 shm_size: 16GB From 31dfd392a36edbce2b4bdb7c5e34f110c98f61dd Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sun, 15 Jun 2025 13:26:42 +0300 Subject: [PATCH 39/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .../background/tasks/process_running_jobs.py | 42 ++++- .../tasks/test_process_running_jobs.py | 149 +++++++++++++++++- 2 files changed, 183 insertions(+), 8 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index e05f98fd23..23f151454e 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -1,4 +1,5 @@ import asyncio +import re from collections.abc import Iterable from datetime import timedelta, timezone from typing import Dict, List, Optional @@ -7,6 +8,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload +from dstack._internal import settings from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT from dstack._internal.core.errors import GatewayError from dstack._internal.core.models.backends.base import BackendType @@ -441,6 +443,40 @@ def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> boo return False +def _patch_base_image_for_aws_efa( + job_spec: JobSpec, job_provisioning_data: JobProvisioningData +) -> str: + image_name = job_spec.image_name + + if job_provisioning_data.backend != BackendType.AWS: + return image_name + + instance_type = job_provisioning_data.instance_type.name + efa_enabled_patterns = [ + r"^p6\.", + r"^p5\.", + r"^p5e\.", + r"^p4d\.", + r"^p4de\.", + r"^g6\.", + r"^g6e\.", + ] + + is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) + if not is_efa_enabled: + return image_name + + if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"): + return image_name + + if image_name.endswith("-base"): + return image_name[:-5] + "-devel-efa" + elif image_name.endswith("-devel"): + return image_name[:-6] + "-devel-efa" + + return image_name + + @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1) def _process_provisioning_with_shim( ports: Dict[int, int], @@ -517,14 +553,14 @@ def _process_provisioning_with_shim( cpu = None memory = None network_mode = NetworkMode.HOST - + image_name = _patch_base_image_for_aws_efa(job_spec, job_provisioning_data) if shim_client.is_api_v2_supported(): shim_client.submit_task( task_id=job_model.id, name=job_model.job_name, registry_username=registry_username, registry_password=registry_password, - image_name=job_spec.image_name, + image_name=image_name, container_user=container_user, privileged=job_spec.privileged, gpu=gpu, @@ -545,7 +581,7 @@ def _process_provisioning_with_shim( submitted = shim_client.submit( username=registry_username, password=registry_password, - image_name=job_spec.image_name, + image_name=image_name, privileged=job_spec.privileged, container_name=job_model.job_name, container_user=container_user, diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index a19003842c..c212785226 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -1,22 +1,26 @@ from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Optional +from typing import TYPE_CHECKING, Optional from unittest.mock import MagicMock, Mock, patch import pytest from freezegun import freeze_time from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal import settings from dstack._internal.core.errors import SSHError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import NetworkMode from dstack._internal.core.models.configurations import DevEnvironmentConfiguration -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceType from dstack._internal.core.models.profiles import StartupOrder, UtilizationPolicy +from dstack._internal.core.models.resources import ResourcesSpec from dstack._internal.core.models.runs import ( JobRuntimeData, + JobSpec, JobStatus, JobTerminationReason, + Requirements, RunStatus, ) from dstack._internal.core.models.volumes import ( @@ -24,8 +28,11 @@ VolumeMountPoint, VolumeStatus, ) -from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.tasks.process_running_jobs import ( + _patch_base_image_for_aws_efa, + process_running_jobs, +) from dstack._internal.server.schemas.runner import ( HealthcheckResponse, JobStateEvent, @@ -56,6 +63,9 @@ pytestmark = pytest.mark.usefixtures("image_config_mock") +if TYPE_CHECKING: + from dstack._internal.core.models.runs import JobProvisioningData + @pytest.fixture def ssh_tunnel_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: @@ -221,7 +231,7 @@ async def test_updates_running_job(self, test_db, session: AsyncSession, tmp_pat patch( "dstack._internal.server.services.runner.client.RunnerClient" ) as RunnerClientMock, - patch.object(settings, "SERVER_DIR_PATH", tmp_path), + patch.object(server_settings, "SERVER_DIR_PATH", tmp_path), ): runner_client_mock = RunnerClientMock.return_value runner_client_mock.pull.return_value = PullResponse( @@ -878,3 +888,132 @@ async def test_master_job_waits_for_workers(self, test_db, session: AsyncSession await process_running_jobs() await session.refresh(master_job) assert master_job.status == JobStatus.RUNNING + + +class TestPatchBaseImageForAwsEfa: + @staticmethod + def _create_job_spec(image_name: str) -> "JobSpec": + return JobSpec( + job_num=0, + job_name="test-job", + commands=["echo hello"], + env={}, + image_name=image_name, + requirements=Requirements(resources=ResourcesSpec()), + ) + + @staticmethod + def _create_job_provisioning_data_with_instance_type( + backend: BackendType, instance_type: str + ) -> "JobProvisioningData": + job_provisioning_data = get_job_provisioning_data(backend=backend) + job_provisioning_data.instance_type = InstanceType( + name=instance_type, + resources=job_provisioning_data.instance_type.resources, + ) + return job_provisioning_data + + @staticmethod + def _call_patch_base_image_for_aws_efa( + image_name: str, backend: BackendType, instance_type: str + ) -> str: + job_spec = TestPatchBaseImageForAwsEfa._create_job_spec(image_name) + job_provisioning_data = ( + TestPatchBaseImageForAwsEfa._create_job_provisioning_data_with_instance_type( + backend, instance_type + ) + ) + return _patch_base_image_for_aws_efa(job_spec, job_provisioning_data) + + @pytest.mark.parametrize( + "suffix,instance_type", + [ + ("-base", "p6.xlarge"), + ("-devel", "p5.48xlarge"), + ], + ) + def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa" + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + [ + "p6.xlarge", + "p6.2xlarge", + "p5.xlarge", + "p5.48xlarge", + "p5e.xlarge", + "p4d.24xlarge", + "p4de.24xlarge", + "g6.xlarge", + "g6e.xlarge", + ], + ) + def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa" + assert result == expected + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "backend", + [BackendType.GCP, BackendType.AZURE, BackendType.LAMBDA, BackendType.LOCAL], + ) + @pytest.mark.parametrize( + "instance_type", + [ + "standard-4", + "p5.xlarge", + "p6.2xlarge", + "g6.xlarge", + ], # Mix of generic and EFA-named types + ) + def test_no_patch_non_aws_backends( + self, backend: BackendType, suffix: str, instance_type: str + ): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + result = self._call_patch_base_image_for_aws_efa(image_name, backend, instance_type) + assert result == image_name + + @pytest.mark.parametrize("suffix", ["-base", "-devel"]) + @pytest.mark.parametrize( + "instance_type", + ["t3.micro", "m5.large", "c5.xlarge", "r5.2xlarge", "m6i.large"], + ) + def test_no_patch_non_efa_aws_instances(self, instance_type: str, suffix: str): + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + assert result == image_name + + @pytest.mark.parametrize( + "instance_type", + ["p5.xlarge", "p6.2xlarge", "t3.micro", "m5.large"], # Mix of EFA and non-EFA instances + ) + @pytest.mark.parametrize( + "image_name", + [ + "ubuntu:20.04", + "nvidia/cuda:11.8-runtime-ubuntu20.04", + "python:3.9-slim", + "custom/image:latest", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-custom", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa", + f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}", + ], + ) + def test_no_patch_other_images(self, instance_type: str, image_name: str): + result = self._call_patch_base_image_for_aws_efa( + image_name, BackendType.AWS, instance_type + ) + assert result == image_name From 23586834e7dfb12dcef81799bcd745e18dbb8507 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sun, 15 Jun 2025 15:22:31 +0300 Subject: [PATCH 40/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- .../_internal/core/models/configurations.py | 27 +++-- .../services/jobs/configurators/base.py | 17 +-- .../core/models/test_configurations.py | 100 ++++++++++++++++++ .../tasks/test_process_running_jobs.py | 2 +- 4 files changed, 118 insertions(+), 28 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index bc6ba3235c..0e3750f624 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -4,6 +4,7 @@ from pathlib import PurePosixPath from typing import Any, Dict, List, Optional, Union +from packaging import version from pydantic import Field, ValidationError, conint, constr, root_validator, validator from typing_extensions import Annotated, Literal @@ -35,14 +36,6 @@ class RunConfigurationType(str, Enum): SERVICE = "service" -class PythonVersion(str, Enum): - PY39 = "3.9" - PY310 = "3.10" - PY311 = "3.11" - PY312 = "3.12" - PY313 = "3.13" - - class PortMapping(CoreModel): local_port: Optional[ValidPort] = None container_port: ValidPort @@ -193,8 +186,8 @@ class BaseRunConfiguration(CoreModel): Optional[RegistryAuth], Field(description="Credentials for pulling a private Docker image") ] = None python: Annotated[ - Optional[PythonVersion], - Field(description="The major version of Python. Mutually exclusive with `image`"), + Optional[str], + Field(description="The version of Python. Mutually exclusive with `image`"), ] = None nvcc: Annotated[ Optional[bool], @@ -248,16 +241,22 @@ class BaseRunConfiguration(CoreModel): setup: CommandsList = [] @validator("python", pre=True, always=True) - def convert_python(cls, v, values) -> Optional[PythonVersion]: + def convert_python(cls, v, values) -> Optional[str]: if v is not None and values.get("image"): - raise KeyError("`image` and `python` are mutually exclusive fields") + raise ValueError("`image` and `python` are mutually exclusive fields") + if v is None: + return None if isinstance(v, float): v = str(v) if v == "3.1": v = "3.10" if isinstance(v, str): - return PythonVersion(v) - return v + try: + version.parse(v) + return v + except version.InvalidVersion: + raise ValueError(f"Invalid Python version format: {v}") + raise ValueError(f"Python version must be a string, got {type(v)}") @validator("volumes", each_item=True) def convert_volumes(cls, v) -> MountPoint: diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 4c6d7ca973..2f2071d30c 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -12,7 +12,6 @@ from dstack._internal.core.models.configurations import ( DEFAULT_REPO_DIR, PortMapping, - PythonVersion, RunConfigurationType, ) from dstack._internal.core.models.profiles import ( @@ -38,16 +37,8 @@ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator -def get_default_python_verison() -> str: - version_info = sys.version_info - python_version_str = f"{version_info.major}.{version_info.minor}" - try: - return PythonVersion(python_version_str).value - except ValueError: - raise ServerClientError( - "Failed to use the system Python version. " - f"Python {python_version_str} is not supported." - ) +def get_default_python_version() -> str: + return sys.version.split()[0] def get_default_image(nvcc: bool = False) -> str: @@ -257,8 +248,8 @@ def _working_dir(self) -> Optional[str]: def _python(self) -> str: if self.run_spec.configuration.python is not None: - return self.run_spec.configuration.python.value - return get_default_python_verison() + return self.run_spec.configuration.python + return get_default_python_version() def _volumes(self, job_num: int) -> List[MountPoint]: return interpolate_job_volumes(self.run_spec.configuration.volumes, job_num) diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 0f081f615e..fa25501c52 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -1,3 +1,4 @@ +import re from typing import Any, Optional import pytest @@ -72,6 +73,105 @@ def test_shell_invalid(self): ): parse_run_configuration(conf) + @pytest.mark.parametrize( + "python_version", + [ + # Basic versions + "3.9", + "3.10.5", + # Pre-releases + "3.9a1", + "3.10b2", + "3.11rc1", + # Post releases + "3.9.0post1", + # Development versions + "3.10.dev0", + # Local versions + "3.9+local", + ], + ) + def test_python_version_valid(self, python_version: str): + conf = { + "type": "task", + "python": python_version, + "commands": ["python --version"], + } + parsed_conf = parse_run_configuration(conf) + assert parsed_conf.python == python_version + + @pytest.mark.parametrize( + "python_version", + [ + "python3.9", + "3.9.x", + "", + "latest", + "stable", + ], + ) + def test_python_version_invalid(self, python_version: str): + conf = { + "type": "task", + "python": python_version, + "commands": ["python --version"], + } + with pytest.raises( + ConfigurationError, match=f"Invalid Python version format: {re.escape(python_version)}" + ): + parse_run_configuration(conf) + + def test_python_version_float_conversion(self): + # Test the special case where 3.10 becomes 3.10 + conf = { + "type": "task", + "python": 3.10, + "commands": ["python --version"], + } + parsed_conf = parse_run_configuration(conf) + assert parsed_conf.python == "3.10" + + # Test other float versions + conf = { + "type": "task", + "python": 3.9, + "commands": ["python --version"], + } + parsed_conf = parse_run_configuration(conf) + assert parsed_conf.python == "3.9" + + def test_python_version_mutually_exclusive_with_image(self): + conf = { + "type": "task", + "python": "3.9", + "image": "python:3.9", + "commands": ["python --version"], + } + with pytest.raises( + ConfigurationError, match="`image` and `python` are mutually exclusive fields" + ): + parse_run_configuration(conf) + + def test_python_version_none(self): + conf = { + "type": "task", + "python": None, + "commands": ["echo hello"], + } + parsed_conf = parse_run_configuration(conf) + assert parsed_conf.python is None + + def test_python_version_wrong_type(self): + conf = { + "type": "task", + "python": ["3.9"], # Wrong type - should be string + "commands": ["python --version"], + } + with pytest.raises( + ConfigurationError, match="Python version must be a string, got " + ): + parse_run_configuration(conf) + def test_registry_auth_hashable(): """ diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index c212785226..fbc4740e12 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -319,7 +319,7 @@ async def test_provisioning_shim_with_volumes( job_provisioning_data = get_job_provisioning_data(dockerized=True) with patch( - "dstack._internal.server.services.jobs.configurators.base.get_default_python_verison" + "dstack._internal.server.services.jobs.configurators.base.get_default_python_version" ) as PyVersion: PyVersion.return_value = "3.13" job = await create_job( From 27520ebce6d8016bf1781c5f25b777f487808060 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sun, 15 Jun 2025 19:58:51 +0300 Subject: [PATCH 41/54] [UX] Pre-build a EFA version of the default Docker image #2793 --- examples/clusters/nccl-tests/.dstack.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/clusters/nccl-tests/.dstack.yml b/examples/clusters/nccl-tests/.dstack.yml index dc399534f4..164148b3c7 100644 --- a/examples/clusters/nccl-tests/.dstack.yml +++ b/examples/clusters/nccl-tests/.dstack.yml @@ -15,7 +15,6 @@ commands: --hostfile $DSTACK_MPI_HOSTFILE \ -n $DSTACK_GPUS_NUM \ -N $DSTACK_GPUS_PER_NODE \ - --mca btl_tcp_if_exclude lo,docker0 \ --bind-to none \ /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 else From bdaa059b667270a8e69620ee9dc0d7f5f0153a2a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 16 Jun 2025 12:17:21 +0300 Subject: [PATCH 42/54] [UX] Pre-build a EFA version of the default Docker image #2793 Roll-backed removing PythonVersion enum; Minor fixes --- .../_internal/core/models/configurations.py | 27 ++--- .../services/jobs/configurators/base.py | 17 ++- .../core/models/test_configurations.py | 100 ------------------ .../tasks/test_process_running_jobs.py | 10 +- 4 files changed, 31 insertions(+), 123 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 0e3750f624..bc6ba3235c 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -4,7 +4,6 @@ from pathlib import PurePosixPath from typing import Any, Dict, List, Optional, Union -from packaging import version from pydantic import Field, ValidationError, conint, constr, root_validator, validator from typing_extensions import Annotated, Literal @@ -36,6 +35,14 @@ class RunConfigurationType(str, Enum): SERVICE = "service" +class PythonVersion(str, Enum): + PY39 = "3.9" + PY310 = "3.10" + PY311 = "3.11" + PY312 = "3.12" + PY313 = "3.13" + + class PortMapping(CoreModel): local_port: Optional[ValidPort] = None container_port: ValidPort @@ -186,8 +193,8 @@ class BaseRunConfiguration(CoreModel): Optional[RegistryAuth], Field(description="Credentials for pulling a private Docker image") ] = None python: Annotated[ - Optional[str], - Field(description="The version of Python. Mutually exclusive with `image`"), + Optional[PythonVersion], + Field(description="The major version of Python. Mutually exclusive with `image`"), ] = None nvcc: Annotated[ Optional[bool], @@ -241,22 +248,16 @@ class BaseRunConfiguration(CoreModel): setup: CommandsList = [] @validator("python", pre=True, always=True) - def convert_python(cls, v, values) -> Optional[str]: + def convert_python(cls, v, values) -> Optional[PythonVersion]: if v is not None and values.get("image"): - raise ValueError("`image` and `python` are mutually exclusive fields") - if v is None: - return None + raise KeyError("`image` and `python` are mutually exclusive fields") if isinstance(v, float): v = str(v) if v == "3.1": v = "3.10" if isinstance(v, str): - try: - version.parse(v) - return v - except version.InvalidVersion: - raise ValueError(f"Invalid Python version format: {v}") - raise ValueError(f"Python version must be a string, got {type(v)}") + return PythonVersion(v) + return v @validator("volumes", each_item=True) def convert_volumes(cls, v) -> MountPoint: diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 2f2071d30c..4c6d7ca973 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -12,6 +12,7 @@ from dstack._internal.core.models.configurations import ( DEFAULT_REPO_DIR, PortMapping, + PythonVersion, RunConfigurationType, ) from dstack._internal.core.models.profiles import ( @@ -37,8 +38,16 @@ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator -def get_default_python_version() -> str: - return sys.version.split()[0] +def get_default_python_verison() -> str: + version_info = sys.version_info + python_version_str = f"{version_info.major}.{version_info.minor}" + try: + return PythonVersion(python_version_str).value + except ValueError: + raise ServerClientError( + "Failed to use the system Python version. " + f"Python {python_version_str} is not supported." + ) def get_default_image(nvcc: bool = False) -> str: @@ -248,8 +257,8 @@ def _working_dir(self) -> Optional[str]: def _python(self) -> str: if self.run_spec.configuration.python is not None: - return self.run_spec.configuration.python - return get_default_python_version() + return self.run_spec.configuration.python.value + return get_default_python_verison() def _volumes(self, job_num: int) -> List[MountPoint]: return interpolate_job_volumes(self.run_spec.configuration.volumes, job_num) diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index fa25501c52..0f081f615e 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -1,4 +1,3 @@ -import re from typing import Any, Optional import pytest @@ -73,105 +72,6 @@ def test_shell_invalid(self): ): parse_run_configuration(conf) - @pytest.mark.parametrize( - "python_version", - [ - # Basic versions - "3.9", - "3.10.5", - # Pre-releases - "3.9a1", - "3.10b2", - "3.11rc1", - # Post releases - "3.9.0post1", - # Development versions - "3.10.dev0", - # Local versions - "3.9+local", - ], - ) - def test_python_version_valid(self, python_version: str): - conf = { - "type": "task", - "python": python_version, - "commands": ["python --version"], - } - parsed_conf = parse_run_configuration(conf) - assert parsed_conf.python == python_version - - @pytest.mark.parametrize( - "python_version", - [ - "python3.9", - "3.9.x", - "", - "latest", - "stable", - ], - ) - def test_python_version_invalid(self, python_version: str): - conf = { - "type": "task", - "python": python_version, - "commands": ["python --version"], - } - with pytest.raises( - ConfigurationError, match=f"Invalid Python version format: {re.escape(python_version)}" - ): - parse_run_configuration(conf) - - def test_python_version_float_conversion(self): - # Test the special case where 3.10 becomes 3.10 - conf = { - "type": "task", - "python": 3.10, - "commands": ["python --version"], - } - parsed_conf = parse_run_configuration(conf) - assert parsed_conf.python == "3.10" - - # Test other float versions - conf = { - "type": "task", - "python": 3.9, - "commands": ["python --version"], - } - parsed_conf = parse_run_configuration(conf) - assert parsed_conf.python == "3.9" - - def test_python_version_mutually_exclusive_with_image(self): - conf = { - "type": "task", - "python": "3.9", - "image": "python:3.9", - "commands": ["python --version"], - } - with pytest.raises( - ConfigurationError, match="`image` and `python` are mutually exclusive fields" - ): - parse_run_configuration(conf) - - def test_python_version_none(self): - conf = { - "type": "task", - "python": None, - "commands": ["echo hello"], - } - parsed_conf = parse_run_configuration(conf) - assert parsed_conf.python is None - - def test_python_version_wrong_type(self): - conf = { - "type": "task", - "python": ["3.9"], # Wrong type - should be string - "commands": ["python --version"], - } - with pytest.raises( - ConfigurationError, match="Python version must be a string, got " - ): - parse_run_configuration(conf) - def test_registry_auth_hashable(): """ diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index fbc4740e12..5f73d0fdc3 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import TYPE_CHECKING, Optional +from typing import Optional from unittest.mock import MagicMock, Mock, patch import pytest @@ -16,6 +16,7 @@ from dstack._internal.core.models.profiles import StartupOrder, UtilizationPolicy from dstack._internal.core.models.resources import ResourcesSpec from dstack._internal.core.models.runs import ( + JobProvisioningData, JobRuntimeData, JobSpec, JobStatus, @@ -63,9 +64,6 @@ pytestmark = pytest.mark.usefixtures("image_config_mock") -if TYPE_CHECKING: - from dstack._internal.core.models.runs import JobProvisioningData - @pytest.fixture def ssh_tunnel_mock(monkeypatch: pytest.MonkeyPatch) -> Mock: @@ -319,7 +317,7 @@ async def test_provisioning_shim_with_volumes( job_provisioning_data = get_job_provisioning_data(dockerized=True) with patch( - "dstack._internal.server.services.jobs.configurators.base.get_default_python_version" + "dstack._internal.server.services.jobs.configurators.base.get_default_python_verison" ) as PyVersion: PyVersion.return_value = "3.13" job = await create_job( @@ -905,7 +903,7 @@ def _create_job_spec(image_name: str) -> "JobSpec": @staticmethod def _create_job_provisioning_data_with_instance_type( backend: BackendType, instance_type: str - ) -> "JobProvisioningData": + ) -> JobProvisioningData: job_provisioning_data = get_job_provisioning_data(backend=backend) job_provisioning_data.instance_type = InstanceType( name=instance_type, From 035ecede924ae79d4b59c65b4caf6f4c51ece418 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 16 Jun 2025 12:51:42 +0300 Subject: [PATCH 43/54] [UX] Pre-build a EFA version of the default Docker image #2793 PR review comments --- .../background/tasks/process_running_jobs.py | 68 +++++++++---------- .../services/jobs/configurators/base.py | 7 ++ 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index 23f151454e..71ae2c066f 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -443,40 +443,6 @@ def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> boo return False -def _patch_base_image_for_aws_efa( - job_spec: JobSpec, job_provisioning_data: JobProvisioningData -) -> str: - image_name = job_spec.image_name - - if job_provisioning_data.backend != BackendType.AWS: - return image_name - - instance_type = job_provisioning_data.instance_type.name - efa_enabled_patterns = [ - r"^p6\.", - r"^p5\.", - r"^p5e\.", - r"^p4d\.", - r"^p4de\.", - r"^g6\.", - r"^g6e\.", - ] - - is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) - if not is_efa_enabled: - return image_name - - if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"): - return image_name - - if image_name.endswith("-base"): - return image_name[:-5] + "-devel-efa" - elif image_name.endswith("-devel"): - return image_name[:-6] + "-devel-efa" - - return image_name - - @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1) def _process_provisioning_with_shim( ports: Dict[int, int], @@ -1005,3 +971,37 @@ def _get_instance_specific_gpu_devices( GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl") ) return gpu_devices + + +def _patch_base_image_for_aws_efa( + job_spec: JobSpec, job_provisioning_data: JobProvisioningData +) -> str: + image_name = job_spec.image_name + + if job_provisioning_data.backend != BackendType.AWS: + return image_name + + instance_type = job_provisioning_data.instance_type.name + efa_enabled_patterns = [ + r"^p6\.", + r"^p5\.", + r"^p5e\.", + r"^p4d\.", + r"^p4de\.", + r"^g6\.", + r"^g6e\.", + ] + + is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) + if not is_efa_enabled: + return image_name + + if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"): + return image_name + + if image_name.endswith("-base"): + return image_name[:-5] + "-devel-efa" + elif image_name.endswith("-devel"): + return image_name[:-6] + "-devel-efa" + + return image_name diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 4c6d7ca973..fa3e9114b1 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -51,6 +51,13 @@ def get_default_python_verison() -> str: def get_default_image(nvcc: bool = False) -> str: + """ + Note: May be overridden by dstack (e.g., EFA-enabled version for AWS EFA-capable instances). + See `dstack._internal.server.background.tasks.process_running_jobs._patch_base_image_for_aws_efa` for details. + + Args: + nvcc: If True, returns 'devel' variant, otherwise 'base'. + """ return f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}" From 57ab13d9ac5aaa51026e068780c8b80641f81754 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 16 Jun 2025 22:34:54 +0300 Subject: [PATCH 44/54] Experimental: Build default Docker images for both Ubuntu versions 20.04 and 22.04 --- .github/workflows/docker.yml | 3 ++- docker/base/Dockerfile | 5 +++-- docker/base/Dockerfile.common | 6 ++++-- .../server/background/tasks/process_running_jobs.py | 8 ++++---- .../server/services/jobs/configurators/base.py | 2 +- src/dstack/_internal/settings.py | 3 +++ src/dstack/version.py | 1 + .../background/tasks/test_process_running_jobs.py | 10 +++++----- 8 files changed, 23 insertions(+), 15 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 97be8253e0..889970d1ea 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,6 +52,7 @@ jobs: strategy: matrix: flavor: ["base", "devel", "devel-efa"] + ubuntu_version: ["20", "22"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -75,7 +76,7 @@ jobs: fi docker buildx build \ --platform linux/amd64 \ - --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }} \ + --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ --build-arg FLAVOR=${{ matrix.flavor }} \ --provenance=false \ --push \ diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 94812a242a..a42788984a 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,7 +1,8 @@ # syntax = edrevo/dockerfile-plus +ARG UBUNTU_VERSION # Build stage -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder +FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda @@ -10,7 +11,7 @@ ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi # Prerequisites RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ && apt-get upgrade -y \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 1ddd2e227c..9fbe5d4aff 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -1,4 +1,6 @@ -FROM nvidia/cuda:12.1.1-base-ubuntu20.04 +ARG UBUNTU_VERSION + +FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 ARG _UV_HOME="/opt/uv" @@ -9,7 +11,7 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH="${UV_INSTALL_DIR}:${PATH}" RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ && apt-get upgrade -y \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index 71ae2c066f..6eeffc66f6 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -999,9 +999,9 @@ def _patch_base_image_for_aws_efa( if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"): return image_name - if image_name.endswith("-base"): - return image_name[:-5] + "-devel-efa" - elif image_name.endswith("-devel"): - return image_name[:-6] + "-devel-efa" + if image_name.endswith(f"-base-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"): + return image_name[:-17] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" + elif image_name.endswith(f"-devel-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"): + return image_name[:-18] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" return image_name diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index fa3e9114b1..6ef0ca7712 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -58,7 +58,7 @@ def get_default_image(nvcc: bool = False) -> str: Args: nvcc: If True, returns 'devel' variant, otherwise 'base'. """ - return f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}" + return f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-{'devel' if nvcc else 'base'}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" class JobConfigurator(ABC): diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 92a7326a0c..2636a3b362 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -14,6 +14,9 @@ DSTACK_BASE_IMAGE = os.getenv("DSTACK_BASE_IMAGE", "dstackai/base") DSTACK_BASE_IMAGE_VERSION = os.getenv("DSTACK_BASE_IMAGE_VERSION", version.base_image) +DSTACK_BASE_IMAGE_UBUNTU_VERSION = os.getenv( + "DSTACK_BASE_IMAGE_UBUNTU_VERSION", version.base_image_ubuntu_version +) class FeatureFlags: diff --git a/src/dstack/version.py b/src/dstack/version.py index 80e1b21d2a..b7c06d6962 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -1,3 +1,4 @@ __version__ = "0.0.0" __is_release__ = False base_image = "0.10" +base_image_ubuntu_version = "20.04" diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index 5f73d0fdc3..a8181f8da4 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -931,11 +931,11 @@ def _call_patch_base_image_for_aws_efa( ], ) def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: str): - image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" result = self._call_patch_base_image_for_aws_efa( image_name, BackendType.AWS, instance_type ) - expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa" + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" assert result == expected @pytest.mark.parametrize("suffix", ["-base", "-devel"]) @@ -954,11 +954,11 @@ def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: st ], ) def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str): - image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" result = self._call_patch_base_image_for_aws_efa( image_name, BackendType.AWS, instance_type ) - expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa" + expected = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" assert result == expected @pytest.mark.parametrize("suffix", ["-base", "-devel"]) @@ -978,7 +978,7 @@ def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str): def test_no_patch_non_aws_backends( self, backend: BackendType, suffix: str, instance_type: str ): - image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" + image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}" result = self._call_patch_base_image_for_aws_efa(image_name, backend, instance_type) assert result == image_name From 8acfff9c96315865e8dee805bb274047c91590a7 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 16 Jun 2025 22:41:33 +0300 Subject: [PATCH 45/54] Experimental: Build default Docker images for both Ubuntu versions 20.04 and 22.04 --- docker/base/Dockerfile | 2 ++ docker/base/efa/Dockerfile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index a42788984a..e957d33758 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -53,6 +53,8 @@ RUN cd /opt \ # Final stage +ARG UBUNTU_VERSION + INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/opt/nccl diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 50b6c1c5ef..3cc6b8a521 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,5 +1,7 @@ # syntax = edrevo/dockerfile-plus +ARG UBUNTU_VERSION + INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/usr/local From 1b1c02b3afe908520277a88db73c9c66c1238879 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 16 Jun 2025 22:55:11 +0300 Subject: [PATCH 46/54] Experimental: Build default Docker images for both Ubuntu versions 20.04 and 22.04 --- .github/workflows/docker.yml | 1 + docker/base/Dockerfile | 2 -- docker/base/efa/Dockerfile | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 889970d1ea..898e76f0cc 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -78,6 +78,7 @@ jobs: --platform linux/amd64 \ --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ --build-arg FLAVOR=${{ matrix.flavor }} \ + --build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \ --provenance=false \ --push \ -f $FILE . diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index e957d33758..a42788984a 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -53,8 +53,6 @@ RUN cd /opt \ # Final stage -ARG UBUNTU_VERSION - INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/opt/nccl diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 3cc6b8a521..50b6c1c5ef 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -1,7 +1,5 @@ # syntax = edrevo/dockerfile-plus -ARG UBUNTU_VERSION - INCLUDE+ base/Dockerfile.common ENV NCCL_HOME=/usr/local From 78dc09487ddba0aa3b0c30ad376e4b37bd204bc7 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 10:41:41 +0300 Subject: [PATCH 47/54] Experimental: Build default Docker images for both Ubuntu versions 20.04 and 22.04 --- src/dstack/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/version.py b/src/dstack/version.py index b7c06d6962..d4255a2301 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -1,4 +1,4 @@ __version__ = "0.0.0" __is_release__ = False base_image = "0.10" -base_image_ubuntu_version = "20.04" +base_image_ubuntu_version = "22.04" From 136716707cd39c6bce0fc4a939ecff48919adf66 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 12:00:43 +0300 Subject: [PATCH 48/54] Added `OMPI_MCA_pml`, `OMPI_MCA_btl`, `OMPI_MCA_btl_tcp_if_exclude`, and `NCCL_SOCKET_IFNAME` to the base images --- docker/base/Dockerfile.common | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 9fbe5d4aff..ae76f30124 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -10,6 +10,11 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH="${UV_INSTALL_DIR}:${PATH}" +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo + RUN export DEBIAN_FRONTEND=noninteractive \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ From 6d5ceb22ae330775d3d099c5a341cb0294b27c7c Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 15:40:09 +0300 Subject: [PATCH 49/54] - [x] Added `OMPI_MCA_pml`, `OMPI_MCA_btl`, `OMPI_MCA_btl_tcp_if_exclude`, and `NCCL_SOCKET_IFNAME` to base images. - [x] Updated the list of EFA-enabled AWS EC2 instances --- docker/base/Dockerfile.common | 8 ++++---- .../server/background/tasks/process_running_jobs.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index ae76f30124..914e130613 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -10,10 +10,10 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH="${UV_INSTALL_DIR}:${PATH}" -ENV OMPI_MCA_pml=^cm,ucx -ENV OMPI_MCA_btl=tcp,self -ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 -ENV NCCL_SOCKET_IFNAME=^docker,lo +export OMPI_MCA_pml=^cm,ucx +export OMPI_MCA_btl=tcp,self +export OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +export NCCL_SOCKET_IFNAME=^docker,lo RUN export DEBIAN_FRONTEND=noninteractive \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index 6eeffc66f6..df2cb6cb47 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -983,13 +983,19 @@ def _patch_base_image_for_aws_efa( instance_type = job_provisioning_data.instance_type.name efa_enabled_patterns = [ - r"^p6\.", + # TODO: p6-b200 isn't supported yet in gpuhunt + r"^p6-b200\.", r"^p5\.", r"^p5e\.", + r"^p5en\.", r"^p4d\.", r"^p4de\.", - r"^g6\.", - r"^g6e\.", + r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^gr6\.8xlarge$", + r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", + r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$", + r"^p3dn\.", ] is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) From 3894ba9715cf98a4f79c0784a5e9577e382342b9 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 15:43:03 +0300 Subject: [PATCH 50/54] - [x] Added OMPI_MCA_pml, OMPI_MCA_btl, OMPI_MCA_btl_tcp_if_exclude, and NCCL_SOCKET_IFNAME to base images. (bugfix) --- docker/base/Dockerfile.common | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 914e130613..ae76f30124 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -10,10 +10,10 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH="${UV_INSTALL_DIR}:${PATH}" -export OMPI_MCA_pml=^cm,ucx -export OMPI_MCA_btl=tcp,self -export OMPI_MCA_btl_tcp_if_exclude=lo,docker0 -export NCCL_SOCKET_IFNAME=^docker,lo +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo RUN export DEBIAN_FRONTEND=noninteractive \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ From a646ebcf016d7e2bde12ae6716305426777ec43b Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 16:01:40 +0300 Subject: [PATCH 51/54] Fixed broken tests --- .../background/tasks/process_running_jobs.py | 14 +++++++------- .../background/tasks/test_process_running_jobs.py | 13 +++++-------- src/tests/_internal/server/routers/test_runs.py | 4 ++-- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index df2cb6cb47..b834db39b9 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -984,18 +984,18 @@ def _patch_base_image_for_aws_efa( instance_type = job_provisioning_data.instance_type.name efa_enabled_patterns = [ # TODO: p6-b200 isn't supported yet in gpuhunt - r"^p6-b200\.", - r"^p5\.", - r"^p5e\.", - r"^p5en\.", - r"^p4d\.", - r"^p4de\.", + r"^p6-b200\.(48xlarge)$", + r"^p5\.(48xlarge)$", + r"^p5e\.(48xlarge)$", + r"^p5en\.(48xlarge)$", + r"^p4d\.(24xlarge)$", + r"^p4de\.(24xlarge)$", r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", r"^gr6\.8xlarge$", r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$", r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$", - r"^p3dn\.", + r"^p3dn\.(24xlarge)$", ] is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns) diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index a8181f8da4..e290b1801f 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -338,7 +338,7 @@ async def test_provisioning_shim_with_volumes( name="test-run-0-0", registry_username="", registry_password="", - image_name="dstackai/base:0.10-base", + image_name="dstackai/base:0.10-base-ubuntu22.04", container_user="root", privileged=privileged, gpu=None, @@ -926,7 +926,7 @@ def _call_patch_base_image_for_aws_efa( @pytest.mark.parametrize( "suffix,instance_type", [ - ("-base", "p6.xlarge"), + ("-base", "p6-b200.48xlarge"), ("-devel", "p5.48xlarge"), ], ) @@ -942,15 +942,12 @@ def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: st @pytest.mark.parametrize( "instance_type", [ - "p6.xlarge", - "p6.2xlarge", - "p5.xlarge", "p5.48xlarge", "p5e.xlarge", "p4d.24xlarge", "p4de.24xlarge", - "g6.xlarge", - "g6e.xlarge", + "g6.8xlarge", + "g6e.8xlarge", ], ) def test_patch_all_efa_instance_types(self, instance_type: str, suffix: str): @@ -985,7 +982,7 @@ def test_no_patch_non_aws_backends( @pytest.mark.parametrize("suffix", ["-base", "-devel"]) @pytest.mark.parametrize( "instance_type", - ["t3.micro", "m5.large", "c5.xlarge", "r5.2xlarge", "m6i.large"], + ["t3.micro", "m5.large", "c5.xlarge", "r5.2xlarge", "m6i.large", "g6.xlarge"], ) def test_no_patch_non_efa_aws_instances(self, instance_type: str, suffix: str): image_name = f"{settings.DSTACK_BASE_IMAGE}:{settings.DSTACK_BASE_IMAGE_VERSION}{suffix}" diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 9bedd934f9..3c2181a209 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -188,7 +188,7 @@ def get_dev_env_run_plan_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:0.10-base", + "image_name": "dstackai/base:0.10-base-ubuntu22.04", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", @@ -352,7 +352,7 @@ def get_dev_env_run_dict( ], "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:0.10-base", + "image_name": "dstackai/base:0.10-base-ubuntu22.04", "user": None, "privileged": privileged, "job_name": f"{run_name}-0-0", From 48b4b9dd7b177b5c7d51eab13456a503dab9f017 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 16:02:20 +0300 Subject: [PATCH 52/54] Removed Ubuntu 20.04 from Ci/CD --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 898e76f0cc..f4c97b5da1 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,7 +52,7 @@ jobs: strategy: matrix: flavor: ["base", "devel", "devel-efa"] - ubuntu_version: ["20", "22"] + ubuntu_version: ["22"] steps: - name: Checkout repository uses: actions/checkout@v4 From d8d755c225e302948314b858b4275414211bdbc2 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 16:09:51 +0300 Subject: [PATCH 53/54] Fixed broken tests --- .../server/background/tasks/test_process_running_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index e290b1801f..9f2310577a 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -943,7 +943,7 @@ def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: st "instance_type", [ "p5.48xlarge", - "p5e.xlarge", + "p5e.24xlarge", "p4d.24xlarge", "p4de.24xlarge", "g6.8xlarge", From 49b7cb95288132245db2441e7542418a033c42da Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 17 Jun 2025 16:57:51 +0300 Subject: [PATCH 54/54] Fixed tests --- .../server/background/tasks/test_process_running_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index 9f2310577a..59a08ddc4d 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -943,7 +943,7 @@ def test_patch_aws_efa_instance_with_suffix(self, suffix: str, instance_type: st "instance_type", [ "p5.48xlarge", - "p5e.24xlarge", + "p5e.48xlarge", "p4d.24xlarge", "p4de.24xlarge", "g6.8xlarge",