Skip to content

Commit 948c5e4

Browse files
Merge latest master into PR2
2 parents ae7ad90 + 0b53d25 commit 948c5e4

72 files changed

Lines changed: 1987 additions & 801 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/docker-efa.yml

Lines changed: 0 additions & 46 deletions
This file was deleted.

.github/workflows/docker.yml

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ jobs:
5151
runs-on: ubuntu-latest
5252
strategy:
5353
matrix:
54-
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
55-
flavor: ["base", "devel"]
54+
flavor: ["base", "devel", "devel-efa"]
55+
ubuntu_version: ["22"]
5656
steps:
5757
- name: Checkout repository
5858
uses: actions/checkout@v4
@@ -67,7 +67,21 @@ jobs:
6767
uses: docker/setup-qemu-action@v3
6868
- name: Build and upload to DockerHub
6969
run: |
70-
docker buildx build --platform linux/amd64 --build-arg FLAVOR=${{ matrix.flavor }} --build-arg PYTHON=${{ matrix.python }} --push --provenance=false --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${{ matrix.flavor == 'devel' && '-devel' || '' }} -f base/Dockerfile .
70+
if [ "${{ matrix.flavor }}" = "base" ]; then
71+
FILE="base/Dockerfile"
72+
elif [ "${{ matrix.flavor }}" = "devel" ]; then
73+
FILE="base/Dockerfile"
74+
else
75+
FILE="base/efa/Dockerfile"
76+
fi
77+
docker buildx build \
78+
--platform linux/amd64 \
79+
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \
80+
--build-arg FLAVOR=${{ matrix.flavor }} \
81+
--build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \
82+
--provenance=false \
83+
--push \
84+
-f $FILE .
7185
7286
build-aws-images:
7387
needs: build-docker

.github/workflows/release.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,10 +230,12 @@ jobs:
230230
- name: Upload pip package
231231
run: |
232232
VERSION=${GITHUB_REF#refs/tags/}
233-
BASE_IMAGE=$(cat src/dstack/version.py | grep base_image)
233+
BASE_IMAGE=$(cat src/dstack/version.py | grep "base_image = ")
234+
BASE_IMAGE_UBUNTU_VERSION=$(cat src/dstack/version.py | grep "base_image_ubuntu_version = ")
234235
echo "__version__ = \"$VERSION\"" > src/dstack/version.py
235236
echo "__is_release__ = True" >> src/dstack/version.py
236237
echo $BASE_IMAGE >> src/dstack/version.py
238+
echo $BASE_IMAGE_UBUNTU_VERSION >> src/dstack/version.py
237239
cp README.md src
238240
uv build
239241
uv publish --username ${{ secrets.PYPI_USERNAME }} --password ${{ secrets.PYPI_PASSWORD }}

docker/base/Dockerfile

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,79 @@
1+
# syntax = edrevo/dockerfile-plus
2+
ARG UBUNTU_VERSION
3+
4+
# Build stage
5+
FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder
6+
7+
ENV NCCL_HOME=/opt/nccl
8+
ENV CUDA_HOME=/usr/local/cuda
9+
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
10+
11+
# Prerequisites
12+
13+
RUN export DEBIAN_FRONTEND=noninteractive \
14+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \
15+
&& apt-get update --fix-missing \
16+
&& apt-get upgrade -y \
17+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
18+
&& apt-get install -y tzdata \
19+
&& dpkg-reconfigure --frontend noninteractive tzdata \
20+
&& cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
21+
&& apt-get install -y --no-install-recommends \
22+
cuda-libraries-dev-${cuda_version} \
23+
cuda-nvcc-${cuda_version} \
24+
libhwloc-dev \
25+
autoconf \
26+
automake \
27+
libtool \
28+
libopenmpi-dev \
29+
git \
30+
curl \
31+
python3 \
32+
build-essential
33+
34+
# NCCL
35+
36+
ARG NCCL_VERSION=2.26.2-1
37+
38+
RUN cd /tmp \
39+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
40+
&& cd nccl \
41+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
42+
43+
# NCCL tests
44+
45+
RUN cd /opt \
46+
&& git clone https://github.com/NVIDIA/nccl-tests \
47+
&& cd nccl-tests \
48+
&& make -j$(nproc) \
49+
MPI=1 \
50+
MPI_HOME=${OPEN_MPI_PATH} \
51+
CUDA_HOME=${CUDA_HOME} \
52+
NCCL_HOME=${NCCL_HOME}
53+
54+
# Final stage
55+
56+
INCLUDE+ base/Dockerfile.common
57+
58+
ENV NCCL_HOME=/opt/nccl
59+
60+
COPY --from=builder ${NCCL_HOME} ${NCCL_HOME}
61+
COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build
62+
163
ARG FLAVOR
2-
FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04
3-
4-
ARG PYTHON
5-
ARG _UV_HOME="/opt/uv"
6-
ENV UV_PYTHON="${PYTHON}"
7-
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
8-
ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python"
9-
ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin"
10-
ENV UV_MANAGED_PYTHON=1
11-
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
12-
13-
ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}"
14-
15-
RUN export DEBIAN_FRONTEND=noninteractive && \
16-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
17-
apt-get update --fix-missing && \
18-
apt-get upgrade -y && \
19-
ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
20-
apt-get install -y tzdata && \
21-
dpkg-reconfigure --frontend noninteractive tzdata && \
22-
apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
23-
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \
24-
sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \
25-
mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_*
26-
27-
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \
28-
uv python install --preview --default
64+
65+
# MPI, NVCC, and /etc/ld.so.conf.d
66+
67+
RUN apt-get update \
68+
&& apt-get install -y --no-install-recommends \
69+
openmpi-bin \
70+
&& if [ "$FLAVOR" = "devel" ]; then \
71+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
72+
&& apt-get install -y --no-install-recommends \
73+
cuda-libraries-dev-${cuda_version} \
74+
cuda-nvcc-${cuda_version} \
75+
libhwloc-dev; \
76+
fi \
77+
&& rm -rf /var/lib/apt/lists/* \
78+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
79+
&& ldconfig

docker/base/Dockerfile.common

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
ARG UBUNTU_VERSION
2+
3+
FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04
4+
5+
ARG _UV_HOME="/opt/uv"
6+
7+
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
8+
ENV UV_MANAGED_PYTHON=1
9+
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
10+
11+
ENV PATH="${UV_INSTALL_DIR}:${PATH}"
12+
13+
ENV OMPI_MCA_pml=^cm,ucx
14+
ENV OMPI_MCA_btl=tcp,self
15+
ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0
16+
ENV NCCL_SOCKET_IFNAME=^docker,lo
17+
18+
RUN export DEBIAN_FRONTEND=noninteractive \
19+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \
20+
&& apt-get update --fix-missing \
21+
&& apt-get upgrade -y \
22+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
23+
&& apt-get install -y tzdata \
24+
&& dpkg-reconfigure --frontend noninteractive tzdata \
25+
&& apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
26+
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \
27+
&& rm -rf /var/lib/apt/lists/* \
28+
&& sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \
29+
&& mkdir /run/sshd \
30+
&& mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \
31+
&& chmod 600 ~/.ssh/authorized_keys \
32+
&& rm /etc/ssh/ssh_host_*
33+
34+
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \
35+
&& uv python install --preview --default
Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1
1+
# syntax = edrevo/dockerfile-plus
22

3-
FROM ${BASE_IMAGE}
3+
INCLUDE+ base/Dockerfile.common
44

5-
ENV PREFIX=/usr/local
6-
ENV CUDA_PATH=/usr/local/cuda
5+
ENV NCCL_HOME=/usr/local
6+
ENV CUDA_HOME=/usr/local/cuda
77
ENV LIBFABRIC_PATH=/opt/amazon/efa
88
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
99
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
1010
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"
1111

12-
# prerequisites
12+
# Prerequisites
1313

1414
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
1515
&& apt-get update \
@@ -19,61 +19,58 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
1919
libhwloc-dev \
2020
autoconf \
2121
automake \
22-
libtool
22+
libtool \
23+
&& rm -rf /var/lib/apt/lists/*
2324

2425
# EFA
2526

2627
ARG EFA_VERSION=1.38.1
2728

28-
RUN cd $HOME \
29+
RUN cd /tmp \
30+
&& apt-get update \
2931
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
3032
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
3133
&& cd aws-efa-installer \
32-
&& ./efa_installer.sh -y --skip-kmod -g
34+
&& ./efa_installer.sh -y --skip-kmod -g \
35+
&& rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
3336

3437
# NCCL
3538

3639
ARG NCCL_VERSION=2.26.2-1
3740

38-
RUN cd $HOME \
41+
RUN cd /tmp \
3942
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
4043
&& cd nccl \
41-
&& make -j$(nproc) src.build BUILDDIR=${PREFIX}
44+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
45+
&& rm -rf /tmp/nccl
4246

4347
# AWS OFI NCCL
4448

4549
ARG OFI_VERSION=1.14.0
4650

47-
RUN cd $HOME \
51+
RUN cd /tmp \
4852
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
4953
&& cd aws-ofi-nccl \
5054
&& ./autogen.sh \
5155
&& ./configure \
52-
--with-cuda=${CUDA_PATH} \
56+
--with-cuda=${CUDA_HOME} \
5357
--with-libfabric=${LIBFABRIC_PATH} \
5458
--with-mpi=${OPEN_MPI_PATH} \
55-
--with-cuda=${CUDA_PATH} \
56-
--with-nccl=${PREFIX} \
59+
--with-cuda=${CUDA_HOME} \
60+
--with-nccl=${NCCL_HOME} \
5761
--disable-tests \
58-
--prefix=${PREFIX} \
59-
&& make -j$(numproc) \
60-
&& make install
62+
--prefix=${NCCL_HOME} \
63+
&& make -j$(nproc) \
64+
&& make install \
65+
&& rm -rf /tmp/aws-ofi-nccl /var/lib/apt/lists/*
6166

6267
# NCCL Tests
6368

64-
RUN cd $HOME \
69+
RUN cd /opt \
6570
&& git clone https://github.com/NVIDIA/nccl-tests \
6671
&& cd nccl-tests \
67-
&& make -j$(numproc) \
72+
&& make -j$(nproc) \
6873
MPI=1 \
6974
MPI_HOME=${OPEN_MPI_PATH} \
70-
CUDA_HOME=${CUDA_PATH} \
71-
NCCL_HOME=${PREFIX}
72-
73-
ARG BUILD_DATE
74-
ARG IMAGE_NAME
75-
ARG DSTACK_REVISION
76-
77-
LABEL org.opencontainers.image.title="${IMAGE_NAME}"
78-
LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}"
79-
LABEL org.opencontainers.image.created="${BUILD_DATE}"
75+
CUDA_HOME=${CUDA_HOME} \
76+
NCCL_HOME=${NCCL_HOME}
File renamed without changes.
File renamed without changes.

docs/docs/concepts/fleets.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ This ensures all instances are provisioned with optimal inter-node connectivity.
7070
Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration.
7171
Otherwise, instances are only connected by the default VPC subnet.
7272

73-
Refer to the [EFA](../../blog/posts/efa.md) example for more details.
73+
Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details.
7474

7575
??? info "GCP"
7676
When you create a cloud fleet with GCP, for the A3 Mega and A3 High instance types, [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking is automatically configured.

docs/docs/guides/clusters.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ For cloud fleets, fast interconnect is currently supported only on the `aws`, `g
2222

2323
!!! info "Backend configuration"
2424
Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration.
25-
Refer to the [EFA](../../blog/posts/efa.md) example for more details.
25+
Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details.
2626

2727
=== "GCP"
2828
When you create a cloud fleet with GCP, for the A3 Mega and A3 High instance types, [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking is automatically configured.

0 commit comments

Comments
 (0)