Skip to content

Commit b09844d

Browse files
Rework default Docker images (#2799)
1 parent 5a5b134 commit b09844d

27 files changed

Lines changed: 366 additions & 237 deletions

File tree

.github/workflows/docker-efa.yml

Lines changed: 0 additions & 46 deletions
This file was deleted.

.github/workflows/docker.yml

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ jobs:
5151
runs-on: ubuntu-latest
5252
strategy:
5353
matrix:
54-
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
55-
flavor: ["base", "devel"]
54+
flavor: ["base", "devel", "devel-efa"]
55+
ubuntu_version: ["22"]
5656
steps:
5757
- name: Checkout repository
5858
uses: actions/checkout@v4
@@ -67,7 +67,21 @@ jobs:
6767
uses: docker/setup-qemu-action@v3
6868
- name: Build and upload to DockerHub
6969
run: |
70-
docker buildx build --platform linux/amd64 --build-arg FLAVOR=${{ matrix.flavor }} --build-arg PYTHON=${{ matrix.python }} --push --provenance=false --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:py${{ matrix.python }}-${{ inputs.image_version }}-cuda-12.1${{ matrix.flavor == 'devel' && '-devel' || '' }} -f base/Dockerfile .
70+
if [ "${{ matrix.flavor }}" = "base" ]; then
71+
FILE="base/Dockerfile"
72+
elif [ "${{ matrix.flavor }}" = "devel" ]; then
73+
FILE="base/Dockerfile"
74+
else
75+
FILE="base/efa/Dockerfile"
76+
fi
77+
docker buildx build \
78+
--platform linux/amd64 \
79+
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \
80+
--build-arg FLAVOR=${{ matrix.flavor }} \
81+
--build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \
82+
--provenance=false \
83+
--push \
84+
-f $FILE .
7185
7286
build-aws-images:
7387
needs: build-docker

docker/base/Dockerfile

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,79 @@
1+
# syntax = edrevo/dockerfile-plus
2+
ARG UBUNTU_VERSION
3+
4+
# Build stage
5+
FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder
6+
7+
ENV NCCL_HOME=/opt/nccl
8+
ENV CUDA_HOME=/usr/local/cuda
9+
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
10+
11+
# Prerequisites
12+
13+
RUN export DEBIAN_FRONTEND=noninteractive \
14+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \
15+
&& apt-get update --fix-missing \
16+
&& apt-get upgrade -y \
17+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
18+
&& apt-get install -y tzdata \
19+
&& dpkg-reconfigure --frontend noninteractive tzdata \
20+
&& cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
21+
&& apt-get install -y --no-install-recommends \
22+
cuda-libraries-dev-${cuda_version} \
23+
cuda-nvcc-${cuda_version} \
24+
libhwloc-dev \
25+
autoconf \
26+
automake \
27+
libtool \
28+
libopenmpi-dev \
29+
git \
30+
curl \
31+
python3 \
32+
build-essential
33+
34+
# NCCL
35+
36+
ARG NCCL_VERSION=2.26.2-1
37+
38+
RUN cd /tmp \
39+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
40+
&& cd nccl \
41+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
42+
43+
# NCCL tests
44+
45+
RUN cd /opt \
46+
&& git clone https://github.com/NVIDIA/nccl-tests \
47+
&& cd nccl-tests \
48+
&& make -j$(nproc) \
49+
MPI=1 \
50+
MPI_HOME=${OPEN_MPI_PATH} \
51+
CUDA_HOME=${CUDA_HOME} \
52+
NCCL_HOME=${NCCL_HOME}
53+
54+
# Final stage
55+
56+
INCLUDE+ base/Dockerfile.common
57+
58+
ENV NCCL_HOME=/opt/nccl
59+
60+
COPY --from=builder ${NCCL_HOME} ${NCCL_HOME}
61+
COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build
62+
163
ARG FLAVOR
2-
FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04
3-
4-
ARG PYTHON
5-
ARG _UV_HOME="/opt/uv"
6-
ENV UV_PYTHON="${PYTHON}"
7-
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
8-
ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python"
9-
ENV UV_PYTHON_BIN_DIR="${UV_PYTHON_INSTALL_DIR}/bin"
10-
ENV UV_MANAGED_PYTHON=1
11-
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
12-
13-
ENV PATH="${UV_INSTALL_DIR}:${UV_PYTHON_BIN_DIR}:${PATH}"
14-
15-
RUN export DEBIAN_FRONTEND=noninteractive && \
16-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
17-
apt-get update --fix-missing && \
18-
apt-get upgrade -y && \
19-
ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
20-
apt-get install -y tzdata && \
21-
dpkg-reconfigure --frontend noninteractive tzdata && \
22-
apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
23-
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \
24-
sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \
25-
mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_*
26-
27-
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \
28-
uv python install --preview --default
64+
65+
# MPI, NVCC, and /etc/ld.so.conf.d
66+
67+
RUN apt-get update \
68+
&& apt-get install -y --no-install-recommends \
69+
openmpi-bin \
70+
&& if [ "$FLAVOR" = "devel" ]; then \
71+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
72+
&& apt-get install -y --no-install-recommends \
73+
cuda-libraries-dev-${cuda_version} \
74+
cuda-nvcc-${cuda_version} \
75+
libhwloc-dev; \
76+
fi \
77+
&& rm -rf /var/lib/apt/lists/* \
78+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
79+
&& ldconfig

docker/base/Dockerfile.common

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
ARG UBUNTU_VERSION
2+
3+
FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04
4+
5+
ARG _UV_HOME="/opt/uv"
6+
7+
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
8+
ENV UV_MANAGED_PYTHON=1
9+
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
10+
11+
ENV PATH="${UV_INSTALL_DIR}:${PATH}"
12+
13+
ENV OMPI_MCA_pml=^cm,ucx
14+
ENV OMPI_MCA_btl=tcp,self
15+
ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0
16+
ENV NCCL_SOCKET_IFNAME=^docker,lo
17+
18+
RUN export DEBIAN_FRONTEND=noninteractive \
19+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \
20+
&& apt-get update --fix-missing \
21+
&& apt-get upgrade -y \
22+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
23+
&& apt-get install -y tzdata \
24+
&& dpkg-reconfigure --frontend noninteractive tzdata \
25+
&& apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
26+
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \
27+
&& rm -rf /var/lib/apt/lists/* \
28+
&& sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \
29+
&& mkdir /run/sshd \
30+
&& mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \
31+
&& chmod 600 ~/.ssh/authorized_keys \
32+
&& rm /etc/ssh/ssh_host_*
33+
34+
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \
35+
&& uv python install --preview --default
Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1
1+
# syntax = edrevo/dockerfile-plus
22

3-
FROM ${BASE_IMAGE}
3+
INCLUDE+ base/Dockerfile.common
44

5-
ENV PREFIX=/usr/local
6-
ENV CUDA_PATH=/usr/local/cuda
5+
ENV NCCL_HOME=/usr/local
6+
ENV CUDA_HOME=/usr/local/cuda
77
ENV LIBFABRIC_PATH=/opt/amazon/efa
88
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
99
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
1010
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"
1111

12-
# prerequisites
12+
# Prerequisites
1313

1414
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
1515
&& apt-get update \
@@ -19,61 +19,58 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
1919
libhwloc-dev \
2020
autoconf \
2121
automake \
22-
libtool
22+
libtool \
23+
&& rm -rf /var/lib/apt/lists/*
2324

2425
# EFA
2526

2627
ARG EFA_VERSION=1.38.1
2728

28-
RUN cd $HOME \
29+
RUN cd /tmp \
30+
&& apt-get update \
2931
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
3032
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
3133
&& cd aws-efa-installer \
32-
&& ./efa_installer.sh -y --skip-kmod -g
34+
&& ./efa_installer.sh -y --skip-kmod -g \
35+
&& rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
3336

3437
# NCCL
3538

3639
ARG NCCL_VERSION=2.26.2-1
3740

38-
RUN cd $HOME \
41+
RUN cd /tmp \
3942
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
4043
&& cd nccl \
41-
&& make -j$(nproc) src.build BUILDDIR=${PREFIX}
44+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
45+
&& rm -rf /tmp/nccl
4246

4347
# AWS OFI NCCL
4448

4549
ARG OFI_VERSION=1.14.0
4650

47-
RUN cd $HOME \
51+
RUN cd /tmp \
4852
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
4953
&& cd aws-ofi-nccl \
5054
&& ./autogen.sh \
5155
&& ./configure \
52-
--with-cuda=${CUDA_PATH} \
56+
--with-cuda=${CUDA_HOME} \
5357
--with-libfabric=${LIBFABRIC_PATH} \
5458
--with-mpi=${OPEN_MPI_PATH} \
55-
--with-cuda=${CUDA_PATH} \
56-
--with-nccl=${PREFIX} \
59+
--with-cuda=${CUDA_HOME} \
60+
--with-nccl=${NCCL_HOME} \
5761
--disable-tests \
58-
--prefix=${PREFIX} \
59-
&& make -j$(numproc) \
60-
&& make install
62+
--prefix=${NCCL_HOME} \
63+
&& make -j$(nproc) \
64+
&& make install \
65+
&& rm -rf /tmp/aws-ofi-nccl /var/lib/apt/lists/*
6166

6267
# NCCL Tests
6368

64-
RUN cd $HOME \
69+
RUN cd /opt \
6570
&& git clone https://github.com/NVIDIA/nccl-tests \
6671
&& cd nccl-tests \
67-
&& make -j$(numproc) \
72+
&& make -j$(nproc) \
6873
MPI=1 \
6974
MPI_HOME=${OPEN_MPI_PATH} \
70-
CUDA_HOME=${CUDA_PATH} \
71-
NCCL_HOME=${PREFIX}
72-
73-
ARG BUILD_DATE
74-
ARG IMAGE_NAME
75-
ARG DSTACK_REVISION
76-
77-
LABEL org.opencontainers.image.title="${IMAGE_NAME}"
78-
LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}"
79-
LABEL org.opencontainers.image.created="${BUILD_DATE}"
75+
CUDA_HOME=${CUDA_HOME} \
76+
NCCL_HOME=${NCCL_HOME}
File renamed without changes.

examples/clusters/nccl-tests/.dstack.yml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,22 @@ nodes: 2
55
startup_order: workers-first
66
stop_criteria: master-done
77

8-
# This image comes with MPI and NCCL tests pre-built
9-
image: dstackai/efa
108
env:
119
- NCCL_DEBUG=INFO
1210
commands:
13-
- cd /root/nccl-tests/build
1411
- |
1512
if [ $DSTACK_NODE_RANK -eq 0 ]; then
1613
mpirun \
1714
--allow-run-as-root \
1815
--hostfile $DSTACK_MPI_HOSTFILE \
1916
-n $DSTACK_GPUS_NUM \
2017
-N $DSTACK_GPUS_PER_NODE \
21-
--mca btl_tcp_if_exclude lo,docker0 \
2218
--bind-to none \
23-
./all_reduce_perf -b 8 -e 8G -f 2 -g 1
19+
/opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
2420
else
2521
sleep infinity
2622
fi
2723
2824
resources:
29-
gpu: nvidia:4:16GB
25+
gpu: nvidia:1..8
3026
shm_size: 16GB

examples/distributed-training/torchrun/.dstack.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
type: task
22
name: train-distrib
33

4-
# The size of the cluster
54
nodes: 2
65

76
python: 3.12
@@ -21,6 +20,5 @@ commands:
2120
multinode.py 50 10
2221
2322
resources:
24-
gpu: 24GB:1..2
25-
# Uncomment if using multiple GPUs
26-
#shm_size: 24GB
23+
gpu: 1..8
24+
shm_size: 16GB

scripts/packer/aws-image-cuda.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,6 @@
8181
{
8282
"type": "shell",
8383
"script": "provisioners/install-nvidia-container-toolkit.sh"
84-
},
85-
{
86-
"type": "shell",
87-
"environment_vars": [
88-
"IMAGE_REPO={{user `image_repo`}}",
89-
"IMAGE_VERSION={{user `image_version`}}"
90-
],
91-
"script": "provisioners/pull-docker-images.sh"
9284
}
9385
]
9486
}

0 commit comments

Comments
 (0)