|
2 | 2 |
|
3 | 3 | INCLUDE+ base/Dockerfile.common |
4 | 4 |
|
5 | | -ENV NCCL_HOME=/usr/local |
6 | | -ENV CUDA_HOME=/usr/local/cuda |
| 5 | +ENV PREFIX=/usr/local |
| 6 | +ENV CUDA_PATH=/usr/local/cuda |
7 | 7 | ENV LIBFABRIC_PATH=/opt/amazon/efa |
8 | 8 | ENV OPEN_MPI_PATH=/opt/amazon/openmpi |
9 | | -ENV NCCL_TESTS_HOME=/opt/nccl-tests |
10 | 9 | ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" |
| 10 | +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" |
11 | 11 |
|
12 | | -ARG EFA_VERSION=1.38.1 |
13 | | -ARG NCCL_VERSION=2.26.2-1 |
14 | | -ARG OFI_VERSION=1.14.0 |
15 | | -ARG FLAVOR |
| 12 | +# prerequisites |
16 | 13 |
|
17 | | -RUN apt-get update \ |
18 | | - && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ |
| 14 | +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ |
| 15 | + && apt-get update \ |
19 | 16 | && apt-get install -y --no-install-recommends \ |
20 | 17 | cuda-libraries-dev-${cuda_version} \ |
21 | 18 | cuda-nvcc-${cuda_version} \ |
22 | 19 | libhwloc-dev \ |
23 | 20 | autoconf \ |
24 | 21 | automake \ |
25 | | - libtool \ |
26 | | - && cd $HOME \ |
| 22 | + libtool |
| 23 | + |
| 24 | +# EFA |
| 25 | + |
| 26 | +ARG EFA_VERSION=1.38.1 |
| 27 | + |
| 28 | +RUN cd $HOME \ |
27 | 29 | && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ |
28 | 30 | && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ |
29 | 31 | && cd aws-efa-installer \ |
30 | | - && ./efa_installer.sh -y --skip-kmod -g \ |
31 | | - && cd $HOME \ |
| 32 | + && ./efa_installer.sh -y --skip-kmod -g |
| 33 | + |
| 34 | +# NCCL |
| 35 | + |
| 36 | +ARG NCCL_VERSION=2.26.2-1 |
| 37 | + |
| 38 | +RUN cd $HOME \ |
32 | 39 | && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
33 | 40 | && cd nccl \ |
34 | | - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ |
35 | | - && cd $HOME \ |
| 41 | + && make -j$(nproc) src.build BUILDDIR=${PREFIX} |
| 42 | + |
| 43 | +# AWS OFI NCCL |
| 44 | + |
| 45 | +ARG OFI_VERSION=1.14.0 |
| 46 | + |
| 47 | +RUN cd $HOME \ |
36 | 48 | && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ |
37 | 49 | && cd aws-ofi-nccl \ |
38 | 50 | && ./autogen.sh \ |
39 | 51 | && ./configure \ |
40 | | - --with-cuda=${CUDA_HOME} \ |
| 52 | + --with-cuda=${CUDA_PATH} \ |
41 | 53 | --with-libfabric=${LIBFABRIC_PATH} \ |
42 | 54 | --with-mpi=${OPEN_MPI_PATH} \ |
43 | | - --with-nccl=${NCCL_HOME} \ |
| 55 | + --with-cuda=${CUDA_PATH} \ |
| 56 | + --with-nccl=${PREFIX} \ |
44 | 57 | --disable-tests \ |
45 | | - --prefix=${NCCL_HOME} \ |
| 58 | + --prefix=${PREFIX} \ |
46 | 59 | && make -j$(numproc) \ |
47 | | - && make install \ |
48 | | - && git clone https://github.com/NVIDIA/nccl-tests ${HOME}/nccl-tests \ |
49 | | - && cd ${HOME}/nccl-tests \ |
| 60 | + && make install |
| 61 | + |
| 62 | +# NCCL Tests |
| 63 | + |
| 64 | +RUN cd $HOME \ |
| 65 | + && git clone https://github.com/NVIDIA/nccl-tests \ |
| 66 | + && cd nccl-tests \ |
50 | 67 | && make -j$(numproc) \ |
51 | 68 | MPI=1 \ |
52 | 69 | MPI_HOME=${OPEN_MPI_PATH} \ |
53 | | - CUDA_HOME=${CUDA_HOME} \ |
54 | | - NCCL_HOME=${NCCL_HOME} \ |
55 | | - && ln -s ${HOME}/nccl-tests/build ${NCCL_TESTS_HOME} \ |
56 | | - && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ |
57 | | - && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ |
58 | | - && ldconfig |
| 70 | + CUDA_HOME=${CUDA_PATH} \ |
| 71 | + NCCL_HOME=${PREFIX} |
0 commit comments