forked from Physical-Intelligence/openpi
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDockerfile
More file actions
51 lines (43 loc) · 2.28 KB
/
Dockerfile
File metadata and controls
51 lines (43 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Dockerfile for PyTorch training (PI0/PI05 models with multi-GPU DDP support).
# Based on the existing serve_policy.Dockerfile pattern.
#
# Build:
# docker build -t openpi_train -f Dockerfile .
#
# Run (single GPU):
# docker run --rm -it --gpus=all -v .:/app openpi_train \
# python scripts/train_pytorch.py debug --exp_name my_experiment
#
# Run (multi-GPU DDP):
# docker run --rm -it --gpus=all --ipc=host -v .:/app openpi_train \
# torchrun --standalone --nnodes=1 --nproc_per_node=2 \
# scripts/train_pytorch.py pi0_aloha_sim --exp_name my_experiment
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04@sha256:2d913b09e6be8387e1a10976933642c73c840c0b735f0bf3c28d97fc9bc422e0
COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
WORKDIR /app
# Install system dependencies (git-lfs needed by LeRobot, build tools for native extensions).
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git git-lfs linux-headers-generic build-essential clang && \
rm -rf /var/lib/apt/lists/*
# Copy from the cache instead of linking since it's a mounted volume.
ENV UV_LINK_MODE=copy
# Write the virtual environment outside of the project directory so it doesn't
# leak out of the container when we mount the application code.
ENV UV_PROJECT_ENVIRONMENT=/.venv
# Install dependencies using the lockfile (without installing the project itself).
RUN uv venv --python 3.11.9 $UV_PROJECT_ENVIRONMENT
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
--mount=type=bind,source=packages/openpi-client/pyproject.toml,target=packages/openpi-client/pyproject.toml \
--mount=type=bind,source=packages/openpi-client/src,target=packages/openpi-client/src \
GIT_LFS_SKIP_SMUDGE=1 uv sync --frozen --no-install-project --no-dev
# Patch transformers with custom model implementations.
COPY src/openpi/models_pytorch/transformers_replace/ /tmp/transformers_replace/
RUN /.venv/bin/python -c "import transformers; print(transformers.__file__)" \
| xargs dirname \
| xargs -I{} cp -r /tmp/transformers_replace/* {} \
&& rm -rf /tmp/transformers_replace
# Shared memory size is critical for multi-GPU DDP training (use --ipc=host or --shm-size).
CMD ["/bin/bash"]