Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/Dockerfile.nmp-automodel-training
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
-e /app/sdk/python/nemo-platform \
-e /app/packages/nemo_platform_plugin \
-e /app/packages/nmp_common \
-e /app/packages/nmp_customization_common \
-e /app/services/automodel

# Re-pin nemo_automodel from the base clone without re-resolving transformers (already in base venv).
Expand Down
74 changes: 61 additions & 13 deletions docker/Dockerfile.nmp-unsloth-training
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
# model_entity).
#
# Install steps:
# 1. `uv pip install unsloth --torch-backend=auto`. This is unsloth's
# canonical install command (per their README). It pulls unsloth +
# unsloth_zoo + the entire HF stack (transformers, trl, peft, accelerate,
# datasets, bitsandbytes, xformers, etc.) at the versions unsloth's own
# pyproject.toml has constrained — including explicit !=X.Y.Z blocklists
# for known-broken transformers/trl releases. We deliberately don't
# second-guess these pins; they're tested upstream.
# 1b. flash-attn — optional for unsloth and not installed by step 1. Without
# 1. `uv pip install unsloth --torch-backend=auto` plus explicit
# `transformers==4.57.6` and `huggingface-hub==0.36.2` pins (transformers
# 4.57.x requires hub <1.0; platform glue would otherwise pull hub 1.x).
# Unsloth's resolver still pulls unsloth_zoo
# and the rest of the HF stack (trl, peft, accelerate, datasets,
# bitsandbytes, xformers, etc.). `--overrides preserve_base_torch.txt`
# blocks uv from installing/upgrading torch into the venv so the NGC
# base's PyTorch + CUDA remain the runtime stack.
# 1b. bitsandbytes — compiled from source against NGC CUDA 13.1 (same pattern
# as docker/Dockerfile.nmp-automodel-base). PyPI wheels
# only ship through cuda130; source build replaces the wheel from step 1.
# 1c. flash-attn — optional for unsloth and not installed by step 1. Without
# it Unsloth falls back when xformers is also missing (common on newer CUDA
# stacks), logging "FA2 = False / Xformers = None". Installed immediately
# after unsloth so pip does not re-resolve the HF stack.
Expand Down Expand Up @@ -39,6 +43,7 @@ ENV VIRTUAL_ENV=/opt/venv \
HF_HUB_ENABLE_HF_TRANSFER=1 \
OTEL_PYTHON_EXCLUDED_URLS="health"
ENV PATH="/opt/venv/bin:/root/.local/bin:${PATH}"
ENV UNSLOTH_SKIP_TORCHVISION_CHECK=1

# --system-site-packages lets the venv inherit the NGC base's pre-built torch.
# Without this, `uv pip install unsloth --torch-backend=auto` would download a
Expand All @@ -54,21 +59,53 @@ ARG USERNAME=ubuntu
ARG USER_UID=1000
ARG USER_GID=1000
ARG UNSLOTH_VERSION=2026.6.1
ARG TRANSFORMERS_VERSION=4.57.6
ARG HF_HUB_VERSION=0.36.2
ARG BITSANDBYTES_VERSION=0.49.1
ARG BNB_MAX_JOBS=10

WORKDIR /app

COPY docker/unsloth/preserve_base_torch.txt /opt/docker/preserve_base_torch.txt
COPY docker/unsloth/no_override_requirements.txt /opt/docker/no_override_requirements.txt

RUN mkdir -p /home/${USERNAME}/.cache && \
chown -R ${USER_UID}:${USER_GID} /home/${USERNAME}

# Step 1: install unsloth via its own resolver. --torch-backend=auto tells uv
# Step 1: install unsloth + pinned transformers. --torch-backend=auto tells uv
# to detect the existing torch's CUDA build (from --system-site-packages
# inheritance) and pick the matching xformers / bitsandbytes wheels.
# preserve_base_torch.txt prevents uv from replacing the NGC torch stack.
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \
--torch-backend=auto \
unsloth==${UNSLOTH_VERSION}
--overrides /opt/docker/preserve_base_torch.txt \
unsloth==${UNSLOTH_VERSION} \
transformers==${TRANSFORMERS_VERSION} \
huggingface-hub==${HF_HUB_VERSION}

# Re-pin transformers + huggingface-hub without touching torch.
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \
--overrides /opt/docker/preserve_base_torch.txt \
--reinstall-package transformers \
--reinstall-package huggingface-hub \
transformers==${TRANSFORMERS_VERSION} \
huggingface-hub==${HF_HUB_VERSION}

# TODO: Step 1b: Flash Attention 2 — compiled from source against the NGC 26.02 torch.
# Step 1b: bitsandbytes from source — matches automodel base (CUDA 13.1 nvcc).
RUN --mount=type=cache,target=/root/.cache/uv \
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git /tmp/bitsandbytes && \
cd /tmp/bitsandbytes && \
git checkout ${BITSANDBYTES_VERSION} && \
cmake -DCOMPUTE_CAPABILITY="75;80;86;87;89;90;100;103;110;120;121" -DCOMPUTE_BACKEND=cuda -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -S . && \
make -j${BNB_MAX_JOBS} && \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache scikit-build-core --no-deps && \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --no-build-isolation --no-deps --force-reinstall . && \
uv pip uninstall --python ${VIRTUAL_ENV}/bin/python scikit-build-core && \
rm -rf /tmp/bitsandbytes

# TODO: Step 1c: Flash Attention 2 — compiled from source against the NGC 26.02 torch.
# /usr/local/cuda symlinks to an older toolkit; use /usr/local/cuda-13.1 instead.
# Cap parallel nvcc/ninja work — default uses all CPUs and OOMs typical build hosts.
# Put flash attention back in when we have a working wheel in a separate image.
Expand All @@ -93,16 +130,27 @@ RUN chown -R ${USER_UID}:${USER_GID} /app
# resolver to re-evaluate the whole HF stack.
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \
--overrides /opt/docker/no_override_requirements.txt \
-e /app/sdk/python/nemo-platform \
-e /app/packages/nemo_platform_plugin \
-e /app/packages/nmp_common \
-e /app/services/unsloth
-e /app/packages/nmp_customization_common \
-e "/app/services/unsloth[integrations]"

# Re-pin hf-transfer (used by HF_HUB_ENABLE_HF_TRANSFER above).
# hf-transfer can pull huggingface-hub 1.x — install then re-pin hub + transformers.
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \
--overrides /opt/docker/preserve_base_torch.txt \
"hf-transfer>=0.1.8,<0.2"

RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \
--overrides /opt/docker/preserve_base_torch.txt \
--reinstall-package huggingface-hub \
--reinstall-package transformers \
huggingface-hub==${HF_HUB_VERSION} \
transformers==${TRANSFORMERS_VERSION}

ENTRYPOINT ["/opt/venv/bin/python"]
CMD ["-m", "nmp.unsloth.tasks.training", "--help"]

Expand Down
4 changes: 4 additions & 0 deletions docker/automodel/Dockerfile.platform-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
FROM scratch AS platform-workspace
# Use a reduced workspace file for this partial source tree.
COPY docker/automodel/pyproject.workspace.toml pyproject.toml
# uv --overrides file the training Dockerfile reads at /app/docker/automodel/.
# (The docker/ tree moved out of services/automodel/, so it no longer rides in
# via `COPY services/automodel`; copy it explicitly into the workspace slice.)
COPY docker/automodel/no_override_requirements.txt docker/automodel/no_override_requirements.txt
# nemo-platform-sdk hatch build force-includes docs/ from repo root.
# docs/fern/openapi/openapi.yaml is a symlink to ../../../openapi/openapi.yaml.
Expand All @@ -14,6 +17,7 @@ COPY openapi openapi
COPY packages/nmp_build_tools packages/nmp_build_tools
COPY packages/models packages/models
COPY packages/nmp_common packages/nmp_common
COPY packages/nmp_customization_common packages/nmp_customization_common
COPY packages/nemo_platform_plugin packages/nemo_platform_plugin
COPY sdk/python/nemo-platform sdk/python/nemo-platform
COPY services/automodel services/automodel
Expand Down
2 changes: 2 additions & 0 deletions docker/automodel/pyproject.workspace.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ members = [
"sdk/python/nemo-platform",
"packages/nemo_platform_plugin",
"packages/nmp_common",
"packages/nmp_customization_common",
"services/automodel",
"services/core/models",
]
Expand All @@ -26,5 +27,6 @@ models = { workspace = true }
nemo-platform-sdk = { workspace = true }
nemo-platform-plugin = { workspace = true }
nmp-common = { workspace = true }
nmp-customization-common = { workspace = true }
nmp-automodel = { workspace = true }
nmp-models = { workspace = true }
1 change: 1 addition & 0 deletions docker/unsloth/Dockerfile.platform-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ COPY docs docs
COPY openapi openapi
COPY packages/nmp_build_tools packages/nmp_build_tools
COPY packages/nmp_common packages/nmp_common
COPY packages/nmp_customization_common packages/nmp_customization_common
COPY packages/nemo_platform_plugin packages/nemo_platform_plugin
COPY sdk/python/nemo-platform sdk/python/nemo-platform
COPY services/unsloth services/unsloth
35 changes: 18 additions & 17 deletions docker/unsloth/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ docker buildx bake \
The build pulls the NGC PyTorch base, then runs unsloth's canonical install
in two steps:

1. `uv pip install unsloth --torch-backend=auto` — this is the command
straight from unsloth's README. It pulls `unsloth`, `unsloth_zoo`, and the
full HF stack (transformers, trl, peft, accelerate, datasets, bitsandbytes,
xformers) at versions tested upstream — we deliberately don't pin any of
them on our end, because unsloth's pyproject already has precise
`!=X.Y.Z` blocklists for known-broken releases.
1. `uv pip install unsloth --torch-backend=auto transformers==4.57.6 huggingface-hub==0.36.2` with
`preserve_base_torch.txt` overrides so the NGC base's PyTorch + CUDA are not
replaced. Unsloth's resolver still pulls `unsloth_zoo`, trl, peft,
accelerate, datasets, bitsandbytes, and xformers. **transformers is pinned
explicitly** to `4.57.6` (override at build time via
`--build-arg TRANSFORMERS_VERSION=...`).
1b. Flash Attention 2 (Dockerfile step 1b) — source build with
`--no-build-isolation` against the NGC base torch (cached Docker layer).
Parallelism is capped via `MAX_JOBS` (default `2`, override with bake arg
Expand Down Expand Up @@ -275,7 +275,8 @@ nemo models adapters retrieve qwen-unsloth-smoke-out \
| `compile()` errors with "platform.runtime: docker" | Set `platform.runtime: docker` in `~/.nemo/config.yaml` and restart services. |
| `compile()` errors with "Docker daemon unreachable" | Confirm `docker info` works as the user running `nemo services`. |
| First job step errors with `Model 'X' has no fileset attached` | Attach a fileset to the model entity (`nemo models update --fileset ...`). |
| `training` step errors with `bitsandbytes`/CUDA mismatch | Rebuild the image — the base NGC PyTorch tag may have moved. |
| `training` step errors with `bitsandbytes`/CUDA mismatch (`libbitsandbytes_cuda131.so` not found) | Rebuild `nmp-unsloth-training` — the image compiles bitsandbytes from source against NGC CUDA 13.1 (same pattern as `nmp-automodel-base`). Override `BNB_MAX_JOBS` at build time if nvcc OOMs. |
| `WandbCallback requires wandb to be installed` | Rebuild `nmp-unsloth-training` — the image installs `wandb` and `mlflow-skinny` for integrations. |
| `training` step OOMs on a small GPU | Reduce `model.max_seq_length` and / or set `model.load_in_4bit: true`. |
| `model-entity-creation` errors with "Adapter already exists" | Pick a fresh `output.name` (the unsloth compiler is "always create"; no overwrite). |
| Step config not picked up (`NEMO_JOB_STEP_CONFIG_FILE_PATH is not set`) | The container was started outside the Jobs runner — only platform-driven submit populates this. |
Expand All @@ -300,13 +301,13 @@ nemo files filesets delete qwen-unsloth-smoke-out -w default
separate ML stack. If you need both backends on the same cluster, run
both images side by side; jobs from each backend route to their own
`nmp-{backend}-training` image via env-var overrides.
- **Why we don't pin transformers / trl / peft / bitsandbytes** — unsloth's
own pyproject already constrains them tightly (e.g.
`transformers>=4.51.3,!=4.52.0..3,!=4.53.0,!=4.54.0,!=4.55.0..1,!=4.57.0,
!=4.57.4..5,!=5.0.0,!=5.1.0,<=5.5.0`). Our `[unsloth]` extra in
`services/unsloth/pyproject.toml` is just `["unsloth[huggingface]"]` —
delegating everything to upstream so we don't ship our own subtly-wrong
constraints.
- **No CUDA wheels are pre-built** — `bitsandbytes` ships PyPI wheels
(Ampere+; for older arches, swap to a source build or pin a compatible
release).
- **transformers + huggingface-hub pins** — the training image pins `transformers==4.57.6`
and `huggingface-hub==0.36.2` in
`Dockerfile.nmp-unsloth-training` (compatible with unsloth's upstream
blocklists). Other HF deps (trl, peft, bitsandbytes, etc.) still come from
unsloth's resolver. **PyTorch + CUDA** stay on the NGC base stack via
`--system-site-packages` and `preserve_base_torch.txt` / `no_override_requirements.txt`
overrides (same impossible-marker pattern as automodel).
- **bitsandbytes** — compiled from source in the image (v0.49.1, same approach as
`nmp-automodel-base`) because NGC 26.02 is CUDA 13.1 and PyPI only ships
prebuilt libs through cuda130.
11 changes: 11 additions & 0 deletions docker/unsloth/no_override_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Preserve NGC base torch/CUDA and HF stack pins from Dockerfile step 1.
# Impossible markers block uv from re-resolving these when adding platform glue.
transformers; sys_platform == 'never'
huggingface-hub; sys_platform == 'never'
bitsandbytes; sys_platform == 'never'
torch; sys_platform == 'never'
torchvision; sys_platform == 'never'
torchaudio; sys_platform == 'never'
tokenizers; sys_platform == 'never'
accelerate; sys_platform == 'never'
safetensors; sys_platform == 'never'
7 changes: 7 additions & 0 deletions docker/unsloth/preserve_base_torch.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Block uv from installing or upgrading PyTorch wheels into the venv.
# The NGC base image ships torch + CUDA; the venv uses --system-site-packages
# to inherit that stack. Impossible markers (sys_platform == 'never') are the
# same pattern as docker/automodel/no_override_requirements.txt.
torch; sys_platform == 'never'
torchvision; sys_platform == 'never'
torchaudio; sys_platform == 'never'
2 changes: 2 additions & 0 deletions docker/unsloth/pyproject.workspace.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ members = [
"sdk/python/nemo-platform",
"packages/nemo_platform_plugin",
"packages/nmp_common",
"packages/nmp_customization_common",
"services/unsloth",
]

Expand All @@ -23,4 +24,5 @@ nmp-build-tools = { workspace = true }
nemo-platform-sdk = { workspace = true }
nemo-platform-plugin = { workspace = true }
nmp-common = { workspace = true }
nmp-customization-common = { workspace = true }
nmp-unsloth = { workspace = true }
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Shared experiment-tracking integration schemas for platform plugins."""

from nemo_platform_plugin.integrations.schemas import IntegrationsSpec, MlflowIntegration, WandbIntegration

__all__ = [
"IntegrationsSpec",
"MlflowIntegration",
"WandbIntegration",
]
Loading
Loading