diff --git a/docs/docs/concepts/dev-environments.md b/docs/docs/concepts/dev-environments.md index fbf8558796..8c1e66c87e 100644 --- a/docs/docs/concepts/dev-environments.md +++ b/docs/docs/concepts/dev-environments.md @@ -16,7 +16,7 @@ name: vscode python: "3.11" # Uncomment to use a custom Docker image -#image: dstackai/base:py3.13-0.7-cuda-12.1 +#image: huggingface/trl-latest-gpu ide: vscode # Uncomment to leverage spot instances @@ -86,13 +86,11 @@ property with a list of commands to run at startup: ```yaml type: dev-environment -# The name is optional, if not specified, generated randomly name: vscode python: "3.11" ide: vscode -# Commands to run on startup init: - pip install wandb ``` @@ -129,23 +127,18 @@ resources: -The `cpu` property also allows you to specify the CPU architecture, `x86` or `arm`. Examples: -`x86:16` (16 x86-64 cores), `arm:8..` (at least 8 ARM64 cores). -If the architecture is not specified, `dstack` tries to infer it from the `gpu` specification -using `x86` as the fallback value. +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. + +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). -The `gpu` property allows specifying not only memory size but also GPU vendor, names -and their quantity. Examples: `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). -If the vendor is not specified, `dstack` tries to infer it from the GPU name using `nvidia` as the fallback value. +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. ??? info "Google Cloud TPU" To use TPUs, specify its architecture via the `gpu` property. ```yaml type: dev-environment - # The name is optional, if not specified, generated randomly name: vscode ide: vscode @@ -163,17 +156,50 @@ If the vendor is not specified, `dstack` tries to infer it from the GPU name usi > If you’re unsure which offers (hardware configurations) are available from the configured backends, use the > [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. -### Python version +### Docker + +#### Default image If you don't specify `image`, `dstack` uses its base Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. +`uv`, `python`, `pip`, essential CUDA drivers, and NCCL tests (under `/opt/nccl-tests/build`). -??? info "nvcc" - By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. - If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. +Set the `python` property to pre-install a specific version of Python. -### Docker +
+ +```yaml +type: dev-environment +name: vscode + +python: 3.12 + +ide: vscode +``` + +
+ +#### NVCC + +By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. + +
+ +```yaml +type: dev-environment +name: vscode + +python: 3.12 +nvcc: true + +ide: vscode +init: + - uv pip install flash_attn --no-build-isolation +``` + +
+ +#### Custom image If you want, you can specify your own Docker image via `image`. @@ -181,40 +207,64 @@ If you want, you can specify your own Docker image via `image`. ```yaml type: dev-environment -# The name is optional, if not specified, generated randomly name: vscode -# Any custom Docker image -image: ghcr.io/huggingface/text-generation-inference:latest +image: huggingface/trl-latest-gpu ide: vscode ``` -!!! info "Privileged mode" - To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. - This mode allows using [Docker and Docker Compose](../guides/protips.md#docker-and-docker-compose) inside `dstack` runs. +#### Docker in Docker - Not supported with `runpod`, `vastai`, and `kubernetes`. +Set `docker` to `true` to enable the `docker` CLI in your dev environment, e.g., to run or build Docker images, or use Docker Compose. -??? info "Private registry" - Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. +
- ```yaml - type: dev-environment - # The name is optional, if not specified, generated randomly - name: vscode +```yaml +type: dev-environment +name: vscode + +docker: true + +ide: vscode +init: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi +``` + +
+ +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. - # Any private Docker image - image: ghcr.io/huggingface/text-generation-inference:latest - # Credentials of the private Docker registry - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry - ide: vscode - ``` +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +
+ +```yaml +type: dev-environment +name: vscode + +env: + - NGC_API_KEY + +image: nvcr.io/nim/deepseek-ai/deepseek-r1-distill-llama-8b +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} + +ide: vscode +``` + +
### Environment variables @@ -222,10 +272,8 @@ ide: vscode ```yaml type: dev-environment -# The name is optional, if not specified, generated randomly name: vscode -# Environment variables env: - HF_TOKEN - HF_HUB_ENABLE_HF_TRANSFER=1 @@ -282,6 +330,7 @@ to automatically stop the dev environment after a configured period of inactivit ```yaml type: dev-environment name: vscode + ide: vscode # Stop if inactive for 2 hours @@ -296,12 +345,12 @@ If you go offline without stopping anything manually, the dev environment will a within about 3 minutes. If `inactivity_duration` is configured for your dev environment, you can see how long -it has been inactive in `dstack ps --verbose`. +it has been inactive in `dstack ps --verbose` (or `-v`).
```shell -$ dstack ps --verbose +$ dstack ps -v NAME BACKEND RESOURCES PRICE STATUS SUBMITTED vscode cudo 2xCPU, 8GB, $0.0286 running 8 mins ago 100.0GB (disk) (inactive for 2m 34s) diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 489a0de453..6c7e6b6e36 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -7,7 +7,7 @@ Services allow you to deploy models or web apps as secure and scalable endpoints First, define a service configuration as a YAML file in your project folder. The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` are both acceptable). -
+
```yaml type: service @@ -43,7 +43,7 @@ To run a service, pass the configuration to [`dstack apply`](../reference/cli/ds ```shell $ HF_TOKEN=... -$ dstack apply -f service.dstack.yml +$ dstack apply -f .dstack.yml # BACKEND REGION RESOURCES SPOT PRICE 1 runpod CA-MTL-1 18xCPU, 100GB, A5000:24GB:2 yes $0.22 @@ -125,25 +125,20 @@ You can configure the number of replicas as well as the auto-scaling rules. ```yaml type: service -# The name is optional, if not specified, generated randomly name: llama31-service python: 3.12 -# Required environment variables env: - HF_TOKEN commands: - uv pip install vllm - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096 -# Expose the port of the service port: 8000 resources: - # Change to what is required gpu: 24GB -# Minimum and maximum number of replicas replicas: 1..4 scaling: # Requests per seconds @@ -178,7 +173,6 @@ This can be disabled by setting `auth` to `false`. ```yaml type: service -# The name is optional, if not specified, generated randomly name: http-server-service # Disable authorization @@ -186,10 +180,8 @@ auth: false python: 3.12 -# Commands of the service commands: - python3 -m http.server -# The port of the service port: 8000 ``` @@ -209,7 +201,6 @@ type: service name: dash gateway: false -# Disable authorization auth: false # Do not strip the path prefix strip_prefix: false @@ -237,7 +228,7 @@ set [`strip_prefix`](../reference/dstack.yml/service.md#strip_prefix) to `false` If your app cannot be configured to work with a path prefix, you can host it on a dedicated domain name by setting up a [gateway](gateways.md). -### Rate Limits { #rate-limits } +### Rate limits { #rate-limits } If you have a [gateway](gateways.md), you can configure rate limits for your service using the [`rate_limits`](../reference/dstack.yml/service.md#rate_limits) property. @@ -260,14 +251,9 @@ rate_limits:
-The limit is specified in requests per second, but requests are tracked with millisecond -granularity. For example, `rps: 4` means at most 1 request every 250 milliseconds. -For most applications, it is recommended to set the `burst` property, which allows -temporary bursts, but keeps the average request rate at the limit specified in `rps`. +The rps limit sets the max requests per second, tracked in milliseconds (e.g., `rps: 4` means 1 request every 250 ms). Use `burst` to allow short spikes while keeping the average within `rps`. -Rate limits are applied to the entire service regardless of the number of replicas. -They are applied to each client separately, as determined by the client's IP address. -If a client violates a limit, it receives an error with status code `429`. +Limits apply to the whole service (all replicas) and per client (by IP). Clients exceeding the limit get a 429 error. ??? info "Partitioning key" Instead of partitioning requests by client IP address, @@ -296,23 +282,23 @@ If a client violates a limit, it receives an error with status code `429`. If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). -
+
```yaml type: service -# The name is optional, if not specified, generated randomly name: llama31-service python: 3.12 - -# Commands of the service +env: + - HF_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct + - MAX_MODEL_LEN=4096 commands: - uv pip install vllm - - python -m vllm.entrypoints.openai.api_server - --model mistralai/Mixtral-8X7B-Instruct-v0.1 - --host 0.0.0.0 - --tensor-parallel-size $DSTACK_GPUS_NUM -# Expose the port of the service + - | + vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --tensor-parallel-size $DSTACK_GPUS_NUM port: 8000 resources: @@ -327,18 +313,14 @@ resources:
-The `cpu` property also allows you to specify the CPU architecture, `x86` or `arm`. Examples: -`x86:16` (16 x86-64 cores), `arm:8..` (at least 8 ARM64 cores). -If the architecture is not specified, `dstack` tries to infer it from the `gpu` specification -using `x86` as the fallback value. +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. -The `gpu` property allows specifying not only memory size but also GPU vendor, names -and their quantity. Examples: `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). -If the vendor is not specified, `dstack` tries to infer it from the GPU name using `nvidia` as the fallback value. +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). -??? info "Google Cloud TPU" +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. + + ??? info "Shared memory" If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure @@ -370,131 +352,148 @@ If the vendor is not specified, `dstack` tries to infer it from the GPU name usi > If you’re unsure which offers (hardware configurations) are available from the configured backends, use the > [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. -### Python version + +### Docker + +#### Default image If you don't specify `image`, `dstack` uses its base Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. +`uv`, `python`, `pip`, essential CUDA drivers, and NCCL tests (under `/opt/nccl-tests/build`). -
+Set the `python` property to pre-install a specific version of Python. + + + +
```yaml type: service -# The name is optional, if not specified, generated randomly name: http-server-service -# If `image` is not specified, dstack uses its base image python: 3.12 -# Commands of the service commands: - python3 -m http.server -# The port of the service port: 8000 ```
-??? info "nvcc" - By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. - If you need `nvcc`, set the corresponding property to true. +#### NVCC -
+By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. - ```yaml - type: service - # The name is optional, if not specified, generated randomly - name: http-server-service - - # If `image` is not specified, dstack uses its base image - python: 3.12 - # Ensure nvcc is installed (req. for Flash Attention) - nvcc: true + - # Commands of the service - commands: - - python3 -m http.server - # The port of the service - port: 8000 - ``` - -
+
-### Docker +```yaml +type: service +name: http-server-service + +python: 3.12 +nvcc: true + +commands: + - python3 -m http.server +port: 8000 +``` + +
+ +#### Custom image If you want, you can specify your own Docker image via `image`. -
+
```yaml type: service - # The name is optional, if not specified, generated randomly name: http-server-service - # Any custom Docker image - image: dstackai/base:py3.13-0.7-cuda-12.1 + image: python - # Commands of the service commands: - python3 -m http.server - # The port of the service port: 8000 ```
-!!! info "Privileged mode" - To enable privileged mode, set [`privileged`](../reference/dstack.yml/service.md#privileged) to `true`. - This mode allows using [Docker and Docker Compose](../guides/protips.md#docker-and-docker-compose) inside `dstack` runs. +#### Docker in Docker - Not supported with `runpod`, `vastai`, and `kubernetes`. +Set `docker` to `true` to enable the `docker` CLI in your service, e.g., to run Docker images or use Docker Compose. -??? info "Private registry" - Use the [`registry_auth`](../reference/dstack.yml/service.md#registry_auth) property to provide credentials for a private Docker registry. +
- ```yaml - type: service - # The name is optional, if not specified, generated randomly - name: http-server-service - - # Any private Docker iamge - image: dstackai/base:py3.13-0.7-cuda-12.1 - # Credentials of the private registry - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 +```yaml +type: service +name: chat-ui-task + +auth: false + +docker: true + +working_dir: examples/misc/docker-compose +commands: + - docker compose up +port: 9000 +``` + +
+ +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. + +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry - # Commands of the service - commands: - - python3 -m http.server - # The port of the service - port: 8000 - ``` +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +```yaml +type: service +name: serve-distill-deepseek +env: + - NGC_API_KEY + - NIM_MAX_MODEL_LEN=4096 + +image: nvcr.io/nim/deepseek-ai/deepseek-r1-distill-llama-8b +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} +port: 8000 + +model: deepseek-ai/deepseek-r1-distill-llama-8b + +resources: + gpu: H100:1 +``` + ### Environment variables -
+
```yaml type: service -# The name is optional, if not specified, generated randomly name: llama-2-7b-service python: 3.12 -# Environment variables env: - HF_TOKEN - MODEL=NousResearch/Llama-2-7b-chat-hf -# Commands of the service commands: - uv pip install vllm - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 -# The port of the service port: 8000 resources: - # Required GPU VRAM gpu: 24GB ``` @@ -512,14 +511,17 @@ resources: | `DSTACK_REPO_ID` | The ID of the repo | | `DSTACK_GPUS_NUM` | The total number of GPUs in the run | + + ### Retry policy By default, if `dstack` can't find capacity, or the service exits with an error, or the instance is interrupted, the run will fail. If you'd like `dstack` to automatically retry, configure the [retry](../reference/dstack.yml/service.md#retry) property accordingly: + -
+
```yaml type: service @@ -527,7 +529,6 @@ image: my-app:latest port: 80 retry: - # Retry on specific events on_events: [no-capacity, error, interruption] # Retry for up to 1 hour duration: 1h @@ -550,6 +551,8 @@ Sometimes it’s useful to track whether a service is fully utilizing all GPUs. Below is an example of a service that auto-terminate if any GPU stays below 10% utilization for 1 hour. + +
```yaml diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md index bb10c1f749..d35dca31c9 100644 --- a/docs/docs/concepts/tasks.md +++ b/docs/docs/concepts/tasks.md @@ -10,30 +10,33 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` [//]: # (TODO: Make tabs - single machine & distributed tasks & web app) -
+
```yaml type: task # The name is optional, if not specified, generated randomly -name: axolotl-train +name: trl-sft + +python: 3.12 -# Using the official Axolotl's Docker image -image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.1 +# Uncomment to use a custom Docker image +#image: huggingface/trl-latest-gpu -# Required environment variables env: - - HF_TOKEN - - WANDB_API_KEY -# Commands of the task + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + commands: - - accelerate launch -m axolotl.cli.train examples/single-node-training/axolotl/config.yaml + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET + --num_processes $DSTACK_GPUS_PER_NODE resources: - gpu: - # 24GB or more VRAM - memory: 24GB.. - # Two or more GPU - count: 2.. + # One to two H100 GPUs + gpu: H100:1..2 + shm_size: 24GB ```
@@ -43,16 +46,14 @@ To run a task, pass the configuration to [`dstack apply`](../reference/cli/dstac
```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ dstack apply -f examples/.dstack.yml +$ dstack apply -f .dstack.yml # BACKEND REGION RESOURCES SPOT PRICE 1 runpod CA-MTL-1 18xCPU, 100GB, A5000:24GB:2 yes $0.22 2 runpod EU-SE-1 18xCPU, 100GB, A5000:24GB:2 yes $0.22 3 gcp us-west4 27xCPU, 150GB, A5000:24GB:3 yes $0.33 -Submit the run axolotl-train? [y/n]: y +Submit the run trl-sft? [y/n]: y Launching `axolotl-train`... ---> 100% @@ -74,20 +75,17 @@ and runs the commands. A task can configure ports. In this case, if the task is running an application on a port, `dstack apply` will securely allow you to access this port from your local machine through port forwarding. -
+
```yaml type: task -# The name is optional, if not specified, generated randomly name: streamlit-hello python: 3.12 -# Commands of the task commands: - uv pip install streamlit - streamlit hello -# Expose the port to access the web app ports: - 8501 ``` @@ -108,7 +106,6 @@ However, you can run it on a cluster of nodes by specifying `nodes`. type: task name: train-distrib -# The size of the cluster nodes: 2 python: 3.12 @@ -129,8 +126,7 @@ commands: resources: gpu: 24GB:1..2 - # Uncomment if using multiple GPUs - #shm_size: 24GB + shm_size: 24GB ```
@@ -191,13 +187,20 @@ range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). ```yaml type: task -# The name is optional, if not specified, generated randomly -name: train +name: trl-sft + +python: 3.12 + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb -# Commands of the task commands: - - uv pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET + --num_processes $DSTACK_GPUS_PER_NODE resources: # 16 or more x86_64 cores @@ -207,35 +210,31 @@ resources: # 4 GPUs from 40GB to 80GB gpu: 40GB..80GB:4 # Shared memory (required by multi-gpu) - shm_size: 16GB + shm_size: 24GB # Disk size disk: 500GB ```
-The `cpu` property also allows you to specify the CPU architecture, `x86` or `arm`. Examples: -`x86:16` (16 x86-64 cores), `arm:8..` (at least 8 ARM64 cores). -If the architecture is not specified, `dstack` tries to infer it from the `gpu` specification -using `x86` as the fallback value. +The `cpu` property lets you set the architecture (`x86` or `arm`) and core count — e.g., `x86:16` (16 x86 cores), `arm:8..` (at least 8 ARM cores). +If not set, `dstack` infers it from the GPU or defaults to `x86`. + +The `gpu` property lets you specify vendor, model, memory, and count — e.g., `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either), `A100:80GB` (one 80GB A100), `A100:2` (two A100), `24GB..40GB:2` (two GPUs with 24–40GB), `A100:40GB:2` (two 40GB A100s). -The `gpu` property allows specifying not only memory size but also GPU vendor, names -and their quantity. Examples: `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). -If the vendor is not specified, `dstack` tries to infer it from the GPU name using `nvidia` as the fallback value. +If vendor is omitted, `dstack` infers it from the model or defaults to `nvidia`. -??? info "Google Cloud TPU" + + ```yaml type: task - # The name is optional, if not specified, generated randomly name: train python: 3.12 - # Commands of the task commands: - pip install -r fine-tuning/qlora/requirements.txt - python fine-tuning/qlora/train.py @@ -244,128 +243,201 @@ If the vendor is not specified, `dstack` tries to infer it from the GPU name usi gpu: v2-8 ``` - Currently, only 8 TPU cores can be specified, supporting single TPU device workloads. Multi-TPU support is coming soon. + Currently, only 8 TPU cores can be specified, supporting single TPU device workloads. Multi-TPU support is coming soon. --> ??? info "Shared memory" If you are using parallel communicating processes (e.g., dataloaders in PyTorch), you may need to configure - `shm_size`, e.g. set it to `16GB`. + `shm_size`, e.g. set it to `24GB`. > If you’re unsure which offers (hardware configurations) are available from the configured backends, use the > [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. -### Python version + +### Docker + +#### Default image If you don't specify `image`, `dstack` uses its base Docker image pre-configured with -`python`, `pip`, `conda` (Miniforge), and essential CUDA drivers. -The `python` property determines which default Docker image is used. +`uv`, `python`, `pip`, essential CUDA drivers, and NCCL tests (under `/opt/nccl-tests/build`). + +Set the `python` property to pre-install a specific version of Python. -
+
```yaml type: task -# The name is optional, if not specified, generated randomly name: train -# If `image` is not specified, dstack uses its base image python: 3.12 -# Commands of the task +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1..2 + shm_size: 24GB ```
-??? info "nvcc" - By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. - If you need `nvcc`, set the corresponding property to true. +#### NVCC +By default, the base Docker image doesn’t include `nvcc`, which is required for building custom CUDA kernels. +If you need `nvcc`, set the [`nvcc`](../reference/dstack.yml/dev-environment.md#nvcc) property to true. - ```yaml - type: task - # The name is optional, if not specified, generated randomly - name: train +```yaml +type: task +name: train - # If `image` is not specified, dstack uses its base image - python: 3.12 - # Ensure nvcc is installed (req. for Flash Attention) - nvcc: true - - commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - ``` +python: 3.12 +nvcc: true -### Docker +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + +commands: + - uv pip install trl + - uv pip install flash_attn --no-build-isolation + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --attn_implementation=flash_attention_2 \ + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 +``` + +#### Custom image If you want, you can specify your own Docker image via `image`. + +
```yaml type: task -# The name is optional, if not specified, generated randomly -name: train +name: trl-sft + +image: huggingface/trl-latest-gpu + +env: + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb -# Any custom Docker image -image: dstackai/base:py3.13-0.7-cuda-12.1 +# if shell is not specified, `sh` is used for custom images +shell: bash -# Commands of the task commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py + - source activate trl + - | + trl sft --model_name_or_path $MODEL \ + --dataset_name $DATASET \ + --output_dir /output \ + --torch_dtype bfloat16 \ + --use_peft true + +resources: + gpu: H100:1 ```
-!!! info "Privileged mode" - To enable privileged mode, set [`privileged`](../reference/dstack.yml/task.md#privileged) to `true`. - This mode allows using [Docker and Docker Compose](../guides/protips.md#docker-and-docker-compose) inside `dstack` runs. +#### Docker in Docker - Not supported with `runpod`, `vastai`, and `kubernetes`. +Set `docker` to `true` to enable the `docker` CLI in your task, e.g., to run or build Docker images, or use Docker Compose. -??? info "Private registry" - Use the [`registry_auth`](../reference/dstack.yml/task.md#registry_auth) property to provide credentials for a private Docker registry. +
- ```yaml - type: dev-environment - # The name is optional, if not specified, generated randomly - name: train - - # Any private Docker image - image: dstackai/base:py3.13-0.7-cuda-12.1 - # Credentials of the private Docker registry - registry_auth: - username: peterschmidt85 - password: ghp_e49HcZ9oYwBzUbcSk2080gXZOU2hiT9AeSR5 +```yaml +type: task +name: docker-nvidia-smi + +docker: true + +commands: + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi + +resources: + gpu: 1 +``` + +
+ +Cannot be used with `python` or `image`. Not supported on `runpod`, `vastai`, or `kubernetes`. + +#### Privileged mode + +To enable privileged mode, set [`privileged`](../reference/dstack.yml/dev-environment.md#privileged) to `true`. + +Not supported with `runpod`, `vastai`, and `kubernetes`. + +#### Private registry - # Commands of the task - commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py - ``` +Use the [`registry_auth`](../reference/dstack.yml/dev-environment.md#registry_auth) property to provide credentials for a private Docker registry. + +```yaml +type: task +name: train + +env: + - NGC_API_KEY + +image: nvcr.io/nvidia/pytorch:25.05-py3 +registry_auth: + username: $oauthtoken + password: ${{ env.NGC_API_KEY }} + +commands: + - git clone https://github.com/pytorch/examples.git pytorch-examples + - cd pytorch-examples/distributed/ddp-tutorial-series + - pip install -r requirements.txt + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --nnodes=$DSTACK_NODES_NUM \ + multinode.py 50 10 + +resources: + gpu: H100:1..2 + shm_size: 24GB +``` ### Environment variables -
+
```yaml type: task -# The name is optional, if not specified, generated randomly -name: train +name: trl-sft python: 3.12 -# Environment variables env: - HF_TOKEN - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb -# Commands of the task commands: - - pip install -r fine-tuning/qlora/requirements.txt - - python fine-tuning/qlora/train.py + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET + --num_processes $DSTACK_GPUS_PER_NODE + +resources: + gpu: H100:1 ```
@@ -397,6 +469,8 @@ the run will fail. If you'd like `dstack` to automatically retry, configure the [retry](../reference/dstack.yml/task.md#retry) property accordingly: + +
```yaml @@ -410,7 +484,6 @@ commands: - python fine-tuning/qlora/train.py retry: - # Retry on specific events on_events: [no-capacity, error, interruption] # Retry for up to 1 hour duration: 1h @@ -427,6 +500,8 @@ Be default, submitted runs are scheduled in the order they were submitted. When compute resources are limited, you may want to prioritize some runs over others. This can be done by specifying the [`priority`](../reference/dstack.yml/task.md) property in the run configuration: + +
```yaml @@ -455,6 +530,8 @@ Sometimes it’s useful to track whether a task is fully utilizing all GPUs. Whi Below is an example of a task that auto-terminate if any GPU stays below 10% utilization for 1 hour. + +
```yaml diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index a9aed7af2f..f4f35e27a7 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -111,33 +111,22 @@ utilization_policy:
-## Docker and Docker Compose +## Docker in Docker -All backends except `runpod`, `vastai`, and `kubernetes` allow using Docker and Docker Compose -inside `dstack` runs. To do that, additional configuration steps are required: - -1. Set the `privileged` property to `true`. -2. Set the `image` property to `dstackai/dind` (or another DinD image). -3. For tasks and services, add `start-dockerd` as the first command. For dev environments, add `start-dockerd` as the first command - in the `init` property. - -Note, `start-dockerd` is a part of `dstackai/dind` image, if you use a different DinD image, -replace it with a corresponding command to start Docker daemon. +Set `docker` to `true` to enable the `docker` CLI in your dev environment, e.g., to run or build Docker images, or use Docker Compose. === "Dev environment"
```yaml type: dev-environment - name: vscode-dind + name: vscode - privileged: true - image: dstackai/dind + docker: true ide: vscode - init: - - start-dockerd + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi ```
@@ -147,14 +136,15 @@ replace it with a corresponding command to start Docker daemon. ```yaml type: task - name: task-dind + name: docker-nvidia-smi - privileged: true - image: dstackai/dind + docker: true commands: - - start-dockerd - - docker compose up + - docker run --gpus all nvidia/cuda:12.3.0-base-ubuntu22.04 nvidia-smi + + resources: + gpu: 1 ```
@@ -162,23 +152,36 @@ replace it with a corresponding command to start Docker daemon. ??? info "Volumes" To persist Docker data between runs (e.g. images, containers, volumes, etc), create a `dstack` [volume](../concepts/volumes.md) - and add attach it in your run configuration: - - ```yaml - type: dev-environment - name: vscode-dind - - privileged: true - image: dstackai/dind - ide: vscode - - init: - - start-dockerd + and add attach it in your run configuration. + + === "Network volums" - volumes: - - name: docker-volume - path: /var/lib/docker - ``` + ```yaml + type: dev-environment + name: vscode + + docker: true + ide: vscode + + volumes: + - name: docker-volume + path: /var/lib/docker + ``` + + === "Instance volumes" + + ```yaml + type: dev-environment + name: vscode + + docker: true + ide: vscode + + volumes: + - name: /docker-volume + path: /var/lib/docker + optional: true + ``` See more Docker examples [here](https://github.com/dstackai/dstack/tree/master/examples/misc/docker-compose). @@ -296,6 +299,8 @@ By default, if `dstack` can't find available capacity, the run will fail. If you'd like `dstack` to automatically retry, configure the [retry](../reference/dstack.yml/task.md#retry) property accordingly: + +
```yaml @@ -358,6 +363,8 @@ The general format is: `::: + Ranges can be: * **Closed** (e.g. `24GB..80GB` or `1..8`) @@ -399,7 +406,7 @@ If you're not sure which offers (hardware configurations) are available with the
```shell -$ dstack offer --gpu H100:1.. --max-offers 10 +$ dstack offer --gpu H100 --max-offers 10 Getting offers... ---> 100% @@ -481,5 +488,3 @@ corresponding service quotas for each type of instance in each region. Note, for AWS, GCP, and Azure, service quota values are measured with the number of CPUs rather than GPUs. [//]: # (TODO: Mention spot policy) - -[//]: # (TODO: Mention retry policy) diff --git a/examples/misc/docker-compose/.dstack.yml b/examples/misc/docker-compose/.dstack.yml index f7ee087a45..2cf006bbf2 100644 --- a/examples/misc/docker-compose/.dstack.yml +++ b/examples/misc/docker-compose/.dstack.yml @@ -1,14 +1,11 @@ type: dev-environment -name: vscode-dind +name: vscode-docker -privileged: true -image: dstackai/dind +docker: true env: - MODEL_ID=meta-llama/Llama-3.2-3B-Instruct - HF_TOKEN ide: vscode -init: - - start-dockerd # Uncomment to leverage spot instances #spot_policy: auto diff --git a/examples/misc/docker-compose/README.md b/examples/misc/docker-compose/README.md index a4e199ad64..dc2b4dec31 100644 --- a/examples/misc/docker-compose/README.md +++ b/examples/misc/docker-compose/README.md @@ -32,23 +32,17 @@ using [Docker Compose :material-arrow-top-right-thin:{ .external }](https://docs type: task name: chat-ui-task - privileged: true - image: dstackai/dind + docker: true env: - MODEL_ID=meta-llama/Llama-3.2-3B-Instruct - HF_TOKEN working_dir: examples/misc/docker-compose commands: - - start-dockerd - docker compose up ports: - 9000 - # Uncomment to leverage spot instances - #spot_policy: auto - resources: - # Required resources gpu: "nvidia:24GB" ``` diff --git a/examples/misc/docker-compose/compose.yaml b/examples/misc/docker-compose/compose.yaml index ef79f8eaa1..c5c843667c 100644 --- a/examples/misc/docker-compose/compose.yaml +++ b/examples/misc/docker-compose/compose.yaml @@ -1,6 +1,6 @@ services: app: - image: ghcr.io/huggingface/chat-ui:sha-c83861a + image: ghcr.io/huggingface/chat-ui-db:0.9.5 environment: HF_TOKEN: ${HF_TOKEN?} MONGODB_URL: mongodb://db:27017 @@ -16,7 +16,7 @@ services: - db tgi: - image: ghcr.io/huggingface/text-generation-inference:sha-704a58c + image: ghcr.io/huggingface/text-generation-inference:3.3.4 volumes: - tgi_data:/data environment: diff --git a/examples/misc/docker-compose/service.dstack.yml b/examples/misc/docker-compose/service.dstack.yml index 38dd78b109..7234ce1b64 100644 --- a/examples/misc/docker-compose/service.dstack.yml +++ b/examples/misc/docker-compose/service.dstack.yml @@ -1,14 +1,12 @@ type: service name: chat-ui-service -privileged: true -image: dstackai/dind +docker: true env: - MODEL_ID=meta-llama/Llama-3.2-3B-Instruct - HF_TOKEN working_dir: examples/misc/docker-compose commands: - - start-dockerd - docker compose up port: 9000 auth: false @@ -18,9 +16,10 @@ auth: false resources: # Required resources - gpu: "nvidia:24GB" + gpu: 1 -# Uncomment to persist data -#volumes: -# - name: my-dind-volume -# path: /var/lib/docker +# Cache the Docker data +volumes: + - instance_path: /root/.cache/docker-data + path: /var/lib/docker + optional: true diff --git a/examples/misc/docker-compose/task.dstack.yml b/examples/misc/docker-compose/task.dstack.yml index 58004686ab..148b6a11dc 100644 --- a/examples/misc/docker-compose/task.dstack.yml +++ b/examples/misc/docker-compose/task.dstack.yml @@ -1,14 +1,12 @@ type: task name: chat-ui-task -privileged: true -image: dstackai/dind +docker: true env: - MODEL_ID=meta-llama/Llama-3.2-3B-Instruct - HF_TOKEN working_dir: examples/misc/docker-compose commands: - - start-dockerd - docker compose up ports: - 9000 @@ -17,10 +15,10 @@ ports: spot_policy: auto resources: - # Required resources - gpu: "nvidia:24GB" + gpu: 1 -# Uncomment to persist data -#volumes: -# - name: my-dind-volume -# path: /var/lib/docker +# Cache the Docker data +volumes: + - instance_path: /root/.cache/docker-data + path: /var/lib/docker + optional: true diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py index 385f9bd8fa..97f90c8d2e 100644 --- a/src/dstack/_internal/core/compatibility/runs.py +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -97,6 +97,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]: configuration_excludes["rate_limits"] = True if configuration.shell is None: configuration_excludes["shell"] = True + if configuration.docker is None: + configuration_excludes["docker"] = True if configuration.priority is None: configuration_excludes["priority"] = True if configuration.startup_order is None: diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index bc6ba3235c..92ae999ba0 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -194,12 +194,14 @@ class BaseRunConfiguration(CoreModel): ] = None python: Annotated[ Optional[PythonVersion], - Field(description="The major version of Python. Mutually exclusive with `image`"), + Field( + description="The major version of Python. Mutually exclusive with `image` and `docker`" + ), ] = None nvcc: Annotated[ Optional[bool], Field( - description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image`" + description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image` and `docker`" ), ] = None single_branch: Annotated[ @@ -244,6 +246,12 @@ class BaseRunConfiguration(CoreModel): volumes: Annotated[ List[Union[MountPoint, str]], Field(description="The volumes mount points") ] = [] + docker: Annotated[ + Optional[bool], + Field( + description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`" + ), + ] = None # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init` setup: CommandsList = [] @@ -259,6 +267,18 @@ def convert_python(cls, v, values) -> Optional[PythonVersion]: return PythonVersion(v) return v + @validator("docker", pre=True, always=True) + def _docker(cls, v, values) -> Optional[bool]: + if v is True and values.get("image"): + raise KeyError("`image` and `docker` are mutually exclusive fields") + if v is True and values.get("python"): + raise KeyError("`python` and `docker` are mutually exclusive fields") + if v is True and values.get("nvcc"): + raise KeyError("`nvcc` and `docker` are mutually exclusive fields") + # Ideally, we'd like to also prohibit privileged=False when docker=True, + # but it's not possible to do so without breaking backwards compatibility. + return v + @validator("volumes", each_item=True) def convert_volumes(cls, v) -> MountPoint: if isinstance(v, str): diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 6ef0ca7712..465a24fb0d 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -171,6 +171,8 @@ async def _commands(self) -> List[str]: return result def _dstack_image_commands(self) -> List[str]: + if self.run_spec.configuration.docker is True: + return ["start-dockerd"] if ( self.run_spec.configuration.image is not None or self.run_spec.configuration.entrypoint is not None @@ -201,7 +203,9 @@ def _home_dir(self) -> Optional[str]: return self.run_spec.configuration.home_dir def _image_name(self) -> str: - if self.run_spec.configuration.image is not None: + if self.run_spec.configuration.docker is True: + return settings.DSTACK_DIND_IMAGE + elif self.run_spec.configuration.image is not None: return self.run_spec.configuration.image return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc)) @@ -215,6 +219,8 @@ async def _user(self) -> Optional[UnixUser]: return UnixUser.parse(user) def _privileged(self) -> bool: + if self.run_spec.configuration.docker is True: + return True return self.run_spec.configuration.privileged def _single_branch(self) -> bool: diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 2636a3b362..52fd008001 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -17,6 +17,7 @@ DSTACK_BASE_IMAGE_UBUNTU_VERSION = os.getenv( "DSTACK_BASE_IMAGE_UBUNTU_VERSION", version.base_image_ubuntu_version ) +DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind") class FeatureFlags: diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 65c4fcd131..c2e50794a6 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -77,8 +77,52 @@ def get_dev_env_run_plan_dict( action: ApplyAction = ApplyAction.CREATE, current_resource: Optional[Run] = None, privileged: bool = False, + docker: bool = False, volumes: List[MountPoint] = [], ) -> Dict: + # When docker=True, commands should start with start-dockerd + if docker: + commands = [ + "/bin/bash", + "-i", + "-c", + "start-dockerd && (echo pip install ipykernel... && " + "pip install -q --no-cache-dir " + 'ipykernel 2> /dev/null) || echo "no ' + 'pip, ipykernel was not installed" ' + "&& echo '' && echo To open in VS " + "Code Desktop, use link below: && " + "echo '' && echo ' " + "vscode://vscode-remote/ssh-remote+dry-run/workflow' " + "&& echo '' && echo 'To connect via " + "SSH, use: `ssh dry-run`' && echo '' " + "&& echo -n 'To exit, press Ctrl+C.' " + "&& tail -f /dev/null", + ] + image_name = "dstackai/dind" + else: + commands = [ + "/bin/bash", + "-i", + "-c", + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" + " && source /workflow/.venv/bin/activate" + " && (echo pip install ipykernel... && " + "pip install -q --no-cache-dir " + 'ipykernel 2> /dev/null) || echo "no ' + 'pip, ipykernel was not installed" ' + "&& echo '' && echo To open in VS " + "Code Desktop, use link below: && " + "echo '' && echo ' " + "vscode://vscode-remote/ssh-remote+dry-run/workflow' " + "&& echo '' && echo 'To connect via " + "SSH, use: `ssh dry-run`' && echo '' " + "&& echo -n 'To exit, press Ctrl+C.' " + "&& tail -f /dev/null", + ] + image_name = "dstackai/base:0.10-base-ubuntu22.04" + run_spec = { "configuration": { "entrypoint": None, @@ -90,11 +134,12 @@ def get_dev_env_run_plan_dict( "version": None, "image": None, "user": None, + "docker": docker, "shell": None, "privileged": privileged, "init": [], "ports": [], - "python": "3.13", + "python": "3.13" if not docker else None, "nvcc": None, "registry_auth": None, "setup": [], @@ -166,31 +211,12 @@ def get_dev_env_run_plan_dict( { "job_spec": { "app_specs": [], - "commands": [ - "/bin/bash", - "-i", - "-c", - "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" - " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" - " && source /workflow/.venv/bin/activate" - " && (echo pip install ipykernel... && " - "pip install -q --no-cache-dir " - 'ipykernel 2> /dev/null) || echo "no ' - 'pip, ipykernel was not installed" ' - "&& echo '' && echo To open in VS " - "Code Desktop, use link below: && " - "echo '' && echo ' " - "vscode://vscode-remote/ssh-remote+dry-run/workflow' " - "&& echo '' && echo 'To connect via " - "SSH, use: `ssh dry-run`' && echo '' " - "&& echo -n 'To exit, press Ctrl+C.' " - "&& tail -f /dev/null", - ], + "commands": commands, "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:0.10-base-ubuntu22.04", + "image_name": image_name, "user": None, - "privileged": privileged, + "privileged": True if docker else privileged, "job_name": f"{run_name}-0-0", "replica_num": 0, "job_num": 0, @@ -223,7 +249,7 @@ def get_dev_env_run_plan_dict( } ], "current_resource": current_resource.dict() if current_resource else None, - "action": action, + "action": action.value, } @@ -238,8 +264,52 @@ def get_dev_env_run_dict( last_processed_at: str = "2023-01-02T03:04:00+00:00", finished_at: Optional[str] = "2023-01-02T03:04:00+00:00", privileged: bool = False, + docker: Optional[bool] = None, deleted: bool = False, ) -> Dict: + # When docker=True, commands should start with start-dockerd and use dind image + if docker: + commands = [ + "/bin/bash", + "-i", + "-c", + "start-dockerd && (echo pip install ipykernel... && " + "pip install -q --no-cache-dir " + 'ipykernel 2> /dev/null) || echo "no ' + 'pip, ipykernel was not installed" ' + "&& echo '' && echo To open in VS " + "Code Desktop, use link below: && " + "echo '' && echo ' " + "vscode://vscode-remote/ssh-remote+test-run/workflow' " + "&& echo '' && echo 'To connect via " + "SSH, use: `ssh test-run`' && echo '' " + "&& echo -n 'To exit, press Ctrl+C.' " + "&& tail -f /dev/null", + ] + image_name = "dstackai/dind" + else: + commands = [ + "/bin/bash", + "-i", + "-c", + "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" + " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" + " && source /workflow/.venv/bin/activate" + " && (echo pip install ipykernel... && " + "pip install -q --no-cache-dir " + 'ipykernel 2> /dev/null) || echo "no ' + 'pip, ipykernel was not installed" ' + "&& echo '' && echo To open in VS " + "Code Desktop, use link below: && " + "echo '' && echo ' " + "vscode://vscode-remote/ssh-remote+test-run/workflow' " + "&& echo '' && echo 'To connect via " + "SSH, use: `ssh test-run`' && echo '' " + "&& echo -n 'To exit, press Ctrl+C.' " + "&& tail -f /dev/null", + ] + image_name = "dstackai/base:0.10-base-ubuntu22.04" + return { "id": run_id, "project_name": project_name, @@ -259,11 +329,12 @@ def get_dev_env_run_dict( "version": None, "image": None, "user": None, + "docker": docker, "shell": None, "privileged": privileged, "init": [], "ports": [], - "python": "3.13", + "python": "3.13" if not docker else None, "nvcc": None, "registry_auth": None, "setup": [], @@ -330,31 +401,12 @@ def get_dev_env_run_dict( { "job_spec": { "app_specs": [], - "commands": [ - "/bin/bash", - "-i", - "-c", - "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1" - " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc" - " && source /workflow/.venv/bin/activate" - " && (echo pip install ipykernel... && " - "pip install -q --no-cache-dir " - 'ipykernel 2> /dev/null) || echo "no ' - 'pip, ipykernel was not installed" ' - "&& echo '' && echo To open in VS " - "Code Desktop, use link below: && " - "echo '' && echo ' " - "vscode://vscode-remote/ssh-remote+test-run/workflow' " - "&& echo '' && echo 'To connect via " - "SSH, use: `ssh test-run`' && echo '' " - "&& echo -n 'To exit, press Ctrl+C.' " - "&& tail -f /dev/null", - ], + "commands": commands, "env": {}, "home_dir": "/root", - "image_name": "dstackai/base:0.10-base-ubuntu22.04", + "image_name": image_name, "user": None, - "privileged": privileged, + "privileged": True if docker else privileged, "job_name": f"{run_name}-0-0", "replica_num": 0, "job_num": 0, @@ -740,10 +792,10 @@ async def test_returns_403_if_not_project_member( assert response.status_code == 403 @pytest.mark.asyncio - @pytest.mark.parametrize("privileged", [None, False]) + @pytest.mark.parametrize("privileged", [False]) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_run_plan_privileged_false( - self, test_db, session: AsyncSession, client: AsyncClient, privileged: Optional[bool] + self, test_db, session: AsyncSession, client: AsyncClient, privileged: bool ): user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user) @@ -778,7 +830,7 @@ async def test_returns_run_plan_privileged_false( offers=[offer_aws, offer_runpod], total_offers=2, max_price=2.0, - privileged=False, + privileged=privileged, ) run_spec = copy.deepcopy(run_plan_dict["run_spec"]) if privileged is None: @@ -864,6 +916,68 @@ async def test_returns_run_plan_privileged_true( assert response.status_code == 200, response.json() assert response.json() == run_plan_dict + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_docker_true( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + offer_aws = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ) + offer_runpod = InstanceOfferWithAvailability( + backend=BackendType.RUNPOD, + instance=InstanceType( + name="instance", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.AVAILABLE, + ) + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=[offer_aws], + total_offers=1, + max_price=1.0, + docker=True, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers_cached.return_value = [offer_aws] + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [ + offer_runpod + ] + m.return_value = [backend_mock_aws, backend_mock_runpod] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_plan_dict + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_run_plan_instance_volumes( @@ -927,7 +1041,6 @@ async def test_returns_run_plan_instance_volumes( assert response.json() == run_plan_dict @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( ("old_conf", "new_conf", "action"), [ @@ -1204,6 +1317,55 @@ async def test_submits_run( job = res.scalar() assert job is not None + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_submits_run_docker_true( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + run_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + submitted_at_formatted = "2023-01-02T03:04:00+00:00" + last_processed_at_formatted = submitted_at_formatted + repo = await create_repo(session=session, project_id=project.id) + run_dict = get_dev_env_run_dict( + run_id=str(run_id), + job_id=str(run_id), + project_name=project.name, + username=user.name, + submitted_at=submitted_at_formatted, + last_processed_at=last_processed_at_formatted, + finished_at=None, + run_name="test-run", + repo_id=repo.name, + docker=True, + privileged=True, # docker=True automatically enables privileged mode + ) + body = {"run_spec": run_dict["run_spec"]} + with ( + patch("uuid.uuid4") as uuid_mock, + patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock, + ): + uuid_mock.return_value = run_id + datetime_mock.return_value = submitted_at + response = await client.post( + f"/api/project/{project.name}/runs/submit", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 200, response.json() + assert response.json() == run_dict + res = await session.execute(select(RunModel)) + run = res.scalar() + assert run is not None + res = await session.execute(select(JobModel)) + job = res.scalar() + assert job is not None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_submits_run_without_run_name(