+
Ranges can be:
* **Closed** (e.g. `24GB..80GB` or `1..8`)
@@ -399,7 +406,7 @@ If you're not sure which offers (hardware configurations) are available with the
```shell
-$ dstack offer --gpu H100:1.. --max-offers 10
+$ dstack offer --gpu H100 --max-offers 10
Getting offers...
---> 100%
@@ -481,5 +488,3 @@ corresponding service quotas for each type of instance in each region.
Note, for AWS, GCP, and Azure, service quota values are measured with the number of CPUs rather than GPUs.
[//]: # (TODO: Mention spot policy)
-
-[//]: # (TODO: Mention retry policy)
diff --git a/examples/misc/docker-compose/.dstack.yml b/examples/misc/docker-compose/.dstack.yml
index f7ee087a45..2cf006bbf2 100644
--- a/examples/misc/docker-compose/.dstack.yml
+++ b/examples/misc/docker-compose/.dstack.yml
@@ -1,14 +1,11 @@
type: dev-environment
-name: vscode-dind
+name: vscode-docker
-privileged: true
-image: dstackai/dind
+docker: true
env:
- MODEL_ID=meta-llama/Llama-3.2-3B-Instruct
- HF_TOKEN
ide: vscode
-init:
- - start-dockerd
# Uncomment to leverage spot instances
#spot_policy: auto
diff --git a/examples/misc/docker-compose/README.md b/examples/misc/docker-compose/README.md
index a4e199ad64..dc2b4dec31 100644
--- a/examples/misc/docker-compose/README.md
+++ b/examples/misc/docker-compose/README.md
@@ -32,23 +32,17 @@ using [Docker Compose :material-arrow-top-right-thin:{ .external }](https://docs
type: task
name: chat-ui-task
- privileged: true
- image: dstackai/dind
+ docker: true
env:
- MODEL_ID=meta-llama/Llama-3.2-3B-Instruct
- HF_TOKEN
working_dir: examples/misc/docker-compose
commands:
- - start-dockerd
- docker compose up
ports:
- 9000
- # Uncomment to leverage spot instances
- #spot_policy: auto
-
resources:
- # Required resources
gpu: "nvidia:24GB"
```
diff --git a/examples/misc/docker-compose/compose.yaml b/examples/misc/docker-compose/compose.yaml
index ef79f8eaa1..c5c843667c 100644
--- a/examples/misc/docker-compose/compose.yaml
+++ b/examples/misc/docker-compose/compose.yaml
@@ -1,6 +1,6 @@
services:
app:
- image: ghcr.io/huggingface/chat-ui:sha-c83861a
+ image: ghcr.io/huggingface/chat-ui-db:0.9.5
environment:
HF_TOKEN: ${HF_TOKEN?}
MONGODB_URL: mongodb://db:27017
@@ -16,7 +16,7 @@ services:
- db
tgi:
- image: ghcr.io/huggingface/text-generation-inference:sha-704a58c
+ image: ghcr.io/huggingface/text-generation-inference:3.3.4
volumes:
- tgi_data:/data
environment:
diff --git a/examples/misc/docker-compose/service.dstack.yml b/examples/misc/docker-compose/service.dstack.yml
index 38dd78b109..7234ce1b64 100644
--- a/examples/misc/docker-compose/service.dstack.yml
+++ b/examples/misc/docker-compose/service.dstack.yml
@@ -1,14 +1,12 @@
type: service
name: chat-ui-service
-privileged: true
-image: dstackai/dind
+docker: true
env:
- MODEL_ID=meta-llama/Llama-3.2-3B-Instruct
- HF_TOKEN
working_dir: examples/misc/docker-compose
commands:
- - start-dockerd
- docker compose up
port: 9000
auth: false
@@ -18,9 +16,10 @@ auth: false
resources:
# Required resources
- gpu: "nvidia:24GB"
+ gpu: 1
-# Uncomment to persist data
-#volumes:
-# - name: my-dind-volume
-# path: /var/lib/docker
+# Cache the Docker data
+volumes:
+ - instance_path: /root/.cache/docker-data
+ path: /var/lib/docker
+ optional: true
diff --git a/examples/misc/docker-compose/task.dstack.yml b/examples/misc/docker-compose/task.dstack.yml
index 58004686ab..148b6a11dc 100644
--- a/examples/misc/docker-compose/task.dstack.yml
+++ b/examples/misc/docker-compose/task.dstack.yml
@@ -1,14 +1,12 @@
type: task
name: chat-ui-task
-privileged: true
-image: dstackai/dind
+docker: true
env:
- MODEL_ID=meta-llama/Llama-3.2-3B-Instruct
- HF_TOKEN
working_dir: examples/misc/docker-compose
commands:
- - start-dockerd
- docker compose up
ports:
- 9000
@@ -17,10 +15,10 @@ ports:
spot_policy: auto
resources:
- # Required resources
- gpu: "nvidia:24GB"
+ gpu: 1
-# Uncomment to persist data
-#volumes:
-# - name: my-dind-volume
-# path: /var/lib/docker
+# Cache the Docker data
+volumes:
+ - instance_path: /root/.cache/docker-data
+ path: /var/lib/docker
+ optional: true
diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py
index 385f9bd8fa..97f90c8d2e 100644
--- a/src/dstack/_internal/core/compatibility/runs.py
+++ b/src/dstack/_internal/core/compatibility/runs.py
@@ -97,6 +97,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
configuration_excludes["rate_limits"] = True
if configuration.shell is None:
configuration_excludes["shell"] = True
+ if configuration.docker is None:
+ configuration_excludes["docker"] = True
if configuration.priority is None:
configuration_excludes["priority"] = True
if configuration.startup_order is None:
diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py
index bc6ba3235c..92ae999ba0 100644
--- a/src/dstack/_internal/core/models/configurations.py
+++ b/src/dstack/_internal/core/models/configurations.py
@@ -194,12 +194,14 @@ class BaseRunConfiguration(CoreModel):
] = None
python: Annotated[
Optional[PythonVersion],
- Field(description="The major version of Python. Mutually exclusive with `image`"),
+ Field(
+ description="The major version of Python. Mutually exclusive with `image` and `docker`"
+ ),
] = None
nvcc: Annotated[
Optional[bool],
Field(
- description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image`"
+ description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image` and `docker`"
),
] = None
single_branch: Annotated[
@@ -244,6 +246,12 @@ class BaseRunConfiguration(CoreModel):
volumes: Annotated[
List[Union[MountPoint, str]], Field(description="The volumes mount points")
] = []
+ docker: Annotated[
+ Optional[bool],
+ Field(
+ description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`"
+ ),
+ ] = None
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
setup: CommandsList = []
@@ -259,6 +267,18 @@ def convert_python(cls, v, values) -> Optional[PythonVersion]:
return PythonVersion(v)
return v
+ @validator("docker", pre=True, always=True)
+ def _docker(cls, v, values) -> Optional[bool]:
+ if v is True and values.get("image"):
+ raise KeyError("`image` and `docker` are mutually exclusive fields")
+ if v is True and values.get("python"):
+ raise KeyError("`python` and `docker` are mutually exclusive fields")
+ if v is True and values.get("nvcc"):
+ raise KeyError("`nvcc` and `docker` are mutually exclusive fields")
+ # Ideally, we'd like to also prohibit privileged=False when docker=True,
+ # but it's not possible to do so without breaking backwards compatibility.
+ return v
+
@validator("volumes", each_item=True)
def convert_volumes(cls, v) -> MountPoint:
if isinstance(v, str):
diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py
index 6ef0ca7712..465a24fb0d 100644
--- a/src/dstack/_internal/server/services/jobs/configurators/base.py
+++ b/src/dstack/_internal/server/services/jobs/configurators/base.py
@@ -171,6 +171,8 @@ async def _commands(self) -> List[str]:
return result
def _dstack_image_commands(self) -> List[str]:
+ if self.run_spec.configuration.docker is True:
+ return ["start-dockerd"]
if (
self.run_spec.configuration.image is not None
or self.run_spec.configuration.entrypoint is not None
@@ -201,7 +203,9 @@ def _home_dir(self) -> Optional[str]:
return self.run_spec.configuration.home_dir
def _image_name(self) -> str:
- if self.run_spec.configuration.image is not None:
+ if self.run_spec.configuration.docker is True:
+ return settings.DSTACK_DIND_IMAGE
+ elif self.run_spec.configuration.image is not None:
return self.run_spec.configuration.image
return get_default_image(nvcc=bool(self.run_spec.configuration.nvcc))
@@ -215,6 +219,8 @@ async def _user(self) -> Optional[UnixUser]:
return UnixUser.parse(user)
def _privileged(self) -> bool:
+ if self.run_spec.configuration.docker is True:
+ return True
return self.run_spec.configuration.privileged
def _single_branch(self) -> bool:
diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py
index 2636a3b362..52fd008001 100644
--- a/src/dstack/_internal/settings.py
+++ b/src/dstack/_internal/settings.py
@@ -17,6 +17,7 @@
DSTACK_BASE_IMAGE_UBUNTU_VERSION = os.getenv(
"DSTACK_BASE_IMAGE_UBUNTU_VERSION", version.base_image_ubuntu_version
)
+DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind")
class FeatureFlags:
diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py
index 65c4fcd131..c2e50794a6 100644
--- a/src/tests/_internal/server/routers/test_runs.py
+++ b/src/tests/_internal/server/routers/test_runs.py
@@ -77,8 +77,52 @@ def get_dev_env_run_plan_dict(
action: ApplyAction = ApplyAction.CREATE,
current_resource: Optional[Run] = None,
privileged: bool = False,
+ docker: bool = False,
volumes: List[MountPoint] = [],
) -> Dict:
+ # When docker=True, commands should start with start-dockerd
+ if docker:
+ commands = [
+ "/bin/bash",
+ "-i",
+ "-c",
+ "start-dockerd && (echo pip install ipykernel... && "
+ "pip install -q --no-cache-dir "
+ 'ipykernel 2> /dev/null) || echo "no '
+ 'pip, ipykernel was not installed" '
+ "&& echo '' && echo To open in VS "
+ "Code Desktop, use link below: && "
+ "echo '' && echo ' "
+ "vscode://vscode-remote/ssh-remote+dry-run/workflow' "
+ "&& echo '' && echo 'To connect via "
+ "SSH, use: `ssh dry-run`' && echo '' "
+ "&& echo -n 'To exit, press Ctrl+C.' "
+ "&& tail -f /dev/null",
+ ]
+ image_name = "dstackai/dind"
+ else:
+ commands = [
+ "/bin/bash",
+ "-i",
+ "-c",
+ "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1"
+ " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc"
+ " && source /workflow/.venv/bin/activate"
+ " && (echo pip install ipykernel... && "
+ "pip install -q --no-cache-dir "
+ 'ipykernel 2> /dev/null) || echo "no '
+ 'pip, ipykernel was not installed" '
+ "&& echo '' && echo To open in VS "
+ "Code Desktop, use link below: && "
+ "echo '' && echo ' "
+ "vscode://vscode-remote/ssh-remote+dry-run/workflow' "
+ "&& echo '' && echo 'To connect via "
+ "SSH, use: `ssh dry-run`' && echo '' "
+ "&& echo -n 'To exit, press Ctrl+C.' "
+ "&& tail -f /dev/null",
+ ]
+ image_name = "dstackai/base:0.10-base-ubuntu22.04"
+
run_spec = {
"configuration": {
"entrypoint": None,
@@ -90,11 +134,12 @@ def get_dev_env_run_plan_dict(
"version": None,
"image": None,
"user": None,
+ "docker": docker,
"shell": None,
"privileged": privileged,
"init": [],
"ports": [],
- "python": "3.13",
+ "python": "3.13" if not docker else None,
"nvcc": None,
"registry_auth": None,
"setup": [],
@@ -166,31 +211,12 @@ def get_dev_env_run_plan_dict(
{
"job_spec": {
"app_specs": [],
- "commands": [
- "/bin/bash",
- "-i",
- "-c",
- "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1"
- " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc"
- " && source /workflow/.venv/bin/activate"
- " && (echo pip install ipykernel... && "
- "pip install -q --no-cache-dir "
- 'ipykernel 2> /dev/null) || echo "no '
- 'pip, ipykernel was not installed" '
- "&& echo '' && echo To open in VS "
- "Code Desktop, use link below: && "
- "echo '' && echo ' "
- "vscode://vscode-remote/ssh-remote+dry-run/workflow' "
- "&& echo '' && echo 'To connect via "
- "SSH, use: `ssh dry-run`' && echo '' "
- "&& echo -n 'To exit, press Ctrl+C.' "
- "&& tail -f /dev/null",
- ],
+ "commands": commands,
"env": {},
"home_dir": "/root",
- "image_name": "dstackai/base:0.10-base-ubuntu22.04",
+ "image_name": image_name,
"user": None,
- "privileged": privileged,
+ "privileged": True if docker else privileged,
"job_name": f"{run_name}-0-0",
"replica_num": 0,
"job_num": 0,
@@ -223,7 +249,7 @@ def get_dev_env_run_plan_dict(
}
],
"current_resource": current_resource.dict() if current_resource else None,
- "action": action,
+ "action": action.value,
}
@@ -238,8 +264,52 @@ def get_dev_env_run_dict(
last_processed_at: str = "2023-01-02T03:04:00+00:00",
finished_at: Optional[str] = "2023-01-02T03:04:00+00:00",
privileged: bool = False,
+ docker: Optional[bool] = None,
deleted: bool = False,
) -> Dict:
+ # When docker=True, commands should start with start-dockerd and use dind image
+ if docker:
+ commands = [
+ "/bin/bash",
+ "-i",
+ "-c",
+ "start-dockerd && (echo pip install ipykernel... && "
+ "pip install -q --no-cache-dir "
+ 'ipykernel 2> /dev/null) || echo "no '
+ 'pip, ipykernel was not installed" '
+ "&& echo '' && echo To open in VS "
+ "Code Desktop, use link below: && "
+ "echo '' && echo ' "
+ "vscode://vscode-remote/ssh-remote+test-run/workflow' "
+ "&& echo '' && echo 'To connect via "
+ "SSH, use: `ssh test-run`' && echo '' "
+ "&& echo -n 'To exit, press Ctrl+C.' "
+ "&& tail -f /dev/null",
+ ]
+ image_name = "dstackai/dind"
+ else:
+ commands = [
+ "/bin/bash",
+ "-i",
+ "-c",
+ "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1"
+ " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc"
+ " && source /workflow/.venv/bin/activate"
+ " && (echo pip install ipykernel... && "
+ "pip install -q --no-cache-dir "
+ 'ipykernel 2> /dev/null) || echo "no '
+ 'pip, ipykernel was not installed" '
+ "&& echo '' && echo To open in VS "
+ "Code Desktop, use link below: && "
+ "echo '' && echo ' "
+ "vscode://vscode-remote/ssh-remote+test-run/workflow' "
+ "&& echo '' && echo 'To connect via "
+ "SSH, use: `ssh test-run`' && echo '' "
+ "&& echo -n 'To exit, press Ctrl+C.' "
+ "&& tail -f /dev/null",
+ ]
+ image_name = "dstackai/base:0.10-base-ubuntu22.04"
+
return {
"id": run_id,
"project_name": project_name,
@@ -259,11 +329,12 @@ def get_dev_env_run_dict(
"version": None,
"image": None,
"user": None,
+ "docker": docker,
"shell": None,
"privileged": privileged,
"init": [],
"ports": [],
- "python": "3.13",
+ "python": "3.13" if not docker else None,
"nvcc": None,
"registry_auth": None,
"setup": [],
@@ -330,31 +401,12 @@ def get_dev_env_run_dict(
{
"job_spec": {
"app_specs": [],
- "commands": [
- "/bin/bash",
- "-i",
- "-c",
- "uv venv --python 3.13 --prompt workflow --seed /workflow/.venv > /dev/null 2>&1"
- " && echo 'source /workflow/.venv/bin/activate' >> ~/.bashrc"
- " && source /workflow/.venv/bin/activate"
- " && (echo pip install ipykernel... && "
- "pip install -q --no-cache-dir "
- 'ipykernel 2> /dev/null) || echo "no '
- 'pip, ipykernel was not installed" '
- "&& echo '' && echo To open in VS "
- "Code Desktop, use link below: && "
- "echo '' && echo ' "
- "vscode://vscode-remote/ssh-remote+test-run/workflow' "
- "&& echo '' && echo 'To connect via "
- "SSH, use: `ssh test-run`' && echo '' "
- "&& echo -n 'To exit, press Ctrl+C.' "
- "&& tail -f /dev/null",
- ],
+ "commands": commands,
"env": {},
"home_dir": "/root",
- "image_name": "dstackai/base:0.10-base-ubuntu22.04",
+ "image_name": image_name,
"user": None,
- "privileged": privileged,
+ "privileged": True if docker else privileged,
"job_name": f"{run_name}-0-0",
"replica_num": 0,
"job_num": 0,
@@ -740,10 +792,10 @@ async def test_returns_403_if_not_project_member(
assert response.status_code == 403
@pytest.mark.asyncio
- @pytest.mark.parametrize("privileged", [None, False])
+ @pytest.mark.parametrize("privileged", [False])
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
async def test_returns_run_plan_privileged_false(
- self, test_db, session: AsyncSession, client: AsyncClient, privileged: Optional[bool]
+ self, test_db, session: AsyncSession, client: AsyncClient, privileged: bool
):
user = await create_user(session=session, global_role=GlobalRole.USER)
project = await create_project(session=session, owner=user)
@@ -778,7 +830,7 @@ async def test_returns_run_plan_privileged_false(
offers=[offer_aws, offer_runpod],
total_offers=2,
max_price=2.0,
- privileged=False,
+ privileged=privileged,
)
run_spec = copy.deepcopy(run_plan_dict["run_spec"])
if privileged is None:
@@ -864,6 +916,68 @@ async def test_returns_run_plan_privileged_true(
assert response.status_code == 200, response.json()
assert response.json() == run_plan_dict
+ @pytest.mark.asyncio
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
+ async def test_returns_run_plan_docker_true(
+ self,
+ test_db,
+ session: AsyncSession,
+ client: AsyncClient,
+ ):
+ user = await create_user(session=session, global_role=GlobalRole.USER)
+ project = await create_project(session=session, owner=user)
+ await add_project_member(
+ session=session, project=project, user=user, project_role=ProjectRole.USER
+ )
+ repo = await create_repo(session=session, project_id=project.id)
+ offer_aws = InstanceOfferWithAvailability(
+ backend=BackendType.AWS,
+ instance=InstanceType(
+ name="instance",
+ resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]),
+ ),
+ region="us",
+ price=1.0,
+ availability=InstanceAvailability.AVAILABLE,
+ )
+ offer_runpod = InstanceOfferWithAvailability(
+ backend=BackendType.RUNPOD,
+ instance=InstanceType(
+ name="instance",
+ resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]),
+ ),
+ region="us",
+ price=2.0,
+ availability=InstanceAvailability.AVAILABLE,
+ )
+ run_plan_dict = get_dev_env_run_plan_dict(
+ project_name=project.name,
+ username=user.name,
+ repo_id=repo.name,
+ offers=[offer_aws],
+ total_offers=1,
+ max_price=1.0,
+ docker=True,
+ )
+ body = {"run_spec": run_plan_dict["run_spec"]}
+ with patch("dstack._internal.server.services.backends.get_project_backends") as m:
+ backend_mock_aws = Mock()
+ backend_mock_aws.TYPE = BackendType.AWS
+ backend_mock_aws.compute.return_value.get_offers_cached.return_value = [offer_aws]
+ backend_mock_runpod = Mock()
+ backend_mock_runpod.TYPE = BackendType.RUNPOD
+ backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [
+ offer_runpod
+ ]
+ m.return_value = [backend_mock_aws, backend_mock_runpod]
+ response = await client.post(
+ f"/api/project/{project.name}/runs/get_plan",
+ headers=get_auth_headers(user.token),
+ json=body,
+ )
+ assert response.status_code == 200, response.json()
+ assert response.json() == run_plan_dict
+
@pytest.mark.asyncio
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
async def test_returns_run_plan_instance_volumes(
@@ -927,7 +1041,6 @@ async def test_returns_run_plan_instance_volumes(
assert response.json() == run_plan_dict
@pytest.mark.asyncio
- @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
@pytest.mark.parametrize(
("old_conf", "new_conf", "action"),
[
@@ -1204,6 +1317,55 @@ async def test_submits_run(
job = res.scalar()
assert job is not None
+ @pytest.mark.asyncio
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
+ async def test_submits_run_docker_true(
+ self, test_db, session: AsyncSession, client: AsyncClient
+ ):
+ user = await create_user(session=session, global_role=GlobalRole.USER)
+ project = await create_project(session=session, owner=user)
+ await add_project_member(
+ session=session, project=project, user=user, project_role=ProjectRole.USER
+ )
+ run_id = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e")
+ submitted_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)
+ submitted_at_formatted = "2023-01-02T03:04:00+00:00"
+ last_processed_at_formatted = submitted_at_formatted
+ repo = await create_repo(session=session, project_id=project.id)
+ run_dict = get_dev_env_run_dict(
+ run_id=str(run_id),
+ job_id=str(run_id),
+ project_name=project.name,
+ username=user.name,
+ submitted_at=submitted_at_formatted,
+ last_processed_at=last_processed_at_formatted,
+ finished_at=None,
+ run_name="test-run",
+ repo_id=repo.name,
+ docker=True,
+ privileged=True, # docker=True automatically enables privileged mode
+ )
+ body = {"run_spec": run_dict["run_spec"]}
+ with (
+ patch("uuid.uuid4") as uuid_mock,
+ patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock,
+ ):
+ uuid_mock.return_value = run_id
+ datetime_mock.return_value = submitted_at
+ response = await client.post(
+ f"/api/project/{project.name}/runs/submit",
+ headers=get_auth_headers(user.token),
+ json=body,
+ )
+ assert response.status_code == 200, response.json()
+ assert response.json() == run_dict
+ res = await session.execute(select(RunModel))
+ run = res.scalar()
+ assert run is not None
+ res = await session.execute(select(JobModel))
+ job = res.scalar()
+ assert job is not None
+
@pytest.mark.asyncio
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
async def test_submits_run_without_run_name(