Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ oci = [
"dstack[server]",
]
nebius = [
"nebius>=0.2.19,<0.3; python_version >= '3.10'",
"nebius>=0.2.40,<0.3; python_version >= '3.10'",
"dstack[server]",
]
all = [
Expand Down
10 changes: 8 additions & 2 deletions src/dstack/_internal/core/backends/nebius/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
SUPPORTED_PLATFORMS = [
"gpu-h100-sxm",
"gpu-h200-sxm",
"gpu-b200-sxm",
"gpu-l40s-a",
"gpu-l40s-d",
"cpu-d3",
Expand Down Expand Up @@ -150,12 +151,16 @@ def create_instance(
)
if backend_data.cluster is not None:
cluster_id = backend_data.cluster.id

gpus = instance_offer.instance.resources.gpus
create_disk_op = resources.create_disk(
sdk=self._sdk,
name=instance_name,
project_id=self._region_to_project_id[instance_offer.region],
size_mib=instance_offer.instance.resources.disk.size_mib,
image_family="ubuntu22.04-cuda12",
image_family="ubuntu24.04-cuda12"
if gpus and gpus[0].name == "B200"
else "ubuntu22.04-cuda12",
)
create_instance_op = None
try:
Expand All @@ -180,6 +185,7 @@ def create_instance(
cluster_id=cluster_id,
disk_id=create_disk_op.resource_id,
subnet_id=self._get_subnet_id(instance_offer.region),
preemptible=instance_offer.instance.resources.spot,
)
_wait_for_instance(self._sdk, create_instance_op)
except BaseException:
Expand Down Expand Up @@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:

def _supported_instances(offer: InstanceOffer) -> bool:
platform, _ = offer.instance.name.split()
return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
return platform in SUPPORTED_PLATFORMS
1 change: 1 addition & 0 deletions src/dstack/_internal/core/backends/nebius/fabrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class InfinibandFabric:
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"),
]


Expand Down
9 changes: 9 additions & 0 deletions src/dstack/_internal/core/backends/nebius/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
GpuClusterSpec,
Instance,
InstanceGpuClusterSpec,
InstanceRecoveryPolicy,
InstanceServiceClient,
InstanceSpec,
IPAddress,
NetworkInterfaceSpec,
PreemptibleSpec,
PublicIPAddress,
ResourcesSpec,
SourceImageFamily,
Expand Down Expand Up @@ -283,6 +285,7 @@ def create_instance(
cluster_id: Optional[str],
disk_id: str,
subnet_id: str,
preemptible: bool,
) -> SDKOperation[Operation]:
client = InstanceServiceClient(sdk)
request = CreateInstanceRequest(
Expand All @@ -306,6 +309,12 @@ def create_instance(
public_ip_address=PublicIPAddress(static=True),
)
],
preemptible=PreemptibleSpec(
priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP
)
if preemptible
else None,
recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None,
),
)
with wrap_capacity_errors():
Expand Down
Loading