diff --git a/pyproject.toml b/pyproject.toml index f56704ab4f..26b5d52d24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,7 +176,7 @@ oci = [ "dstack[server]", ] nebius = [ - "nebius>=0.2.19,<0.3; python_version >= '3.10'", + "nebius>=0.2.40,<0.3; python_version >= '3.10'", "dstack[server]", ] all = [ diff --git a/src/dstack/_internal/core/backends/nebius/compute.py b/src/dstack/_internal/core/backends/nebius/compute.py index 4fc4d6b522..48be255aa7 100644 --- a/src/dstack/_internal/core/backends/nebius/compute.py +++ b/src/dstack/_internal/core/backends/nebius/compute.py @@ -74,6 +74,7 @@ SUPPORTED_PLATFORMS = [ "gpu-h100-sxm", "gpu-h200-sxm", + "gpu-b200-sxm", "gpu-l40s-a", "gpu-l40s-d", "cpu-d3", @@ -150,12 +151,16 @@ def create_instance( ) if backend_data.cluster is not None: cluster_id = backend_data.cluster.id + + gpus = instance_offer.instance.resources.gpus create_disk_op = resources.create_disk( sdk=self._sdk, name=instance_name, project_id=self._region_to_project_id[instance_offer.region], size_mib=instance_offer.instance.resources.disk.size_mib, - image_family="ubuntu22.04-cuda12", + image_family="ubuntu24.04-cuda12" + if gpus and gpus[0].name == "B200" + else "ubuntu22.04-cuda12", ) create_instance_op = None try: @@ -180,6 +185,7 @@ def create_instance( cluster_id=cluster_id, disk_id=create_disk_op.resource_id, subnet_id=self._get_subnet_id(instance_offer.region), + preemptible=instance_offer.instance.resources.spot, ) _wait_for_instance(self._sdk, create_instance_op) except BaseException: @@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None: def _supported_instances(offer: InstanceOffer) -> bool: platform, _ = offer.instance.name.split() - return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot + return platform in SUPPORTED_PLATFORMS diff --git a/src/dstack/_internal/core/backends/nebius/fabrics.py b/src/dstack/_internal/core/backends/nebius/fabrics.py index 45ccec6f4c..914381a869 100644 --- a/src/dstack/_internal/core/backends/nebius/fabrics.py +++ b/src/dstack/_internal/core/backends/nebius/fabrics.py @@ -21,6 +21,7 @@ class InfinibandFabric: InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"), InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"), InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"), + InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"), ] diff --git a/src/dstack/_internal/core/backends/nebius/resources.py b/src/dstack/_internal/core/backends/nebius/resources.py index 6cd682cd9e..9728e835f2 100644 --- a/src/dstack/_internal/core/backends/nebius/resources.py +++ b/src/dstack/_internal/core/backends/nebius/resources.py @@ -28,10 +28,12 @@ GpuClusterSpec, Instance, InstanceGpuClusterSpec, + InstanceRecoveryPolicy, InstanceServiceClient, InstanceSpec, IPAddress, NetworkInterfaceSpec, + PreemptibleSpec, PublicIPAddress, ResourcesSpec, SourceImageFamily, @@ -283,6 +285,7 @@ def create_instance( cluster_id: Optional[str], disk_id: str, subnet_id: str, + preemptible: bool, ) -> SDKOperation[Operation]: client = InstanceServiceClient(sdk) request = CreateInstanceRequest( @@ -306,6 +309,12 @@ def create_instance( public_ip_address=PublicIPAddress(static=True), ) ], + preemptible=PreemptibleSpec( + priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP + ) + if preemptible + else None, + recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None, ), ) with wrap_capacity_errors():