Drop hardcoded Hot Aisle VM specs

jvstme · jvstme · commit 25d0c83e0787 · 2025-10-28T23:38:28.000+01:00
Use the spec object from gpuhunt offers instead.
This allows newly added instance types with
different CPU, RAM, disk, and GPU count
configurations to automatically become available
in dstack. However, limit the supported GPUs to
MI300X, since other GPUs and CPU-only VMs might
need to be tested by the dstack team before they
become available to users.
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
     "python-multipart>=0.0.16",
     "filelock",
     "psutil",
-    "gpuhunt==0.1.11",
+    # TODO: release and pin new version
+    "gpuhunt @ https://github.com/dstackai/gpuhunt/archive/refs/heads/hotaisle_store_specs_in_provider_data.zip",
     "argcomplete>=3.5.0",
     "ignore-python>=0.2.0",
     "orjson",
@@ -67,6 +68,9 @@ artifacts = [
     "src/dstack/_internal/server/statics/**",
 ]
 
+[tool.hatch.metadata]
+allow-direct-references = true  # TODO: unset
+
 [tool.hatch.metadata.hooks.fancy-pypi-readme]
 content-type = "text/markdown"
 
diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py
@@ -2,7 +2,7 @@
 import subprocess
 import tempfile
 from threading import Thread
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import gpuhunt
 from gpuhunt.providers.hotaisle import HotAisleProvider
@@ -22,6 +22,7 @@
 from dstack._internal.core.models.instances import (
     InstanceAvailability,
     InstanceConfiguration,
+    InstanceOffer,
     InstanceOfferWithAvailability,
 )
 from dstack._internal.core.models.placement import PlacementGroup
@@ -31,48 +32,7 @@
 logger = get_logger(__name__)
 
 
-INSTANCE_TYPE_SPECS = {
-    "1x MI300X 8x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "1x MI300X 13x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "2x MI300X 26x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "2x MI300X 26x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "4x MI300X 52x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "4x MI300X 52x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "8x MI300X 104x Xeon Platinum 8470": {
-        "cpu_model": "Xeon Platinum 8470",
-        "cpu_frequency": 2000000000,
-        "cpu_manufacturer": "Intel",
-    },
-    "8x MI300X 104x Xeon Platinum 8462Y+": {
-        "cpu_model": "Xeon Platinum 8462Y+",
-        "cpu_frequency": 2800000000,
-        "cpu_manufacturer": "Intel",
-    },
-}
+SUPPORTED_GPUS = ["MI300X"]
 
 
 class HotAisleCompute(
@@ -95,45 +55,15 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability
             backend=BackendType.HOTAISLE,
             locations=self.config.regions or None,
             catalog=self.catalog,
+            extra_filter=_supported_instances,
         )
-        supported_offers = []
-        for offer in offers:
-            if offer.instance.name in INSTANCE_TYPE_SPECS:
-                supported_offers.append(
-                    InstanceOfferWithAvailability(
-                        **offer.dict(), availability=InstanceAvailability.AVAILABLE
-                    )
-                )
-            else:
-                logger.warning(
-                    f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
-                )
-        return supported_offers
-
-    def get_payload_from_offer(self, instance_type) -> dict:
-        instance_type_name = instance_type.name
-        cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
-        cpu_cores = instance_type.resources.cpus
-
-        return {
-            "cpu_cores": cpu_cores,
-            "cpus": {
-                "count": 1,
-                "manufacturer": cpu_specs["cpu_manufacturer"],
-                "model": cpu_specs["cpu_model"],
-                "cores": cpu_cores,
-                "frequency": cpu_specs["cpu_frequency"],
-            },
-            "disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
-            "ram_capacity": instance_type.resources.memory_mib * 1024**2,
-            "gpus": [
-                {
-                    "count": len(instance_type.resources.gpus),
-                    "manufacturer": instance_type.resources.gpus[0].vendor,
-                    "model": instance_type.resources.gpus[0].name,
-                }
-            ],
-        }
+        return [
+            InstanceOfferWithAvailability(
+                **offer.dict(),
+                availability=InstanceAvailability.AVAILABLE,
+            )
+            for offer in offers
+        ]
 
     def create_instance(
         self,
@@ -143,8 +73,10 @@ def create_instance(
     ) -> JobProvisioningData:
         project_ssh_key = instance_config.ssh_keys[0]
         self.api_client.upload_ssh_key(project_ssh_key.public)
-        vm_payload = self.get_payload_from_offer(instance_offer.instance)
-        vm_data = self.api_client.create_virtual_machine(vm_payload)
+        offer_backend_data: HotAisleOfferBackendData = (
+            HotAisleOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
+        )
+        vm_data = self.api_client.create_virtual_machine(offer_backend_data.vm_specs)
         return JobProvisioningData(
             backend=instance_offer.backend,
             instance_type=instance_offer.instance,
@@ -240,10 +172,20 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
         )
 
 
+def _supported_instances(offer: InstanceOffer) -> bool:
+    return len(offer.instance.resources.gpus) > 0 and all(
+        gpu.name in SUPPORTED_GPUS for gpu in offer.instance.resources.gpus
+    )
+
+
 class HotAisleInstanceBackendData(CoreModel):
     ip_address: str
 
     @classmethod
     def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
         assert raw is not None
         return cls.__response__.parse_raw(raw)
+
+
+class HotAisleOfferBackendData(CoreModel):
+    vm_specs: dict[str, Any]