From f5f2f2e14d1ace4699377b762ef17674185416e2 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 29 Jul 2025 12:10:14 +0545 Subject: [PATCH 1/7] add hotaisle backend --- .../_internal/core/backends/configurators.py | 9 + .../core/backends/hotaisle/__init__.py | 1 + .../core/backends/hotaisle/api_client.py | 104 +++++++++ .../core/backends/hotaisle/backend.py | 16 ++ .../core/backends/hotaisle/compute.py | 212 ++++++++++++++++++ .../core/backends/hotaisle/configurator.py | 62 +++++ .../core/backends/hotaisle/models.py | 45 ++++ src/dstack/_internal/core/backends/models.py | 8 + .../_internal/core/models/backends/base.py | 2 + 9 files changed, 459 insertions(+) create mode 100644 src/dstack/_internal/core/backends/hotaisle/__init__.py create mode 100644 src/dstack/_internal/core/backends/hotaisle/api_client.py create mode 100644 src/dstack/_internal/core/backends/hotaisle/backend.py create mode 100644 src/dstack/_internal/core/backends/hotaisle/compute.py create mode 100644 src/dstack/_internal/core/backends/hotaisle/configurator.py create mode 100644 src/dstack/_internal/core/backends/hotaisle/models.py diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index 571d010529..f0e198417e 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -54,6 +54,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.hotaisle.configurator import ( + HotaisleConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(HotaisleConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.kubernetes.configurator import ( KubernetesConfigurator, diff --git a/src/dstack/_internal/core/backends/hotaisle/__init__.py b/src/dstack/_internal/core/backends/hotaisle/__init__.py new file mode 100644 index 0000000000..9c665d1498 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/__init__.py @@ -0,0 +1 @@ +# Hotaisle backend for dstack diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py new file mode 100644 index 0000000000..2a346b432b --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -0,0 +1,104 @@ +from typing import Any, Dict, Optional + +import requests + +from dstack._internal.utils.logging import get_logger + +API_URL = "https://admin.hotaisle.app/api" + +logger = get_logger(__name__) + + +class HotaisleAPIClient: + def __init__(self, api_key: str, team_handle: str): + self.api_key = api_key + self.team_handle = team_handle + + def validate_api_key(self) -> bool: + try: + self._validate_user_and_team() + return True + except requests.HTTPError as e: + if e.response.status_code in [401, 403]: + return False + raise e + except ValueError: + return False + + def _validate_user_and_team(self) -> None: + url = f"{API_URL}/user/" + response = self._make_request("GET", url) + + if response.ok: + user_data = response.json() + else: + response.raise_for_status() + + teams = user_data.get("teams", []) + if not teams: + raise ValueError("No Hotaisle teams found for this user") + + available_teams = [team["handle"] for team in teams] + if self.team_handle not in available_teams: + raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.") + + def upload_ssh_key(self, public_key: str) -> bool: + url = f"{API_URL}/user/ssh_keys/" + payload = {"authorized_key": public_key} + + response = self._make_request("POST", url, json=payload) + + if response.status_code == 409: + return True # Key already exists - success + if not response.ok: + response.raise_for_status() + return True + + def create_virtual_machine( + self, vm_payload: Dict[str, Any], instance_name: str + ) -> Dict[str, Any]: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" + response = self._make_request("POST", url, json=vm_payload) + + if not response.ok: + response.raise_for_status() + + vm_data = response.json() + return vm_data + + def get_vm_state(self, vm_name: str) -> str: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" + response = self._make_request("GET", url) + + if not response.ok: + response.raise_for_status() + + state_data = response.json() + return state_data["state"] + + def terminate_virtual_machine(self, vm_name: str) -> bool: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" + response = self._make_request("DELETE", url) + + if response.status_code == 204: + return True + else: + response.raise_for_status() + + def _make_request( + self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + ) -> requests.Response: + headers = { + "accept": "application/json", + "Authorization": self.api_key, + } + if json is not None: + headers["Content-Type"] = "application/json" + + return requests.request( + method=method, + url=url, + headers=headers, + json=json, + timeout=timeout, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py new file mode 100644 index 0000000000..e8804c7a85 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute +from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.models.backends.base import BackendType + + +class HotaisleBackend(Backend): + TYPE = BackendType.HOTAISLE + COMPUTE_CLASS = HotaisleCompute + + def __init__(self, config: HotaisleConfig): + self.config = config + self._compute = HotaisleCompute(self.config) + + def compute(self) -> HotaisleCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py new file mode 100644 index 0000000000..9a49486ba8 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -0,0 +1,212 @@ +import shlex +import subprocess +import tempfile +from threading import Thread +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.hotaisle import HotAisleProvider + +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithCreateInstanceSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient +from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 + + +class HotaisleCompute( + ComputeWithCreateInstanceSupport, + Compute, +): + def __init__(self, config: HotaisleConfig): + super().__init__() + self.config = config + self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.catalog.add_provider( + HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) + ) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.HOTAISLE, + locations=self.config.regions or None, + requirements=requirements, + catalog=self.catalog, + ) + offers = [ + InstanceOfferWithAvailability( + **offer.dict(), availability=InstanceAvailability.AVAILABLE + ) + for offer in offers + ] + return offers + + def get_payload_from_offer(self, instance_type) -> dict: + # Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are + # not configurable. + cpu_cores = instance_type.resources.cpus + if cpu_cores == 8: + cpu_model = "Xeon Platinum 8462Y+" + frequency = 2800000000 + else: # cpu_cores == 13 + cpu_model = "Xeon Platinum 8470" + frequency = 2000000000 + + return { + "cpu_cores": cpu_cores, + "cpus": { + "count": 1, + "manufacturer": "Intel", + "model": cpu_model, + "cores": cpu_cores, + "frequency": frequency, + }, + "disk_capacity": 13194139533312, + "ram_capacity": 240518168576, + "gpus": [ + { + "count": len(instance_type.resources.gpus), + "manufacturer": "AMD", + "model": "MI300X", + } + ], + } + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + project_ssh_key = instance_config.ssh_keys[0] + self.api_client.upload_ssh_key(project_ssh_key.public) + vm_payload = self.get_payload_from_offer(instance_offer.instance) + vm_data = self.api_client.create_virtual_machine(vm_payload, instance_name) + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_data["name"], + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="hotaisle", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=vm_data["ip_address"], + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) + if vm_state == "running": + if provisioning_data.hostname is None and provisioning_data.backend_data: + provisioning_data.hostname = provisioning_data.backend_data + commands = get_shim_commands( + authorized_keys=[project_ssh_public_key], + arch=provisioning_data.instance_type.resources.cpu_arch, + ) + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) + thread = Thread( + target=_start_runner, + kwargs={ + "hostname": provisioning_data.hostname, + "project_ssh_private_key": project_ssh_private_key, + "launch_command": launch_command, + }, + daemon=True, + ) + thread.start() + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + vm_name = instance_id + self.api_client.terminate_virtual_machine(vm_name) + + +def _start_runner( + hostname: str, + project_ssh_private_key: str, + launch_command: str, +): + _setup_instance( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + ) + _launch_runner( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + launch_command=launch_command, + ) + + +def _setup_instance( + hostname: str, + ssh_private_key: str, +): + setup_commands = ("sudo apt-get update",) + _run_ssh_command( + hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands) + ) + + +def _launch_runner( + hostname: str, + ssh_private_key: str, + launch_command: str, +): + _run_ssh_command( + hostname=hostname, + ssh_private_key=ssh_private_key, + command=launch_command, + ) + + +def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): + with tempfile.NamedTemporaryFile("w+", 0o600) as f: + f.write(ssh_private_key) + f.flush() + subprocess.run( + [ + "ssh", + "-F", + "none", + "-o", + "StrictHostKeyChecking=no", + "-i", + f.name, + f"hotaisle@{hostname}", + command, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/configurator.py b/src/dstack/_internal/core/backends/hotaisle/configurator.py new file mode 100644 index 0000000000..c0f1d6deb6 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/configurator.py @@ -0,0 +1,62 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient +from dstack._internal.core.backends.hotaisle.backend import HotaisleBackend +from dstack._internal.core.backends.hotaisle.models import ( + AnyHotaisleBackendConfig, + AnyHotaisleCreds, + HotaisleBackendConfig, + HotaisleBackendConfigWithCreds, + HotaisleConfig, + HotaisleCreds, + HotaisleStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class HotaisleConfigurator(Configurator): + TYPE = BackendType.HOTAISLE + BACKEND_CLASS = HotaisleBackend + + def validate_config(self, config: HotaisleBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds, config.team_handle) + + def create_backend( + self, project_name: str, config: HotaisleBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=HotaisleStoredConfig( + **HotaisleBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=HotaisleCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config( + self, record: BackendRecord, include_creds: bool + ) -> AnyHotaisleBackendConfig: + config = self._get_config(record) + if include_creds: + return HotaisleBackendConfigWithCreds.__response__.parse_obj(config) + return HotaisleBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> HotaisleBackend: + config = self._get_config(record) + return HotaisleBackend(config=config) + + def _get_config(self, record: BackendRecord) -> HotaisleConfig: + return HotaisleConfig.__response__( + **json.loads(record.config), + creds=HotaisleCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyHotaisleCreds, team_handle: str): + api_client = HotaisleAPIClient(creds.api_key, team_handle) + if not api_client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/hotaisle/models.py b/src/dstack/_internal/core/backends/hotaisle/models.py new file mode 100644 index 0000000000..e77955d647 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/models.py @@ -0,0 +1,45 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class HotaisleAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The Hotaisle API key")] + + +AnyHotaisleCreds = HotaisleAPIKeyCreds +HotaisleCreds = AnyHotaisleCreds + + +class HotaisleBackendConfig(CoreModel): + type: Annotated[ + Literal["hotaisle"], + Field(description="The type of backend"), + ] = "hotaisle" + team_handle: Annotated[str, Field(description="The Hotaisle team handle")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Hotaisle regions. Omit to use all regions"), + ] = None + + +class HotaisleBackendConfigWithCreds(HotaisleBackendConfig): + creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] + + +AnyHotaisleBackendConfig = Union[HotaisleBackendConfig, HotaisleBackendConfigWithCreds] + + +class HotaisleBackendFileConfigWithCreds(HotaisleBackendConfig): + creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] + + +class HotaisleStoredConfig(HotaisleBackendConfig): + pass + + +class HotaisleConfig(HotaisleStoredConfig): + creds: AnyHotaisleCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 0b5779db78..992424e45e 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -29,6 +29,11 @@ GCPBackendConfigWithCreds, GCPBackendFileConfigWithCreds, ) +from dstack._internal.core.backends.hotaisle.models import ( + HotaisleBackendConfig, + HotaisleBackendConfigWithCreds, + HotaisleBackendFileConfigWithCreds, +) from dstack._internal.core.backends.kubernetes.models import ( KubernetesBackendConfig, KubernetesBackendConfigWithCreds, @@ -73,6 +78,7 @@ CudoBackendConfig, DataCrunchBackendConfig, GCPBackendConfig, + HotaisleBackendConfig, KubernetesBackendConfig, LambdaBackendConfig, NebiusBackendConfig, @@ -95,6 +101,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendConfigWithCreds, + HotaisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, @@ -116,6 +123,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendFileConfigWithCreds, + HotaisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 78aafb142c..04f704c669 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -11,6 +11,7 @@ class BackendType(str, enum.Enum): DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform DATACRUNCH (BackendType): DataCrunch + HOTAISLE (BackendType): Hotaisle KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud NEBIUS (BackendType): Nebius AI Cloud @@ -28,6 +29,7 @@ class BackendType(str, enum.Enum): DATACRUNCH = "datacrunch" DSTACK = "dstack" GCP = "gcp" + HOTAISLE = "hotaisle" KUBERNETES = "kubernetes" LAMBDA = "lambda" LOCAL = "local" From b9ca0bedb4c7c62b2742fa385f17c3f596f6e3df Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Fri, 1 Aug 2025 18:49:37 +0545 Subject: [PATCH 2/7] Daemonize launch_command to solve dstack restart issue --- src/dstack/_internal/core/backends/hotaisle/compute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py index 9a49486ba8..876f14584e 100644 --- a/src/dstack/_internal/core/backends/hotaisle/compute.py +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -184,10 +184,11 @@ def _launch_runner( ssh_private_key: str, launch_command: str, ): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" _run_ssh_command( hostname=hostname, ssh_private_key=ssh_private_key, - command=launch_command, + command=daemonized_command, ) From 9558c29904c8d461fe8bb2dff2d1f9835a2979e5 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 6 Aug 2025 13:46:42 +0545 Subject: [PATCH 3/7] Update backends.md and config.yml.md --- docs/docs/concepts/backends.md | 22 ++++++++++++++++++++++ docs/docs/reference/server/config.yml.md | 19 ++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index fe9d7df74e..bb3b9d2ca4 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -579,6 +579,28 @@ gcloud projects list --format="json(projectId)" Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. +## Hotaisle + +Log in to the SSH TUI as described in the [Hotaisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). +Create a new team and generate an API key for the member in the team. + +Then, go ahead and configure the backend: + +
+ +```yaml +projects: +- name: main + backends: + - type: hotaisle + team_handle: hotaisle-team-handle + creds: + type: api_key + api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f +``` + +
+ ## Lambda Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index fbe378d8cd..513ffd4b22 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -15,7 +15,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti overrides: show_root_heading: false backends: - type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' + type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotaisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' #### `projects[n].backends` { #backends data-toc-label="backends" } @@ -126,6 +126,23 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti type: required: true +##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: hotaisle- + +###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + ##### `projects[n].backends[type=lambda]` { #lambda data-toc-label="lambda" } #SCHEMA# dstack._internal.core.backends.lambdalabs.models.LambdaBackendConfigWithCreds From 1bf72e0defa3032a809a446b65dd2191f4c9ab9d Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 7 Aug 2025 14:49:38 +0545 Subject: [PATCH 4/7] Resolve Review Comments --- docs/docs/concepts/backends.md | 10 ++- docs/docs/reference/server/config.yml.md | 6 +- .../_internal/core/backends/configurators.py | 4 +- .../core/backends/hotaisle/api_client.py | 62 +++++++------ .../core/backends/hotaisle/backend.py | 14 +-- .../core/backends/hotaisle/compute.py | 88 +++++++++++++------ .../core/backends/hotaisle/configurator.py | 56 ++++++------ .../core/backends/hotaisle/models.py | 30 +++---- src/dstack/_internal/core/backends/models.py | 12 +-- .../_internal/core/models/backends/base.py | 2 +- 10 files changed, 161 insertions(+), 123 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index bb3b9d2ca4..81a6cf48e0 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -579,9 +579,9 @@ gcloud projects list --format="json(projectId)" Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. -## Hotaisle +## Hot Aisle -Log in to the SSH TUI as described in the [Hotaisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). +Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). Create a new team and generate an API key for the member in the team. Then, go ahead and configure the backend: @@ -601,6 +601,12 @@ projects: +??? info "Required permissions" + The API key must have the following roles assigned: + + * **Owner role for the user** - Required for creating and managing SSH keys + * **Operator role for the team** - Required for managing virtual machines within the team + ## Lambda Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index 513ffd4b22..1c347619cd 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -15,7 +15,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti overrides: show_root_heading: false backends: - type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotaisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' + type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' #### `projects[n].backends` { #backends data-toc-label="backends" } @@ -128,7 +128,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti ##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } -#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleBackendConfigWithCreds +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds overrides: show_root_heading: false type: @@ -137,7 +137,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti ###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } -#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleAPIKeyCreds +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds overrides: show_root_heading: false type: diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index f0e198417e..a2df6a4e63 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -56,10 +56,10 @@ try: from dstack._internal.core.backends.hotaisle.configurator import ( - HotaisleConfigurator, + HotAisleConfigurator, ) - _CONFIGURATOR_CLASSES.append(HotaisleConfigurator) + _CONFIGURATOR_CLASSES.append(HotAisleConfigurator) except ImportError: pass diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py index 2a346b432b..dd9ced4088 100644 --- a/src/dstack/_internal/core/backends/hotaisle/api_client.py +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -2,6 +2,7 @@ import requests +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error from dstack._internal.utils.logging import get_logger API_URL = "https://admin.hotaisle.app/api" @@ -9,7 +10,7 @@ logger = get_logger(__name__) -class HotaisleAPIClient: +class HotAisleAPIClient: def __init__(self, api_key: str, team_handle: str): self.api_key = api_key self.team_handle = team_handle @@ -19,28 +20,42 @@ def validate_api_key(self) -> bool: self._validate_user_and_team() return True except requests.HTTPError as e: - if e.response.status_code in [401, 403]: - return False + if e.response.status_code == 401: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], details="Invalid API key" + ) + elif e.response.status_code == 403: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Authenticated user does note have required permissions", + ) + raise e + except ValueError as e: + error_message = str(e) + if "No Hot Aisle teams found" in error_message: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Valid API key but no teams found for this user", + ) + elif "not found" in error_message: + raise_invalid_credentials_error( + fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found" + ) raise e - except ValueError: - return False def _validate_user_and_team(self) -> None: url = f"{API_URL}/user/" response = self._make_request("GET", url) - - if response.ok: - user_data = response.json() - else: - response.raise_for_status() + response.raise_for_status() + user_data = response.json() teams = user_data.get("teams", []) if not teams: - raise ValueError("No Hotaisle teams found for this user") + raise ValueError("No Hot Aisle teams found for this user") available_teams = [team["handle"] for team in teams] if self.team_handle not in available_teams: - raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.") + raise ValueError(f"Hot Aisle team '{self.team_handle}' not found.") def upload_ssh_key(self, public_key: str) -> bool: url = f"{API_URL}/user/ssh_keys/" @@ -50,8 +65,7 @@ def upload_ssh_key(self, public_key: str) -> bool: if response.status_code == 409: return True # Key already exists - success - if not response.ok: - response.raise_for_status() + response.raise_for_status() return True def create_virtual_machine( @@ -59,38 +73,28 @@ def create_virtual_machine( ) -> Dict[str, Any]: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" response = self._make_request("POST", url, json=vm_payload) - - if not response.ok: - response.raise_for_status() - + response.raise_for_status() vm_data = response.json() return vm_data def get_vm_state(self, vm_name: str) -> str: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" response = self._make_request("GET", url) - - if not response.ok: - response.raise_for_status() - + response.raise_for_status() state_data = response.json() return state_data["state"] - def terminate_virtual_machine(self, vm_name: str) -> bool: + def terminate_virtual_machine(self, vm_name: str) -> None: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" response = self._make_request("DELETE", url) - - if response.status_code == 204: - return True - else: - response.raise_for_status() + response.raise_for_status() def _make_request( self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 ) -> requests.Response: headers = { "accept": "application/json", - "Authorization": self.api_key, + "Authorization": f"Token {self.api_key}", } if json is not None: headers["Content-Type"] = "application/json" diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py index e8804c7a85..cb568f5258 100644 --- a/src/dstack/_internal/core/backends/hotaisle/backend.py +++ b/src/dstack/_internal/core/backends/hotaisle/backend.py @@ -1,16 +1,16 @@ from dstack._internal.core.backends.base.backend import Backend -from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute -from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig from dstack._internal.core.models.backends.base import BackendType -class HotaisleBackend(Backend): +class HotAisleBackend(Backend): TYPE = BackendType.HOTAISLE - COMPUTE_CLASS = HotaisleCompute + COMPUTE_CLASS = HotAisleCompute - def __init__(self, config: HotaisleConfig): + def __init__(self, config: HotAisleConfig): self.config = config - self._compute = HotaisleCompute(self.config) + self._compute = HotAisleCompute(self.config) - def compute(self) -> HotaisleCompute: + def compute(self) -> HotAisleCompute: return self._compute diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py index 876f14584e..de91173b56 100644 --- a/src/dstack/_internal/core/backends/hotaisle/compute.py +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -14,9 +14,10 @@ get_shim_commands, ) from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient -from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.instances import ( InstanceAvailability, InstanceConfiguration, @@ -31,14 +32,28 @@ MAX_INSTANCE_NAME_LEN = 60 -class HotaisleCompute( +INSTANCE_TYPE_SPECS = { + "1x MI300X 8x Xeon Platinum 8462Y+": { + "cpu_model": "Xeon Platinum 8462Y+", + "cpu_frequency": 2800000000, + "cpu_manufacturer": "Intel", + }, + "1x MI300X 13x Xeon Platinum 8470": { + "cpu_model": "Xeon Platinum 8470", + "cpu_frequency": 2000000000, + "cpu_manufacturer": "Intel", + }, +} + + +class HotAisleCompute( ComputeWithCreateInstanceSupport, Compute, ): - def __init__(self, config: HotaisleConfig): + def __init__(self, config: HotAisleConfig): super().__init__() self.config = config - self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle) + self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle) self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) self.catalog.add_provider( HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) @@ -53,41 +68,43 @@ def get_offers( requirements=requirements, catalog=self.catalog, ) - offers = [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) - for offer in offers - ] - return offers + + supported_offers = [] + for offer in offers: + if offer.instance.name in INSTANCE_TYPE_SPECS: + supported_offers.append( + InstanceOfferWithAvailability( + **offer.dict(), availability=InstanceAvailability.AVAILABLE + ) + ) + else: + logger.warning( + f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}" + ) + + return supported_offers def get_payload_from_offer(self, instance_type) -> dict: - # Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are - # not configurable. + instance_type_name = instance_type.name + cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name] cpu_cores = instance_type.resources.cpus - if cpu_cores == 8: - cpu_model = "Xeon Platinum 8462Y+" - frequency = 2800000000 - else: # cpu_cores == 13 - cpu_model = "Xeon Platinum 8470" - frequency = 2000000000 return { "cpu_cores": cpu_cores, "cpus": { "count": 1, - "manufacturer": "Intel", - "model": cpu_model, + "manufacturer": cpu_specs["cpu_manufacturer"], + "model": cpu_specs["cpu_model"], "cores": cpu_cores, - "frequency": frequency, + "frequency": cpu_specs["cpu_frequency"], }, - "disk_capacity": 13194139533312, - "ram_capacity": 240518168576, + "disk_capacity": instance_type.resources.disk.size_mib * 1024**2, + "ram_capacity": instance_type.resources.memory_mib * 1024**2, "gpus": [ { "count": len(instance_type.resources.gpus), - "manufacturer": "AMD", - "model": "MI300X", + "manufacturer": instance_type.resources.gpus[0].vendor, + "model": instance_type.resources.gpus[0].name, } ], } @@ -117,7 +134,9 @@ def create_instance( ssh_port=22, dockerized=True, ssh_proxy=None, - backend_data=vm_data["ip_address"], + backend_data=HotAisleInstanceBackendData( + ip_address=vm_data["ip_address"], vm_id=vm_data["name"] + ).json(), ) def update_provisioning_data( @@ -129,7 +148,8 @@ def update_provisioning_data( vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) if vm_state == "running": if provisioning_data.hostname is None and provisioning_data.backend_data: - provisioning_data.hostname = provisioning_data.backend_data + backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data) + provisioning_data.hostname = backend_data.ip_address commands = get_shim_commands( authorized_keys=[project_ssh_public_key], arch=provisioning_data.instance_type.resources.cpu_arch, @@ -211,3 +231,13 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) + + +class HotAisleInstanceBackendData(CoreModel): + ip_address: str + vm_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": + assert raw is not None + return cls.__response__.parse_raw(raw) diff --git a/src/dstack/_internal/core/backends/hotaisle/configurator.py b/src/dstack/_internal/core/backends/hotaisle/configurator.py index c0f1d6deb6..c7a6a6006e 100644 --- a/src/dstack/_internal/core/backends/hotaisle/configurator.py +++ b/src/dstack/_internal/core/backends/hotaisle/configurator.py @@ -3,60 +3,58 @@ from dstack._internal.core.backends.base.configurator import ( BackendRecord, Configurator, - raise_invalid_credentials_error, ) -from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient -from dstack._internal.core.backends.hotaisle.backend import HotaisleBackend +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend from dstack._internal.core.backends.hotaisle.models import ( - AnyHotaisleBackendConfig, - AnyHotaisleCreds, - HotaisleBackendConfig, - HotaisleBackendConfigWithCreds, - HotaisleConfig, - HotaisleCreds, - HotaisleStoredConfig, + AnyHotAisleBackendConfig, + AnyHotAisleCreds, + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleConfig, + HotAisleCreds, + HotAisleStoredConfig, ) from dstack._internal.core.models.backends.base import ( BackendType, ) -class HotaisleConfigurator(Configurator): +class HotAisleConfigurator(Configurator): TYPE = BackendType.HOTAISLE - BACKEND_CLASS = HotaisleBackend + BACKEND_CLASS = HotAisleBackend - def validate_config(self, config: HotaisleBackendConfigWithCreds, default_creds_enabled: bool): + def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool): self._validate_creds(config.creds, config.team_handle) def create_backend( - self, project_name: str, config: HotaisleBackendConfigWithCreds + self, project_name: str, config: HotAisleBackendConfigWithCreds ) -> BackendRecord: return BackendRecord( - config=HotaisleStoredConfig( - **HotaisleBackendConfig.__response__.parse_obj(config).dict() + config=HotAisleStoredConfig( + **HotAisleBackendConfig.__response__.parse_obj(config).dict() ).json(), - auth=HotaisleCreds.parse_obj(config.creds).json(), + auth=HotAisleCreds.parse_obj(config.creds).json(), ) def get_backend_config( self, record: BackendRecord, include_creds: bool - ) -> AnyHotaisleBackendConfig: + ) -> AnyHotAisleBackendConfig: config = self._get_config(record) if include_creds: - return HotaisleBackendConfigWithCreds.__response__.parse_obj(config) - return HotaisleBackendConfig.__response__.parse_obj(config) + return HotAisleBackendConfigWithCreds.__response__.parse_obj(config) + return HotAisleBackendConfig.__response__.parse_obj(config) - def get_backend(self, record: BackendRecord) -> HotaisleBackend: + def get_backend(self, record: BackendRecord) -> HotAisleBackend: config = self._get_config(record) - return HotaisleBackend(config=config) + return HotAisleBackend(config=config) - def _get_config(self, record: BackendRecord) -> HotaisleConfig: - return HotaisleConfig.__response__( + def _get_config(self, record: BackendRecord) -> HotAisleConfig: + return HotAisleConfig.__response__( **json.loads(record.config), - creds=HotaisleCreds.parse_raw(record.auth), + creds=HotAisleCreds.parse_raw(record.auth), ) - def _validate_creds(self, creds: AnyHotaisleCreds, team_handle: str): - api_client = HotaisleAPIClient(creds.api_key, team_handle) - if not api_client.validate_api_key(): - raise_invalid_credentials_error(fields=[["creds", "api_key"]]) + def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str): + api_client = HotAisleAPIClient(creds.api_key, team_handle) + api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/hotaisle/models.py b/src/dstack/_internal/core/backends/hotaisle/models.py index e77955d647..efee6b4e93 100644 --- a/src/dstack/_internal/core/backends/hotaisle/models.py +++ b/src/dstack/_internal/core/backends/hotaisle/models.py @@ -5,41 +5,41 @@ from dstack._internal.core.models.common import CoreModel -class HotaisleAPIKeyCreds(CoreModel): +class HotAisleAPIKeyCreds(CoreModel): type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The Hotaisle API key")] + api_key: Annotated[str, Field(description="The Hot Aisle API key")] -AnyHotaisleCreds = HotaisleAPIKeyCreds -HotaisleCreds = AnyHotaisleCreds +AnyHotAisleCreds = HotAisleAPIKeyCreds +HotAisleCreds = AnyHotAisleCreds -class HotaisleBackendConfig(CoreModel): +class HotAisleBackendConfig(CoreModel): type: Annotated[ Literal["hotaisle"], Field(description="The type of backend"), ] = "hotaisle" - team_handle: Annotated[str, Field(description="The Hotaisle team handle")] + team_handle: Annotated[str, Field(description="The Hot Aisle team handle")] regions: Annotated[ Optional[List[str]], - Field(description="The list of Hotaisle regions. Omit to use all regions"), + Field(description="The list of Hot Aisle regions. Omit to use all regions"), ] = None -class HotaisleBackendConfigWithCreds(HotaisleBackendConfig): - creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] +class HotAisleBackendConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] -AnyHotaisleBackendConfig = Union[HotaisleBackendConfig, HotaisleBackendConfigWithCreds] +AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds] -class HotaisleBackendFileConfigWithCreds(HotaisleBackendConfig): - creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] +class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] -class HotaisleStoredConfig(HotaisleBackendConfig): +class HotAisleStoredConfig(HotAisleBackendConfig): pass -class HotaisleConfig(HotaisleStoredConfig): - creds: AnyHotaisleCreds +class HotAisleConfig(HotAisleStoredConfig): + creds: AnyHotAisleCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 992424e45e..1097459704 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -30,9 +30,9 @@ GCPBackendFileConfigWithCreds, ) from dstack._internal.core.backends.hotaisle.models import ( - HotaisleBackendConfig, - HotaisleBackendConfigWithCreds, - HotaisleBackendFileConfigWithCreds, + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleBackendFileConfigWithCreds, ) from dstack._internal.core.backends.kubernetes.models import ( KubernetesBackendConfig, @@ -78,7 +78,7 @@ CudoBackendConfig, DataCrunchBackendConfig, GCPBackendConfig, - HotaisleBackendConfig, + HotAisleBackendConfig, KubernetesBackendConfig, LambdaBackendConfig, NebiusBackendConfig, @@ -101,7 +101,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendConfigWithCreds, - HotaisleBackendConfigWithCreds, + HotAisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, @@ -123,7 +123,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendFileConfigWithCreds, - HotaisleBackendFileConfigWithCreds, + HotAisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 04f704c669..067e181f6b 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -11,7 +11,7 @@ class BackendType(str, enum.Enum): DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform DATACRUNCH (BackendType): DataCrunch - HOTAISLE (BackendType): Hotaisle + HOTAISLE (BackendType): Hot Aisle KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud NEBIUS (BackendType): Nebius AI Cloud From 16e77e3b6ff4480d9e8ef989708a3f9774fa236b Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 7 Aug 2025 15:05:06 +0545 Subject: [PATCH 5/7] Bump gpuhunt to 0.1.7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5ff9c95670..bbc52084c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.6", + "gpuhunt==0.1.7", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", From d2846aede135a8f42743925bd517573c5a0e0ebd Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 7 Aug 2025 15:47:12 +0545 Subject: [PATCH 6/7] Resolve Remaining Review Comments --- .../core/backends/hotaisle/api_client.py | 7 ++++--- .../core/backends/hotaisle/compute.py | 20 +------------------ 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py index dd9ced4088..866ff2acbd 100644 --- a/src/dstack/_internal/core/backends/hotaisle/api_client.py +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -68,9 +68,7 @@ def upload_ssh_key(self, public_key: str) -> bool: response.raise_for_status() return True - def create_virtual_machine( - self, vm_payload: Dict[str, Any], instance_name: str - ) -> Dict[str, Any]: + def create_virtual_machine(self, vm_payload: Dict[str, Any]) -> Dict[str, Any]: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" response = self._make_request("POST", url, json=vm_payload) response.raise_for_status() @@ -87,6 +85,9 @@ def get_vm_state(self, vm_name: str) -> str: def terminate_virtual_machine(self, vm_name: str) -> None: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" response = self._make_request("DELETE", url) + if response.status_code == 404: + logger.debug("Hot Aisle virtual machine %s not found", vm_name) + return response.raise_for_status() def _make_request( diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py index de91173b56..8502f3fe2b 100644 --- a/src/dstack/_internal/core/backends/hotaisle/compute.py +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -10,7 +10,6 @@ from dstack._internal.core.backends.base.compute import ( Compute, ComputeWithCreateInstanceSupport, - generate_unique_instance_name, get_shim_commands, ) from dstack._internal.core.backends.base.offers import get_catalog_offers @@ -115,13 +114,10 @@ def create_instance( instance_config: InstanceConfiguration, placement_group: Optional[PlacementGroup], ) -> JobProvisioningData: - instance_name = generate_unique_instance_name( - instance_config, max_length=MAX_INSTANCE_NAME_LEN - ) project_ssh_key = instance_config.ssh_keys[0] self.api_client.upload_ssh_key(project_ssh_key.public) vm_payload = self.get_payload_from_offer(instance_offer.instance) - vm_data = self.api_client.create_virtual_machine(vm_payload, instance_name) + vm_data = self.api_client.create_virtual_machine(vm_payload) return JobProvisioningData( backend=instance_offer.backend, instance_type=instance_offer.instance, @@ -178,10 +174,6 @@ def _start_runner( project_ssh_private_key: str, launch_command: str, ): - _setup_instance( - hostname=hostname, - ssh_private_key=project_ssh_private_key, - ) _launch_runner( hostname=hostname, ssh_private_key=project_ssh_private_key, @@ -189,16 +181,6 @@ def _start_runner( ) -def _setup_instance( - hostname: str, - ssh_private_key: str, -): - setup_commands = ("sudo apt-get update",) - _run_ssh_command( - hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands) - ) - - def _launch_runner( hostname: str, ssh_private_key: str, From c19065bb2c4dee2e90ef82803ffe7298217fc9b4 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 7 Aug 2025 15:55:28 +0545 Subject: [PATCH 7/7] Add hotaisle to TestListBackendTypes --- src/tests/_internal/server/routers/test_backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index 6afe36c0c6..6bac9383a3 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -83,6 +83,7 @@ async def test_returns_backend_types(self, client: AsyncClient): "cudo", "datacrunch", "gcp", + "hotaisle", "kubernetes", "lambda", *(["nebius"] if sys.version_info >= (3, 10) else []),