diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index 571d010529..f0e198417e 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -54,6 +54,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.hotaisle.configurator import ( + HotaisleConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(HotaisleConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.kubernetes.configurator import ( KubernetesConfigurator, diff --git a/src/dstack/_internal/core/backends/hotaisle/__init__.py b/src/dstack/_internal/core/backends/hotaisle/__init__.py new file mode 100644 index 0000000000..9c665d1498 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/__init__.py @@ -0,0 +1 @@ +# Hotaisle backend for dstack diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py new file mode 100644 index 0000000000..2a346b432b --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -0,0 +1,104 @@ +from typing import Any, Dict, Optional + +import requests + +from dstack._internal.utils.logging import get_logger + +API_URL = "https://admin.hotaisle.app/api" + +logger = get_logger(__name__) + + +class HotaisleAPIClient: + def __init__(self, api_key: str, team_handle: str): + self.api_key = api_key + self.team_handle = team_handle + + def validate_api_key(self) -> bool: + try: + self._validate_user_and_team() + return True + except requests.HTTPError as e: + if e.response.status_code in [401, 403]: + return False + raise e + except ValueError: + return False + + def _validate_user_and_team(self) -> None: + url = f"{API_URL}/user/" + response = self._make_request("GET", url) + + if response.ok: + user_data = response.json() + else: + response.raise_for_status() + + teams = user_data.get("teams", []) + if not teams: + raise ValueError("No Hotaisle teams found for this user") + + available_teams = [team["handle"] for team in teams] + if self.team_handle not in available_teams: + raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.") + + def upload_ssh_key(self, public_key: str) -> bool: + url = f"{API_URL}/user/ssh_keys/" + payload = {"authorized_key": public_key} + + response = self._make_request("POST", url, json=payload) + + if response.status_code == 409: + return True # Key already exists - success + if not response.ok: + response.raise_for_status() + return True + + def create_virtual_machine( + self, vm_payload: Dict[str, Any], instance_name: str + ) -> Dict[str, Any]: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" + response = self._make_request("POST", url, json=vm_payload) + + if not response.ok: + response.raise_for_status() + + vm_data = response.json() + return vm_data + + def get_vm_state(self, vm_name: str) -> str: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" + response = self._make_request("GET", url) + + if not response.ok: + response.raise_for_status() + + state_data = response.json() + return state_data["state"] + + def terminate_virtual_machine(self, vm_name: str) -> bool: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" + response = self._make_request("DELETE", url) + + if response.status_code == 204: + return True + else: + response.raise_for_status() + + def _make_request( + self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + ) -> requests.Response: + headers = { + "accept": "application/json", + "Authorization": self.api_key, + } + if json is not None: + headers["Content-Type"] = "application/json" + + return requests.request( + method=method, + url=url, + headers=headers, + json=json, + timeout=timeout, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py new file mode 100644 index 0000000000..e8804c7a85 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute +from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.models.backends.base import BackendType + + +class HotaisleBackend(Backend): + TYPE = BackendType.HOTAISLE + COMPUTE_CLASS = HotaisleCompute + + def __init__(self, config: HotaisleConfig): + self.config = config + self._compute = HotaisleCompute(self.config) + + def compute(self) -> HotaisleCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py new file mode 100644 index 0000000000..876f14584e --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -0,0 +1,213 @@ +import shlex +import subprocess +import tempfile +from threading import Thread +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.hotaisle import HotAisleProvider + +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithCreateInstanceSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient +from dstack._internal.core.backends.hotaisle.models import HotaisleConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 + + +class HotaisleCompute( + ComputeWithCreateInstanceSupport, + Compute, +): + def __init__(self, config: HotaisleConfig): + super().__init__() + self.config = config + self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.catalog.add_provider( + HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) + ) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.HOTAISLE, + locations=self.config.regions or None, + requirements=requirements, + catalog=self.catalog, + ) + offers = [ + InstanceOfferWithAvailability( + **offer.dict(), availability=InstanceAvailability.AVAILABLE + ) + for offer in offers + ] + return offers + + def get_payload_from_offer(self, instance_type) -> dict: + # Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are + # not configurable. + cpu_cores = instance_type.resources.cpus + if cpu_cores == 8: + cpu_model = "Xeon Platinum 8462Y+" + frequency = 2800000000 + else: # cpu_cores == 13 + cpu_model = "Xeon Platinum 8470" + frequency = 2000000000 + + return { + "cpu_cores": cpu_cores, + "cpus": { + "count": 1, + "manufacturer": "Intel", + "model": cpu_model, + "cores": cpu_cores, + "frequency": frequency, + }, + "disk_capacity": 13194139533312, + "ram_capacity": 240518168576, + "gpus": [ + { + "count": len(instance_type.resources.gpus), + "manufacturer": "AMD", + "model": "MI300X", + } + ], + } + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + project_ssh_key = instance_config.ssh_keys[0] + self.api_client.upload_ssh_key(project_ssh_key.public) + vm_payload = self.get_payload_from_offer(instance_offer.instance) + vm_data = self.api_client.create_virtual_machine(vm_payload, instance_name) + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_data["name"], + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="hotaisle", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=vm_data["ip_address"], + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) + if vm_state == "running": + if provisioning_data.hostname is None and provisioning_data.backend_data: + provisioning_data.hostname = provisioning_data.backend_data + commands = get_shim_commands( + authorized_keys=[project_ssh_public_key], + arch=provisioning_data.instance_type.resources.cpu_arch, + ) + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) + thread = Thread( + target=_start_runner, + kwargs={ + "hostname": provisioning_data.hostname, + "project_ssh_private_key": project_ssh_private_key, + "launch_command": launch_command, + }, + daemon=True, + ) + thread.start() + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + vm_name = instance_id + self.api_client.terminate_virtual_machine(vm_name) + + +def _start_runner( + hostname: str, + project_ssh_private_key: str, + launch_command: str, +): + _setup_instance( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + ) + _launch_runner( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + launch_command=launch_command, + ) + + +def _setup_instance( + hostname: str, + ssh_private_key: str, +): + setup_commands = ("sudo apt-get update",) + _run_ssh_command( + hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands) + ) + + +def _launch_runner( + hostname: str, + ssh_private_key: str, + launch_command: str, +): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" + _run_ssh_command( + hostname=hostname, + ssh_private_key=ssh_private_key, + command=daemonized_command, + ) + + +def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): + with tempfile.NamedTemporaryFile("w+", 0o600) as f: + f.write(ssh_private_key) + f.flush() + subprocess.run( + [ + "ssh", + "-F", + "none", + "-o", + "StrictHostKeyChecking=no", + "-i", + f.name, + f"hotaisle@{hostname}", + command, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/configurator.py b/src/dstack/_internal/core/backends/hotaisle/configurator.py new file mode 100644 index 0000000000..c0f1d6deb6 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/configurator.py @@ -0,0 +1,62 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient +from dstack._internal.core.backends.hotaisle.backend import HotaisleBackend +from dstack._internal.core.backends.hotaisle.models import ( + AnyHotaisleBackendConfig, + AnyHotaisleCreds, + HotaisleBackendConfig, + HotaisleBackendConfigWithCreds, + HotaisleConfig, + HotaisleCreds, + HotaisleStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class HotaisleConfigurator(Configurator): + TYPE = BackendType.HOTAISLE + BACKEND_CLASS = HotaisleBackend + + def validate_config(self, config: HotaisleBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds, config.team_handle) + + def create_backend( + self, project_name: str, config: HotaisleBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=HotaisleStoredConfig( + **HotaisleBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=HotaisleCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config( + self, record: BackendRecord, include_creds: bool + ) -> AnyHotaisleBackendConfig: + config = self._get_config(record) + if include_creds: + return HotaisleBackendConfigWithCreds.__response__.parse_obj(config) + return HotaisleBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> HotaisleBackend: + config = self._get_config(record) + return HotaisleBackend(config=config) + + def _get_config(self, record: BackendRecord) -> HotaisleConfig: + return HotaisleConfig.__response__( + **json.loads(record.config), + creds=HotaisleCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyHotaisleCreds, team_handle: str): + api_client = HotaisleAPIClient(creds.api_key, team_handle) + if not api_client.validate_api_key(): + raise_invalid_credentials_error(fields=[["creds", "api_key"]]) diff --git a/src/dstack/_internal/core/backends/hotaisle/models.py b/src/dstack/_internal/core/backends/hotaisle/models.py new file mode 100644 index 0000000000..e77955d647 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/models.py @@ -0,0 +1,45 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class HotaisleAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The Hotaisle API key")] + + +AnyHotaisleCreds = HotaisleAPIKeyCreds +HotaisleCreds = AnyHotaisleCreds + + +class HotaisleBackendConfig(CoreModel): + type: Annotated[ + Literal["hotaisle"], + Field(description="The type of backend"), + ] = "hotaisle" + team_handle: Annotated[str, Field(description="The Hotaisle team handle")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Hotaisle regions. Omit to use all regions"), + ] = None + + +class HotaisleBackendConfigWithCreds(HotaisleBackendConfig): + creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] + + +AnyHotaisleBackendConfig = Union[HotaisleBackendConfig, HotaisleBackendConfigWithCreds] + + +class HotaisleBackendFileConfigWithCreds(HotaisleBackendConfig): + creds: Annotated[AnyHotaisleCreds, Field(description="The credentials")] + + +class HotaisleStoredConfig(HotaisleBackendConfig): + pass + + +class HotaisleConfig(HotaisleStoredConfig): + creds: AnyHotaisleCreds diff --git a/src/dstack/_internal/core/backends/lambdalabs/compute.py b/src/dstack/_internal/core/backends/lambdalabs/compute.py index 15641851ac..aead3e1eb0 100644 --- a/src/dstack/_internal/core/backends/lambdalabs/compute.py +++ b/src/dstack/_internal/core/backends/lambdalabs/compute.py @@ -206,10 +206,11 @@ def _launch_runner( ssh_private_key: str, launch_command: str, ): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" _run_ssh_command( hostname=hostname, ssh_private_key=ssh_private_key, - command=launch_command, + command=daemonized_command, ) diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 0b5779db78..992424e45e 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -29,6 +29,11 @@ GCPBackendConfigWithCreds, GCPBackendFileConfigWithCreds, ) +from dstack._internal.core.backends.hotaisle.models import ( + HotaisleBackendConfig, + HotaisleBackendConfigWithCreds, + HotaisleBackendFileConfigWithCreds, +) from dstack._internal.core.backends.kubernetes.models import ( KubernetesBackendConfig, KubernetesBackendConfigWithCreds, @@ -73,6 +78,7 @@ CudoBackendConfig, DataCrunchBackendConfig, GCPBackendConfig, + HotaisleBackendConfig, KubernetesBackendConfig, LambdaBackendConfig, NebiusBackendConfig, @@ -95,6 +101,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendConfigWithCreds, + HotaisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, @@ -116,6 +123,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendFileConfigWithCreds, + HotaisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 78aafb142c..04f704c669 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -11,6 +11,7 @@ class BackendType(str, enum.Enum): DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform DATACRUNCH (BackendType): DataCrunch + HOTAISLE (BackendType): Hotaisle KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud NEBIUS (BackendType): Nebius AI Cloud @@ -28,6 +29,7 @@ class BackendType(str, enum.Enum): DATACRUNCH = "datacrunch" DSTACK = "dstack" GCP = "gcp" + HOTAISLE = "hotaisle" KUBERNETES = "kubernetes" LAMBDA = "lambda" LOCAL = "local"