diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index fe9d7df74e..81a6cf48e0 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -579,6 +579,34 @@ gcloud projects list --format="json(projectId)" Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. +## Hot Aisle + +Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). +Create a new team and generate an API key for the member in the team. + +Then, go ahead and configure the backend: + +
+ +```yaml +projects: +- name: main + backends: + - type: hotaisle + team_handle: hotaisle-team-handle + creds: + type: api_key + api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f +``` + +
+ +??? info "Required permissions" + The API key must have the following roles assigned: + + * **Owner role for the user** - Required for creating and managing SSH keys + * **Operator role for the team** - Required for managing virtual machines within the team + ## Lambda Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index fbe378d8cd..1c347619cd 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -15,7 +15,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti overrides: show_root_heading: false backends: - type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' + type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' #### `projects[n].backends` { #backends data-toc-label="backends" } @@ -126,6 +126,23 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti type: required: true +##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: hotaisle- + +###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + ##### `projects[n].backends[type=lambda]` { #lambda data-toc-label="lambda" } #SCHEMA# dstack._internal.core.backends.lambdalabs.models.LambdaBackendConfigWithCreds diff --git a/pyproject.toml b/pyproject.toml index 5ff9c95670..bbc52084c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.6", + "gpuhunt==0.1.7", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index 571d010529..a2df6a4e63 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -54,6 +54,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.hotaisle.configurator import ( + HotAisleConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(HotAisleConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.kubernetes.configurator import ( KubernetesConfigurator, diff --git a/src/dstack/_internal/core/backends/hotaisle/__init__.py b/src/dstack/_internal/core/backends/hotaisle/__init__.py new file mode 100644 index 0000000000..9c665d1498 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/__init__.py @@ -0,0 +1 @@ +# Hotaisle backend for dstack diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py new file mode 100644 index 0000000000..866ff2acbd --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -0,0 +1,109 @@ +from typing import Any, Dict, Optional + +import requests + +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error +from dstack._internal.utils.logging import get_logger + +API_URL = "https://admin.hotaisle.app/api" + +logger = get_logger(__name__) + + +class HotAisleAPIClient: + def __init__(self, api_key: str, team_handle: str): + self.api_key = api_key + self.team_handle = team_handle + + def validate_api_key(self) -> bool: + try: + self._validate_user_and_team() + return True + except requests.HTTPError as e: + if e.response.status_code == 401: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], details="Invalid API key" + ) + elif e.response.status_code == 403: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Authenticated user does note have required permissions", + ) + raise e + except ValueError as e: + error_message = str(e) + if "No Hot Aisle teams found" in error_message: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], + details="Valid API key but no teams found for this user", + ) + elif "not found" in error_message: + raise_invalid_credentials_error( + fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found" + ) + raise e + + def _validate_user_and_team(self) -> None: + url = f"{API_URL}/user/" + response = self._make_request("GET", url) + response.raise_for_status() + user_data = response.json() + + teams = user_data.get("teams", []) + if not teams: + raise ValueError("No Hot Aisle teams found for this user") + + available_teams = [team["handle"] for team in teams] + if self.team_handle not in available_teams: + raise ValueError(f"Hot Aisle team '{self.team_handle}' not found.") + + def upload_ssh_key(self, public_key: str) -> bool: + url = f"{API_URL}/user/ssh_keys/" + payload = {"authorized_key": public_key} + + response = self._make_request("POST", url, json=payload) + + if response.status_code == 409: + return True # Key already exists - success + response.raise_for_status() + return True + + def create_virtual_machine(self, vm_payload: Dict[str, Any]) -> Dict[str, Any]: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" + response = self._make_request("POST", url, json=vm_payload) + response.raise_for_status() + vm_data = response.json() + return vm_data + + def get_vm_state(self, vm_name: str) -> str: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" + response = self._make_request("GET", url) + response.raise_for_status() + state_data = response.json() + return state_data["state"] + + def terminate_virtual_machine(self, vm_name: str) -> None: + url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" + response = self._make_request("DELETE", url) + if response.status_code == 404: + logger.debug("Hot Aisle virtual machine %s not found", vm_name) + return + response.raise_for_status() + + def _make_request( + self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + ) -> requests.Response: + headers = { + "accept": "application/json", + "Authorization": f"Token {self.api_key}", + } + if json is not None: + headers["Content-Type"] = "application/json" + + return requests.request( + method=method, + url=url, + headers=headers, + json=json, + timeout=timeout, + ) diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py new file mode 100644 index 0000000000..cb568f5258 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig +from dstack._internal.core.models.backends.base import BackendType + + +class HotAisleBackend(Backend): + TYPE = BackendType.HOTAISLE + COMPUTE_CLASS = HotAisleCompute + + def __init__(self, config: HotAisleConfig): + self.config = config + self._compute = HotAisleCompute(self.config) + + def compute(self) -> HotAisleCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py new file mode 100644 index 0000000000..8502f3fe2b --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/compute.py @@ -0,0 +1,225 @@ +import shlex +import subprocess +import tempfile +from threading import Thread +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.hotaisle import HotAisleProvider + +from dstack._internal.core.backends.base.compute import ( + Compute, + ComputeWithCreateInstanceSupport, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.models import HotAisleConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 + + +INSTANCE_TYPE_SPECS = { + "1x MI300X 8x Xeon Platinum 8462Y+": { + "cpu_model": "Xeon Platinum 8462Y+", + "cpu_frequency": 2800000000, + "cpu_manufacturer": "Intel", + }, + "1x MI300X 13x Xeon Platinum 8470": { + "cpu_model": "Xeon Platinum 8470", + "cpu_frequency": 2000000000, + "cpu_manufacturer": "Intel", + }, +} + + +class HotAisleCompute( + ComputeWithCreateInstanceSupport, + Compute, +): + def __init__(self, config: HotAisleConfig): + super().__init__() + self.config = config + self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.catalog.add_provider( + HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) + ) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.HOTAISLE, + locations=self.config.regions or None, + requirements=requirements, + catalog=self.catalog, + ) + + supported_offers = [] + for offer in offers: + if offer.instance.name in INSTANCE_TYPE_SPECS: + supported_offers.append( + InstanceOfferWithAvailability( + **offer.dict(), availability=InstanceAvailability.AVAILABLE + ) + ) + else: + logger.warning( + f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}" + ) + + return supported_offers + + def get_payload_from_offer(self, instance_type) -> dict: + instance_type_name = instance_type.name + cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name] + cpu_cores = instance_type.resources.cpus + + return { + "cpu_cores": cpu_cores, + "cpus": { + "count": 1, + "manufacturer": cpu_specs["cpu_manufacturer"], + "model": cpu_specs["cpu_model"], + "cores": cpu_cores, + "frequency": cpu_specs["cpu_frequency"], + }, + "disk_capacity": instance_type.resources.disk.size_mib * 1024**2, + "ram_capacity": instance_type.resources.memory_mib * 1024**2, + "gpus": [ + { + "count": len(instance_type.resources.gpus), + "manufacturer": instance_type.resources.gpus[0].vendor, + "model": instance_type.resources.gpus[0].name, + } + ], + } + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + project_ssh_key = instance_config.ssh_keys[0] + self.api_client.upload_ssh_key(project_ssh_key.public) + vm_payload = self.get_payload_from_offer(instance_offer.instance) + vm_data = self.api_client.create_virtual_machine(vm_payload) + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_data["name"], + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="hotaisle", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=HotAisleInstanceBackendData( + ip_address=vm_data["ip_address"], vm_id=vm_data["name"] + ).json(), + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) + if vm_state == "running": + if provisioning_data.hostname is None and provisioning_data.backend_data: + backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data) + provisioning_data.hostname = backend_data.ip_address + commands = get_shim_commands( + authorized_keys=[project_ssh_public_key], + arch=provisioning_data.instance_type.resources.cpu_arch, + ) + launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands)) + thread = Thread( + target=_start_runner, + kwargs={ + "hostname": provisioning_data.hostname, + "project_ssh_private_key": project_ssh_private_key, + "launch_command": launch_command, + }, + daemon=True, + ) + thread.start() + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + vm_name = instance_id + self.api_client.terminate_virtual_machine(vm_name) + + +def _start_runner( + hostname: str, + project_ssh_private_key: str, + launch_command: str, +): + _launch_runner( + hostname=hostname, + ssh_private_key=project_ssh_private_key, + launch_command=launch_command, + ) + + +def _launch_runner( + hostname: str, + ssh_private_key: str, + launch_command: str, +): + daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown" + _run_ssh_command( + hostname=hostname, + ssh_private_key=ssh_private_key, + command=daemonized_command, + ) + + +def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): + with tempfile.NamedTemporaryFile("w+", 0o600) as f: + f.write(ssh_private_key) + f.flush() + subprocess.run( + [ + "ssh", + "-F", + "none", + "-o", + "StrictHostKeyChecking=no", + "-i", + f.name, + f"hotaisle@{hostname}", + command, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +class HotAisleInstanceBackendData(CoreModel): + ip_address: str + vm_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": + assert raw is not None + return cls.__response__.parse_raw(raw) diff --git a/src/dstack/_internal/core/backends/hotaisle/configurator.py b/src/dstack/_internal/core/backends/hotaisle/configurator.py new file mode 100644 index 0000000000..c7a6a6006e --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/configurator.py @@ -0,0 +1,60 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, +) +from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient +from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend +from dstack._internal.core.backends.hotaisle.models import ( + AnyHotAisleBackendConfig, + AnyHotAisleCreds, + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleConfig, + HotAisleCreds, + HotAisleStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class HotAisleConfigurator(Configurator): + TYPE = BackendType.HOTAISLE + BACKEND_CLASS = HotAisleBackend + + def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool): + self._validate_creds(config.creds, config.team_handle) + + def create_backend( + self, project_name: str, config: HotAisleBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=HotAisleStoredConfig( + **HotAisleBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=HotAisleCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config( + self, record: BackendRecord, include_creds: bool + ) -> AnyHotAisleBackendConfig: + config = self._get_config(record) + if include_creds: + return HotAisleBackendConfigWithCreds.__response__.parse_obj(config) + return HotAisleBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> HotAisleBackend: + config = self._get_config(record) + return HotAisleBackend(config=config) + + def _get_config(self, record: BackendRecord) -> HotAisleConfig: + return HotAisleConfig.__response__( + **json.loads(record.config), + creds=HotAisleCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str): + api_client = HotAisleAPIClient(creds.api_key, team_handle) + api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/hotaisle/models.py b/src/dstack/_internal/core/backends/hotaisle/models.py new file mode 100644 index 0000000000..efee6b4e93 --- /dev/null +++ b/src/dstack/_internal/core/backends/hotaisle/models.py @@ -0,0 +1,45 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class HotAisleAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The Hot Aisle API key")] + + +AnyHotAisleCreds = HotAisleAPIKeyCreds +HotAisleCreds = AnyHotAisleCreds + + +class HotAisleBackendConfig(CoreModel): + type: Annotated[ + Literal["hotaisle"], + Field(description="The type of backend"), + ] = "hotaisle" + team_handle: Annotated[str, Field(description="The Hot Aisle team handle")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of Hot Aisle regions. Omit to use all regions"), + ] = None + + +class HotAisleBackendConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] + + +AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds] + + +class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig): + creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")] + + +class HotAisleStoredConfig(HotAisleBackendConfig): + pass + + +class HotAisleConfig(HotAisleStoredConfig): + creds: AnyHotAisleCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 0b5779db78..1097459704 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -29,6 +29,11 @@ GCPBackendConfigWithCreds, GCPBackendFileConfigWithCreds, ) +from dstack._internal.core.backends.hotaisle.models import ( + HotAisleBackendConfig, + HotAisleBackendConfigWithCreds, + HotAisleBackendFileConfigWithCreds, +) from dstack._internal.core.backends.kubernetes.models import ( KubernetesBackendConfig, KubernetesBackendConfigWithCreds, @@ -73,6 +78,7 @@ CudoBackendConfig, DataCrunchBackendConfig, GCPBackendConfig, + HotAisleBackendConfig, KubernetesBackendConfig, LambdaBackendConfig, NebiusBackendConfig, @@ -95,6 +101,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendConfigWithCreds, + HotAisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, @@ -116,6 +123,7 @@ CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, GCPBackendFileConfigWithCreds, + HotAisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, LambdaBackendConfigWithCreds, OCIBackendConfigWithCreds, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 78aafb142c..067e181f6b 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -11,6 +11,7 @@ class BackendType(str, enum.Enum): DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform DATACRUNCH (BackendType): DataCrunch + HOTAISLE (BackendType): Hot Aisle KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud NEBIUS (BackendType): Nebius AI Cloud @@ -28,6 +29,7 @@ class BackendType(str, enum.Enum): DATACRUNCH = "datacrunch" DSTACK = "dstack" GCP = "gcp" + HOTAISLE = "hotaisle" KUBERNETES = "kubernetes" LAMBDA = "lambda" LOCAL = "local" diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index 6afe36c0c6..6bac9383a3 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -83,6 +83,7 @@ async def test_returns_backend_types(self, client: AsyncClient): "cudo", "datacrunch", "gcp", + "hotaisle", "kubernetes", "lambda", *(["nebius"] if sys.version_info >= (3, 10) else []),