diff --git a/src/dstack/_internal/core/backends/tensordock/__init__.py b/src/dstack/_internal/core/backends/tensordock/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/dstack/_internal/core/backends/tensordock/api_client.py b/src/dstack/_internal/core/backends/tensordock/api_client.py deleted file mode 100644 index a45772bf40..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/api_client.py +++ /dev/null @@ -1,104 +0,0 @@ -import uuid - -import requests -import yaml - -from dstack._internal.core.errors import BackendError -from dstack._internal.core.models.instances import InstanceType -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) -REQUEST_TIMEOUT = 20 - - -class TensorDockAPIClient: - def __init__(self, api_key: str, api_token: str): - self.api_url = "https://marketplace.tensordock.com/api/v0".rstrip("/") - self.api_key = api_key - self.api_token = api_token - self.s = requests.Session() - - def auth_test(self) -> bool: - resp = self.s.post( - self._url("/auth/test"), - data={"api_key": self.api_key, "api_token": self.api_token}, - timeout=REQUEST_TIMEOUT, - ) - resp.raise_for_status() - return resp.json()["success"] - - def get_hostnode(self, hostnode_id: str) -> dict: - logger.debug("Fetching hostnode %s", hostnode_id) - resp = self.s.get( - self._url(f"/client/deploy/hostnodes/{hostnode_id}"), timeout=REQUEST_TIMEOUT - ) - resp.raise_for_status() - data = resp.json() - if not data["success"]: - raise requests.HTTPError(data) - return data["hostnode"] - - def deploy_single(self, instance_name: str, instance: InstanceType, cloudinit: dict) -> dict: - hostnode = self.get_hostnode(instance.name) - gpu = instance.resources.gpus[0] - for gpu_model in hostnode["specs"]["gpu"].keys(): - if gpu_model.endswith(f"-{gpu.memory_mib // 1024}gb"): - if gpu.name.lower() in gpu_model.lower(): - break - else: - raise ValueError(f"Can't find GPU on the hostnode: {gpu.name}") - form = { - "api_key": self.api_key, - "api_token": self.api_token, - "password": uuid.uuid4().hex, # we disable the password auth, but it's required - "name": instance_name, - "gpu_count": len(instance.resources.gpus), - "gpu_model": gpu_model, - "vcpus": instance.resources.cpus, - "ram": instance.resources.memory_mib // 1024, - "external_ports": "{%s}" - % max(hostnode["networking"]["ports"]), # it's safer to use a higher port - "internal_ports": "{22}", - "hostnode": instance.name, - "storage": round(instance.resources.disk.size_mib / 1024), - "operating_system": "Ubuntu 22.04 LTS", - "cloudinit_script": yaml.dump(cloudinit).replace("\n", "\\n"), - } - logger.debug( - "Deploying instance hostnode=%s, cpus=%s, memory=%s, gpu=%sx %s", - form["hostnode"], - form["vcpus"], - form["ram"], - form["gpu_count"], - form["gpu_model"], - ) - resp = self.s.post(self._url("/client/deploy/single"), data=form, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - data = resp.json() - if not data["success"]: - raise requests.HTTPError(data) - data["password"] = form["password"] - return data - - def delete_single_if_exists(self, instance_id: str): - logger.debug("Deleting instance %s", instance_id) - resp = self.s.post( - self._url("/client/delete/single"), - data={ - "api_key": self.api_key, - "api_token": self.api_token, - "server": instance_id, - }, - timeout=REQUEST_TIMEOUT, - ) - try: - data = resp.json() - if "already terminated" in data.get("error", ""): - return - if not data.get("success"): - raise BackendError(data) - except ValueError: # json parsing error - raise BackendError(resp.text) - - def _url(self, path): - return f"{self.api_url}/{path.lstrip('/')}" diff --git a/src/dstack/_internal/core/backends/tensordock/backend.py b/src/dstack/_internal/core/backends/tensordock/backend.py deleted file mode 100644 index f40755bc8b..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/backend.py +++ /dev/null @@ -1,16 +0,0 @@ -from dstack._internal.core.backends.base.backend import Backend -from dstack._internal.core.backends.tensordock.compute import TensorDockCompute -from dstack._internal.core.backends.tensordock.models import TensorDockConfig -from dstack._internal.core.models.backends.base import BackendType - - -class TensorDockBackend(Backend): - TYPE = BackendType.TENSORDOCK - COMPUTE_CLASS = TensorDockCompute - - def __init__(self, config: TensorDockConfig): - self.config = config - self._compute = TensorDockCompute(self.config) - - def compute(self) -> TensorDockCompute: - return self._compute diff --git a/src/dstack/_internal/core/backends/tensordock/compute.py b/src/dstack/_internal/core/backends/tensordock/compute.py deleted file mode 100644 index 700b51a169..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/compute.py +++ /dev/null @@ -1,122 +0,0 @@ -import json -from typing import List, Optional - -import requests - -from dstack._internal.core.backends.base.backend import Compute -from dstack._internal.core.backends.base.compute import ( - ComputeWithCreateInstanceSupport, - generate_unique_instance_name, - get_shim_commands, -) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient -from dstack._internal.core.backends.tensordock.models import TensorDockConfig -from dstack._internal.core.errors import NoCapacityError -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, -) -from dstack._internal.core.models.placement import PlacementGroup -from dstack._internal.core.models.runs import JobProvisioningData, Requirements -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - - -# Undocumented but names of len 60 work -MAX_INSTANCE_NAME_LEN = 60 - - -class TensorDockCompute( - ComputeWithCreateInstanceSupport, - Compute, -): - def __init__(self, config: TensorDockConfig): - super().__init__() - self.config = config - self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token) - - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - offers = get_catalog_offers( - backend=BackendType.TENSORDOCK, - requirements=requirements, - ) - offers = [ - InstanceOfferWithAvailability( - **offer.dict(), availability=InstanceAvailability.AVAILABLE - ) - for offer in offers - ] - return offers - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - placement_group: Optional[PlacementGroup], - ) -> JobProvisioningData: - instance_name = generate_unique_instance_name( - instance_config, max_length=MAX_INSTANCE_NAME_LEN - ) - commands = get_shim_commands(authorized_keys=instance_config.get_public_keys()) - try: - resp = self.api_client.deploy_single( - instance_name=instance_name, - instance=instance_offer.instance, - cloudinit={ - "ssh_pwauth": False, # disable password auth - "users": [ - "default", - { - "name": "user", - "ssh_authorized_keys": instance_config.get_public_keys(), - }, - ], - "runcmd": [ - ["sh", "-c", " && ".join(commands)], - ], - "write_files": [ - { - "path": "/etc/docker/daemon.json", - "content": json.dumps( - { - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [], - } - }, - "exec-opts": ["native.cgroupdriver=cgroupfs"], - } - ), - } - ], - }, - ) - except requests.HTTPError as e: - logger.warning("Got error from tensordock: %s", e) - raise NoCapacityError() - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=resp["server"], - hostname=resp["ip"], - internal_ip=None, - region=instance_offer.region, - price=instance_offer.price, - username="user", - ssh_port={int(v): int(k) for k, v in resp["port_forwards"].items()}[22], - dockerized=True, - ssh_proxy=None, - backend_data=None, - ) - - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ): - self.api_client.delete_single_if_exists(instance_id) diff --git a/src/dstack/_internal/core/backends/tensordock/configurator.py b/src/dstack/_internal/core/backends/tensordock/configurator.py deleted file mode 100644 index 0582b63431..0000000000 --- a/src/dstack/_internal/core/backends/tensordock/configurator.py +++ /dev/null @@ -1,74 +0,0 @@ -import json - -from dstack._internal.core.backends.base.configurator import ( - BackendRecord, - Configurator, - raise_invalid_credentials_error, -) -from dstack._internal.core.backends.tensordock import api_client -from dstack._internal.core.backends.tensordock.backend import TensorDockBackend -from dstack._internal.core.backends.tensordock.models import ( - TensorDockBackendConfig, - TensorDockBackendConfigWithCreds, - TensorDockConfig, - TensorDockCreds, - TensorDockStoredConfig, -) -from dstack._internal.core.models.backends.base import ( - BackendType, -) - -# TensorDock regions are dynamic, currently we don't offer any filtering -REGIONS = [] - - -class TensorDockConfigurator( - Configurator[ - TensorDockBackendConfig, - TensorDockBackendConfigWithCreds, - ] -): - TYPE = BackendType.TENSORDOCK - BACKEND_CLASS = TensorDockBackend - - def validate_config( - self, config: TensorDockBackendConfigWithCreds, default_creds_enabled: bool - ): - self._validate_tensordock_creds(config.creds.api_key, config.creds.api_token) - - def create_backend( - self, project_name: str, config: TensorDockBackendConfigWithCreds - ) -> BackendRecord: - if config.regions is None: - config.regions = REGIONS - return BackendRecord( - config=TensorDockStoredConfig( - **TensorDockBackendConfig.__response__.parse_obj(config).dict() - ).json(), - auth=TensorDockCreds.parse_obj(config.creds).json(), - ) - - def get_backend_config_with_creds( - self, record: BackendRecord - ) -> TensorDockBackendConfigWithCreds: - config = self._get_config(record) - return TensorDockBackendConfigWithCreds.__response__.parse_obj(config) - - def get_backend_config_without_creds(self, record: BackendRecord) -> TensorDockBackendConfig: - config = self._get_config(record) - return TensorDockBackendConfig.__response__.parse_obj(config) - - def get_backend(self, record: BackendRecord) -> TensorDockBackend: - config = self._get_config(record) - return TensorDockBackend(config=config) - - def _get_config(self, record: BackendRecord) -> TensorDockConfig: - return TensorDockConfig.__response__( - **json.loads(record.config), - creds=TensorDockCreds.parse_raw(record.auth), - ) - - def _validate_tensordock_creds(self, api_key: str, api_token: str): - client = api_client.TensorDockAPIClient(api_key=api_key, api_token=api_token) - if not client.auth_test(): - raise_invalid_credentials_error(fields=[["creds", "api_key"], ["creds", "api_token"]]) diff --git a/src/dstack/_internal/core/backends/tensordock/models.py b/src/dstack/_internal/core/backends/tensordock/models.py index 171f1edf6d..d031b515ac 100644 --- a/src/dstack/_internal/core/backends/tensordock/models.py +++ b/src/dstack/_internal/core/backends/tensordock/models.py @@ -4,6 +4,8 @@ from dstack._internal.core.models.common import CoreModel +# TODO: TensorDock is deprecated and will be removed in the future + class TensorDockAPIKeyCreds(CoreModel): type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" diff --git a/src/tests/_internal/core/backends/tensordock/test_configurator.py b/src/tests/_internal/core/backends/tensordock/test_configurator.py deleted file mode 100644 index 934308adad..0000000000 --- a/src/tests/_internal/core/backends/tensordock/test_configurator.py +++ /dev/null @@ -1,38 +0,0 @@ -from unittest.mock import patch - -import pytest - -from dstack._internal.core.backends.tensordock.configurator import ( - TensorDockConfigurator, -) -from dstack._internal.core.backends.tensordock.models import ( - TensorDockBackendConfigWithCreds, - TensorDockCreds, -) -from dstack._internal.core.errors import BackendInvalidCredentialsError - - -class TestTensorDockConfigurator: - def test_validate_config_valid(self): - config = TensorDockBackendConfigWithCreds( - creds=TensorDockCreds(api_key="valid", api_token="valid"), - ) - with patch( - "dstack._internal.core.backends.tensordock.api_client.TensorDockAPIClient.auth_test" - ) as auth_test_mock: - auth_test_mock.return_value = True - TensorDockConfigurator().validate_config(config, default_creds_enabled=True) - - def test_validate_config_invalid_creds(self): - config = TensorDockBackendConfigWithCreds( - creds=TensorDockCreds(api_key="invalid", api_token="invalid"), - ) - with ( - patch( - "dstack._internal.core.backends.tensordock.api_client.TensorDockAPIClient.auth_test" - ) as auth_test_mock, - pytest.raises(BackendInvalidCredentialsError) as exc_info, - ): - auth_test_mock.return_value = False - TensorDockConfigurator().validate_config(config, default_creds_enabled=True) - assert exc_info.value.fields == [["creds", "api_key"], ["creds", "api_token"]] diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index 33dd2147b5..a640dcb84d 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -91,7 +91,6 @@ async def test_returns_backend_types(self, client: AsyncClient): *(["nebius"] if sys.version_info >= (3, 10) else []), "oci", "runpod", - "tensordock", "vastai", "vultr", ]