From 856344383eceab34daf7ca7d070b71eb5f7396db Mon Sep 17 00:00:00 2001 From: API Engineering Date: Wed, 11 Mar 2026 06:34:52 +0000 Subject: [PATCH] [bot] Updated client based on openapi-1bd3add/clientgen --- DO_OPENAPI_COMMIT_SHA.txt | 2 +- src/pydo/_client.py | 6 + src/pydo/aio/_client.py | 6 + src/pydo/aio/operations/__init__.py | 2 + src/pydo/aio/operations/_operations.py | 2847 +++++++++++++++++++++ src/pydo/operations/__init__.py | 2 + src/pydo/operations/_operations.py | 3168 ++++++++++++++++++++++++ 7 files changed, 6032 insertions(+), 1 deletion(-) diff --git a/DO_OPENAPI_COMMIT_SHA.txt b/DO_OPENAPI_COMMIT_SHA.txt index 3e045b5c..dd085343 100644 --- a/DO_OPENAPI_COMMIT_SHA.txt +++ b/DO_OPENAPI_COMMIT_SHA.txt @@ -1 +1 @@ -cf0a60a +1bd3add diff --git a/src/pydo/_client.py b/src/pydo/_client.py index 0fe21400..59a62d9a 100644 --- a/src/pydo/_client.py +++ b/src/pydo/_client.py @@ -26,6 +26,7 @@ CdnOperations, CertificatesOperations, DatabasesOperations, + DedicatedInferencesOperations, DomainsOperations, DropletActionsOperations, DropletsOperations, @@ -597,6 +598,8 @@ class GeneratedClient: # pylint: disable=client-accepts-api-version-keyword,too :vartype billing_insights: pydo.operations.BillingInsightsOperations :ivar databases: DatabasesOperations operations :vartype databases: pydo.operations.DatabasesOperations + :ivar dedicated_inferences: DedicatedInferencesOperations operations + :vartype dedicated_inferences: pydo.operations.DedicatedInferencesOperations :ivar domains: DomainsOperations operations :vartype domains: pydo.operations.DomainsOperations :ivar droplets: DropletsOperations operations @@ -746,6 +749,9 @@ def __init__( self.databases = DatabasesOperations( self._client, self._config, self._serialize, self._deserialize ) + self.dedicated_inferences = DedicatedInferencesOperations( + self._client, self._config, self._serialize, self._deserialize + ) self.domains = DomainsOperations( self._client, self._config, self._serialize, self._deserialize ) diff --git a/src/pydo/aio/_client.py b/src/pydo/aio/_client.py index 69354835..2316f3d5 100644 --- a/src/pydo/aio/_client.py +++ b/src/pydo/aio/_client.py @@ -26,6 +26,7 @@ CdnOperations, CertificatesOperations, DatabasesOperations, + DedicatedInferencesOperations, DomainsOperations, DropletActionsOperations, DropletsOperations, @@ -597,6 +598,8 @@ class GeneratedClient: # pylint: disable=client-accepts-api-version-keyword,too :vartype billing_insights: pydo.aio.operations.BillingInsightsOperations :ivar databases: DatabasesOperations operations :vartype databases: pydo.aio.operations.DatabasesOperations + :ivar dedicated_inferences: DedicatedInferencesOperations operations + :vartype dedicated_inferences: pydo.aio.operations.DedicatedInferencesOperations :ivar domains: DomainsOperations operations :vartype domains: pydo.aio.operations.DomainsOperations :ivar droplets: DropletsOperations operations @@ -746,6 +749,9 @@ def __init__( self.databases = DatabasesOperations( self._client, self._config, self._serialize, self._deserialize ) + self.dedicated_inferences = DedicatedInferencesOperations( + self._client, self._config, self._serialize, self._deserialize + ) self.domains = DomainsOperations( self._client, self._config, self._serialize, self._deserialize ) diff --git a/src/pydo/aio/operations/__init__.py b/src/pydo/aio/operations/__init__.py index 4a74b7ce..4325fa0f 100644 --- a/src/pydo/aio/operations/__init__.py +++ b/src/pydo/aio/operations/__init__.py @@ -17,6 +17,7 @@ from ._operations import InvoicesOperations from ._operations import BillingInsightsOperations from ._operations import DatabasesOperations +from ._operations import DedicatedInferencesOperations from ._operations import DomainsOperations from ._operations import DropletsOperations from ._operations import DropletActionsOperations @@ -70,6 +71,7 @@ "InvoicesOperations", "BillingInsightsOperations", "DatabasesOperations", + "DedicatedInferencesOperations", "DomainsOperations", "DropletsOperations", "DropletActionsOperations", diff --git a/src/pydo/aio/operations/_operations.py b/src/pydo/aio/operations/_operations.py index e68d3123..79e518c0 100644 --- a/src/pydo/aio/operations/_operations.py +++ b/src/pydo/aio/operations/_operations.py @@ -177,6 +177,19 @@ build_databases_update_region_request, build_databases_update_sql_mode_request, build_databases_update_user_request, + build_dedicated_inferences_create_request, + build_dedicated_inferences_create_tokens_request, + build_dedicated_inferences_delete_request, + build_dedicated_inferences_delete_tokens_request, + build_dedicated_inferences_get_accelerator_request, + build_dedicated_inferences_get_ca_request, + build_dedicated_inferences_get_gpu_model_config_request, + build_dedicated_inferences_get_request, + build_dedicated_inferences_list_accelerators_request, + build_dedicated_inferences_list_request, + build_dedicated_inferences_list_sizes_request, + build_dedicated_inferences_list_tokens_request, + build_dedicated_inferences_patch_request, build_domains_create_record_request, build_domains_create_request, build_domains_delete_record_request, @@ -121819,6 +121832,2840 @@ async def delete_opensearch_index( return deserialized # type: ignore +class DedicatedInferencesOperations: + """ + .. warning:: + **DO NOT** instantiate this class directly. + + Instead, you should access the following operations through + :class:`~pydo.aio.GeneratedClient`'s + :attr:`dedicated_inferences` attribute. + """ + + def __init__(self, *args, **kwargs) -> None: + input_args = list(args) + self._client = input_args.pop(0) if input_args else kwargs.pop("client") + self._config = input_args.pop(0) if input_args else kwargs.pop("config") + self._serialize = input_args.pop(0) if input_args else kwargs.pop("serializer") + self._deserialize = ( + input_args.pop(0) if input_args else kwargs.pop("deserializer") + ) + + @distributed_trace_async + async def get(self, dedicated_inference_id: str, **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Get a Dedicated Inference. + + Retrieve an existing Dedicated Inference by ID. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}``. The status in the response + is one of active, new, provisioning, updating, deleting, or error. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + async def patch( + self, + dedicated_inference_id: str, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "access_tokens": { + "hugging_face_token": "str" # Optional. Hugging Face token required + for gated models. + }, + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @overload + async def patch( + self, + dedicated_inference_id: str, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @distributed_trace_async + async def patch( + self, dedicated_inference_id: str, body: Union[JSON, IO[bytes]], **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "access_tokens": { + "hugging_face_token": "str" # Optional. Hugging Face token required + for gated models. + }, + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_patch_request( + dedicated_inference_id=dedicated_inference_id, + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def delete( + self, dedicated_inference_id: str, **kwargs: Any + ) -> Optional[JSON]: + # pylint: disable=line-too-long + """Delete a Dedicated Inference. + + Delete an existing Dedicated Inference. Send a DELETE request to + ``/v2/dedicated-inferences/{dedicated_inference_id}``. The response 202 Accepted + indicates the request was accepted for processing. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object or None + :rtype: JSON or None + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[Optional[JSON]] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_delete_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + deserialized = None + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, deserialized, response_headers) # type: ignore + + return deserialized # type: ignore + + @distributed_trace_async + async def list( + self, + *, + per_page: int = 20, + page: int = 1, + region: Optional[str] = None, + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inferences. + + List all Dedicated Inference instances for your team. Send a GET request to + ``/v2/dedicated-inferences``. You may filter by region and use page and per_page + for pagination. + + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :keyword region: Filter by region. Dedicated Inference is only available in nyc2, tor1, and + atl1. Known values are: "nyc2", "tor1", and "atl1". Default value is None. + :paramtype region: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "dedicated_inferences": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. When the + Dedicated Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private + VPC FQDN of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public + FQDN of the Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated + Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. + Pending deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether + to expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": + "str", # DigitalOcean GPU slug. Required. + "scale": 0, # Number + of accelerator instances. Required. + "type": "str", # + Accelerator type (e.g. prefill_decode). Required. + "status": "str" # + Optional. Current state of the Accelerator. Known + values are: "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used + to identify an existing deployment when updating; empty means + create new. + "model_provider": "str", # Optional. + Model provider. "hugging_face" + "model_slug": "str", # Optional. + Model identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated + Inference. Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. + Pending deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the + Dedicated Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose + a public LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": + "str", # DigitalOcean GPU slug. Required. + "scale": 0, # Number + of accelerator instances. Required. + "type": "str", # + Accelerator type (e.g. prefill_decode). Required. + "status": "str" # + Optional. Current state of the Accelerator. Known + values are: "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used + to identify an existing deployment when updating; empty means + create new. + "model_provider": "str", # Optional. + Model provider. "hugging_face" + "model_slug": "str", # Optional. + Model identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. + Must be unique within the team. Required. + "region": "str", # DigitalOcean region where the + Dedicated Inference is hosted. Required. Known values are: "atl1", + "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the + Dedicated Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated + Inference. + } + ], + "links": { + "pages": { + "str": "str" # Optional. Pagination links (first, prev, + next, last). + } + }, + "meta": { + "total": 0 # Total number of results. Required. + } + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_request( + per_page=per_page, + page=page, + region=region, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + async def create( + self, body: JSON, *, content_type: str = "application/json", **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + }, + "access_tokens": { + "str": "str" # Optional. Key-value pairs for provider tokens (e.g. + Hugging Face). + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + + @overload + async def create( + self, body: IO[bytes], *, content_type: str = "application/json", **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + + @distributed_trace_async + async def create(self, body: Union[JSON, IO[bytes]], **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + }, + "access_tokens": { + "str": "str" # Optional. Key-value pairs for provider tokens (e.g. + Hugging Face). + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_create_request( + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def list_accelerators( + self, + dedicated_inference_id: str, + *, + per_page: int = 20, + page: int = 1, + slug: Optional[str] = None, + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inference Accelerators. + + List all accelerators (GPUs) in use by a Dedicated Inference instance. Send a + GET request to ``/v2/dedicated-inferences/{dedicated_inference_id}/accelerators``. + Optionally filter by slug and use page/per_page for pagination. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :keyword slug: Filter accelerators by GPU slug. Default value is None. + :paramtype slug: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "meta": { + "total": 0 # Optional. Number of objects returned by the request. + }, + "accelerators": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the accelerator. + "name": "str", # Optional. Name of the accelerator. + "role": "str", # Optional. Role of the accelerator (e.g. + prefill_decode). + "slug": "str", # Optional. DigitalOcean GPU slug. + "status": "str" # Optional. Status of the accelerator. + } + ], + "links": { + "pages": {} + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_accelerators_request( + dedicated_inference_id=dedicated_inference_id, + per_page=per_page, + page=page, + slug=slug, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def get_accelerator( + self, dedicated_inference_id: str, accelerator_id: str, **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Get a Dedicated Inference Accelerator. + + Retrieve a single accelerator by ID for a Dedicated Inference instance. Send a + GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/accelerators/{accelerator_id}``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param accelerator_id: A unique identifier for a Dedicated Inference accelerator. Required. + :type accelerator_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the accelerator. + "name": "str", # Optional. Name of the accelerator. + "role": "str", # Optional. Role of the accelerator (e.g. prefill_decode). + "slug": "str", # Optional. DigitalOcean GPU slug. + "status": "str" # Optional. Status of the accelerator. + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_accelerator_request( + dedicated_inference_id=dedicated_inference_id, + accelerator_id=accelerator_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def get_ca(self, dedicated_inference_id: str, **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Get Dedicated Inference CA Certificate. + + Get the CA certificate for a Dedicated Inference instance (base64-encoded). + Required for private endpoint connectivity. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/ca``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "cert": "str" # Base64-encoded CA certificate. Required. + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_ca_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def list_tokens( + self, + dedicated_inference_id: str, + *, + per_page: int = 20, + page: int = 1, + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inference Tokens. + + List all access tokens for a Dedicated Inference instance. Token values are + not returned; only id, name, and created_at. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "meta": { + "total": 0 # Optional. Number of objects returned by the request. + }, + "links": { + "pages": {} + }, + "tokens": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once + on create. Store securely. + } + ] + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_tokens_request( + dedicated_inference_id=dedicated_inference_id, + per_page=per_page, + page=page, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + async def create_tokens( + self, + dedicated_inference_id: str, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "name": "str" # Name for the new token. Required. + } + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @overload + async def create_tokens( + self, + dedicated_inference_id: str, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @distributed_trace_async + async def create_tokens( + self, dedicated_inference_id: str, body: Union[JSON, IO[bytes]], **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "name": "str" # Name for the new token. Required. + } + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_create_tokens_request( + dedicated_inference_id=dedicated_inference_id, + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def delete_tokens( + self, dedicated_inference_id: str, token_id: str, **kwargs: Any + ) -> Optional[JSON]: + # pylint: disable=line-too-long + """Revoke a Dedicated Inference Token. + + Revoke (delete) an access token for a Dedicated Inference instance. Send a + DELETE request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens/{token_id}``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param token_id: A unique identifier for a Dedicated Inference access token. Required. + :type token_id: str + :return: JSON object or None + :rtype: JSON or None + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[Optional[JSON]] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_delete_tokens_request( + dedicated_inference_id=dedicated_inference_id, + token_id=token_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [204, 404]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + deserialized = None + response_headers = {} + if response.status_code == 204: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, deserialized, response_headers) # type: ignore + + return deserialized # type: ignore + + @distributed_trace_async + async def list_sizes(self, **kwargs: Any) -> JSON: + """List Dedicated Inference Sizes. + + Get available Dedicated Inference sizes and pricing for supported GPUs. Send a + GET request to ``/v2/dedicated-inferences/sizes``. + + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "enabled_regions": [ + "str" # Optional. Regions where Dedicated Inference is available. + ], + "sizes": [ + { + "currency": "str", # Optional. + "gpu_slug": "str", # Optional. + "price_per_hour": "str", # Optional. + "region": "str" # Optional. + } + ] + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_sizes_request( + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace_async + async def get_gpu_model_config(self, **kwargs: Any) -> JSON: + """Get Dedicated Inference GPU Model Config. + + Get supported GPU and model configurations for Dedicated Inference. Use this to + discover supported GPU slugs and model slugs (e.g. Hugging Face). Send a GET + request to ``/v2/dedicated-inferences/gpu-model-config``. + + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "gpu_model_configs": [ + { + "gpu_slugs": [ + "str" # Optional. + ], + "is_gated_model": bool, # Optional. Whether the model + requires gated access (e.g. Hugging Face token). + "model_name": "str", # Optional. + "model_slug": "str" # Optional. + } + ] + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_gpu_model_config_request( + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + await self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + await response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + class DomainsOperations: """ .. warning:: diff --git a/src/pydo/operations/__init__.py b/src/pydo/operations/__init__.py index 4a74b7ce..4325fa0f 100644 --- a/src/pydo/operations/__init__.py +++ b/src/pydo/operations/__init__.py @@ -17,6 +17,7 @@ from ._operations import InvoicesOperations from ._operations import BillingInsightsOperations from ._operations import DatabasesOperations +from ._operations import DedicatedInferencesOperations from ._operations import DomainsOperations from ._operations import DropletsOperations from ._operations import DropletActionsOperations @@ -70,6 +71,7 @@ "InvoicesOperations", "BillingInsightsOperations", "DatabasesOperations", + "DedicatedInferencesOperations", "DomainsOperations", "DropletsOperations", "DropletActionsOperations", diff --git a/src/pydo/operations/_operations.py b/src/pydo/operations/_operations.py index f06ed50c..757cbda6 100644 --- a/src/pydo/operations/_operations.py +++ b/src/pydo/operations/_operations.py @@ -3581,6 +3581,342 @@ def build_databases_delete_opensearch_index_request( # pylint: disable=name-too return HttpRequest(method="DELETE", url=_url, headers=_headers, **kwargs) +def build_dedicated_inferences_get_request( + dedicated_inference_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="GET", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_patch_request( + dedicated_inference_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + if content_type is not None: + _headers["Content-Type"] = _SERIALIZER.header( + "content_type", content_type, "str" + ) + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="PATCH", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_delete_request( # pylint: disable=name-too-long + dedicated_inference_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="DELETE", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_list_request( + *, per_page: int = 20, page: int = 1, region: Optional[str] = None, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = case_insensitive_dict(kwargs.pop("params", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences" + + # Construct parameters + if per_page is not None: + _params["per_page"] = _SERIALIZER.query( + "per_page", per_page, "int", maximum=200, minimum=1 + ) + if page is not None: + _params["page"] = _SERIALIZER.query("page", page, "int", minimum=1) + if region is not None: + _params["region"] = _SERIALIZER.query("region", region, "str") + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest( + method="GET", url=_url, params=_params, headers=_headers, **kwargs + ) + + +def build_dedicated_inferences_create_request( + **kwargs: Any, +) -> HttpRequest: # pylint: disable=name-too-long + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences" + + # Construct headers + if content_type is not None: + _headers["Content-Type"] = _SERIALIZER.header( + "content_type", content_type, "str" + ) + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="POST", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_list_accelerators_request( # pylint: disable=name-too-long + dedicated_inference_id: str, + *, + per_page: int = 20, + page: int = 1, + slug: Optional[str] = None, + **kwargs: Any, +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = case_insensitive_dict(kwargs.pop("params", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/accelerators" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct parameters + if per_page is not None: + _params["per_page"] = _SERIALIZER.query( + "per_page", per_page, "int", maximum=200, minimum=1 + ) + if page is not None: + _params["page"] = _SERIALIZER.query("page", page, "int", minimum=1) + if slug is not None: + _params["slug"] = _SERIALIZER.query("slug", slug, "str") + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest( + method="GET", url=_url, params=_params, headers=_headers, **kwargs + ) + + +def build_dedicated_inferences_get_accelerator_request( # pylint: disable=name-too-long + dedicated_inference_id: str, accelerator_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/accelerators/{accelerator_id}" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + "accelerator_id": _SERIALIZER.url("accelerator_id", accelerator_id, "str"), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="GET", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_get_ca_request( # pylint: disable=name-too-long + dedicated_inference_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/ca" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="GET", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_list_tokens_request( # pylint: disable=name-too-long + dedicated_inference_id: str, *, per_page: int = 20, page: int = 1, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = case_insensitive_dict(kwargs.pop("params", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/tokens" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct parameters + if per_page is not None: + _params["per_page"] = _SERIALIZER.query( + "per_page", per_page, "int", maximum=200, minimum=1 + ) + if page is not None: + _params["page"] = _SERIALIZER.query("page", page, "int", minimum=1) + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest( + method="GET", url=_url, params=_params, headers=_headers, **kwargs + ) + + +def build_dedicated_inferences_create_tokens_request( # pylint: disable=name-too-long + dedicated_inference_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/tokens" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + if content_type is not None: + _headers["Content-Type"] = _SERIALIZER.header( + "content_type", content_type, "str" + ) + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="POST", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_delete_tokens_request( # pylint: disable=name-too-long + dedicated_inference_id: str, token_id: str, **kwargs: Any +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/{dedicated_inference_id}/tokens/{token_id}" + path_format_arguments = { + "dedicated_inference_id": _SERIALIZER.url( + "dedicated_inference_id", dedicated_inference_id, "str" + ), + "token_id": _SERIALIZER.url("token_id", token_id, "str"), + } + + _url: str = _url.format(**path_format_arguments) # type: ignore + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="DELETE", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_list_sizes_request( + **kwargs: Any, +) -> HttpRequest: # pylint: disable=name-too-long + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/sizes" + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="GET", url=_url, headers=_headers, **kwargs) + + +def build_dedicated_inferences_get_gpu_model_config_request( # pylint: disable=name-too-long + **kwargs: Any, +) -> HttpRequest: + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + + accept = _headers.pop("Accept", "application/json") + + # Construct URL + _url = "/v2/dedicated-inferences/gpu-model-config" + + # Construct headers + _headers["Accept"] = _SERIALIZER.header("accept", accept, "str") + + return HttpRequest(method="GET", url=_url, headers=_headers, **kwargs) + + def build_domains_list_request( *, per_page: int = 20, page: int = 1, **kwargs: Any ) -> HttpRequest: @@ -135007,6 +135343,2838 @@ def delete_opensearch_index( return deserialized # type: ignore +class DedicatedInferencesOperations: + """ + .. warning:: + **DO NOT** instantiate this class directly. + + Instead, you should access the following operations through + :class:`~pydo.GeneratedClient`'s + :attr:`dedicated_inferences` attribute. + """ + + def __init__(self, *args, **kwargs): + input_args = list(args) + self._client = input_args.pop(0) if input_args else kwargs.pop("client") + self._config = input_args.pop(0) if input_args else kwargs.pop("config") + self._serialize = input_args.pop(0) if input_args else kwargs.pop("serializer") + self._deserialize = ( + input_args.pop(0) if input_args else kwargs.pop("deserializer") + ) + + @distributed_trace + def get(self, dedicated_inference_id: str, **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Get a Dedicated Inference. + + Retrieve an existing Dedicated Inference by ID. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}``. The status in the response + is one of active, new, provisioning, updating, deleting, or error. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + def patch( + self, + dedicated_inference_id: str, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "access_tokens": { + "hugging_face_token": "str" # Optional. Hugging Face token required + for gated models. + }, + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @overload + def patch( + self, + dedicated_inference_id: str, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @distributed_trace + def patch( + self, dedicated_inference_id: str, body: Union[JSON, IO[bytes]], **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Update a Dedicated Inference. + + Update an existing Dedicated Inference. Send a PATCH request to + ``/v2/dedicated-inferences/{dedicated_inference_id}`` with updated ``spec`` and/or + ``access_tokens``. Status will move to updating and return to active when done. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "access_tokens": { + "hugging_face_token": "str" # Optional. Hugging Face token required + for gated models. + }, + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_patch_request( + dedicated_inference_id=dedicated_inference_id, + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def delete(self, dedicated_inference_id: str, **kwargs: Any) -> Optional[JSON]: + # pylint: disable=line-too-long + """Delete a Dedicated Inference. + + Delete an existing Dedicated Inference. Send a DELETE request to + ``/v2/dedicated-inferences/{dedicated_inference_id}``. The response 202 Accepted + indicates the request was accepted for processing. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object or None + :rtype: JSON or None + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[Optional[JSON]] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_delete_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + deserialized = None + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, deserialized, response_headers) # type: ignore + + return deserialized # type: ignore + + @distributed_trace + def list( + self, + *, + per_page: int = 20, + page: int = 1, + region: Optional[str] = None, + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inferences. + + List all Dedicated Inference instances for your team. Send a GET request to + ``/v2/dedicated-inferences``. You may filter by region and use page and per_page + for pagination. + + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :keyword region: Filter by region. Dedicated Inference is only available in nyc2, tor1, and + atl1. Known values are: "nyc2", "tor1", and "atl1". Default value is None. + :paramtype region: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "dedicated_inferences": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. When the + Dedicated Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private + VPC FQDN of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public + FQDN of the Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated + Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. + Pending deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether + to expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": + "str", # DigitalOcean GPU slug. Required. + "scale": 0, # Number + of accelerator instances. Required. + "type": "str", # + Accelerator type (e.g. prefill_decode). Required. + "status": "str" # + Optional. Current state of the Accelerator. Known + values are: "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used + to identify an existing deployment when updating; empty means + create new. + "model_provider": "str", # Optional. + Model provider. "hugging_face" + "model_slug": "str", # Optional. + Model identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated + Inference. Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. + Pending deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the + Dedicated Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose + a public LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": + "str", # DigitalOcean GPU slug. Required. + "scale": 0, # Number + of accelerator instances. Required. + "type": "str", # + Accelerator type (e.g. prefill_decode). Required. + "status": "str" # + Optional. Current state of the Accelerator. Known + values are: "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used + to identify an existing deployment when updating; empty means + create new. + "model_provider": "str", # Optional. + Model provider. "hugging_face" + "model_slug": "str", # Optional. + Model identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. + Must be unique within the team. Required. + "region": "str", # DigitalOcean region where the + Dedicated Inference is hosted. Required. Known values are: "atl1", + "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the + Dedicated Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated + Inference. + } + ], + "links": { + "pages": { + "str": "str" # Optional. Pagination links (first, prev, + next, last). + } + }, + "meta": { + "total": 0 # Total number of results. Required. + } + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_request( + per_page=per_page, + page=page, + region=region, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + def create( + self, body: JSON, *, content_type: str = "application/json", **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + }, + "access_tokens": { + "str": "str" # Optional. Key-value pairs for provider tokens (e.g. + Hugging Face). + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + + @overload + def create( + self, body: IO[bytes], *, content_type: str = "application/json", **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + + @distributed_trace + def create(self, body: Union[JSON, IO[bytes]], **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference. + + Create a new Dedicated Inference for your team. Send a POST request to + ``/v2/dedicated-inferences`` with a ``spec`` object (version, name, region, vpc, + enable_public_endpoint, model_deployments) and optional ``access_tokens`` (e.g. + hugging_face_token for gated models). The response code 202 Accepted indicates + the request was accepted for processing; it does not indicate success or failure. + The token value is returned only on create; store it securely. + + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public LLM + endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of accelerator + instances. Required. + "type": "str", # Accelerator type + (e.g. prefill_decode). Required. + "status": "str" # Optional. Current + state of the Accelerator. Known values are: "new", + "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to identify an + existing deployment when updating; empty means create new. + "model_provider": "str", # Optional. Model provider. + "hugging_face" + "model_slug": "str", # Optional. Model identifier + (e.g. Hugging Face slug). + "workload_config": {} # Optional. Workload-specific + configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be unique + within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated Inference + is hosted. Required. Known values are: "atl1", "nyc2", and "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated Inference. + Required. + } + }, + "access_tokens": { + "str": "str" # Optional. Key-value pairs for provider tokens (e.g. + Hugging Face). + } + } + + # response body for status code(s): 202 + response == { + "dedicated_inference": { + "created_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was created. + "endpoints": { + "private_endpoint_fqdn": "str", # Optional. Private VPC FQDN + of the Dedicated Inference instance. + "public_endpoint_fqdn": "str" # Optional. Public FQDN of the + Dedicated Inference instance. + }, + "id": "str", # Optional. Unique ID of the Dedicated Inference. + "pending_deployment_spec": { + "created_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "enable_public_endpoint": bool, # Optional. Whether to + expose a public LLM endpoint. + "id": "str", # Optional. Deployment UUID. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Optional. Name of the Dedicated Inference. + Must be unique within the team. + "status": "str", # Optional. Known values are: + "provisioning" and "updating". + "updated_at": "2020-02-20 00:00:00", # Optional. Pending + deployment when status is provisioning or updating. + "version": 0, # Optional. Spec version. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "region": "str", # Optional. DigitalOcean region where the Dedicated + Inference is hosted. + "spec": { + "enable_public_endpoint": bool, # Whether to expose a public + LLM endpoint. Required. + "model_deployments": [ + { + "accelerators": [ + { + "accelerator_slug": "str", # + DigitalOcean GPU slug. Required. + "scale": 0, # Number of + accelerator instances. Required. + "type": "str", # Accelerator + type (e.g. prefill_decode). Required. + "status": "str" # Optional. + Current state of the Accelerator. Known values are: + "new", "provisioning", and "active". + } + ], + "model_id": "str", # Optional. Used to + identify an existing deployment when updating; empty means create + new. + "model_provider": "str", # Optional. Model + provider. "hugging_face" + "model_slug": "str", # Optional. Model + identifier (e.g. Hugging Face slug). + "workload_config": {} # Optional. + Workload-specific configuration (e.g. ISL/OSL in future). + } + ], + "name": "str", # Name of the Dedicated Inference. Must be + unique within the team. Required. + "region": "str", # DigitalOcean region where the Dedicated + Inference is hosted. Required. Known values are: "atl1", "nyc2", and + "tor1". + "version": 0, # Spec version. Required. + "vpc": { + "uuid": "str" # VPC UUID for the Dedicated + Inference. Required. + } + }, + "status": "str", # Optional. Current state of the Dedicated + Inference. Known values are: "active", "new", "provisioning", "updating", + "deleting", and "error". + "updated_at": "2020-02-20 00:00:00", # Optional. When the Dedicated + Inference was last updated. + "vpc_uuid": "str" # Optional. VPC UUID of the Dedicated Inference. + }, + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_create_request( + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def list_accelerators( + self, + dedicated_inference_id: str, + *, + per_page: int = 20, + page: int = 1, + slug: Optional[str] = None, + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inference Accelerators. + + List all accelerators (GPUs) in use by a Dedicated Inference instance. Send a + GET request to ``/v2/dedicated-inferences/{dedicated_inference_id}/accelerators``. + Optionally filter by slug and use page/per_page for pagination. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :keyword slug: Filter accelerators by GPU slug. Default value is None. + :paramtype slug: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "meta": { + "total": 0 # Optional. Number of objects returned by the request. + }, + "accelerators": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the accelerator. + "name": "str", # Optional. Name of the accelerator. + "role": "str", # Optional. Role of the accelerator (e.g. + prefill_decode). + "slug": "str", # Optional. DigitalOcean GPU slug. + "status": "str" # Optional. Status of the accelerator. + } + ], + "links": { + "pages": {} + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_accelerators_request( + dedicated_inference_id=dedicated_inference_id, + per_page=per_page, + page=page, + slug=slug, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def get_accelerator( + self, dedicated_inference_id: str, accelerator_id: str, **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Get a Dedicated Inference Accelerator. + + Retrieve a single accelerator by ID for a Dedicated Inference instance. Send a + GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/accelerators/{accelerator_id}``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param accelerator_id: A unique identifier for a Dedicated Inference accelerator. Required. + :type accelerator_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the accelerator. + "name": "str", # Optional. Name of the accelerator. + "role": "str", # Optional. Role of the accelerator (e.g. prefill_decode). + "slug": "str", # Optional. DigitalOcean GPU slug. + "status": "str" # Optional. Status of the accelerator. + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_accelerator_request( + dedicated_inference_id=dedicated_inference_id, + accelerator_id=accelerator_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def get_ca(self, dedicated_inference_id: str, **kwargs: Any) -> JSON: + # pylint: disable=line-too-long + """Get Dedicated Inference CA Certificate. + + Get the CA certificate for a Dedicated Inference instance (base64-encoded). + Required for private endpoint connectivity. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/ca``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "cert": "str" # Base64-encoded CA certificate. Required. + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_ca_request( + dedicated_inference_id=dedicated_inference_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def list_tokens( + self, + dedicated_inference_id: str, + *, + per_page: int = 20, + page: int = 1, + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """List Dedicated Inference Tokens. + + List all access tokens for a Dedicated Inference instance. Token values are + not returned; only id, name, and created_at. Send a GET request to + ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :keyword per_page: Number of items returned per page. Default value is 20. + :paramtype per_page: int + :keyword page: Which 'page' of paginated results to return. Default value is 1. + :paramtype page: int + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "meta": { + "total": 0 # Optional. Number of objects returned by the request. + }, + "links": { + "pages": {} + }, + "tokens": [ + { + "created_at": "2020-02-20 00:00:00", # Optional. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once + on create. Store securely. + } + ] + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_tokens_request( + dedicated_inference_id=dedicated_inference_id, + per_page=per_page, + page=page, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 200: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @overload + def create_tokens( + self, + dedicated_inference_id: str, + body: JSON, + *, + content_type: str = "application/json", + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: JSON + :keyword content_type: Body Parameter content-type. Content type parameter for JSON body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "name": "str" # Name for the new token. Required. + } + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @overload + def create_tokens( + self, + dedicated_inference_id: str, + body: IO[bytes], + *, + content_type: str = "application/json", + **kwargs: Any, + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Required. + :type body: IO[bytes] + :keyword content_type: Body Parameter content-type. Content type parameter for binary body. + Default value is "application/json". + :paramtype content_type: str + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + + @distributed_trace + def create_tokens( + self, dedicated_inference_id: str, body: Union[JSON, IO[bytes]], **kwargs: Any + ) -> JSON: + # pylint: disable=line-too-long + """Create a Dedicated Inference Token. + + Create a new access token for a Dedicated Inference instance. Send a POST + request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens`` with a + ``name``. The token value is returned only once in the response; store it securely. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param body: Is either a JSON type or a IO[bytes] type. Required. + :type body: JSON or IO[bytes] + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # JSON input template you can fill out and use as your body input. + body = { + "name": "str" # Name for the new token. Required. + } + + # response body for status code(s): 202 + response == { + "token": { + "created_at": "2020-02-20 00:00:00", # Optional. Access token for + authenticating to Dedicated Inference endpoints. + "id": "str", # Optional. Unique ID of the token. + "name": "str", # Optional. Name of the token. + "value": "str" # Optional. Token value; only returned once on + create. Store securely. + } + } + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {}) + _params = kwargs.pop("params", {}) or {} + + content_type: Optional[str] = kwargs.pop( + "content_type", _headers.pop("Content-Type", None) + ) + cls: ClsType[JSON] = kwargs.pop("cls", None) + + content_type = content_type or "application/json" + _json = None + _content = None + if isinstance(body, (IOBase, bytes)): + _content = body + else: + _json = body + + _request = build_dedicated_inferences_create_tokens_request( + dedicated_inference_id=dedicated_inference_id, + content_type=content_type, + json=_json, + content=_content, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [202, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + if response.status_code == 202: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def delete_tokens( + self, dedicated_inference_id: str, token_id: str, **kwargs: Any + ) -> Optional[JSON]: + # pylint: disable=line-too-long + """Revoke a Dedicated Inference Token. + + Revoke (delete) an access token for a Dedicated Inference instance. Send a + DELETE request to ``/v2/dedicated-inferences/{dedicated_inference_id}/tokens/{token_id}``. + + :param dedicated_inference_id: A unique identifier for a Dedicated Inference instance. + Required. + :type dedicated_inference_id: str + :param token_id: A unique identifier for a Dedicated Inference access token. Required. + :type token_id: str + :return: JSON object or None + :rtype: JSON or None + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 404 + response == { + "id": "str", # A short identifier corresponding to the HTTP status code + returned. For example, the ID for a response returning a 404 status code would + be "not_found.". Required. + "message": "str", # A message providing additional information about the + error, including details to help resolve it when possible. Required. + "request_id": "str" # Optional. Optionally, some endpoints may include a + request ID that should be provided when reporting bugs or opening support + tickets to help identify the issue. + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[Optional[JSON]] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_delete_tokens_request( + dedicated_inference_id=dedicated_inference_id, + token_id=token_id, + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [204, 404]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + deserialized = None + response_headers = {} + if response.status_code == 204: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.status_code == 404: + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, deserialized, response_headers) # type: ignore + + return deserialized # type: ignore + + @distributed_trace + def list_sizes(self, **kwargs: Any) -> JSON: + """List Dedicated Inference Sizes. + + Get available Dedicated Inference sizes and pricing for supported GPUs. Send a + GET request to ``/v2/dedicated-inferences/sizes``. + + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "enabled_regions": [ + "str" # Optional. Regions where Dedicated Inference is available. + ], + "sizes": [ + { + "currency": "str", # Optional. + "gpu_slug": "str", # Optional. + "price_per_hour": "str", # Optional. + "region": "str" # Optional. + } + ] + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_list_sizes_request( + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + @distributed_trace + def get_gpu_model_config(self, **kwargs: Any) -> JSON: + """Get Dedicated Inference GPU Model Config. + + Get supported GPU and model configurations for Dedicated Inference. Use this to + discover supported GPU slugs and model slugs (e.g. Hugging Face). Send a GET + request to ``/v2/dedicated-inferences/gpu-model-config``. + + :return: JSON object + :rtype: JSON + :raises ~azure.core.exceptions.HttpResponseError: + + Example: + .. code-block:: python + + # response body for status code(s): 200 + response == { + "gpu_model_configs": [ + { + "gpu_slugs": [ + "str" # Optional. + ], + "is_gated_model": bool, # Optional. Whether the model + requires gated access (e.g. Hugging Face token). + "model_name": "str", # Optional. + "model_slug": "str" # Optional. + } + ] + } + """ + error_map: MutableMapping[int, Type[HttpResponseError]] = { + 404: ResourceNotFoundError, + 409: ResourceExistsError, + 304: ResourceNotModifiedError, + 401: cast( + Type[HttpResponseError], + lambda response: ClientAuthenticationError(response=response), + ), + 429: HttpResponseError, + 500: HttpResponseError, + } + error_map.update(kwargs.pop("error_map", {}) or {}) + + _headers = kwargs.pop("headers", {}) or {} + _params = kwargs.pop("params", {}) or {} + + cls: ClsType[JSON] = kwargs.pop("cls", None) + + _request = build_dedicated_inferences_get_gpu_model_config_request( + headers=_headers, + params=_params, + ) + _request.url = self._client.format_url(_request.url) + + _stream = False + pipeline_response: PipelineResponse = ( + self._client._pipeline.run( # pylint: disable=protected-access + _request, stream=_stream, **kwargs + ) + ) + + response = pipeline_response.http_response + + if response.status_code not in [200]: + if _stream: + response.read() # Load the body in memory and close the socket + map_error(status_code=response.status_code, response=response, error_map=error_map) # type: ignore + raise HttpResponseError(response=response) + + response_headers = {} + response_headers["ratelimit-limit"] = self._deserialize( + "int", response.headers.get("ratelimit-limit") + ) + response_headers["ratelimit-remaining"] = self._deserialize( + "int", response.headers.get("ratelimit-remaining") + ) + response_headers["ratelimit-reset"] = self._deserialize( + "int", response.headers.get("ratelimit-reset") + ) + + if response.content: + deserialized = response.json() + else: + deserialized = None + + if cls: + return cls(pipeline_response, cast(JSON, deserialized), response_headers) # type: ignore + + return cast(JSON, deserialized) # type: ignore + + class DomainsOperations: """ .. warning::