Skip to content

Commit 08aad9f

Browse files
committed
Switch to nebius sdk 0.3
1 parent bd9fe78 commit 08aad9f

3 files changed

Lines changed: 27 additions & 25 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,7 @@ oci = [
189189
"dstack[server]",
190190
]
191191
nebius = [
192-
# 0.2.73 breaks sdk backward compatibility: https://github.com/dstackai/dstack/issues/3171
193-
"nebius>=0.2.40,<=0.2.72; python_version >= '3.10'",
192+
"nebius>=0.3.0,<0.4; python_version >= '3.10'",
194193
"dstack[server]",
195194
]
196195
all = [

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -379,9 +379,7 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
379379
op.status(),
380380
)
381381
time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
382-
resources.LOOP.await_(
383-
op.update(per_retry_timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
384-
)
382+
resources.LOOP.await_(op.update(per_retry_timeout=resources.REQUEST_TIMEOUT))
385383

386384

387385
def _supported_instances(offer: InstanceOffer) -> bool:

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from tempfile import NamedTemporaryFile
99
from typing import Dict, Optional
1010

11-
from nebius.aio.authorization.options import options_to_metadata
1211
from nebius.aio.operation import Operation as SDKOperation
1312
from nebius.aio.service_error import RequestError, StatusCode
1413
from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
@@ -66,13 +65,11 @@
6665
LOOP = DaemonEventLoop()
6766
# Pass a timeout to all methods to avoid infinite waiting
6867
REQUEST_TIMEOUT = 10
69-
# Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
70-
REQUEST_MD = options_to_metadata(
71-
{
72-
OPTION_RENEW_SYNCHRONOUS: "true",
73-
OPTION_RENEW_REQUEST_TIMEOUT: "5",
74-
}
75-
)
68+
# Pass REQUEST_AUTH_OPTIONS to all methods to avoid infinite retries in case of invalid credentials
69+
REQUEST_AUTH_OPTIONS = {
70+
OPTION_RENEW_SYNCHRONOUS: "true",
71+
OPTION_RENEW_REQUEST_TIMEOUT: "5",
72+
}
7673

7774
# disables log messages about errors such as invalid creds or expired timeouts
7875
logging.getLogger("nebius").setLevel(logging.CRITICAL)
@@ -120,7 +117,7 @@ def wait_for_operation(
120117
if time.monotonic() + interval > deadline:
121118
raise TimeoutError(f"Operation {op.id} wait timeout")
122119
time.sleep(interval)
123-
LOOP.await_(op.update(per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
120+
LOOP.await_(op.update(per_retry_timeout=REQUEST_TIMEOUT))
124121

125122

126123
def get_region_to_project_id_map(
@@ -156,7 +153,9 @@ def validate_regions(configured: set[str], available: set[str]) -> None:
156153
def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
157154
tenants = LOOP.await_(
158155
TenantServiceClient(sdk).list(
159-
ListTenantsRequest(), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
156+
ListTenantsRequest(),
157+
per_retry_timeout=REQUEST_TIMEOUT,
158+
auth_options=REQUEST_AUTH_OPTIONS,
160159
)
161160
)
162161
if len(tenants.items) != 1:
@@ -166,7 +165,7 @@ def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
166165
ProjectServiceClient(sdk).list(
167166
ListProjectsRequest(parent_id=tenant_id, page_size=999),
168167
per_retry_timeout=REQUEST_TIMEOUT,
169-
metadata=REQUEST_MD,
168+
auth_options=REQUEST_AUTH_OPTIONS,
170169
)
171170
)
172171
return projects.items
@@ -240,7 +239,7 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
240239
SubnetServiceClient(sdk).list(
241240
ListSubnetsRequest(parent_id=project_id, page_size=999),
242241
per_retry_timeout=REQUEST_TIMEOUT,
243-
metadata=REQUEST_MD,
242+
auth_options=REQUEST_AUTH_OPTIONS,
244243
)
245244
)
246245
for subnet in subnets.items:
@@ -267,14 +266,18 @@ def create_disk(
267266
)
268267
with wrap_capacity_errors():
269268
return LOOP.await_(
270-
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
269+
client.create(
270+
request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
271+
)
271272
)
272273

273274

274275
def delete_disk(sdk: SDK, disk_id: str) -> None:
275276
LOOP.await_(
276277
DiskServiceClient(sdk).delete(
277-
DeleteDiskRequest(id=disk_id), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
278+
DeleteDiskRequest(id=disk_id),
279+
per_retry_timeout=REQUEST_TIMEOUT,
280+
auth_options=REQUEST_AUTH_OPTIONS,
278281
)
279282
)
280283

@@ -325,7 +328,9 @@ def create_instance(
325328
)
326329
with wrap_capacity_errors():
327330
return LOOP.await_(
328-
client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
331+
client.create(
332+
request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
333+
)
329334
)
330335

331336

@@ -334,7 +339,7 @@ def get_instance(sdk: SDK, instance_id: str) -> Instance:
334339
InstanceServiceClient(sdk).get(
335340
GetInstanceRequest(id=instance_id),
336341
per_retry_timeout=REQUEST_TIMEOUT,
337-
metadata=REQUEST_MD,
342+
auth_options=REQUEST_AUTH_OPTIONS,
338343
)
339344
)
340345

@@ -344,7 +349,7 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
344349
InstanceServiceClient(sdk).delete(
345350
DeleteInstanceRequest(id=instance_id),
346351
per_retry_timeout=REQUEST_TIMEOUT,
347-
metadata=REQUEST_MD,
352+
auth_options=REQUEST_AUTH_OPTIONS,
348353
)
349354
)
350355

@@ -358,17 +363,17 @@ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOper
358363
spec=GpuClusterSpec(infiniband_fabric=fabric),
359364
),
360365
per_retry_timeout=REQUEST_TIMEOUT,
361-
metadata=REQUEST_MD,
366+
auth_options=REQUEST_AUTH_OPTIONS,
362367
)
363368
)
364369

365370

366371
def delete_cluster(sdk: SDK, cluster_id: str) -> None:
367-
return LOOP.await_(
372+
LOOP.await_(
368373
GpuClusterServiceClient(sdk).delete(
369374
DeleteGpuClusterRequest(id=cluster_id),
370375
per_retry_timeout=REQUEST_TIMEOUT,
371-
metadata=REQUEST_MD,
376+
auth_options=REQUEST_AUTH_OPTIONS,
372377
)
373378
)
374379

0 commit comments

Comments
 (0)