Skip to content

Commit e343a90

Browse files
committed
Drop hardcoded Nebius InfiniBand fabrics
Use the fabrics from gpuhunt offers instead. The list of fabrics was moved from dstack to gpuhunt so that new fabrics can be added without a dstack release.
1 parent 1c56561 commit e343a90

6 files changed

Lines changed: 101 additions & 89 deletions

File tree

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@
2828
get_offers_disk_modifier,
2929
)
3030
from dstack._internal.core.backends.nebius import resources
31-
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
32-
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
31+
from dstack._internal.core.backends.nebius.models import (
32+
NebiusConfig,
33+
NebiusOfferBackendData,
34+
NebiusServiceAccountCreds,
35+
)
3336
from dstack._internal.core.errors import (
3437
BackendError,
3538
NotYetTerminated,
@@ -281,23 +284,30 @@ def create_placement_group(
281284
master_instance_offer: InstanceOffer,
282285
) -> PlacementGroupProvisioningData:
283286
assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
284-
backend_data = NebiusPlacementGroupBackendData(cluster=None)
287+
master_instance_offer_backend_data: NebiusOfferBackendData = (
288+
NebiusOfferBackendData.__response__.parse_obj(master_instance_offer.backend_data)
289+
)
290+
fabrics = list(master_instance_offer_backend_data.fabrics)
291+
if self.config.fabrics is not None:
292+
fabrics = [f for f in fabrics if f in self.config.fabrics]
293+
placement_group_backend_data = NebiusPlacementGroupBackendData(cluster=None)
285294
# Only create a Nebius cluster if the instance supports it.
286295
# For other instances, return dummy PlacementGroupProvisioningData.
287-
if fabrics := get_suitable_infiniband_fabrics(
288-
master_instance_offer, allowed_fabrics=self.config.fabrics
289-
):
296+
if fabrics:
290297
fabric = random.choice(fabrics)
291298
op = resources.create_cluster(
292299
self._sdk,
293300
name=placement_group.name,
294301
project_id=self._region_to_project_id[placement_group.configuration.region],
295302
fabric=fabric,
296303
)
297-
backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
304+
placement_group_backend_data.cluster = NebiusClusterBackendData(
305+
id=op.resource_id,
306+
fabric=fabric,
307+
)
298308
return PlacementGroupProvisioningData(
299309
backend=BackendType.NEBIUS,
300-
backend_data=backend_data.json(),
310+
backend_data=placement_group_backend_data.json(),
301311
)
302312

303313
def delete_placement_group(self, placement_group: PlacementGroup) -> None:
@@ -317,16 +327,15 @@ def is_suitable_placement_group(
317327
if placement_group.configuration.region != instance_offer.region:
318328
return False
319329
assert placement_group.provisioning_data is not None
320-
backend_data = NebiusPlacementGroupBackendData.load(
330+
placement_group_backend_data = NebiusPlacementGroupBackendData.load(
321331
placement_group.provisioning_data.backend_data
322332
)
333+
instance_offer_backend_data: NebiusOfferBackendData = (
334+
NebiusOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
335+
)
323336
return (
324-
backend_data.cluster is None
325-
or backend_data.cluster.fabric
326-
in get_suitable_infiniband_fabrics(
327-
instance_offer,
328-
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
329-
)
337+
placement_group_backend_data.cluster is None
338+
or placement_group_backend_data.cluster.fabric in instance_offer_backend_data.fabrics
330339
)
331340

332341

src/dstack/_internal/core/backends/nebius/configurator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
)
1111
from dstack._internal.core.backends.nebius import resources
1212
from dstack._internal.core.backends.nebius.backend import NebiusBackend
13-
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
1413
from dstack._internal.core.backends.nebius.models import (
1514
NebiusBackendConfig,
1615
NebiusBackendConfigWithCreds,
@@ -19,6 +18,7 @@
1918
NebiusServiceAccountCreds,
2019
NebiusStoredConfig,
2120
)
21+
from dstack._internal.core.backends.nebius.resources import get_all_infiniband_fabrics
2222
from dstack._internal.core.errors import BackendError, ServerClientError
2323
from dstack._internal.core.models.backends.base import BackendType
2424

src/dstack/_internal/core/backends/nebius/fabrics.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

src/dstack/_internal/core/backends/nebius/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,7 @@ class NebiusConfig(NebiusStoredConfig):
179179
"""
180180

181181
creds: AnyNebiusCreds
182+
183+
184+
class NebiusOfferBackendData(CoreModel):
185+
fabrics: set[str] = set()

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,14 @@
5050
from nebius.sdk import SDK
5151

5252
from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
53+
from dstack._internal.core.backends.base.offers import get_catalog_offers
5354
from dstack._internal.core.backends.nebius.models import (
5455
DEFAULT_PROJECT_NAME_PREFIX,
56+
NebiusOfferBackendData,
5557
NebiusServiceAccountCreds,
5658
)
5759
from dstack._internal.core.errors import BackendError, NoCapacityError
60+
from dstack._internal.core.models.backends.base import BackendType
5861
from dstack._internal.utils.event_loop import DaemonEventLoop
5962
from dstack._internal.utils.logging import get_logger
6063

@@ -249,6 +252,17 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
249252
raise BackendError(f"Could not find default subnet in project {project_id}")
250253

251254

255+
def get_all_infiniband_fabrics() -> set[str]:
256+
offers = get_catalog_offers(backend=BackendType.NEBIUS)
257+
result = set()
258+
for offer in offers:
259+
backend_data: NebiusOfferBackendData = NebiusOfferBackendData.__response__.parse_obj(
260+
offer.backend_data
261+
)
262+
result |= backend_data.fabrics
263+
return result
264+
265+
252266
def create_disk(
253267
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
254268
) -> SDKOperation[Operation]:

src/tests/_internal/server/routers/test_backends.py

Lines changed: 58 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,13 @@
1313

1414
from dstack._internal.core.backends.oci import region as oci_region
1515
from dstack._internal.core.models.backends.base import BackendType
16-
from dstack._internal.core.models.instances import InstanceStatus
16+
from dstack._internal.core.models.instances import (
17+
Gpu,
18+
InstanceOffer,
19+
InstanceStatus,
20+
InstanceType,
21+
Resources,
22+
)
1723
from dstack._internal.core.models.users import GlobalRole, ProjectRole
1824
from dstack._internal.core.models.volumes import VolumeStatus
1925
from dstack._internal.server.models import BackendModel
@@ -212,6 +218,30 @@ async def test_creates_lambda_backend(
212218
@pytest.mark.skipif(sys.version_info < (3, 10), reason="Nebius requires Python 3.10")
213219
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
214220
class TestNebius:
221+
@pytest.fixture(autouse=True)
222+
def patch_catalog(self):
223+
with patch(
224+
"dstack._internal.core.backends.nebius.resources.get_catalog_offers"
225+
) as get_catalog_offers_mock:
226+
get_catalog_offers_mock.return_value = [
227+
InstanceOffer(
228+
backend=BackendType.NEBIUS,
229+
instance=InstanceType(
230+
name="gpu-h100-sxm 8gpu-128vcpu-1600gb",
231+
resources=Resources(
232+
cpus=128,
233+
memory_mib=1600 * 1024,
234+
gpus=[Gpu(name="H100", memory_mib=80 * 1024)] * 8,
235+
spot=False,
236+
),
237+
),
238+
region="eu-north1",
239+
price=23.6,
240+
backend_data={"fabrics": ["fabric-2", "fabric-3"]},
241+
)
242+
]
243+
yield
244+
215245
async def test_not_creates_with_invalid_creds(
216246
self, test_db, session: AsyncSession, client: AsyncClient
217247
):
@@ -238,18 +268,16 @@ async def test_not_creates_with_invalid_creds(
238268
assert len(res.scalars().all()) == 0
239269

240270
@pytest.mark.parametrize(
241-
("config_regions", "config_projects", "mocked_projects", "error"),
271+
("config_extra", "mocked_projects", "error"),
242272
[
243273
pytest.param(
244-
None,
245-
None,
274+
{},
246275
[_nebius_project()],
247276
None,
248277
id="default",
249278
),
250279
pytest.param(
251-
["eu-north1"],
252-
None,
280+
{"regions": ["eu-north1"]},
253281
[
254282
_nebius_project(
255283
"project-e00test", "default-project-eu-north1", "eu-north1"
@@ -260,15 +288,13 @@ async def test_not_creates_with_invalid_creds(
260288
id="with-regions",
261289
),
262290
pytest.param(
263-
["xx-xxxx1"],
264-
None,
291+
{"regions": ["xx-xxxx1"]},
265292
[_nebius_project()],
266293
"do not exist in this Nebius tenancy",
267294
id="error-invalid-regions",
268295
),
269296
pytest.param(
270-
["eu-north1"],
271-
None,
297+
{"regions": ["eu-north1"]},
272298
[
273299
_nebius_project(
274300
"project-e00test0", "default-project-eu-north1", "eu-north1"
@@ -279,8 +305,7 @@ async def test_not_creates_with_invalid_creds(
279305
id="finds-default-project-among-many",
280306
),
281307
pytest.param(
282-
["eu-north1"],
283-
None,
308+
{"regions": ["eu-north1"]},
284309
[
285310
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
286311
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -289,8 +314,7 @@ async def test_not_creates_with_invalid_creds(
289314
id="error-no-default-project",
290315
),
291316
pytest.param(
292-
None,
293-
["project-e00test0"],
317+
{"projects": ["project-e00test0"]},
294318
[
295319
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
296320
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -299,15 +323,13 @@ async def test_not_creates_with_invalid_creds(
299323
id="with-projects",
300324
),
301325
pytest.param(
302-
None,
303-
["project-e00xxxx"],
326+
{"projects": ["project-e00xxxx"]},
304327
[_nebius_project()],
305328
"not found in this Nebius tenancy",
306329
id="error-invalid-projects",
307330
),
308331
pytest.param(
309-
None,
310-
["project-e00test0", "project-e00test1"],
332+
{"projects": ["project-e00test0", "project-e00test1"]},
311333
[
312334
_nebius_project("project-e00test0", "non-default-project-0", "eu-north1"),
313335
_nebius_project("project-e00test1", "non-default-project-1", "eu-north1"),
@@ -316,8 +338,10 @@ async def test_not_creates_with_invalid_creds(
316338
id="error-multiple-projects-in-same-region",
317339
),
318340
pytest.param(
319-
["eu-north1"],
320-
["project-e00test"],
341+
{
342+
"regions": ["eu-north1"],
343+
"projects": ["project-e00test"],
344+
},
321345
[
322346
_nebius_project(
323347
"project-e00test", "default-project-eu-north1", "eu-north1"
@@ -327,15 +351,26 @@ async def test_not_creates_with_invalid_creds(
327351
None,
328352
id="with-regions-and-projects",
329353
),
354+
pytest.param(
355+
{"fabrics": ["fabric-2", "fabric-3"]},
356+
[_nebius_project()],
357+
None,
358+
id="with-valid-fabrics",
359+
),
360+
pytest.param(
361+
{"fabrics": ["fabric-2", "fabric-invalid"]},
362+
[_nebius_project()],
363+
"InfiniBand fabrics do not exist",
364+
id="with-invalid-fabrics",
365+
),
330366
],
331367
)
332368
async def test_create(
333369
self,
334370
test_db,
335371
session: AsyncSession,
336372
client: AsyncClient,
337-
config_regions: Optional[list[str]],
338-
config_projects: Optional[list[str]],
373+
config_extra: dict[str, Any],
339374
mocked_projects: Sequence[Any],
340375
error: Optional[str],
341376
):
@@ -347,8 +382,7 @@ async def test_create(
347382
body = {
348383
"type": "nebius",
349384
"creds": FAKE_NEBIUS_SERVICE_ACCOUNT_CREDS,
350-
"regions": config_regions,
351-
"projects": config_projects,
385+
**config_extra,
352386
}
353387
with patch(
354388
"dstack._internal.core.backends.nebius.resources.list_tenant_projects"

0 commit comments

Comments
 (0)