From 0ff98efc20b604f539a5a23b7bff84050f0b746d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 15 Aug 2025 23:46:15 +0200 Subject: [PATCH 01/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) --- src/dstack/_internal/cli/commands/list.py | 296 +++++++++++++++++++ src/dstack/_internal/cli/main.py | 2 + src/dstack/_internal/server/routers/runs.py | 14 + src/dstack/_internal/server/schemas/runs.py | 59 +++- src/dstack/_internal/server/services/runs.py | 252 +++++++++++++++- src/dstack/api/server/_runs.py | 15 + 6 files changed, 635 insertions(+), 3 deletions(-) create mode 100644 src/dstack/_internal/cli/commands/list.py diff --git a/src/dstack/_internal/cli/commands/list.py b/src/dstack/_internal/cli/commands/list.py new file mode 100644 index 0000000000..549f489ff6 --- /dev/null +++ b/src/dstack/_internal/cli/commands/list.py @@ -0,0 +1,296 @@ +import argparse +import contextlib +import json +import shutil +from pathlib import Path + +from rich.table import Table + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.configurators.run import BaseRunConfigurator +from dstack._internal.cli.utils.common import console +from dstack._internal.core.models.configurations import ( + ApplyConfigurationType, + TaskConfiguration, +) +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map +from dstack.api.utils import load_profile + + +class ListGpuConfigurator(BaseRunConfigurator): + TYPE = ApplyConfigurationType.TASK + + @classmethod + def register_args( + cls, + parser: argparse.ArgumentParser, + ): + super().register_args(parser, default_max_offers=50) + parser.add_argument( + "--group-by", + action="append", + choices=["backend", "region"], + help="Group GPUs by backend and/or region. Can be specified multiple times.", + ) + + +class ListGpuCommand(APIBaseCommand): + NAME = "gpu" + DESCRIPTION = "List available GPUs" + + def _register(self): + super()._register() + self._parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format (default: plain)", + ) + self._parser.add_argument( + "--json", + action="store_const", + const="json", + dest="format", + help="Output in JSON format (equivalent to --format json)", + ) + ListGpuConfigurator.register_args(self._parser) + + def _command(self, args: argparse.Namespace): + super()._command(args) + conf = TaskConfiguration(commands=[":"]) + + configurator = ListGpuConfigurator(api_client=self.api) + configurator.apply_args(conf, args, []) + profile = load_profile(Path.cwd(), profile_name=args.profile) + + run_spec = RunSpec( + configuration=conf, + ssh_key_pub="(dummy)", + profile=profile, + ) + + if args.format == "plain": + status = console.status("Getting GPU information...") + else: + status = contextlib.nullcontext() + + with status: + gpu_response = self.api.client.runs.get_gpus( + self.api.project, + run_spec, + group_by=args.group_by, + ) + + if args.format == "json": + req = Requirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.merged_profile.reservation, + ) + + if req.spot is None: + spot_policy = "auto" + elif req.spot: + spot_policy = "spot" + else: + spot_policy = "on-demand" + + output = { + "project": self.api.project, + "user": "admin", # TODO: Get actual user name + "resources": req.resources.dict(), + "spot_policy": spot_policy, + "max_price": req.max_price, + "reservation": run_spec.configuration.reservation, + "group_by": args.group_by, + "gpus": [], + } + + for gpu_group in gpu_response.gpus: + gpu_data = { + "name": gpu_group.name, + "memory_mib": gpu_group.memory_mib, + "vendor": gpu_group.vendor.value, + "availability": [av.value for av in gpu_group.availability], + "spot": gpu_group.spot, + "count": {"min": gpu_group.count.min, "max": gpu_group.count.max}, + "price": {"min": gpu_group.price.min, "max": gpu_group.price.max}, + } + + if gpu_group.backend: + gpu_data["backend"] = gpu_group.backend.value + if gpu_group.backends: + gpu_data["backends"] = [b.value for b in gpu_group.backends] + if gpu_group.region: + gpu_data["region"] = gpu_group.region + if gpu_group.regions: + gpu_data["regions"] = gpu_group.regions + + output["gpus"].append(gpu_data) + + print(json.dumps(output, indent=2)) + return + else: + self._print_gpu_table(gpu_response, run_spec, args.group_by) + + def _print_gpu_table(self, gpu_response, run_spec, group_by): + self._print_filter_info(run_spec, group_by) + + has_single_backend = any(gpu_group.backend for gpu_group in gpu_response.gpus) + has_single_region = any(gpu_group.region for gpu_group in gpu_response.gpus) + has_multiple_regions = any(gpu_group.regions for gpu_group in gpu_response.gpus) + + if has_single_backend and has_single_region: + backend_column = "BACKEND" + region_column = "REGION" + elif has_single_backend and has_multiple_regions: + backend_column = "BACKEND" + region_column = "REGIONS" + else: + backend_column = "BACKENDS" + region_column = None + + table = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) + table.add_column("#") + table.add_column("GPU", no_wrap=True, ratio=2) + table.add_column("SPOT", style="grey58", ratio=1) + table.add_column("PRICE", style="grey58", ratio=1) + table.add_column(backend_column, style="grey58", ratio=2) + if region_column: + table.add_column(region_column, style="grey58", ratio=2) + table.add_column() + + for i, gpu_group in enumerate(gpu_response.gpus, start=1): + backend_text = "" + if gpu_group.backend: + backend_text = gpu_group.backend.value + elif gpu_group.backends: + backend_text = ", ".join(b.value for b in gpu_group.backends) + + region_text = "" + if gpu_group.region: + region_text = gpu_group.region + elif gpu_group.regions: + if len(gpu_group.regions) <= 3: + region_text = ", ".join(gpu_group.regions) + else: + region_text = f"{len(gpu_group.regions)} regions" + + if not region_column: + if gpu_group.regions and len(gpu_group.regions) > 3: + shortened_region_text = f"{len(gpu_group.regions)} regions" + backends_display = ( + f"{backend_text} ({shortened_region_text})" + if shortened_region_text + else backend_text + ) + else: + backends_display = ( + f"{backend_text} ({region_text})" if region_text else backend_text + ) + else: + backends_display = backend_text + + memory_gb = f"{gpu_group.memory_mib // 1024}GB" + if gpu_group.count.min == gpu_group.count.max: + count_range = str(gpu_group.count.min) + else: + count_range = f"{gpu_group.count.min}..{gpu_group.count.max}" + gpu_spec = f"{gpu_group.name}:{memory_gb}:{count_range}" + + spot_types = [] + if "spot" in gpu_group.spot: + spot_types.append("spot") + if "on-demand" in gpu_group.spot: + spot_types.append("on-demand") + spot_display = ", ".join(spot_types) + + if gpu_group.price.min == gpu_group.price.max: + price_display = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + else: + min_formatted = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + max_formatted = f"{gpu_group.price.max:.4f}".rstrip("0").rstrip(".") + price_display = f"{min_formatted}..{max_formatted}" + + availability = "" + has_available = any(av.is_available() for av in gpu_group.availability) + has_unavailable = any(not av.is_available() for av in gpu_group.availability) + + if has_unavailable and not has_available: + for av in gpu_group.availability: + if av.value in {"not_available", "no_quota", "idle", "busy"}: + availability = av.value.replace("_", " ").lower() + break + + secondary_style = "grey58" + row_data = [ + f"[{secondary_style}]{i}[/]", + gpu_spec, + f"[{secondary_style}]{spot_display}[/]", + f"[{secondary_style}]{price_display}[/]", + f"[{secondary_style}]{backends_display}[/]", + ] + if region_column: + row_data.append(f"[{secondary_style}]{region_text}[/]") + row_data.append(f"[{secondary_style}]{availability}[/]") + + table.add_row(*row_data) + + console.print(table) + + def _print_filter_info(self, run_spec, group_by): + props = Table(box=None, show_header=False) + props.add_column(no_wrap=True) + props.add_column() + + req = Requirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.merged_profile.reservation, + ) + + pretty_req = req.pretty_format(resources_only=True) + max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "-" + + if req.spot is None: + spot_policy = "auto" + elif req.spot: + spot_policy = "spot" + else: + spot_policy = "on-demand" + + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + props.add_row(th("Project"), self.api.project) + props.add_row(th("User"), "admin") # TODO: Get actual user name + props.add_row(th("Resources"), pretty_req) + props.add_row(th("Spot policy"), spot_policy) + props.add_row(th("Max price"), max_price) + props.add_row(th("Reservation"), run_spec.configuration.reservation or "-") + if group_by: + props.add_row(th("Group by"), ", ".join(group_by)) + + console.print(props) + console.print() + + +class ListCommand(APIBaseCommand): + NAME = "list" + DESCRIPTION = "List various resources" + + def _register(self): + super()._register() + subparsers = self._parser.add_subparsers(dest="subcommand", help="Available subcommands") + + gpu_parser = subparsers.add_parser("gpu", help="List available GPUs") + gpu_cmd = ListGpuCommand(gpu_parser) + gpu_cmd._register() + gpu_parser.set_defaults(func=gpu_cmd._command) + + def _command(self, args: argparse.Namespace): + if not hasattr(args, "subcommand") or args.subcommand is None: + self._parser.print_help() diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index c91d0f2feb..735430c1ea 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -12,6 +12,7 @@ from dstack._internal.cli.commands.fleet import FleetCommand from dstack._internal.cli.commands.gateway import GatewayCommand from dstack._internal.cli.commands.init import InitCommand +from dstack._internal.cli.commands.list import ListCommand from dstack._internal.cli.commands.logs import LogsCommand from dstack._internal.cli.commands.metrics import MetricsCommand from dstack._internal.cli.commands.offer import OfferCommand @@ -68,6 +69,7 @@ def main(): FleetCommand.register(subparsers) GatewayCommand.register(subparsers) InitCommand.register(subparsers) + ListCommand.register(subparsers) OfferCommand.register(subparsers) LogsCommand.register(subparsers) MetricsCommand.register(subparsers) diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py index 8f3909503c..b08edddad8 100644 --- a/src/dstack/_internal/server/routers/runs.py +++ b/src/dstack/_internal/server/routers/runs.py @@ -10,9 +10,11 @@ from dstack._internal.server.schemas.runs import ( ApplyRunPlanRequest, DeleteRunsRequest, + GetRunGpusRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, + RunGpusResponse, StopRunsRequest, SubmitRunRequest, ) @@ -179,6 +181,18 @@ async def delete_runs( await runs.delete_runs(session=session, project=project, runs_names=body.runs_names) +@project_router.post("/gpus", response_model=RunGpusResponse, response_model_exclude_none=True) +async def get_run_gpus( + body: GetRunGpusRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +) -> RunGpusResponse: + _, project = user_project + return await runs.get_run_gpus_grouped( + session=session, project=project, run_spec=body.run_spec, group_by=body.group_by + ) + + # apply_plan replaces submit_run since it can create new runs. @project_router.post("/submit", deprecated=True) async def submit_run( diff --git a/src/dstack/_internal/server/schemas/runs.py b/src/dstack/_internal/server/schemas/runs.py index 8447243715..cb7937edc5 100644 --- a/src/dstack/_internal/server/schemas/runs.py +++ b/src/dstack/_internal/server/schemas/runs.py @@ -1,10 +1,14 @@ from datetime import datetime -from typing import Annotated, List, Optional +from typing import Annotated, List, Literal, Optional from uuid import UUID +import gpuhunt from pydantic import Field +from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.core.models.resources import Range from dstack._internal.core.models.runs import ApplyRunPlanInput, RunSpec @@ -64,3 +68,56 @@ class StopRunsRequest(CoreModel): class DeleteRunsRequest(CoreModel): runs_names: List[str] + + +class BackendGpu(CoreModel): + """GPU specification from a backend offer.""" + + name: str + memory_mib: int + vendor: gpuhunt.AcceleratorVendor + availability: InstanceAvailability + spot: bool + count: int + price: float + + +class BackendGpus(CoreModel): + """Backend GPU specifications.""" + + backend_type: BackendType + gpus: List[BackendGpu] + regions: List[str] + + +class GetRunGpusRequest(CoreModel): + """Request for getting run GPUs with optional grouping.""" + + run_spec: RunSpec + group_by: Optional[List[Literal["backend", "region"]]] = Field( + default=None, description="List of fields to group by. Valid values: 'backend', 'region'" + ) + + +class GpuGroup(CoreModel): + """GPU group that can handle all grouping scenarios.""" + + name: str + memory_mib: int + vendor: gpuhunt.AcceleratorVendor + availability: List[InstanceAvailability] + spot: List[Literal["spot", "on-demand"]] + count: Range[int] + price: Range[float] + backends: Optional[List[BackendType]] = None + backend: Optional[BackendType] = None + regions: Optional[List[str]] = None + region: Optional[str] = None + + +class RunGpusResponse(CoreModel): + """Response containing GPU specifications.""" + + gpus: List[GpuGroup] = Field( + description="List of GPU specifications, grouped according to the group_by parameter" + ) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index 81d34a2ae3..af0e788231 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -3,7 +3,7 @@ import uuid from collections.abc import Iterable from datetime import datetime, timezone -from typing import List, Optional +from typing import Dict, List, Literal, Optional, Tuple import pydantic from apscheduler.triggers.cron import CronTrigger @@ -12,6 +12,7 @@ from sqlalchemy.orm import joinedload, selectinload import dstack._internal.utils.common as common_utils +from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.errors import ( RepoDoesNotExistError, ResourceNotExistsError, @@ -31,8 +32,10 @@ from dstack._internal.core.models.profiles import ( CreationPolicy, RetryEvent, + SpotPolicy, ) from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData +from dstack._internal.core.models.resources import Range from dstack._internal.core.models.runs import ( ApplyRunPlanInput, Job, @@ -42,12 +45,14 @@ JobSubmission, JobTerminationReason, ProbeSpec, + Requirements, Run, RunPlan, RunSpec, RunStatus, RunTerminationReason, ServiceSpec, + get_policy_map, ) from dstack._internal.core.models.volumes import ( InstanceMountPoint, @@ -65,6 +70,12 @@ RunModel, UserModel, ) +from dstack._internal.server.schemas.runs import ( + BackendGpu, + BackendGpus, + GpuGroup, + RunGpusResponse, +) from dstack._internal.server.services import repos as repos_services from dstack._internal.server.services import services from dstack._internal.server.services.docker import is_valid_docker_volume_target @@ -98,6 +109,9 @@ logger = get_logger(__name__) +# Cache for run specs with 5-minute TTL +# Key: (project_id, backend_config_hash), Value: (RunGpusResponse, timestamp) + JOB_TERMINATION_REASONS_TO_RETRY = { JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, @@ -1285,7 +1299,7 @@ def is_replica_registered(jobs: list[JobModel]) -> bool: return jobs[0].registered -def _remove_job_spec_sensitive_info(spec: JobSpec): +def _remove_job_spec_sensitive_info(spec: JobSpec) -> None: spec.ssh_key = None @@ -1303,3 +1317,237 @@ def _get_next_triggered_at(run_spec: RunSpec) -> Optional[datetime]: ) ) return min(fire_times) + + +async def _get_gpu_offers( + session: AsyncSession, project: ProjectModel, run_spec: RunSpec +) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: + """Fetches all available instance offers that match the run spec's GPU requirements.""" + profile = run_spec.merged_profile + requirements = Requirements( + resources=run_spec.configuration.resources, + max_price=profile.max_price, + spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO), + reservation=profile.reservation, + ) + + return await get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + exclude_not_available=False, + multinode=False, + volumes=None, + privileged=False, + instance_mounts=False, + ) + + +def _process_offers_into_backend_gpus( + offers: List[Tuple[Backend, InstanceOfferWithAvailability]], +) -> List[BackendGpus]: + """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info.""" + backend_data: Dict[str, Dict] = {} + + for backend, offer in offers: + backend_type = backend.TYPE + if backend_type not in backend_data: + backend_data[backend_type] = {"gpus": {}, "regions": set()} + + backend_data[backend_type]["regions"].add(offer.region) + + if not offer.instance.resources.gpus: + continue + + gpu_types_in_offer = {} + for gpu in offer.instance.resources.gpus: + gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor) + if gpu_type_key not in gpu_types_in_offer: + gpu_types_in_offer[gpu_type_key] = 0 + gpu_types_in_offer[gpu_type_key] += 1 + + for (gpu_name, gpu_memory_mib, gpu_vendor), gpu_count_in_offer in gpu_types_in_offer.items(): + instance_config_key = ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + gpu_count_in_offer, + offer.instance.resources.spot, + offer.region, + ) + + if instance_config_key not in backend_data[backend_type]["gpus"]: + backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu( + name=gpu_name, + memory_mib=gpu_memory_mib, + vendor=gpu_vendor, + availability=offer.availability, + spot=offer.instance.resources.spot, + count=gpu_count_in_offer, + price=offer.price, + ) + + backend_gpus_list = [] + for backend_type, data in backend_data.items(): + gpus_list = sorted( + list(data["gpus"].values()), + key=lambda g: ( + not g.availability.is_available(), + g.vendor.value, + g.name, + g.memory_mib, + ), + ) + backend_gpus_list.append( + BackendGpus( + backend_type=backend_type, + gpus=gpus_list, + regions=sorted(list(data["regions"])), + ) + ) + return backend_gpus_list + + +def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str): + """Updates an existing GpuGroup with new data from another GPU offer.""" + spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand" + + if gpu.availability not in row.availability: + row.availability.append(gpu.availability) + if spot_type not in row.spot: + row.spot.append(spot_type) + if row.backends and backend_type not in row.backends: + row.backends.append(backend_type) + + row.count.min = min(row.count.min, gpu.count) + row.count.max = max(row.count.max, gpu.count) + row.price.min = min(row.price.min, gpu.price) + row.price.max = max(row.price.max, gpu.price) + + +def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs into a flat list, without any grouping.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor) + if key not in gpu_rows: + price_range = Range[float](min=gpu.price, max=gpu.price) + + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=price_range, + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + result = sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.name, + g.memory_mib, + ), + ) + + return result + + +def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type) + if key not in gpu_rows: + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=gpu.price, max=gpu.price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by both backend and region.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for region in backend.regions: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region) + if key not in gpu_rows: + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=gpu.price, max=gpu.price), + backend=backend.backend_type, + region=region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.region, + g.name, + g.memory_mib, + ), + ) + + +async def get_run_gpus_grouped( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + group_by: Optional[List[Literal["backend", "region"]]] = None, +) -> RunGpusResponse: + """Retrieves available GPU specifications based on a run spec, with optional grouping.""" + offers = await _get_gpu_offers(session, project, run_spec) + backend_gpus = _process_offers_into_backend_gpus(offers) + + group_by_set = set(group_by) if group_by else set() + + if "backend" in group_by_set and "region" in group_by_set: + gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) + elif "backend" in group_by_set: + gpus = _get_gpus_grouped_by_backend(backend_gpus) + else: + gpus = _get_gpus_with_no_grouping(backend_gpus) + + return RunGpusResponse(gpus=gpus) diff --git a/src/dstack/api/server/_runs.py b/src/dstack/api/server/_runs.py index 745ce9c782..882cc5034c 100644 --- a/src/dstack/api/server/_runs.py +++ b/src/dstack/api/server/_runs.py @@ -18,9 +18,11 @@ from dstack._internal.server.schemas.runs import ( ApplyRunPlanRequest, DeleteRunsRequest, + GetRunGpusRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, + RunGpusResponse, StopRunsRequest, ) from dstack.api.server._group import APIClientGroup @@ -94,3 +96,16 @@ def stop(self, project_name: str, runs_names: List[str], abort: bool): def delete(self, project_name: str, runs_names: List[str]): body = DeleteRunsRequest(runs_names=runs_names) self._request(f"/api/project/{project_name}/runs/delete", body=body.json()) + + def get_gpus( + self, + project_name: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None, + ) -> RunGpusResponse: + body = GetRunGpusRequest(run_spec=run_spec, group_by=group_by) + resp = self._request( + f"/api/project/{project_name}/runs/gpus", + body=body.json(), + ) + return parse_obj_as(RunGpusResponse, resp.json()) From dbb9c1a283cdc4febb4882e41b970b6b111f0aa0 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 15 Aug 2025 23:48:41 +0200 Subject: [PATCH 02/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Fix linter --- src/dstack/_internal/server/services/runs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index af0e788231..aae3b60769 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -1366,7 +1366,11 @@ def _process_offers_into_backend_gpus( gpu_types_in_offer[gpu_type_key] = 0 gpu_types_in_offer[gpu_type_key] += 1 - for (gpu_name, gpu_memory_mib, gpu_vendor), gpu_count_in_offer in gpu_types_in_offer.items(): + for ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + ), gpu_count_in_offer in gpu_types_in_offer.items(): instance_config_key = ( gpu_name, gpu_memory_mib, @@ -1542,7 +1546,7 @@ async def get_run_gpus_grouped( backend_gpus = _process_offers_into_backend_gpus(offers) group_by_set = set(group_by) if group_by else set() - + if "backend" in group_by_set and "region" in group_by_set: gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) elif "backend" in group_by_set: From 41f4edbaac4b53eb3537bcdde7850e5b405fbce0 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 16 Aug 2025 11:14:31 +0200 Subject: [PATCH 03/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Added grouping by count --- src/dstack/_internal/cli/commands/list.py | 6 +- src/dstack/_internal/server/schemas/runs.py | 5 +- src/dstack/_internal/server/services/runs.py | 130 ++++++++++++++++++- 3 files changed, 134 insertions(+), 7 deletions(-) diff --git a/src/dstack/_internal/cli/commands/list.py b/src/dstack/_internal/cli/commands/list.py index 549f489ff6..2e2cff21c3 100644 --- a/src/dstack/_internal/cli/commands/list.py +++ b/src/dstack/_internal/cli/commands/list.py @@ -30,8 +30,8 @@ def register_args( parser.add_argument( "--group-by", action="append", - choices=["backend", "region"], - help="Group GPUs by backend and/or region. Can be specified multiple times.", + choices=["backend", "region", "count"], + help="Group GPUs by backend, region, and/or count. Can be specified multiple times.", ) @@ -198,6 +198,8 @@ def _print_gpu_table(self, gpu_response, run_spec, group_by): count_range = str(gpu_group.count.min) else: count_range = f"{gpu_group.count.min}..{gpu_group.count.max}" + + # Always include count in GPU spec format: :: gpu_spec = f"{gpu_group.name}:{memory_gb}:{count_range}" spot_types = [] diff --git a/src/dstack/_internal/server/schemas/runs.py b/src/dstack/_internal/server/schemas/runs.py index cb7937edc5..940b26314c 100644 --- a/src/dstack/_internal/server/schemas/runs.py +++ b/src/dstack/_internal/server/schemas/runs.py @@ -94,8 +94,9 @@ class GetRunGpusRequest(CoreModel): """Request for getting run GPUs with optional grouping.""" run_spec: RunSpec - group_by: Optional[List[Literal["backend", "region"]]] = Field( - default=None, description="List of fields to group by. Valid values: 'backend', 'region'" + group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( + default=None, + description="List of fields to group by. Valid values: 'backend', 'region', 'count'", ) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index aae3b60769..e242be6384 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -1535,11 +1535,125 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> ) +def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count) + if key not in gpu_rows: + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=gpu.price, max=gpu.price), + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count) + if key not in gpu_rows: + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=gpu.price, max=gpu.price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_region_and_count( + backend_gpus: List[BackendGpus], +) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend, region, and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for region in backend.regions: + for gpu in backend.gpus: + key = ( + gpu.name, + gpu.memory_mib, + gpu.vendor, + backend.backend_type, + region, + gpu.count, + ) + if key not in gpu_rows: + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=gpu.price, max=gpu.price), + backend=backend.backend_type, + region=region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.region, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + async def get_run_gpus_grouped( session: AsyncSession, project: ProjectModel, run_spec: RunSpec, - group_by: Optional[List[Literal["backend", "region"]]] = None, + group_by: Optional[List[Literal["backend", "region", "count"]]] = None, ) -> RunGpusResponse: """Retrieves available GPU specifications based on a run spec, with optional grouping.""" offers = await _get_gpu_offers(session, project, run_spec) @@ -1547,10 +1661,20 @@ async def get_run_gpus_grouped( group_by_set = set(group_by) if group_by else set() - if "backend" in group_by_set and "region" in group_by_set: + # Determine grouping strategy based on combination + has_backend = "backend" in group_by_set + has_region = "region" in group_by_set + has_count = "count" in group_by_set + if has_backend and has_region and has_count: + gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus) + elif has_backend and has_count: + gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus) + elif has_backend and has_region: gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) - elif "backend" in group_by_set: + elif has_backend: gpus = _get_gpus_grouped_by_backend(backend_gpus) + elif has_count: + gpus = _get_gpus_grouped_by_count(backend_gpus) else: gpus = _get_gpus_with_no_grouping(backend_gpus) From 67132712f43b102a27c0ecbf5044b53d9eb576a5 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 16 Aug 2025 11:46:48 +0200 Subject: [PATCH 04/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Made price per GPU --- src/dstack/_internal/cli/commands/list.py | 2 +- src/dstack/_internal/server/services/runs.py | 23 +++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/dstack/_internal/cli/commands/list.py b/src/dstack/_internal/cli/commands/list.py index 2e2cff21c3..d9413c28ac 100644 --- a/src/dstack/_internal/cli/commands/list.py +++ b/src/dstack/_internal/cli/commands/list.py @@ -156,7 +156,7 @@ def _print_gpu_table(self, gpu_response, run_spec, group_by): table.add_column("#") table.add_column("GPU", no_wrap=True, ratio=2) table.add_column("SPOT", style="grey58", ratio=1) - table.add_column("PRICE", style="grey58", ratio=1) + table.add_column("$/GPU", style="grey58", ratio=1) table.add_column(backend_column, style="grey58", ratio=2) if region_column: table.add_column(region_column, style="grey58", ratio=2) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index e242be6384..eda8fcbe1b 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -1425,8 +1425,9 @@ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str): row.count.min = min(row.count.min, gpu.count) row.count.max = max(row.count.max, gpu.count) - row.price.min = min(row.price.min, gpu.price) - row.price.max = max(row.price.max, gpu.price) + per_gpu_price = gpu.price / gpu.count + row.price.min = min(row.price.min, per_gpu_price) + row.price.max = max(row.price.max, per_gpu_price) def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: @@ -1436,7 +1437,8 @@ def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup for gpu in backend.gpus: key = (gpu.name, gpu.memory_mib, gpu.vendor) if key not in gpu_rows: - price_range = Range[float](min=gpu.price, max=gpu.price) + per_gpu_price = gpu.price / gpu.count + price_range = Range[float](min=per_gpu_price, max=per_gpu_price) gpu_rows[key] = GpuGroup( name=gpu.name, @@ -1472,6 +1474,7 @@ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGro for gpu in backend.gpus: key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type) if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count gpu_rows[key] = GpuGroup( name=gpu.name, memory_mib=gpu.memory_mib, @@ -1479,7 +1482,7 @@ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGro availability=[gpu.availability], spot=["spot" if gpu.spot else "on-demand"], count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=gpu.price, max=gpu.price), + price=Range[float](min=per_gpu_price, max=per_gpu_price), backend=backend.backend_type, regions=backend.regions.copy(), ) @@ -1507,6 +1510,7 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> for gpu in backend.gpus: key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region) if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count gpu_rows[key] = GpuGroup( name=gpu.name, memory_mib=gpu.memory_mib, @@ -1514,7 +1518,7 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> availability=[gpu.availability], spot=["spot" if gpu.spot else "on-demand"], count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=gpu.price, max=gpu.price), + price=Range[float](min=per_gpu_price, max=per_gpu_price), backend=backend.backend_type, region=region, ) @@ -1542,6 +1546,7 @@ def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup for gpu in backend.gpus: key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count) if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count gpu_rows[key] = GpuGroup( name=gpu.name, memory_mib=gpu.memory_mib, @@ -1549,7 +1554,7 @@ def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup availability=[gpu.availability], spot=["spot" if gpu.spot else "on-demand"], count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=gpu.price, max=gpu.price), + price=Range[float](min=per_gpu_price, max=per_gpu_price), backends=[backend.backend_type], ) else: @@ -1575,6 +1580,7 @@ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> L for gpu in backend.gpus: key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count) if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count gpu_rows[key] = GpuGroup( name=gpu.name, memory_mib=gpu.memory_mib, @@ -1582,7 +1588,7 @@ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> L availability=[gpu.availability], spot=["spot" if gpu.spot else "on-demand"], count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=gpu.price, max=gpu.price), + price=Range[float](min=per_gpu_price, max=per_gpu_price), backend=backend.backend_type, regions=backend.regions.copy(), ) @@ -1620,6 +1626,7 @@ def _get_gpus_grouped_by_backend_region_and_count( gpu.count, ) if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count gpu_rows[key] = GpuGroup( name=gpu.name, memory_mib=gpu.memory_mib, @@ -1627,7 +1634,7 @@ def _get_gpus_grouped_by_backend_region_and_count( availability=[gpu.availability], spot=["spot" if gpu.spot else "on-demand"], count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=gpu.price, max=gpu.price), + price=Range[float](min=per_gpu_price, max=per_gpu_price), backend=backend.backend_type, region=region, ) From e04c2904f40d5dd0bdbc353ba7224865d41b32da Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 10:20:12 +0200 Subject: [PATCH 05/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Removed outdated comment --- src/dstack/_internal/server/services/runs.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index eda8fcbe1b..4b95440140 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -109,9 +109,6 @@ logger = get_logger(__name__) -# Cache for run specs with 5-minute TTL -# Key: (project_id, backend_config_hash), Value: (RunGpusResponse, timestamp) - JOB_TERMINATION_REASONS_TO_RETRY = { JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, From ed80c2e3f6ad6c52e51e2d38c6bfa167c97376c2 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 12:44:46 +0200 Subject: [PATCH 06/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Added tests --- .../_internal/server/routers/test_runs.py | 400 ++++++++++++++++++ 1 file changed, 400 insertions(+) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 945e039495..37b6b89b5e 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -5,6 +5,7 @@ from unittest.mock import AsyncMock, Mock, patch from uuid import UUID +import gpuhunt import pytest from fastapi.testclient import TestClient from freezegun import freeze_time @@ -23,6 +24,7 @@ ) from dstack._internal.core.models.gateways import GatewayStatus from dstack._internal.core.models.instances import ( + Gpu, InstanceAvailability, InstanceOfferWithAvailability, InstanceStatus, @@ -1978,3 +1980,401 @@ async def test_return_error_if_specified_gateway_is_true( ) assert response.status_code == 422 assert "must be a string or boolean `false`, not boolean `true`" in response.text + + +# GPU Test Fixtures and Helpers + +async def gpu_test_setup(session: AsyncSession): + """Common setup for GPU tests: user, project, repo, run_spec.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + return user, project, repo, run_spec + + +def create_gpu_offer( + backend: BackendType, + gpu_name: str, + gpu_memory_mib: int, + price: float, + spot: bool = False, + region: str = "us-west-2", + availability: InstanceAvailability = InstanceAvailability.AVAILABLE, + gpu_count: int = 1, + instance_name: Optional[str] = None, + vendor: gpuhunt.AcceleratorVendor = gpuhunt.AcceleratorVendor.NVIDIA, +) -> InstanceOfferWithAvailability: + """Helper to create GPU offers with sensible defaults.""" + if instance_name is None: + instance_name = f"{gpu_name.lower()}-instance" + + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib, vendor=vendor) for _ in range(gpu_count)] + cpus = max(4, gpu_count * 4) + memory_mib = max(16384, gpu_count * 16384) + + return InstanceOfferWithAvailability( + backend=backend, + instance=InstanceType( + name=instance_name, + resources=Resources( + cpus=cpus, + memory_mib=memory_mib, + spot=spot, + gpus=gpus + ), + ), + region=region, + price=price, + availability=availability, + ) + + +def create_mock_backends_with_offers( + offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]] +) -> List[Mock]: + """Helper to create mocked backends with specific offers.""" + mocked_backends = [] + + for backend_type, offers in offers_by_backend.items(): + backend_mock = Mock() + backend_mock.TYPE = backend_type + backend_mock.compute.return_value.get_offers_cached.return_value = offers + mocked_backends.append(backend_mock) + + return mocked_backends + + +async def call_gpus_api( + client: AsyncClient, + project_name: str, + user_token: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None +): + """Helper to call the GPUs API with standard parameters.""" + json_data = {"run_spec": run_spec.dict()} + if group_by is not None: + json_data["group_by"] = group_by + + return await client.post( + f"/api/project/{project_name}/runs/gpus", + headers=get_auth_headers(user_token), + json=json_data, + ) + + +class TestGetRunGpus: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + run_spec = get_run_spec(run_name="test-run", repo_id="test-repo") + response = await call_gpus_api(client, project.name, user.token, run_spec) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_gpus_without_group_by( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + offer_aws = create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50, spot=False) + offer_runpod = create_gpu_offer(BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1") + offers_by_backend = {BackendType.AWS: [offer_aws], BackendType.RUNPOD: [offer_runpod]} + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api(client, project.name, user.token, run_spec) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) >= 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_empty_gpus_when_no_offers( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers_cached.return_value = [] + m.return_value = [backend_mock_aws] + + response = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_invalid_group_by_rejected( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that invalid group_by values are properly rejected.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + response = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={ + "run_spec": run_spec.dict(), + "group_by": ["invalid_field"] + }, + ) + assert response.status_code == 422 + assert "validation error" in response.text.lower() or "invalid" in response.text.lower() + + + + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_exact_aggregation_values( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test exact aggregation values with precise validation (no >= or <=).""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + offer_t4_spot = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.xlarge", + resources=Resources( + cpus=4, + memory_mib=16384, + spot=True, + gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + ), + ), + region="us-west-2", + price=0.30, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_ondemand = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.2xlarge", + resources=Resources( + cpus=8, + memory_mib=32768, + spot=False, + gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + ), + ), + region="us-west-2", + price=0.60, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_quota = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.4xlarge", + resources=Resources( + cpus=16, + memory_mib=65536, + spot=True, + gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + ), + ), + region="us-east-1", + price=0.45, + availability=InstanceAvailability.NO_QUOTA + ) + offer_t4_multi = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.12xlarge", + resources=Resources( + cpus=48, + memory_mib=196608, + spot=False, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + ] + ), + ), + region="us-west-2", + price=2.40, + availability=InstanceAvailability.AVAILABLE, + ) + + offer_runpod_rtx_east = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.75, spot=True, region="us-east-1" + ) + offer_runpod_rtx_eu = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.65, spot=False, region="eu-west-1" + ) + offer_runpod_t4_east = create_gpu_offer( + BackendType.RUNPOD, "T4", 16384, 0.25, spot=True, region="us-east-1" + ) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers_cached.return_value = [ + offer_t4_spot, offer_t4_ondemand, offer_t4_quota, offer_t4_multi + ] + + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [ + offer_runpod_rtx_east, offer_runpod_rtx_eu, offer_runpod_t4_east + ] + + m.return_value = [backend_mock_aws, backend_mock_runpod] + + response = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + assert response.status_code == 200 + data = response.json() + + assert len(data["gpus"]) == 2 + + t4_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "T4"), None) + rtx_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "RTX4090"), None) + + assert t4_gpu is not None + assert rtx_gpu is not None + + assert t4_gpu["price"]["min"] == 0.25 + assert t4_gpu["price"]["max"] == 0.60 + assert set(t4_gpu["backends"]) == {"aws", "runpod"} + + assert rtx_gpu["price"]["min"] == 0.65 + assert rtx_gpu["price"]["max"] == 0.75 + assert set(rtx_gpu["backends"]) == {"runpod"} + + response_count_grouped = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={ + "run_spec": run_spec.dict(), + "group_by": ["count"] + }, + ) + assert response_count_grouped.status_code == 200 + count_grouped_data = response_count_grouped.json() + + assert len(count_grouped_data["gpus"]) == 3 + + t4_single_group = None + t4_multi_group = None + rtx_single_group = None + + for gpu in count_grouped_data["gpus"]: + if gpu["name"] == "T4" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: + t4_single_group = gpu + elif gpu["name"] == "T4" and gpu["count"]["min"] == 4 and gpu["count"]["max"] == 4: + t4_multi_group = gpu + elif gpu["name"] == "RTX4090" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: + rtx_single_group = gpu + + assert t4_single_group is not None + assert t4_multi_group is not None + assert rtx_single_group is not None + + assert t4_single_group["price"]["min"] == 0.25 + assert t4_single_group["price"]["max"] == 0.60 + assert t4_multi_group["price"]["min"] == 0.60 + assert t4_multi_group["price"]["max"] == 0.60 + assert rtx_single_group["price"]["min"] == 0.65 + assert rtx_single_group["price"]["max"] == 0.75 + + assert set(t4_single_group["backends"]) == {"aws", "runpod"} + assert set(t4_multi_group["backends"]) == {"aws"} + + response_backend = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={ + "run_spec": run_spec.dict(), + "group_by": ["backend"] + }, + ) + assert response_backend.status_code == 200 + backend_data = response_backend.json() + + assert len(backend_data["gpus"]) == 3 + + t4_runpod = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "T4" and gpu.get("backend") == "runpod"), None) + t4_aws = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "T4" and gpu.get("backend") == "aws"), None) + rtx_runpod = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod"), None) + + assert t4_runpod is not None + assert t4_aws is not None + assert rtx_runpod is not None + + assert t4_aws["price"] == {"min": 0.30, "max": 0.60} + assert t4_aws["count"] == {"min": 1, "max": 4} + assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} + assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} + + # Test region grouping to validate multi-region, multi-backend setup + response_region = await client.post( + f"/api/project/{project.name}/runs/gpus", + headers=get_auth_headers(user.token), + json={ + "run_spec": run_spec.dict(), + "group_by": ["region"] + }, + ) + assert response_region.status_code == 200 + region_data = response_region.json() + + assert len(region_data["gpus"]) == 2 + + t4_region_group = next((gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None) + rtx_region_group = next((gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None) + + assert t4_region_group is not None + assert rtx_region_group is not None + + assert set(t4_region_group["backends"]) == {"aws", "runpod"} + assert set(rtx_region_group["backends"]) == {"runpod"} + assert t4_region_group["price"] == {"min": 0.25, "max": 0.60} + assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75} From 0a757c91810ed9c13c4d98c0c5e4243e0935bd26 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 13:10:21 +0200 Subject: [PATCH 07/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Fixed linter --- .../_internal/server/routers/test_runs.py | 161 ++++++++++-------- 1 file changed, 92 insertions(+), 69 deletions(-) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 37b6b89b5e..f0ebd53b30 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -1984,6 +1984,7 @@ async def test_return_error_if_specified_gateway_is_true( # GPU Test Fixtures and Helpers + async def gpu_test_setup(session: AsyncSession): """Common setup for GPU tests: user, project, repo, run_spec.""" user = await create_user(session=session, global_role=GlobalRole.USER) @@ -2011,21 +2012,16 @@ def create_gpu_offer( """Helper to create GPU offers with sensible defaults.""" if instance_name is None: instance_name = f"{gpu_name.lower()}-instance" - + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib, vendor=vendor) for _ in range(gpu_count)] cpus = max(4, gpu_count * 4) memory_mib = max(16384, gpu_count * 16384) - + return InstanceOfferWithAvailability( backend=backend, instance=InstanceType( name=instance_name, - resources=Resources( - cpus=cpus, - memory_mib=memory_mib, - spot=spot, - gpus=gpus - ), + resources=Resources(cpus=cpus, memory_mib=memory_mib, spot=spot, gpus=gpus), ), region=region, price=price, @@ -2034,17 +2030,17 @@ def create_gpu_offer( def create_mock_backends_with_offers( - offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]] + offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]], ) -> List[Mock]: """Helper to create mocked backends with specific offers.""" mocked_backends = [] - + for backend_type, offers in offers_by_backend.items(): backend_mock = Mock() backend_mock.TYPE = backend_type backend_mock.compute.return_value.get_offers_cached.return_value = offers mocked_backends.append(backend_mock) - + return mocked_backends @@ -2053,13 +2049,13 @@ async def call_gpus_api( project_name: str, user_token: str, run_spec: RunSpec, - group_by: Optional[List[str]] = None + group_by: Optional[List[str]] = None, ): """Helper to call the GPUs API with standard parameters.""" json_data = {"run_spec": run_spec.dict()} if group_by is not None: json_data["group_by"] = group_by - + return await client.post( f"/api/project/{project_name}/runs/gpus", headers=get_auth_headers(user_token), @@ -2087,7 +2083,9 @@ async def test_returns_gpus_without_group_by( user, project, repo, run_spec = await gpu_test_setup(session) offer_aws = create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50, spot=False) - offer_runpod = create_gpu_offer(BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1") + offer_runpod = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1" + ) offers_by_backend = {BackendType.AWS: [offer_aws], BackendType.RUNPOD: [offer_runpod]} mocked_backends = create_mock_backends_with_offers(offers_by_backend) @@ -2149,17 +2147,11 @@ async def test_invalid_group_by_rejected( response = await client.post( f"/api/project/{project.name}/runs/gpus", headers=get_auth_headers(user.token), - json={ - "run_spec": run_spec.dict(), - "group_by": ["invalid_field"] - }, + json={"run_spec": run_spec.dict(), "group_by": ["invalid_field"]}, ) assert response.status_code == 422 assert "validation error" in response.text.lower() or "invalid" in response.text.lower() - - - @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_exact_aggregation_values( @@ -2182,7 +2174,9 @@ async def test_exact_aggregation_values( cpus=4, memory_mib=16384, spot=True, - gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], ), ), region="us-west-2", @@ -2197,7 +2191,9 @@ async def test_exact_aggregation_values( cpus=8, memory_mib=32768, spot=False, - gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], ), ), region="us-west-2", @@ -2212,12 +2208,14 @@ async def test_exact_aggregation_values( cpus=16, memory_mib=65536, spot=True, - gpus=[Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], ), ), region="us-east-1", price=0.45, - availability=InstanceAvailability.NO_QUOTA + availability=InstanceAvailability.NO_QUOTA, ) offer_t4_multi = InstanceOfferWithAvailability( backend=BackendType.AWS, @@ -2232,14 +2230,14 @@ async def test_exact_aggregation_values( Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), - ] + ], ), ), region="us-west-2", price=2.40, availability=InstanceAvailability.AVAILABLE, ) - + offer_runpod_rtx_east = create_gpu_offer( BackendType.RUNPOD, "RTX4090", 24576, 0.75, spot=True, region="us-east-1" ) @@ -2254,15 +2252,20 @@ async def test_exact_aggregation_values( backend_mock_aws = Mock() backend_mock_aws.TYPE = BackendType.AWS backend_mock_aws.compute.return_value.get_offers_cached.return_value = [ - offer_t4_spot, offer_t4_ondemand, offer_t4_quota, offer_t4_multi + offer_t4_spot, + offer_t4_ondemand, + offer_t4_quota, + offer_t4_multi, ] - + backend_mock_runpod = Mock() backend_mock_runpod.TYPE = BackendType.RUNPOD backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [ - offer_runpod_rtx_east, offer_runpod_rtx_eu, offer_runpod_t4_east + offer_runpod_rtx_east, + offer_runpod_rtx_eu, + offer_runpod_t4_east, ] - + m.return_value = [backend_mock_aws, backend_mock_runpod] response = await client.post( @@ -2272,19 +2275,19 @@ async def test_exact_aggregation_values( ) assert response.status_code == 200 data = response.json() - + assert len(data["gpus"]) == 2 - + t4_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "T4"), None) rtx_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "RTX4090"), None) - + assert t4_gpu is not None assert rtx_gpu is not None - + assert t4_gpu["price"]["min"] == 0.25 assert t4_gpu["price"]["max"] == 0.60 assert set(t4_gpu["backends"]) == {"aws", "runpod"} - + assert rtx_gpu["price"]["min"] == 0.65 assert rtx_gpu["price"]["max"] == 0.75 assert set(rtx_gpu["backends"]) == {"runpod"} @@ -2292,88 +2295,108 @@ async def test_exact_aggregation_values( response_count_grouped = await client.post( f"/api/project/{project.name}/runs/gpus", headers=get_auth_headers(user.token), - json={ - "run_spec": run_spec.dict(), - "group_by": ["count"] - }, + json={"run_spec": run_spec.dict(), "group_by": ["count"]}, ) assert response_count_grouped.status_code == 200 count_grouped_data = response_count_grouped.json() - + assert len(count_grouped_data["gpus"]) == 3 - + t4_single_group = None t4_multi_group = None rtx_single_group = None - + for gpu in count_grouped_data["gpus"]: if gpu["name"] == "T4" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: t4_single_group = gpu elif gpu["name"] == "T4" and gpu["count"]["min"] == 4 and gpu["count"]["max"] == 4: t4_multi_group = gpu - elif gpu["name"] == "RTX4090" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: + elif ( + gpu["name"] == "RTX4090" + and gpu["count"]["min"] == 1 + and gpu["count"]["max"] == 1 + ): rtx_single_group = gpu - + assert t4_single_group is not None assert t4_multi_group is not None assert rtx_single_group is not None - + assert t4_single_group["price"]["min"] == 0.25 assert t4_single_group["price"]["max"] == 0.60 assert t4_multi_group["price"]["min"] == 0.60 assert t4_multi_group["price"]["max"] == 0.60 assert rtx_single_group["price"]["min"] == 0.65 assert rtx_single_group["price"]["max"] == 0.75 - + assert set(t4_single_group["backends"]) == {"aws", "runpod"} assert set(t4_multi_group["backends"]) == {"aws"} response_backend = await client.post( f"/api/project/{project.name}/runs/gpus", headers=get_auth_headers(user.token), - json={ - "run_spec": run_spec.dict(), - "group_by": ["backend"] - }, + json={"run_spec": run_spec.dict(), "group_by": ["backend"]}, ) assert response_backend.status_code == 200 backend_data = response_backend.json() - + assert len(backend_data["gpus"]) == 3 - - t4_runpod = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "T4" and gpu.get("backend") == "runpod"), None) - t4_aws = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "T4" and gpu.get("backend") == "aws"), None) - rtx_runpod = next((gpu for gpu in backend_data["gpus"] if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod"), None) - + + t4_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "runpod" + ), + None, + ) + t4_aws = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "aws" + ), + None, + ) + rtx_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod" + ), + None, + ) + assert t4_runpod is not None assert t4_aws is not None assert rtx_runpod is not None - + assert t4_aws["price"] == {"min": 0.30, "max": 0.60} assert t4_aws["count"] == {"min": 1, "max": 4} assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} - + # Test region grouping to validate multi-region, multi-backend setup response_region = await client.post( f"/api/project/{project.name}/runs/gpus", headers=get_auth_headers(user.token), - json={ - "run_spec": run_spec.dict(), - "group_by": ["region"] - }, + json={"run_spec": run_spec.dict(), "group_by": ["region"]}, ) assert response_region.status_code == 200 region_data = response_region.json() - + assert len(region_data["gpus"]) == 2 - - t4_region_group = next((gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None) - rtx_region_group = next((gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None) - + + t4_region_group = next( + (gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None + ) + rtx_region_group = next( + (gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None + ) + assert t4_region_group is not None assert rtx_region_group is not None - + assert set(t4_region_group["backends"]) == {"aws", "runpod"} assert set(rtx_region_group["backends"]) == {"runpod"} assert t4_region_group["price"] == {"min": 0.25, "max": 0.60} From 5cf1052a2c927257888fd7719df870baa824235a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 13:11:46 +0200 Subject: [PATCH 08/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Renamed `dstack list gpu` to `dstack gpu` --- .../cli/commands/{list.py => gpu.py} | 26 +++---------------- src/dstack/_internal/cli/main.py | 4 +-- 2 files changed, 6 insertions(+), 24 deletions(-) rename src/dstack/_internal/cli/commands/{list.py => gpu.py} (92%) diff --git a/src/dstack/_internal/cli/commands/list.py b/src/dstack/_internal/cli/commands/gpu.py similarity index 92% rename from src/dstack/_internal/cli/commands/list.py rename to src/dstack/_internal/cli/commands/gpu.py index d9413c28ac..892c0c8195 100644 --- a/src/dstack/_internal/cli/commands/list.py +++ b/src/dstack/_internal/cli/commands/gpu.py @@ -18,7 +18,7 @@ from dstack.api.utils import load_profile -class ListGpuConfigurator(BaseRunConfigurator): +class GpuConfigurator(BaseRunConfigurator): TYPE = ApplyConfigurationType.TASK @classmethod @@ -35,7 +35,7 @@ def register_args( ) -class ListGpuCommand(APIBaseCommand): +class GpuCommand(APIBaseCommand): NAME = "gpu" DESCRIPTION = "List available GPUs" @@ -54,13 +54,13 @@ def _register(self): dest="format", help="Output in JSON format (equivalent to --format json)", ) - ListGpuConfigurator.register_args(self._parser) + GpuConfigurator.register_args(self._parser) def _command(self, args: argparse.Namespace): super()._command(args) conf = TaskConfiguration(commands=[":"]) - configurator = ListGpuConfigurator(api_client=self.api) + configurator = GpuConfigurator(api_client=self.api) configurator.apply_args(conf, args, []) profile = load_profile(Path.cwd(), profile_name=args.profile) @@ -278,21 +278,3 @@ def th(s: str) -> str: console.print(props) console.print() - - -class ListCommand(APIBaseCommand): - NAME = "list" - DESCRIPTION = "List various resources" - - def _register(self): - super()._register() - subparsers = self._parser.add_subparsers(dest="subcommand", help="Available subcommands") - - gpu_parser = subparsers.add_parser("gpu", help="List available GPUs") - gpu_cmd = ListGpuCommand(gpu_parser) - gpu_cmd._register() - gpu_parser.set_defaults(func=gpu_cmd._command) - - def _command(self, args: argparse.Namespace): - if not hasattr(args, "subcommand") or args.subcommand is None: - self._parser.print_help() diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index 735430c1ea..c98d5526b1 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -11,8 +11,8 @@ from dstack._internal.cli.commands.delete import DeleteCommand from dstack._internal.cli.commands.fleet import FleetCommand from dstack._internal.cli.commands.gateway import GatewayCommand +from dstack._internal.cli.commands.gpu import GpuCommand from dstack._internal.cli.commands.init import InitCommand -from dstack._internal.cli.commands.list import ListCommand from dstack._internal.cli.commands.logs import LogsCommand from dstack._internal.cli.commands.metrics import MetricsCommand from dstack._internal.cli.commands.offer import OfferCommand @@ -69,7 +69,7 @@ def main(): FleetCommand.register(subparsers) GatewayCommand.register(subparsers) InitCommand.register(subparsers) - ListCommand.register(subparsers) + GpuCommand.register(subparsers) OfferCommand.register(subparsers) LogsCommand.register(subparsers) MetricsCommand.register(subparsers) From c8d7bdd4ab3f4dd97a02afe059ea001c880ff64d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 14:08:57 +0200 Subject: [PATCH 09/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Moved the /gpus API to a separate routing; also moved GPU-related logic from runs. --- src/dstack/_internal/cli/commands/gpu.py | 2 +- src/dstack/_internal/server/app.py | 2 + src/dstack/_internal/server/routers/gpus.py | 29 ++ src/dstack/_internal/server/routers/runs.py | 14 - src/dstack/_internal/server/schemas/gpus.py | 64 +++ src/dstack/_internal/server/schemas/runs.py | 60 +-- src/dstack/_internal/server/services/gpus.py | 386 +++++++++++++++ src/dstack/_internal/server/services/runs.py | 382 +-------------- src/dstack/api/server/__init__.py | 6 + src/dstack/api/server/_gpus.py | 22 + src/dstack/api/server/_runs.py | 15 - .../_internal/server/routers/test_gpus.py | 449 ++++++++++++++++++ .../_internal/server/routers/test_runs.py | 423 ----------------- 13 files changed, 961 insertions(+), 893 deletions(-) create mode 100644 src/dstack/_internal/server/routers/gpus.py create mode 100644 src/dstack/_internal/server/schemas/gpus.py create mode 100644 src/dstack/_internal/server/services/gpus.py create mode 100644 src/dstack/api/server/_gpus.py create mode 100644 src/tests/_internal/server/routers/test_gpus.py diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py index 892c0c8195..f6f36cfa2c 100644 --- a/src/dstack/_internal/cli/commands/gpu.py +++ b/src/dstack/_internal/cli/commands/gpu.py @@ -76,7 +76,7 @@ def _command(self, args: argparse.Namespace): status = contextlib.nullcontext() with status: - gpu_response = self.api.client.runs.get_gpus( + gpu_response = self.api.client.gpus.get_gpus( self.api.project, run_spec, group_by=args.group_by, diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 8e65897710..bbb666ac12 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -29,6 +29,7 @@ files, fleets, gateways, + gpus, instances, logs, metrics, @@ -204,6 +205,7 @@ def register_routes(app: FastAPI, ui: bool = True): app.include_router(repos.router) app.include_router(runs.root_router) app.include_router(runs.project_router) + app.include_router(gpus.project_router) app.include_router(metrics.router) app.include_router(logs.router) app.include_router(secrets.router) diff --git a/src/dstack/_internal/server/routers/gpus.py b/src/dstack/_internal/server/routers/gpus.py new file mode 100644 index 0000000000..3d1508c899 --- /dev/null +++ b/src/dstack/_internal/server/routers/gpus.py @@ -0,0 +1,29 @@ +from typing import Tuple + +from fastapi import APIRouter, Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.db import get_session +from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.schemas.gpus import GetRunGpusRequest, RunGpusResponse +from dstack._internal.server.security.permissions import ProjectMember +from dstack._internal.server.services.gpus import get_run_gpus_grouped +from dstack._internal.server.utils.routers import get_base_api_additional_responses + +project_router = APIRouter( + prefix="/api/project/{project_name}/gpus", + tags=["gpus"], + responses=get_base_api_additional_responses(), +) + + +@project_router.post("/list", response_model=RunGpusResponse, response_model_exclude_none=True) +async def get_run_gpus( + body: GetRunGpusRequest, + session: AsyncSession = Depends(get_session), + user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), +) -> RunGpusResponse: + _, project = user_project + return await get_run_gpus_grouped( + session=session, project=project, run_spec=body.run_spec, group_by=body.group_by + ) diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py index b08edddad8..8f3909503c 100644 --- a/src/dstack/_internal/server/routers/runs.py +++ b/src/dstack/_internal/server/routers/runs.py @@ -10,11 +10,9 @@ from dstack._internal.server.schemas.runs import ( ApplyRunPlanRequest, DeleteRunsRequest, - GetRunGpusRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, - RunGpusResponse, StopRunsRequest, SubmitRunRequest, ) @@ -181,18 +179,6 @@ async def delete_runs( await runs.delete_runs(session=session, project=project, runs_names=body.runs_names) -@project_router.post("/gpus", response_model=RunGpusResponse, response_model_exclude_none=True) -async def get_run_gpus( - body: GetRunGpusRequest, - session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> RunGpusResponse: - _, project = user_project - return await runs.get_run_gpus_grouped( - session=session, project=project, run_spec=body.run_spec, group_by=body.group_by - ) - - # apply_plan replaces submit_run since it can create new runs. @project_router.post("/submit", deprecated=True) async def submit_run( diff --git a/src/dstack/_internal/server/schemas/gpus.py b/src/dstack/_internal/server/schemas/gpus.py new file mode 100644 index 0000000000..ea173f25fe --- /dev/null +++ b/src/dstack/_internal/server/schemas/gpus.py @@ -0,0 +1,64 @@ +from typing import List, Literal, Optional + +import gpuhunt +from pydantic import Field + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import RunSpec + + +class BackendGpu(CoreModel): + """GPU specification from a backend offer.""" + + name: str + memory_mib: int + vendor: gpuhunt.AcceleratorVendor + availability: InstanceAvailability + spot: bool + count: int + price: float + + +class BackendGpus(CoreModel): + """Backend GPU specifications.""" + + backend_type: BackendType + gpus: List[BackendGpu] + regions: List[str] + + +class GetRunGpusRequest(CoreModel): + """Request for getting run GPUs with optional grouping.""" + + run_spec: RunSpec + group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( + default=None, + description="List of fields to group by. Valid values: 'backend', 'region', 'count'", + ) + + +class GpuGroup(CoreModel): + """GPU group that can handle all grouping scenarios.""" + + name: str + memory_mib: int + vendor: gpuhunt.AcceleratorVendor + availability: List[InstanceAvailability] + spot: List[Literal["spot", "on-demand"]] + count: Range[int] + price: Range[float] + backends: Optional[List[BackendType]] = None + backend: Optional[BackendType] = None + regions: Optional[List[str]] = None + region: Optional[str] = None + + +class RunGpusResponse(CoreModel): + """Response containing GPU specifications.""" + + gpus: List[GpuGroup] = Field( + description="List of GPU specifications, grouped according to the group_by parameter" + ) diff --git a/src/dstack/_internal/server/schemas/runs.py b/src/dstack/_internal/server/schemas/runs.py index 940b26314c..8447243715 100644 --- a/src/dstack/_internal/server/schemas/runs.py +++ b/src/dstack/_internal/server/schemas/runs.py @@ -1,14 +1,10 @@ from datetime import datetime -from typing import Annotated, List, Literal, Optional +from typing import Annotated, List, Optional from uuid import UUID -import gpuhunt from pydantic import Field -from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.instances import InstanceAvailability -from dstack._internal.core.models.resources import Range from dstack._internal.core.models.runs import ApplyRunPlanInput, RunSpec @@ -68,57 +64,3 @@ class StopRunsRequest(CoreModel): class DeleteRunsRequest(CoreModel): runs_names: List[str] - - -class BackendGpu(CoreModel): - """GPU specification from a backend offer.""" - - name: str - memory_mib: int - vendor: gpuhunt.AcceleratorVendor - availability: InstanceAvailability - spot: bool - count: int - price: float - - -class BackendGpus(CoreModel): - """Backend GPU specifications.""" - - backend_type: BackendType - gpus: List[BackendGpu] - regions: List[str] - - -class GetRunGpusRequest(CoreModel): - """Request for getting run GPUs with optional grouping.""" - - run_spec: RunSpec - group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( - default=None, - description="List of fields to group by. Valid values: 'backend', 'region', 'count'", - ) - - -class GpuGroup(CoreModel): - """GPU group that can handle all grouping scenarios.""" - - name: str - memory_mib: int - vendor: gpuhunt.AcceleratorVendor - availability: List[InstanceAvailability] - spot: List[Literal["spot", "on-demand"]] - count: Range[int] - price: Range[float] - backends: Optional[List[BackendType]] = None - backend: Optional[BackendType] = None - regions: Optional[List[str]] = None - region: Optional[str] = None - - -class RunGpusResponse(CoreModel): - """Response containing GPU specifications.""" - - gpus: List[GpuGroup] = Field( - description="List of GPU specifications, grouped according to the group_by parameter" - ) diff --git a/src/dstack/_internal/server/services/gpus.py b/src/dstack/_internal/server/services/gpus.py new file mode 100644 index 0000000000..4d7bd6ba4d --- /dev/null +++ b/src/dstack/_internal/server/services/gpus.py @@ -0,0 +1,386 @@ +from typing import Dict, List, Literal, Optional, Tuple + +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.models.instances import InstanceOfferWithAvailability +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.resources import Range +from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.gpus import ( + BackendGpu, + BackendGpus, + GpuGroup, + RunGpusResponse, +) +from dstack._internal.server.services.offers import get_offers_by_requirements + + +async def _get_gpu_offers( + session: AsyncSession, project: ProjectModel, run_spec: RunSpec +) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: + """Fetches all available instance offers that match the run spec's GPU requirements.""" + profile = run_spec.merged_profile + requirements = Requirements( + resources=run_spec.configuration.resources, + max_price=profile.max_price, + spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO), + reservation=profile.reservation, + ) + + return await get_offers_by_requirements( + project=project, + profile=profile, + requirements=requirements, + exclude_not_available=False, + multinode=False, + volumes=None, + privileged=False, + instance_mounts=False, + ) + + +def _process_offers_into_backend_gpus( + offers: List[Tuple[Backend, InstanceOfferWithAvailability]], +) -> List[BackendGpus]: + """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info.""" + backend_data: Dict[str, Dict] = {} + + for backend, offer in offers: + backend_type = backend.TYPE + if backend_type not in backend_data: + backend_data[backend_type] = {"gpus": {}, "regions": set()} + + backend_data[backend_type]["regions"].add(offer.region) + + if not offer.instance.resources.gpus: + continue + + gpu_types_in_offer = {} + for gpu in offer.instance.resources.gpus: + gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor) + if gpu_type_key not in gpu_types_in_offer: + gpu_types_in_offer[gpu_type_key] = 0 + gpu_types_in_offer[gpu_type_key] += 1 + + for ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + ), gpu_count_in_offer in gpu_types_in_offer.items(): + instance_config_key = ( + gpu_name, + gpu_memory_mib, + gpu_vendor, + gpu_count_in_offer, + offer.instance.resources.spot, + offer.region, + ) + + if instance_config_key not in backend_data[backend_type]["gpus"]: + backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu( + name=gpu_name, + memory_mib=gpu_memory_mib, + vendor=gpu_vendor, + availability=offer.availability, + spot=offer.instance.resources.spot, + count=gpu_count_in_offer, + price=offer.price, + ) + + backend_gpus_list = [] + for backend_type, data in backend_data.items(): + gpus_list = sorted( + list(data["gpus"].values()), + key=lambda g: ( + not g.availability.is_available(), + g.vendor.value, + g.name, + g.memory_mib, + ), + ) + backend_gpus_list.append( + BackendGpus( + backend_type=backend_type, + gpus=gpus_list, + regions=sorted(list(data["regions"])), + ) + ) + return backend_gpus_list + + +def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str): + """Updates an existing GpuGroup with new data from another GPU offer.""" + spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand" + + if gpu.availability not in row.availability: + row.availability.append(gpu.availability) + if spot_type not in row.spot: + row.spot.append(spot_type) + if row.backends and backend_type not in row.backends: + row.backends.append(backend_type) + + row.count.min = min(row.count.min, gpu.count) + row.count.max = max(row.count.max, gpu.count) + per_gpu_price = gpu.price / gpu.count + row.price.min = min(row.price.min, per_gpu_price) + row.price.max = max(row.price.max, per_gpu_price) + + +def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs into a flat list, without any grouping.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + price_range = Range[float](min=per_gpu_price, max=per_gpu_price) + + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=price_range, + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + result = sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.name, + g.memory_mib, + ), + ) + + return result + + +def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by both backend and region.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for region in backend.regions: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.region, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backends=[backend.backend_type], + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + regions=backend.regions.copy(), + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +def _get_gpus_grouped_by_backend_region_and_count( + backend_gpus: List[BackendGpus], +) -> List[GpuGroup]: + """Aggregates GPU specs, grouping them by backend, region, and GPU count.""" + gpu_rows: Dict[Tuple, GpuGroup] = {} + for backend in backend_gpus: + for region in backend.regions: + for gpu in backend.gpus: + key = ( + gpu.name, + gpu.memory_mib, + gpu.vendor, + backend.backend_type, + region, + gpu.count, + ) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + + return sorted( + list(gpu_rows.values()), + key=lambda g: ( + not any(av.is_available() for av in g.availability), + g.price.min, + g.price.max, + g.backend.value, + g.region, + g.count.min, + g.name, + g.memory_mib, + ), + ) + + +async def get_run_gpus_grouped( + session: AsyncSession, + project: ProjectModel, + run_spec: RunSpec, + group_by: Optional[List[Literal["backend", "region", "count"]]] = None, +) -> RunGpusResponse: + """Retrieves available GPU specifications based on a run spec, with optional grouping.""" + offers = await _get_gpu_offers(session, project, run_spec) + backend_gpus = _process_offers_into_backend_gpus(offers) + + group_by_set = set(group_by) if group_by else set() + + # Determine grouping strategy based on combination + has_backend = "backend" in group_by_set + has_region = "region" in group_by_set + has_count = "count" in group_by_set + if has_backend and has_region and has_count: + gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus) + elif has_backend and has_count: + gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus) + elif has_backend and has_region: + gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) + elif has_backend: + gpus = _get_gpus_grouped_by_backend(backend_gpus) + elif has_count: + gpus = _get_gpus_grouped_by_count(backend_gpus) + else: + gpus = _get_gpus_with_no_grouping(backend_gpus) + + return RunGpusResponse(gpus=gpus) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index 4b95440140..9f24d6fcb3 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -3,7 +3,7 @@ import uuid from collections.abc import Iterable from datetime import datetime, timezone -from typing import Dict, List, Literal, Optional, Tuple +from typing import List, Optional import pydantic from apscheduler.triggers.cron import CronTrigger @@ -12,7 +12,6 @@ from sqlalchemy.orm import joinedload, selectinload import dstack._internal.utils.common as common_utils -from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.errors import ( RepoDoesNotExistError, ResourceNotExistsError, @@ -32,10 +31,8 @@ from dstack._internal.core.models.profiles import ( CreationPolicy, RetryEvent, - SpotPolicy, ) from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData -from dstack._internal.core.models.resources import Range from dstack._internal.core.models.runs import ( ApplyRunPlanInput, Job, @@ -45,14 +42,12 @@ JobSubmission, JobTerminationReason, ProbeSpec, - Requirements, Run, RunPlan, RunSpec, RunStatus, RunTerminationReason, ServiceSpec, - get_policy_map, ) from dstack._internal.core.models.volumes import ( InstanceMountPoint, @@ -70,12 +65,6 @@ RunModel, UserModel, ) -from dstack._internal.server.schemas.runs import ( - BackendGpu, - BackendGpus, - GpuGroup, - RunGpusResponse, -) from dstack._internal.server.services import repos as repos_services from dstack._internal.server.services import services from dstack._internal.server.services.docker import is_valid_docker_volume_target @@ -1314,372 +1303,3 @@ def _get_next_triggered_at(run_spec: RunSpec) -> Optional[datetime]: ) ) return min(fire_times) - - -async def _get_gpu_offers( - session: AsyncSession, project: ProjectModel, run_spec: RunSpec -) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: - """Fetches all available instance offers that match the run spec's GPU requirements.""" - profile = run_spec.merged_profile - requirements = Requirements( - resources=run_spec.configuration.resources, - max_price=profile.max_price, - spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO), - reservation=profile.reservation, - ) - - return await get_offers_by_requirements( - project=project, - profile=profile, - requirements=requirements, - exclude_not_available=False, - multinode=False, - volumes=None, - privileged=False, - instance_mounts=False, - ) - - -def _process_offers_into_backend_gpus( - offers: List[Tuple[Backend, InstanceOfferWithAvailability]], -) -> List[BackendGpus]: - """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info.""" - backend_data: Dict[str, Dict] = {} - - for backend, offer in offers: - backend_type = backend.TYPE - if backend_type not in backend_data: - backend_data[backend_type] = {"gpus": {}, "regions": set()} - - backend_data[backend_type]["regions"].add(offer.region) - - if not offer.instance.resources.gpus: - continue - - gpu_types_in_offer = {} - for gpu in offer.instance.resources.gpus: - gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor) - if gpu_type_key not in gpu_types_in_offer: - gpu_types_in_offer[gpu_type_key] = 0 - gpu_types_in_offer[gpu_type_key] += 1 - - for ( - gpu_name, - gpu_memory_mib, - gpu_vendor, - ), gpu_count_in_offer in gpu_types_in_offer.items(): - instance_config_key = ( - gpu_name, - gpu_memory_mib, - gpu_vendor, - gpu_count_in_offer, - offer.instance.resources.spot, - offer.region, - ) - - if instance_config_key not in backend_data[backend_type]["gpus"]: - backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu( - name=gpu_name, - memory_mib=gpu_memory_mib, - vendor=gpu_vendor, - availability=offer.availability, - spot=offer.instance.resources.spot, - count=gpu_count_in_offer, - price=offer.price, - ) - - backend_gpus_list = [] - for backend_type, data in backend_data.items(): - gpus_list = sorted( - list(data["gpus"].values()), - key=lambda g: ( - not g.availability.is_available(), - g.vendor.value, - g.name, - g.memory_mib, - ), - ) - backend_gpus_list.append( - BackendGpus( - backend_type=backend_type, - gpus=gpus_list, - regions=sorted(list(data["regions"])), - ) - ) - return backend_gpus_list - - -def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str): - """Updates an existing GpuGroup with new data from another GPU offer.""" - spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand" - - if gpu.availability not in row.availability: - row.availability.append(gpu.availability) - if spot_type not in row.spot: - row.spot.append(spot_type) - if row.backends and backend_type not in row.backends: - row.backends.append(backend_type) - - row.count.min = min(row.count.min, gpu.count) - row.count.max = max(row.count.max, gpu.count) - per_gpu_price = gpu.price / gpu.count - row.price.min = min(row.price.min, per_gpu_price) - row.price.max = max(row.price.max, per_gpu_price) - - -def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: - """Aggregates GPU specs into a flat list, without any grouping.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - price_range = Range[float](min=per_gpu_price, max=per_gpu_price) - - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=price_range, - backends=[backend.backend_type], - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - result = sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.name, - g.memory_mib, - ), - ) - - return result - - -def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: - """Aggregates GPU specs, grouping them by backend.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - regions=backend.regions.copy(), - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - return sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.backend.value, - g.name, - g.memory_mib, - ), - ) - - -def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: - """Aggregates GPU specs, grouping them by both backend and region.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for region in backend.regions: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - region=region, - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - return sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.backend.value, - g.region, - g.name, - g.memory_mib, - ), - ) - - -def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: - """Aggregates GPU specs, grouping them by GPU count.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backends=[backend.backend_type], - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - return sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.count.min, - g.name, - g.memory_mib, - ), - ) - - -def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]: - """Aggregates GPU specs, grouping them by backend and GPU count.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - regions=backend.regions.copy(), - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - return sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.backend.value, - g.count.min, - g.name, - g.memory_mib, - ), - ) - - -def _get_gpus_grouped_by_backend_region_and_count( - backend_gpus: List[BackendGpus], -) -> List[GpuGroup]: - """Aggregates GPU specs, grouping them by backend, region, and GPU count.""" - gpu_rows: Dict[Tuple, GpuGroup] = {} - for backend in backend_gpus: - for region in backend.regions: - for gpu in backend.gpus: - key = ( - gpu.name, - gpu.memory_mib, - gpu.vendor, - backend.backend_type, - region, - gpu.count, - ) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - region=region, - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) - - return sorted( - list(gpu_rows.values()), - key=lambda g: ( - not any(av.is_available() for av in g.availability), - g.price.min, - g.price.max, - g.backend.value, - g.region, - g.count.min, - g.name, - g.memory_mib, - ), - ) - - -async def get_run_gpus_grouped( - session: AsyncSession, - project: ProjectModel, - run_spec: RunSpec, - group_by: Optional[List[Literal["backend", "region", "count"]]] = None, -) -> RunGpusResponse: - """Retrieves available GPU specifications based on a run spec, with optional grouping.""" - offers = await _get_gpu_offers(session, project, run_spec) - backend_gpus = _process_offers_into_backend_gpus(offers) - - group_by_set = set(group_by) if group_by else set() - - # Determine grouping strategy based on combination - has_backend = "backend" in group_by_set - has_region = "region" in group_by_set - has_count = "count" in group_by_set - if has_backend and has_region and has_count: - gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus) - elif has_backend and has_count: - gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus) - elif has_backend and has_region: - gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus) - elif has_backend: - gpus = _get_gpus_grouped_by_backend(backend_gpus) - elif has_count: - gpus = _get_gpus_grouped_by_count(backend_gpus) - else: - gpus = _get_gpus_with_no_grouping(backend_gpus) - - return RunGpusResponse(gpus=gpus) diff --git a/src/dstack/api/server/__init__.py b/src/dstack/api/server/__init__.py index 5608d90bba..1cdbd5e7cb 100644 --- a/src/dstack/api/server/__init__.py +++ b/src/dstack/api/server/__init__.py @@ -17,6 +17,7 @@ from dstack.api.server._files import FilesAPIClient from dstack.api.server._fleets import FleetsAPIClient from dstack.api.server._gateways import GatewaysAPIClient +from dstack.api.server._gpus import GpusAPIClient from dstack.api.server._logs import LogsAPIClient from dstack.api.server._metrics import MetricsAPIClient from dstack.api.server._projects import ProjectsAPIClient @@ -44,6 +45,7 @@ class APIClient: backends: operations with backends fleets: operations with fleets runs: operations with runs + gpus: operations with GPUs metrics: operations with metrics logs: operations with logs gateways: operations with gateways @@ -93,6 +95,10 @@ def repos(self) -> ReposAPIClient: def runs(self) -> RunsAPIClient: return RunsAPIClient(self._request) + @property + def gpus(self) -> GpusAPIClient: + return GpusAPIClient(self._request) + @property def metrics(self) -> MetricsAPIClient: return MetricsAPIClient(self._request) diff --git a/src/dstack/api/server/_gpus.py b/src/dstack/api/server/_gpus.py new file mode 100644 index 0000000000..fc8c92f117 --- /dev/null +++ b/src/dstack/api/server/_gpus.py @@ -0,0 +1,22 @@ +from typing import List, Optional + +from pydantic import parse_obj_as + +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.schemas.gpus import GetRunGpusRequest, RunGpusResponse +from dstack.api.server._group import APIClientGroup + + +class GpusAPIClient(APIClientGroup): + def get_gpus( + self, + project_name: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None, + ) -> RunGpusResponse: + body = GetRunGpusRequest(run_spec=run_spec, group_by=group_by) + resp = self._request( + f"/api/project/{project_name}/gpus/list", + body=body.json(), + ) + return parse_obj_as(RunGpusResponse, resp.json()) diff --git a/src/dstack/api/server/_runs.py b/src/dstack/api/server/_runs.py index 882cc5034c..745ce9c782 100644 --- a/src/dstack/api/server/_runs.py +++ b/src/dstack/api/server/_runs.py @@ -18,11 +18,9 @@ from dstack._internal.server.schemas.runs import ( ApplyRunPlanRequest, DeleteRunsRequest, - GetRunGpusRequest, GetRunPlanRequest, GetRunRequest, ListRunsRequest, - RunGpusResponse, StopRunsRequest, ) from dstack.api.server._group import APIClientGroup @@ -96,16 +94,3 @@ def stop(self, project_name: str, runs_names: List[str], abort: bool): def delete(self, project_name: str, runs_names: List[str]): body = DeleteRunsRequest(runs_names=runs_names) self._request(f"/api/project/{project_name}/runs/delete", body=body.json()) - - def get_gpus( - self, - project_name: str, - run_spec: RunSpec, - group_by: Optional[List[str]] = None, - ) -> RunGpusResponse: - body = GetRunGpusRequest(run_spec=run_spec, group_by=group_by) - resp = self._request( - f"/api/project/{project_name}/runs/gpus", - body=body.json(), - ) - return parse_obj_as(RunGpusResponse, resp.json()) diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py new file mode 100644 index 0000000000..e8a0c07467 --- /dev/null +++ b/src/tests/_internal/server/routers/test_gpus.py @@ -0,0 +1,449 @@ +from typing import Dict, List, Optional +from unittest.mock import Mock, patch + +import gpuhunt +import pytest +from httpx import AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + Gpu, + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceType, + Resources, +) +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_project, + create_repo, + create_user, + get_auth_headers, + get_run_spec, +) + +pytestmark = pytest.mark.usefixtures("image_config_mock") + + +# GPU Test Fixtures and Helpers + + +async def gpu_test_setup(session: AsyncSession): + """Common setup for GPU tests: user, project, repo, run_spec.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + return user, project, repo, run_spec + + +def create_gpu_offer( + backend: BackendType, + gpu_name: str, + gpu_memory_mib: int, + price: float, + spot: bool = False, + region: str = "us-west-2", + availability: InstanceAvailability = InstanceAvailability.AVAILABLE, + gpu_count: int = 1, + instance_name: Optional[str] = None, + vendor: gpuhunt.AcceleratorVendor = gpuhunt.AcceleratorVendor.NVIDIA, +) -> InstanceOfferWithAvailability: + """Helper to create GPU offers with sensible defaults.""" + if instance_name is None: + instance_name = f"{gpu_name.lower()}-instance" + + gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib, vendor=vendor) for _ in range(gpu_count)] + cpus = max(4, gpu_count * 4) + memory_mib = max(16384, gpu_count * 16384) + + return InstanceOfferWithAvailability( + backend=backend, + instance=InstanceType( + name=instance_name, + resources=Resources(cpus=cpus, memory_mib=memory_mib, spot=spot, gpus=gpus), + ), + region=region, + price=price, + availability=availability, + ) + + +def create_mock_backends_with_offers( + offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]], +) -> List[Mock]: + """Helper to create mocked backends with specific offers.""" + mocked_backends = [] + + for backend_type, offers in offers_by_backend.items(): + backend_mock = Mock() + backend_mock.TYPE = backend_type + backend_mock.compute.return_value.get_offers_cached.return_value = offers + mocked_backends.append(backend_mock) + + return mocked_backends + + +async def call_gpus_api( + client: AsyncClient, + project_name: str, + user_token: str, + run_spec: RunSpec, + group_by: Optional[List[str]] = None, +): + """Helper to call the GPUs API with standard parameters.""" + json_data = {"run_spec": run_spec.dict()} + if group_by is not None: + json_data["group_by"] = group_by + + return await client.post( + f"/api/project/{project_name}/gpus/list", + headers=get_auth_headers(user_token), + json=json_data, + ) + + +class TestGetRunGpus: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_403_if_not_project_member( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + run_spec = get_run_spec(run_name="test-run", repo_id="test-repo") + response = await call_gpus_api(client, project.name, user.token, run_spec) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_gpus_without_group_by( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + offer_aws = create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50, spot=False) + offer_runpod = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1" + ) + offers_by_backend = {BackendType.AWS: [offer_aws], BackendType.RUNPOD: [offer_runpod]} + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api(client, project.name, user.token, run_spec) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) >= 1 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_empty_gpus_when_no_offers( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers_cached.return_value = [] + m.return_value = [backend_mock_aws] + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + + assert response.status_code == 200 + response_data = response.json() + assert "gpus" in response_data + assert isinstance(response_data["gpus"], list) + assert len(response_data["gpus"]) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_invalid_group_by_rejected( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that invalid group_by values are properly rejected.""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["invalid_field"]}, + ) + assert response.status_code == 422 + assert "validation error" in response.text.lower() or "invalid" in response.text.lower() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_exact_aggregation_values( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test exact aggregation values with precise validation (no >= or <=).""" + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + + offer_t4_spot = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.xlarge", + resources=Resources( + cpus=4, + memory_mib=16384, + spot=True, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-west-2", + price=0.30, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_ondemand = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.2xlarge", + resources=Resources( + cpus=8, + memory_mib=32768, + spot=False, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-west-2", + price=0.60, + availability=InstanceAvailability.AVAILABLE, + ) + offer_t4_quota = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.4xlarge", + resources=Resources( + cpus=16, + memory_mib=65536, + spot=True, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + ], + ), + ), + region="us-east-1", + price=0.45, + availability=InstanceAvailability.NO_QUOTA, + ) + offer_t4_multi = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="g4dn.12xlarge", + resources=Resources( + cpus=48, + memory_mib=196608, + spot=False, + gpus=[ + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), + ], + ), + ), + region="us-west-2", + price=2.40, + availability=InstanceAvailability.AVAILABLE, + ) + + offer_runpod_rtx_east = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.75, spot=True, region="us-east-1" + ) + offer_runpod_rtx_eu = create_gpu_offer( + BackendType.RUNPOD, "RTX4090", 24576, 0.65, spot=False, region="eu-west-1" + ) + offer_runpod_t4_east = create_gpu_offer( + BackendType.RUNPOD, "T4", 16384, 0.25, spot=True, region="us-east-1" + ) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock_aws = Mock() + backend_mock_aws.TYPE = BackendType.AWS + backend_mock_aws.compute.return_value.get_offers_cached.return_value = [ + offer_t4_spot, + offer_t4_ondemand, + offer_t4_quota, + offer_t4_multi, + ] + + backend_mock_runpod = Mock() + backend_mock_runpod.TYPE = BackendType.RUNPOD + backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [ + offer_runpod_rtx_east, + offer_runpod_rtx_eu, + offer_runpod_t4_east, + ] + + m.return_value = [backend_mock_aws, backend_mock_runpod] + + response = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict()}, + ) + assert response.status_code == 200 + data = response.json() + + assert len(data["gpus"]) == 2 + + t4_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "T4"), None) + rtx_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "RTX4090"), None) + + assert t4_gpu is not None + assert rtx_gpu is not None + + assert t4_gpu["price"]["min"] == 0.25 + assert t4_gpu["price"]["max"] == 0.60 + assert set(t4_gpu["backends"]) == {"aws", "runpod"} + + assert rtx_gpu["price"]["min"] == 0.65 + assert rtx_gpu["price"]["max"] == 0.75 + assert set(rtx_gpu["backends"]) == {"runpod"} + + response_count_grouped = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["count"]}, + ) + assert response_count_grouped.status_code == 200 + count_grouped_data = response_count_grouped.json() + + assert len(count_grouped_data["gpus"]) == 3 + + t4_single_group = None + t4_multi_group = None + rtx_single_group = None + + for gpu in count_grouped_data["gpus"]: + if gpu["name"] == "T4" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: + t4_single_group = gpu + elif gpu["name"] == "T4" and gpu["count"]["min"] == 4 and gpu["count"]["max"] == 4: + t4_multi_group = gpu + elif ( + gpu["name"] == "RTX4090" + and gpu["count"]["min"] == 1 + and gpu["count"]["max"] == 1 + ): + rtx_single_group = gpu + + assert t4_single_group is not None + assert t4_multi_group is not None + assert rtx_single_group is not None + + assert t4_single_group["price"]["min"] == 0.25 + assert t4_single_group["price"]["max"] == 0.60 + assert t4_multi_group["price"]["min"] == 0.60 + assert t4_multi_group["price"]["max"] == 0.60 + assert rtx_single_group["price"]["min"] == 0.65 + assert rtx_single_group["price"]["max"] == 0.75 + + assert set(t4_single_group["backends"]) == {"aws", "runpod"} + assert set(t4_multi_group["backends"]) == {"aws"} + + response_backend = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["backend"]}, + ) + assert response_backend.status_code == 200 + backend_data = response_backend.json() + + assert len(backend_data["gpus"]) == 3 + + t4_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "runpod" + ), + None, + ) + t4_aws = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "T4" and gpu.get("backend") == "aws" + ), + None, + ) + rtx_runpod = next( + ( + gpu + for gpu in backend_data["gpus"] + if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod" + ), + None, + ) + + assert t4_runpod is not None + assert t4_aws is not None + assert rtx_runpod is not None + + assert t4_aws["price"] == {"min": 0.30, "max": 0.60} + assert t4_aws["count"] == {"min": 1, "max": 4} + assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} + assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} + + # Test region grouping to validate multi-region, multi-backend setup + response_region = await client.post( + f"/api/project/{project.name}/gpus/list", + headers=get_auth_headers(user.token), + json={"run_spec": run_spec.dict(), "group_by": ["region"]}, + ) + assert response_region.status_code == 200 + region_data = response_region.json() + + assert len(region_data["gpus"]) == 2 + + t4_region_group = next( + (gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None + ) + rtx_region_group = next( + (gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None + ) + + assert t4_region_group is not None + assert rtx_region_group is not None + + assert set(t4_region_group["backends"]) == {"aws", "runpod"} + assert set(rtx_region_group["backends"]) == {"runpod"} + assert t4_region_group["price"] == {"min": 0.25, "max": 0.60} + assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75} diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index f0ebd53b30..945e039495 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -5,7 +5,6 @@ from unittest.mock import AsyncMock, Mock, patch from uuid import UUID -import gpuhunt import pytest from fastapi.testclient import TestClient from freezegun import freeze_time @@ -24,7 +23,6 @@ ) from dstack._internal.core.models.gateways import GatewayStatus from dstack._internal.core.models.instances import ( - Gpu, InstanceAvailability, InstanceOfferWithAvailability, InstanceStatus, @@ -1980,424 +1978,3 @@ async def test_return_error_if_specified_gateway_is_true( ) assert response.status_code == 422 assert "must be a string or boolean `false`, not boolean `true`" in response.text - - -# GPU Test Fixtures and Helpers - - -async def gpu_test_setup(session: AsyncSession): - """Common setup for GPU tests: user, project, repo, run_spec.""" - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) - repo = await create_repo(session=session, project_id=project.id) - run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) - return user, project, repo, run_spec - - -def create_gpu_offer( - backend: BackendType, - gpu_name: str, - gpu_memory_mib: int, - price: float, - spot: bool = False, - region: str = "us-west-2", - availability: InstanceAvailability = InstanceAvailability.AVAILABLE, - gpu_count: int = 1, - instance_name: Optional[str] = None, - vendor: gpuhunt.AcceleratorVendor = gpuhunt.AcceleratorVendor.NVIDIA, -) -> InstanceOfferWithAvailability: - """Helper to create GPU offers with sensible defaults.""" - if instance_name is None: - instance_name = f"{gpu_name.lower()}-instance" - - gpus = [Gpu(name=gpu_name, memory_mib=gpu_memory_mib, vendor=vendor) for _ in range(gpu_count)] - cpus = max(4, gpu_count * 4) - memory_mib = max(16384, gpu_count * 16384) - - return InstanceOfferWithAvailability( - backend=backend, - instance=InstanceType( - name=instance_name, - resources=Resources(cpus=cpus, memory_mib=memory_mib, spot=spot, gpus=gpus), - ), - region=region, - price=price, - availability=availability, - ) - - -def create_mock_backends_with_offers( - offers_by_backend: Dict[BackendType, List[InstanceOfferWithAvailability]], -) -> List[Mock]: - """Helper to create mocked backends with specific offers.""" - mocked_backends = [] - - for backend_type, offers in offers_by_backend.items(): - backend_mock = Mock() - backend_mock.TYPE = backend_type - backend_mock.compute.return_value.get_offers_cached.return_value = offers - mocked_backends.append(backend_mock) - - return mocked_backends - - -async def call_gpus_api( - client: AsyncClient, - project_name: str, - user_token: str, - run_spec: RunSpec, - group_by: Optional[List[str]] = None, -): - """Helper to call the GPUs API with standard parameters.""" - json_data = {"run_spec": run_spec.dict()} - if group_by is not None: - json_data["group_by"] = group_by - - return await client.post( - f"/api/project/{project_name}/runs/gpus", - headers=get_auth_headers(user_token), - json=json_data, - ) - - -class TestGetRunGpus: - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_returns_403_if_not_project_member( - self, test_db, session: AsyncSession, client: AsyncClient - ): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - run_spec = get_run_spec(run_name="test-run", repo_id="test-repo") - response = await call_gpus_api(client, project.name, user.token, run_spec) - assert response.status_code == 403 - - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_returns_gpus_without_group_by( - self, test_db, session: AsyncSession, client: AsyncClient - ): - user, project, repo, run_spec = await gpu_test_setup(session) - - offer_aws = create_gpu_offer(BackendType.AWS, "T4", 16384, 0.50, spot=False) - offer_runpod = create_gpu_offer( - BackendType.RUNPOD, "RTX4090", 24576, 0.35, spot=True, region="us-east-1" - ) - offers_by_backend = {BackendType.AWS: [offer_aws], BackendType.RUNPOD: [offer_runpod]} - mocked_backends = create_mock_backends_with_offers(offers_by_backend) - - with patch("dstack._internal.server.services.backends.get_project_backends") as m: - m.return_value = mocked_backends - response = await call_gpus_api(client, project.name, user.token, run_spec) - - assert response.status_code == 200 - response_data = response.json() - assert "gpus" in response_data - assert isinstance(response_data["gpus"], list) - assert len(response_data["gpus"]) >= 1 - - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_returns_empty_gpus_when_no_offers( - self, test_db, session: AsyncSession, client: AsyncClient - ): - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) - repo = await create_repo(session=session, project_id=project.id) - run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) - - with patch("dstack._internal.server.services.backends.get_project_backends") as m: - backend_mock_aws = Mock() - backend_mock_aws.TYPE = BackendType.AWS - backend_mock_aws.compute.return_value.get_offers_cached.return_value = [] - m.return_value = [backend_mock_aws] - - response = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict()}, - ) - - assert response.status_code == 200 - response_data = response.json() - assert "gpus" in response_data - assert isinstance(response_data["gpus"], list) - assert len(response_data["gpus"]) == 0 - - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_invalid_group_by_rejected( - self, test_db, session: AsyncSession, client: AsyncClient - ): - """Test that invalid group_by values are properly rejected.""" - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) - repo = await create_repo(session=session, project_id=project.id) - run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) - - response = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict(), "group_by": ["invalid_field"]}, - ) - assert response.status_code == 422 - assert "validation error" in response.text.lower() or "invalid" in response.text.lower() - - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_exact_aggregation_values( - self, test_db, session: AsyncSession, client: AsyncClient - ): - """Test exact aggregation values with precise validation (no >= or <=).""" - user = await create_user(session=session, global_role=GlobalRole.USER) - project = await create_project(session=session, owner=user) - await add_project_member( - session=session, project=project, user=user, project_role=ProjectRole.USER - ) - repo = await create_repo(session=session, project_id=project.id) - run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) - - offer_t4_spot = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="g4dn.xlarge", - resources=Resources( - cpus=4, - memory_mib=16384, - spot=True, - gpus=[ - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) - ], - ), - ), - region="us-west-2", - price=0.30, - availability=InstanceAvailability.AVAILABLE, - ) - offer_t4_ondemand = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="g4dn.2xlarge", - resources=Resources( - cpus=8, - memory_mib=32768, - spot=False, - gpus=[ - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) - ], - ), - ), - region="us-west-2", - price=0.60, - availability=InstanceAvailability.AVAILABLE, - ) - offer_t4_quota = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="g4dn.4xlarge", - resources=Resources( - cpus=16, - memory_mib=65536, - spot=True, - gpus=[ - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) - ], - ), - ), - region="us-east-1", - price=0.45, - availability=InstanceAvailability.NO_QUOTA, - ) - offer_t4_multi = InstanceOfferWithAvailability( - backend=BackendType.AWS, - instance=InstanceType( - name="g4dn.12xlarge", - resources=Resources( - cpus=48, - memory_mib=196608, - spot=False, - gpus=[ - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), - Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA), - ], - ), - ), - region="us-west-2", - price=2.40, - availability=InstanceAvailability.AVAILABLE, - ) - - offer_runpod_rtx_east = create_gpu_offer( - BackendType.RUNPOD, "RTX4090", 24576, 0.75, spot=True, region="us-east-1" - ) - offer_runpod_rtx_eu = create_gpu_offer( - BackendType.RUNPOD, "RTX4090", 24576, 0.65, spot=False, region="eu-west-1" - ) - offer_runpod_t4_east = create_gpu_offer( - BackendType.RUNPOD, "T4", 16384, 0.25, spot=True, region="us-east-1" - ) - - with patch("dstack._internal.server.services.backends.get_project_backends") as m: - backend_mock_aws = Mock() - backend_mock_aws.TYPE = BackendType.AWS - backend_mock_aws.compute.return_value.get_offers_cached.return_value = [ - offer_t4_spot, - offer_t4_ondemand, - offer_t4_quota, - offer_t4_multi, - ] - - backend_mock_runpod = Mock() - backend_mock_runpod.TYPE = BackendType.RUNPOD - backend_mock_runpod.compute.return_value.get_offers_cached.return_value = [ - offer_runpod_rtx_east, - offer_runpod_rtx_eu, - offer_runpod_t4_east, - ] - - m.return_value = [backend_mock_aws, backend_mock_runpod] - - response = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict()}, - ) - assert response.status_code == 200 - data = response.json() - - assert len(data["gpus"]) == 2 - - t4_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "T4"), None) - rtx_gpu = next((gpu for gpu in data["gpus"] if gpu["name"] == "RTX4090"), None) - - assert t4_gpu is not None - assert rtx_gpu is not None - - assert t4_gpu["price"]["min"] == 0.25 - assert t4_gpu["price"]["max"] == 0.60 - assert set(t4_gpu["backends"]) == {"aws", "runpod"} - - assert rtx_gpu["price"]["min"] == 0.65 - assert rtx_gpu["price"]["max"] == 0.75 - assert set(rtx_gpu["backends"]) == {"runpod"} - - response_count_grouped = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict(), "group_by": ["count"]}, - ) - assert response_count_grouped.status_code == 200 - count_grouped_data = response_count_grouped.json() - - assert len(count_grouped_data["gpus"]) == 3 - - t4_single_group = None - t4_multi_group = None - rtx_single_group = None - - for gpu in count_grouped_data["gpus"]: - if gpu["name"] == "T4" and gpu["count"]["min"] == 1 and gpu["count"]["max"] == 1: - t4_single_group = gpu - elif gpu["name"] == "T4" and gpu["count"]["min"] == 4 and gpu["count"]["max"] == 4: - t4_multi_group = gpu - elif ( - gpu["name"] == "RTX4090" - and gpu["count"]["min"] == 1 - and gpu["count"]["max"] == 1 - ): - rtx_single_group = gpu - - assert t4_single_group is not None - assert t4_multi_group is not None - assert rtx_single_group is not None - - assert t4_single_group["price"]["min"] == 0.25 - assert t4_single_group["price"]["max"] == 0.60 - assert t4_multi_group["price"]["min"] == 0.60 - assert t4_multi_group["price"]["max"] == 0.60 - assert rtx_single_group["price"]["min"] == 0.65 - assert rtx_single_group["price"]["max"] == 0.75 - - assert set(t4_single_group["backends"]) == {"aws", "runpod"} - assert set(t4_multi_group["backends"]) == {"aws"} - - response_backend = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict(), "group_by": ["backend"]}, - ) - assert response_backend.status_code == 200 - backend_data = response_backend.json() - - assert len(backend_data["gpus"]) == 3 - - t4_runpod = next( - ( - gpu - for gpu in backend_data["gpus"] - if gpu["name"] == "T4" and gpu.get("backend") == "runpod" - ), - None, - ) - t4_aws = next( - ( - gpu - for gpu in backend_data["gpus"] - if gpu["name"] == "T4" and gpu.get("backend") == "aws" - ), - None, - ) - rtx_runpod = next( - ( - gpu - for gpu in backend_data["gpus"] - if gpu["name"] == "RTX4090" and gpu.get("backend") == "runpod" - ), - None, - ) - - assert t4_runpod is not None - assert t4_aws is not None - assert rtx_runpod is not None - - assert t4_aws["price"] == {"min": 0.30, "max": 0.60} - assert t4_aws["count"] == {"min": 1, "max": 4} - assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} - assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} - - # Test region grouping to validate multi-region, multi-backend setup - response_region = await client.post( - f"/api/project/{project.name}/runs/gpus", - headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict(), "group_by": ["region"]}, - ) - assert response_region.status_code == 200 - region_data = response_region.json() - - assert len(region_data["gpus"]) == 2 - - t4_region_group = next( - (gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None - ) - rtx_region_group = next( - (gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None - ) - - assert t4_region_group is not None - assert rtx_region_group is not None - - assert set(t4_region_group["backends"]) == {"aws", "runpod"} - assert set(rtx_region_group["backends"]) == {"runpod"} - assert t4_region_group["price"] == {"min": 0.25, "max": 0.60} - assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75} From 5725924d1d9145ef6e380e97d372bc812f5120a3 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 14:21:04 +0200 Subject: [PATCH 10/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP) Minor refactoring --- src/dstack/_internal/cli/commands/gpu.py | 2 +- src/dstack/_internal/server/routers/gpus.py | 14 +++++++------- src/dstack/_internal/server/schemas/gpus.py | 6 +++--- src/dstack/_internal/server/services/gpus.py | 8 ++++---- src/dstack/api/server/_gpus.py | 10 +++++----- src/tests/_internal/server/routers/test_gpus.py | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py index f6f36cfa2c..3001780602 100644 --- a/src/dstack/_internal/cli/commands/gpu.py +++ b/src/dstack/_internal/cli/commands/gpu.py @@ -76,7 +76,7 @@ def _command(self, args: argparse.Namespace): status = contextlib.nullcontext() with status: - gpu_response = self.api.client.gpus.get_gpus( + gpu_response = self.api.client.gpus.list_gpus( self.api.project, run_spec, group_by=args.group_by, diff --git a/src/dstack/_internal/server/routers/gpus.py b/src/dstack/_internal/server/routers/gpus.py index 3d1508c899..521ace1594 100644 --- a/src/dstack/_internal/server/routers/gpus.py +++ b/src/dstack/_internal/server/routers/gpus.py @@ -5,9 +5,9 @@ from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel -from dstack._internal.server.schemas.gpus import GetRunGpusRequest, RunGpusResponse +from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse from dstack._internal.server.security.permissions import ProjectMember -from dstack._internal.server.services.gpus import get_run_gpus_grouped +from dstack._internal.server.services.gpus import list_gpus_grouped from dstack._internal.server.utils.routers import get_base_api_additional_responses project_router = APIRouter( @@ -17,13 +17,13 @@ ) -@project_router.post("/list", response_model=RunGpusResponse, response_model_exclude_none=True) -async def get_run_gpus( - body: GetRunGpusRequest, +@project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True) +async def list_gpus( + body: ListGpusRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), -) -> RunGpusResponse: +) -> ListGpusResponse: _, project = user_project - return await get_run_gpus_grouped( + return await list_gpus_grouped( session=session, project=project, run_spec=body.run_spec, group_by=body.group_by ) diff --git a/src/dstack/_internal/server/schemas/gpus.py b/src/dstack/_internal/server/schemas/gpus.py index ea173f25fe..c0438fca16 100644 --- a/src/dstack/_internal/server/schemas/gpus.py +++ b/src/dstack/_internal/server/schemas/gpus.py @@ -30,8 +30,8 @@ class BackendGpus(CoreModel): regions: List[str] -class GetRunGpusRequest(CoreModel): - """Request for getting run GPUs with optional grouping.""" +class ListGpusRequest(CoreModel): + """Request for listing GPUs with optional grouping.""" run_spec: RunSpec group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( @@ -56,7 +56,7 @@ class GpuGroup(CoreModel): region: Optional[str] = None -class RunGpusResponse(CoreModel): +class ListGpusResponse(CoreModel): """Response containing GPU specifications.""" gpus: List[GpuGroup] = Field( diff --git a/src/dstack/_internal/server/services/gpus.py b/src/dstack/_internal/server/services/gpus.py index 4d7bd6ba4d..3f0d1df577 100644 --- a/src/dstack/_internal/server/services/gpus.py +++ b/src/dstack/_internal/server/services/gpus.py @@ -12,7 +12,7 @@ BackendGpu, BackendGpus, GpuGroup, - RunGpusResponse, + ListGpusResponse, ) from dstack._internal.server.services.offers import get_offers_by_requirements @@ -354,12 +354,12 @@ def _get_gpus_grouped_by_backend_region_and_count( ) -async def get_run_gpus_grouped( +async def list_gpus_grouped( session: AsyncSession, project: ProjectModel, run_spec: RunSpec, group_by: Optional[List[Literal["backend", "region", "count"]]] = None, -) -> RunGpusResponse: +) -> ListGpusResponse: """Retrieves available GPU specifications based on a run spec, with optional grouping.""" offers = await _get_gpu_offers(session, project, run_spec) backend_gpus = _process_offers_into_backend_gpus(offers) @@ -383,4 +383,4 @@ async def get_run_gpus_grouped( else: gpus = _get_gpus_with_no_grouping(backend_gpus) - return RunGpusResponse(gpus=gpus) + return ListGpusResponse(gpus=gpus) diff --git a/src/dstack/api/server/_gpus.py b/src/dstack/api/server/_gpus.py index fc8c92f117..884c0feb3c 100644 --- a/src/dstack/api/server/_gpus.py +++ b/src/dstack/api/server/_gpus.py @@ -3,20 +3,20 @@ from pydantic import parse_obj_as from dstack._internal.core.models.runs import RunSpec -from dstack._internal.server.schemas.gpus import GetRunGpusRequest, RunGpusResponse +from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse from dstack.api.server._group import APIClientGroup class GpusAPIClient(APIClientGroup): - def get_gpus( + def list_gpus( self, project_name: str, run_spec: RunSpec, group_by: Optional[List[str]] = None, - ) -> RunGpusResponse: - body = GetRunGpusRequest(run_spec=run_spec, group_by=group_by) + ) -> ListGpusResponse: + body = ListGpusRequest(run_spec=run_spec, group_by=group_by) resp = self._request( f"/api/project/{project_name}/gpus/list", body=body.json(), ) - return parse_obj_as(RunGpusResponse, resp.json()) + return parse_obj_as(ListGpusResponse, resp.json()) diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py index e8a0c07467..643cb4981e 100644 --- a/src/tests/_internal/server/routers/test_gpus.py +++ b/src/tests/_internal/server/routers/test_gpus.py @@ -109,7 +109,7 @@ async def call_gpus_api( ) -class TestGetRunGpus: +class TestListGpus: @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_403_if_not_project_member( From f9744d4489db186a8fa8f8d1ca5647d0c5704688 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 14:54:47 +0200 Subject: [PATCH 11/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Fixed group_by bug; also added validation (disallow grouping by region without backend) --- src/dstack/_internal/cli/commands/gpu.py | 3 +- src/dstack/_internal/server/schemas/gpus.py | 4 +- src/dstack/_internal/server/services/gpus.py | 88 +++++++++--------- .../_internal/server/routers/test_gpus.py | 90 ++++++++++++++++--- 4 files changed, 128 insertions(+), 57 deletions(-) diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py index 3001780602..4519cf79ee 100644 --- a/src/dstack/_internal/cli/commands/gpu.py +++ b/src/dstack/_internal/cli/commands/gpu.py @@ -31,7 +31,8 @@ def register_args( "--group-by", action="append", choices=["backend", "region", "count"], - help="Group GPUs by backend, region, and/or count. Can be specified multiple times.", + help="Group GPUs by backend, region, and/or count. Can be specified multiple times. " + "Note: 'region' can only be used together with 'backend'.", ) diff --git a/src/dstack/_internal/server/schemas/gpus.py b/src/dstack/_internal/server/schemas/gpus.py index c0438fca16..beda7e43ec 100644 --- a/src/dstack/_internal/server/schemas/gpus.py +++ b/src/dstack/_internal/server/schemas/gpus.py @@ -20,6 +20,7 @@ class BackendGpu(CoreModel): spot: bool count: int price: float + region: str class BackendGpus(CoreModel): @@ -36,7 +37,8 @@ class ListGpusRequest(CoreModel): run_spec: RunSpec group_by: Optional[List[Literal["backend", "region", "count"]]] = Field( default=None, - description="List of fields to group by. Valid values: 'backend', 'region', 'count'", + description="List of fields to group by. Valid values: 'backend', 'region', 'count'. " + "Note: 'region' can only be used together with 'backend'.", ) diff --git a/src/dstack/_internal/server/services/gpus.py b/src/dstack/_internal/server/services/gpus.py index 3f0d1df577..0ec347be00 100644 --- a/src/dstack/_internal/server/services/gpus.py +++ b/src/dstack/_internal/server/services/gpus.py @@ -87,6 +87,7 @@ def _process_offers_into_backend_gpus( spot=offer.instance.resources.spot, count=gpu_count_in_offer, price=offer.price, + region=offer.region, ) backend_gpus_list = [] @@ -204,24 +205,23 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> """Aggregates GPU specs, grouping them by both backend and region.""" gpu_rows: Dict[Tuple, GpuGroup] = {} for backend in backend_gpus: - for region in backend.regions: - for gpu in backend.gpus: - key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - region=region, - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + for gpu in backend.gpus: + key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=gpu.region, + ) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) return sorted( list(gpu_rows.values()), @@ -313,31 +313,30 @@ def _get_gpus_grouped_by_backend_region_and_count( """Aggregates GPU specs, grouping them by backend, region, and GPU count.""" gpu_rows: Dict[Tuple, GpuGroup] = {} for backend in backend_gpus: - for region in backend.regions: - for gpu in backend.gpus: - key = ( - gpu.name, - gpu.memory_mib, - gpu.vendor, - backend.backend_type, - region, - gpu.count, + for gpu in backend.gpus: + key = ( + gpu.name, + gpu.memory_mib, + gpu.vendor, + backend.backend_type, + gpu.region, + gpu.count, + ) + if key not in gpu_rows: + per_gpu_price = gpu.price / gpu.count + gpu_rows[key] = GpuGroup( + name=gpu.name, + memory_mib=gpu.memory_mib, + vendor=gpu.vendor, + availability=[gpu.availability], + spot=["spot" if gpu.spot else "on-demand"], + count=Range[int](min=gpu.count, max=gpu.count), + price=Range[float](min=per_gpu_price, max=per_gpu_price), + backend=backend.backend_type, + region=gpu.region, ) - if key not in gpu_rows: - per_gpu_price = gpu.price / gpu.count - gpu_rows[key] = GpuGroup( - name=gpu.name, - memory_mib=gpu.memory_mib, - vendor=gpu.vendor, - availability=[gpu.availability], - spot=["spot" if gpu.spot else "on-demand"], - count=Range[int](min=gpu.count, max=gpu.count), - price=Range[float](min=per_gpu_price, max=per_gpu_price), - backend=backend.backend_type, - region=region, - ) - else: - _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) + else: + _update_gpu_group(gpu_rows[key], gpu, backend.backend_type) return sorted( list(gpu_rows.values()), @@ -366,6 +365,11 @@ async def list_gpus_grouped( group_by_set = set(group_by) if group_by else set() + if "region" in group_by_set and "backend" not in group_by_set: + from dstack._internal.core.errors import ServerClientError + + raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'") + # Determine grouping strategy based on combination has_backend = "backend" in group_by_set has_region = "region" in group_by_set diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py index 643cb4981e..8116e2ceba 100644 --- a/src/tests/_internal/server/routers/test_gpus.py +++ b/src/tests/_internal/server/routers/test_gpus.py @@ -198,6 +198,19 @@ async def test_invalid_group_by_rejected( assert response.status_code == 422 assert "validation error" in response.text.lower() or "invalid" in response.text.lower() + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_region_without_backend_rejected( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + response = await call_gpus_api( + client, project.name, user.token, run_spec, group_by=["region"] + ) + + assert response.status_code == 400 + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_exact_aggregation_values( @@ -422,28 +435,79 @@ async def test_exact_aggregation_values( assert t4_runpod["price"] == {"min": 0.25, "max": 0.25} assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75} - # Test region grouping to validate multi-region, multi-backend setup response_region = await client.post( f"/api/project/{project.name}/gpus/list", headers=get_auth_headers(user.token), - json={"run_spec": run_spec.dict(), "group_by": ["region"]}, + json={"run_spec": run_spec.dict(), "group_by": ["backend", "region"]}, ) assert response_region.status_code == 200 region_data = response_region.json() - assert len(region_data["gpus"]) == 2 + assert len(region_data["gpus"]) == 5 - t4_region_group = next( - (gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None + t4_aws_uswest2 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "T4" + and gpu.get("backend") == "aws" + and gpu.get("region") == "us-west-2" + ), + None, ) - rtx_region_group = next( - (gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None + t4_runpod_useast1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "T4" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "us-east-1" + ), + None, ) - assert t4_region_group is not None - assert rtx_region_group is not None + rtx_runpod_useast1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "RTX4090" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "us-east-1" + ), + None, + ) + rtx_runpod_euwest1 = next( + ( + gpu + for gpu in region_data["gpus"] + if gpu["name"] == "RTX4090" + and gpu.get("backend") == "runpod" + and gpu.get("region") == "eu-west-1" + ), + None, + ) - assert set(t4_region_group["backends"]) == {"aws", "runpod"} - assert set(rtx_region_group["backends"]) == {"runpod"} - assert t4_region_group["price"] == {"min": 0.25, "max": 0.60} - assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75} + assert t4_aws_uswest2 is not None + assert t4_runpod_useast1 is not None + assert rtx_runpod_useast1 is not None + assert rtx_runpod_euwest1 is not None + + assert t4_aws_uswest2["backend"] == "aws" + assert t4_aws_uswest2["region"] == "us-west-2" + assert t4_aws_uswest2["price"]["min"] == 0.30 + assert t4_aws_uswest2["price"]["max"] == 0.60 + + assert t4_runpod_useast1["backend"] == "runpod" + assert t4_runpod_useast1["region"] == "us-east-1" + assert t4_runpod_useast1["price"]["min"] == 0.25 + assert t4_runpod_useast1["price"]["max"] == 0.25 + + assert rtx_runpod_useast1["backend"] == "runpod" + assert rtx_runpod_useast1["region"] == "us-east-1" + assert rtx_runpod_useast1["price"]["min"] == 0.75 + assert rtx_runpod_useast1["price"]["max"] == 0.75 + + assert rtx_runpod_euwest1["backend"] == "runpod" + assert rtx_runpod_euwest1["region"] == "eu-west-1" + assert rtx_runpod_euwest1["price"]["min"] == 0.65 + assert rtx_runpod_euwest1["price"]["max"] == 0.65 From c8d6b3dcd8d8ba55597dd3e61e367ed8cb779a9f Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 15:01:38 +0200 Subject: [PATCH 12/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Show an error if unknown arg is passed to `dstack gpu` --- src/dstack/_internal/cli/commands/gpu.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py index 4519cf79ee..4231a28aef 100644 --- a/src/dstack/_internal/cli/commands/gpu.py +++ b/src/dstack/_internal/cli/commands/gpu.py @@ -59,6 +59,14 @@ def _register(self): def _command(self, args: argparse.Namespace): super()._command(args) + + # Validate that no unknown arguments were passed + if hasattr(args, "unknown") and args.unknown: + from dstack._internal.core.errors import CLIError + + unknown_args_str = " ".join(args.unknown) + raise CLIError(f"Unrecognized arguments: {unknown_args_str}") + conf = TaskConfiguration(commands=[":"]) configurator = GpuConfigurator(api_client=self.api) From fdf6da6387b1348b6d27e2c0b52d0a594b136c24 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 18 Aug 2025 15:03:34 +0200 Subject: [PATCH 13/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Reverted unneeded change --- src/dstack/_internal/server/services/runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index 9f24d6fcb3..81d34a2ae3 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -1285,7 +1285,7 @@ def is_replica_registered(jobs: list[JobModel]) -> bool: return jobs[0].registered -def _remove_job_spec_sensitive_info(spec: JobSpec) -> None: +def _remove_job_spec_sensitive_info(spec: JobSpec): spec.ssh_key = None From 305a36269b55d736d9fd63230c7d065b7b79b860 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 19 Aug 2025 14:17:33 +0200 Subject: [PATCH 14/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Merged `dstack gpu` to `dstack offer` --- src/dstack/_internal/cli/commands/offer.py | 128 +++++++------ src/dstack/_internal/cli/utils/gpu.py | 210 +++++++++++++++++++++ src/dstack/_internal/cli/utils/run.py | 33 ++++ src/dstack/api/server/_gpus.py | 6 +- 4 files changed, 314 insertions(+), 63 deletions(-) create mode 100644 src/dstack/_internal/cli/utils/gpu.py diff --git a/src/dstack/_internal/cli/commands/offer.py b/src/dstack/_internal/cli/commands/offer.py index cb920ec275..0201ddc21a 100644 --- a/src/dstack/_internal/cli/commands/offer.py +++ b/src/dstack/_internal/cli/commands/offer.py @@ -1,29 +1,20 @@ import argparse -import contextlib -import json from pathlib import Path +from typing import List from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.services.configurators.run import ( - BaseRunConfigurator, -) +from dstack._internal.cli.services.configurators.run import BaseRunConfigurator from dstack._internal.cli.utils.common import console -from dstack._internal.cli.utils.run import print_run_plan -from dstack._internal.core.models.configurations import ( - ApplyConfigurationType, - TaskConfiguration, -) +from dstack._internal.cli.utils.gpu import print_gpu_json, print_gpu_table +from dstack._internal.cli.utils.run import print_offers_json, print_run_plan +from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.configurations import ApplyConfigurationType, TaskConfiguration from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.schemas.gpus import GpuGroup from dstack.api.utils import load_profile class OfferConfigurator(BaseRunConfigurator): - # TODO: The command currently uses `BaseRunConfigurator` to register arguments. - # This includes --env, --retry-policy, and other arguments that are unnecessary for this command. - # Eventually, we should introduce a base `OfferConfigurator` that doesn't include those arguments— - # `BaseRunConfigurator` will inherit from `OfferConfigurator`. - # - # Additionally, it should have its own type: `ApplyConfigurationType.OFFER`. TYPE = ApplyConfigurationType.TASK @classmethod @@ -32,10 +23,18 @@ def register_args( parser: argparse.ArgumentParser, ): super().register_args(parser, default_max_offers=50) + parser.add_argument( + "--group-by", + action="append", + help=( + "Group results by fields ([code]gpu[/code], [code]backend[/code], [code]region[/code], [code]count[/code]). " + "Optional, but if used, must include [code]gpu[/code]. " + "The use of [code]region[/code] also requires [code]backend[/code]. " + "Can be repeated or comma-separated (e.g. [code]--group-by gpu,backend[/code])." + ), + ) -# TODO: Support aggregated offers -# TODO: Add tests class OfferCommand(APIBaseCommand): NAME = "offer" DESCRIPTION = "List offers" @@ -70,49 +69,58 @@ def _command(self, args: argparse.Namespace): ssh_key_pub="(dummy)", profile=profile, ) + + if args.group_by: + args.group_by = self._process_group_by_args(args.group_by) + + if args.group_by and "gpu" not in args.group_by: + group_values = ", ".join(args.group_by) + raise CLIError(f"Cannot group by '{group_values}' without also grouping by 'gpu'") + if args.format == "plain": - status = console.status("Getting offers...") + with console.status("Getting offers..."): + if args.group_by: + gpus = self._list_gpus(args, run_spec) + print_gpu_table(gpus, run_spec, args.group_by, self.api.project) + else: + run_plan = self.api.client.runs.get_plan( + self.api.project, + run_spec, + max_offers=args.max_offers, + ) + print_run_plan(run_plan, include_run_properties=False) else: - status = contextlib.nullcontext() - with status: - run_plan = self.api.client.runs.get_plan( - self.api.project, - run_spec, - max_offers=args.max_offers, - ) - - job_plan = run_plan.job_plans[0] - - if args.format == "json": - # FIXME: Should use effective_run_spec from run_plan, - # since the spec can be changed by the server and plugins - output = { - "project": run_plan.project_name, - "user": run_plan.user, - "resources": job_plan.job_spec.requirements.resources.dict(), - "max_price": (job_plan.job_spec.requirements.max_price), - "spot": run_spec.configuration.spot_policy, - "reservation": run_plan.run_spec.configuration.reservation, - "offers": [], - "total_offers": job_plan.total_offers, - } - - for offer in job_plan.offers: - output["offers"].append( - { - "backend": ( - "ssh" if offer.backend.value == "remote" else offer.backend.value - ), - "region": offer.region, - "instance_type": offer.instance.name, - "resources": offer.instance.resources.dict(), - "spot": offer.instance.resources.spot, - "price": float(offer.price), - "availability": offer.availability.value, - } + if args.group_by: + gpus = self._list_gpus(args, run_spec) + print_gpu_json(gpus, run_spec, args.group_by, self.api.project) + else: + run_plan = self.api.client.runs.get_plan( + self.api.project, + run_spec, + max_offers=args.max_offers, ) + print_offers_json(run_plan, run_spec) - print(json.dumps(output, indent=2)) - return - else: - print_run_plan(run_plan, include_run_properties=False) + def _process_group_by_args(self, group_by_args: List[str]) -> List[str]: + valid_choices = {"gpu", "backend", "region", "count"} + processed = [] + + for arg in group_by_args: + values = [v.strip() for v in arg.split(",") if v.strip()] + for value in values: + if value in valid_choices: + processed.append(value) + else: + raise CLIError( + f"Invalid group-by value: '{value}'. Valid choices are: {', '.join(sorted(valid_choices))}" + ) + + return processed + + def _list_gpus(self, args: List[str], run_spec: RunSpec) -> List[GpuGroup]: + group_by = [g for g in args.group_by if g != "gpu"] or None + return self.api.client.gpus.list_gpus( + self.api.project, + run_spec, + group_by=group_by, + ) diff --git a/src/dstack/_internal/cli/utils/gpu.py b/src/dstack/_internal/cli/utils/gpu.py new file mode 100644 index 0000000000..1edda81767 --- /dev/null +++ b/src/dstack/_internal/cli/utils/gpu.py @@ -0,0 +1,210 @@ +import shutil +from typing import List + +from rich.table import Table + +from dstack._internal.cli.utils.common import console +from dstack._internal.core.models.profiles import SpotPolicy +from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map +from dstack._internal.server.schemas.gpus import GpuGroup + + +def print_gpu_json(gpu_response, run_spec, group_by_cli, api_project): + """Print GPU information in JSON format.""" + req = Requirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.configuration.reservation, + ) + + if req.spot is None: + spot_policy = "auto" + elif req.spot: + spot_policy = "spot" + else: + spot_policy = "on-demand" + + output = { + "project": api_project, + "user": "admin", # TODO: Get actual user name + "resources": req.resources.dict(), + "spot_policy": spot_policy, + "max_price": req.max_price, + "reservation": run_spec.configuration.reservation, + "group_by": group_by_cli, + "gpus": [], + } + + for gpu_group in gpu_response.gpus: + gpu_data = { + "name": gpu_group.name, + "memory_mib": gpu_group.memory_mib, + "vendor": gpu_group.vendor.value, + "availability": [av.value for av in gpu_group.availability], + "spot": gpu_group.spot, + "count": {"min": gpu_group.count.min, "max": gpu_group.count.max}, + "price": {"min": gpu_group.price.min, "max": gpu_group.price.max}, + } + + if gpu_group.backend: + gpu_data["backend"] = gpu_group.backend.value + if gpu_group.backends: + gpu_data["backends"] = [b.value for b in gpu_group.backends] + if gpu_group.region: + gpu_data["region"] = gpu_group.region + if gpu_group.regions: + gpu_data["regions"] = gpu_group.regions + + output["gpus"].append(gpu_data) + + import json + + print(json.dumps(output, indent=2)) + + +def print_gpu_table(gpus: List[GpuGroup], run_spec: RunSpec, group_by: List[str], project: str): + """Print GPU information in a formatted table.""" + print_filter_info(run_spec, group_by, project) + + has_single_backend = any(gpu_group.backend for gpu_group in gpus) + has_single_region = any(gpu_group.region for gpu_group in gpus) + has_multiple_regions = any(gpu_group.regions for gpu_group in gpus) + + if has_single_backend and has_single_region: + backend_column = "BACKEND" + region_column = "REGION" + elif has_single_backend and has_multiple_regions: + backend_column = "BACKEND" + region_column = "REGIONS" + else: + backend_column = "BACKENDS" + region_column = None + + table = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) + table.add_column("#") + table.add_column("GPU", no_wrap=True, ratio=2) + table.add_column("SPOT", style="grey58", ratio=1) + table.add_column("$/GPU", style="grey58", ratio=1) + table.add_column(backend_column, style="grey58", ratio=2) + if region_column: + table.add_column(region_column, style="grey58", ratio=2) + table.add_column() + + for i, gpu_group in enumerate(gpus, start=1): + backend_text = "" + if gpu_group.backend: + backend_text = gpu_group.backend.value + elif gpu_group.backends: + backend_text = ", ".join(b.value for b in gpu_group.backends) + + region_text = "" + if gpu_group.region: + region_text = gpu_group.region + elif gpu_group.regions: + if len(gpu_group.regions) <= 3: + region_text = ", ".join(gpu_group.regions) + else: + region_text = f"{len(gpu_group.regions)} regions" + + if not region_column: + if gpu_group.regions and len(gpu_group.regions) > 3: + shortened_region_text = f"{len(gpu_group.regions)} regions" + backends_display = ( + f"{backend_text} ({shortened_region_text})" + if shortened_region_text + else backend_text + ) + else: + backends_display = ( + f"{backend_text} ({region_text})" if region_text else backend_text + ) + else: + backends_display = backend_text + + memory_gb = f"{gpu_group.memory_mib // 1024}GB" + if gpu_group.count.min == gpu_group.count.max: + count_range = str(gpu_group.count.min) + else: + count_range = f"{gpu_group.count.min}..{gpu_group.count.max}" + + gpu_spec = f"{gpu_group.name}:{memory_gb}:{count_range}" + + spot_types = [] + if "spot" in gpu_group.spot: + spot_types.append("spot") + if "on-demand" in gpu_group.spot: + spot_types.append("on-demand") + spot_display = ", ".join(spot_types) + + if gpu_group.price.min == gpu_group.price.max: + price_display = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + else: + min_formatted = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") + max_formatted = f"{gpu_group.price.max:.4f}".rstrip("0").rstrip(".") + price_display = f"{min_formatted}..{max_formatted}" + + availability = "" + has_available = any(av.is_available() for av in gpu_group.availability) + has_unavailable = any(not av.is_available() for av in gpu_group.availability) + + if has_unavailable and not has_available: + for av in gpu_group.availability: + if av.value in {"not_available", "no_quota", "idle", "busy"}: + availability = av.value.replace("_", " ").lower() + break + + secondary_style = "grey58" + row_data = [ + f"[{secondary_style}]{i}[/]", + gpu_spec, + f"[{secondary_style}]{spot_display}[/]", + f"[{secondary_style}]{price_display}[/]", + f"[{secondary_style}]{backends_display}[/]", + ] + if region_column: + row_data.append(f"[{secondary_style}]{region_text}[/]") + row_data.append(f"[{secondary_style}]{availability}[/]") + + table.add_row(*row_data) + + console.print(table) + + +def print_filter_info(run_spec: RunSpec, group_by: List[str], project: str): + """Print filter information for GPU display.""" + props = Table(box=None, show_header=False) + props.add_column(no_wrap=True) + props.add_column() + + req = Requirements( + resources=run_spec.configuration.resources, + max_price=run_spec.merged_profile.max_price, + spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), + reservation=run_spec.merged_profile.reservation, + ) + + pretty_req = req.pretty_format(resources_only=True) + max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "-" + + if req.spot is None: + spot_policy = "auto" + elif req.spot: + spot_policy = "spot" + else: + spot_policy = "on-demand" + + def th(s: str) -> str: + return f"[bold]{s}[/bold]" + + props.add_row(th("Project"), project) + props.add_row(th("User"), "admin") # TODO: Get actual user name + props.add_row(th("Resources"), pretty_req) + props.add_row(th("Spot policy"), spot_policy) + props.add_row(th("Max price"), max_price) + props.add_row(th("Reservation"), run_spec.configuration.reservation or "-") + if group_by: + props.add_row(th("Group by"), ", ".join(group_by)) + + console.print(props) + console.print() diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index f307912372..58497c0848 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -28,6 +28,39 @@ from dstack.api import Run +def print_offers_json(run_plan: RunPlan, run_spec): + """Print offers information in JSON format.""" + job_plan = run_plan.job_plans[0] + + output = { + "project": run_plan.project_name, + "user": run_plan.user, + "resources": job_plan.job_spec.requirements.resources.dict(), + "max_price": (job_plan.job_spec.requirements.max_price), + "spot": run_spec.configuration.spot_policy, + "reservation": run_plan.run_spec.configuration.reservation, + "offers": [], + "total_offers": job_plan.total_offers, + } + + for offer in job_plan.offers: + output["offers"].append( + { + "backend": ("ssh" if offer.backend.value == "remote" else offer.backend.value), + "region": offer.region, + "instance_type": offer.instance.name, + "resources": offer.instance.resources.dict(), + "spot": offer.instance.resources.spot, + "price": float(offer.price), + "availability": offer.availability.value, + } + ) + + import json + + print(json.dumps(output, indent=2)) + + def print_run_plan( run_plan: RunPlan, max_offers: Optional[int] = None, include_run_properties: bool = True ): diff --git a/src/dstack/api/server/_gpus.py b/src/dstack/api/server/_gpus.py index 884c0feb3c..b30ae8a894 100644 --- a/src/dstack/api/server/_gpus.py +++ b/src/dstack/api/server/_gpus.py @@ -3,7 +3,7 @@ from pydantic import parse_obj_as from dstack._internal.core.models.runs import RunSpec -from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse +from dstack._internal.server.schemas.gpus import GpuGroup, ListGpusRequest, ListGpusResponse from dstack.api.server._group import APIClientGroup @@ -13,10 +13,10 @@ def list_gpus( project_name: str, run_spec: RunSpec, group_by: Optional[List[str]] = None, - ) -> ListGpusResponse: + ) -> List[GpuGroup]: body = ListGpusRequest(run_spec=run_spec, group_by=group_by) resp = self._request( f"/api/project/{project_name}/gpus/list", body=body.json(), ) - return parse_obj_as(ListGpusResponse, resp.json()) + return parse_obj_as(ListGpusResponse, resp.json()).gpus From 565e54d4e25cdd733d776d45977cd3582c7a5ffc Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 19 Aug 2025 14:21:40 +0200 Subject: [PATCH 15/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Removed `dstack gpu` --- src/dstack/_internal/cli/commands/gpu.py | 289 ----------------------- src/dstack/_internal/cli/main.py | 2 - 2 files changed, 291 deletions(-) delete mode 100644 src/dstack/_internal/cli/commands/gpu.py diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py deleted file mode 100644 index 4231a28aef..0000000000 --- a/src/dstack/_internal/cli/commands/gpu.py +++ /dev/null @@ -1,289 +0,0 @@ -import argparse -import contextlib -import json -import shutil -from pathlib import Path - -from rich.table import Table - -from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.services.configurators.run import BaseRunConfigurator -from dstack._internal.cli.utils.common import console -from dstack._internal.core.models.configurations import ( - ApplyConfigurationType, - TaskConfiguration, -) -from dstack._internal.core.models.profiles import SpotPolicy -from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map -from dstack.api.utils import load_profile - - -class GpuConfigurator(BaseRunConfigurator): - TYPE = ApplyConfigurationType.TASK - - @classmethod - def register_args( - cls, - parser: argparse.ArgumentParser, - ): - super().register_args(parser, default_max_offers=50) - parser.add_argument( - "--group-by", - action="append", - choices=["backend", "region", "count"], - help="Group GPUs by backend, region, and/or count. Can be specified multiple times. " - "Note: 'region' can only be used together with 'backend'.", - ) - - -class GpuCommand(APIBaseCommand): - NAME = "gpu" - DESCRIPTION = "List available GPUs" - - def _register(self): - super()._register() - self._parser.add_argument( - "--format", - choices=["plain", "json"], - default="plain", - help="Output format (default: plain)", - ) - self._parser.add_argument( - "--json", - action="store_const", - const="json", - dest="format", - help="Output in JSON format (equivalent to --format json)", - ) - GpuConfigurator.register_args(self._parser) - - def _command(self, args: argparse.Namespace): - super()._command(args) - - # Validate that no unknown arguments were passed - if hasattr(args, "unknown") and args.unknown: - from dstack._internal.core.errors import CLIError - - unknown_args_str = " ".join(args.unknown) - raise CLIError(f"Unrecognized arguments: {unknown_args_str}") - - conf = TaskConfiguration(commands=[":"]) - - configurator = GpuConfigurator(api_client=self.api) - configurator.apply_args(conf, args, []) - profile = load_profile(Path.cwd(), profile_name=args.profile) - - run_spec = RunSpec( - configuration=conf, - ssh_key_pub="(dummy)", - profile=profile, - ) - - if args.format == "plain": - status = console.status("Getting GPU information...") - else: - status = contextlib.nullcontext() - - with status: - gpu_response = self.api.client.gpus.list_gpus( - self.api.project, - run_spec, - group_by=args.group_by, - ) - - if args.format == "json": - req = Requirements( - resources=run_spec.configuration.resources, - max_price=run_spec.merged_profile.max_price, - spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), - reservation=run_spec.merged_profile.reservation, - ) - - if req.spot is None: - spot_policy = "auto" - elif req.spot: - spot_policy = "spot" - else: - spot_policy = "on-demand" - - output = { - "project": self.api.project, - "user": "admin", # TODO: Get actual user name - "resources": req.resources.dict(), - "spot_policy": spot_policy, - "max_price": req.max_price, - "reservation": run_spec.configuration.reservation, - "group_by": args.group_by, - "gpus": [], - } - - for gpu_group in gpu_response.gpus: - gpu_data = { - "name": gpu_group.name, - "memory_mib": gpu_group.memory_mib, - "vendor": gpu_group.vendor.value, - "availability": [av.value for av in gpu_group.availability], - "spot": gpu_group.spot, - "count": {"min": gpu_group.count.min, "max": gpu_group.count.max}, - "price": {"min": gpu_group.price.min, "max": gpu_group.price.max}, - } - - if gpu_group.backend: - gpu_data["backend"] = gpu_group.backend.value - if gpu_group.backends: - gpu_data["backends"] = [b.value for b in gpu_group.backends] - if gpu_group.region: - gpu_data["region"] = gpu_group.region - if gpu_group.regions: - gpu_data["regions"] = gpu_group.regions - - output["gpus"].append(gpu_data) - - print(json.dumps(output, indent=2)) - return - else: - self._print_gpu_table(gpu_response, run_spec, args.group_by) - - def _print_gpu_table(self, gpu_response, run_spec, group_by): - self._print_filter_info(run_spec, group_by) - - has_single_backend = any(gpu_group.backend for gpu_group in gpu_response.gpus) - has_single_region = any(gpu_group.region for gpu_group in gpu_response.gpus) - has_multiple_regions = any(gpu_group.regions for gpu_group in gpu_response.gpus) - - if has_single_backend and has_single_region: - backend_column = "BACKEND" - region_column = "REGION" - elif has_single_backend and has_multiple_regions: - backend_column = "BACKEND" - region_column = "REGIONS" - else: - backend_column = "BACKENDS" - region_column = None - - table = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) - table.add_column("#") - table.add_column("GPU", no_wrap=True, ratio=2) - table.add_column("SPOT", style="grey58", ratio=1) - table.add_column("$/GPU", style="grey58", ratio=1) - table.add_column(backend_column, style="grey58", ratio=2) - if region_column: - table.add_column(region_column, style="grey58", ratio=2) - table.add_column() - - for i, gpu_group in enumerate(gpu_response.gpus, start=1): - backend_text = "" - if gpu_group.backend: - backend_text = gpu_group.backend.value - elif gpu_group.backends: - backend_text = ", ".join(b.value for b in gpu_group.backends) - - region_text = "" - if gpu_group.region: - region_text = gpu_group.region - elif gpu_group.regions: - if len(gpu_group.regions) <= 3: - region_text = ", ".join(gpu_group.regions) - else: - region_text = f"{len(gpu_group.regions)} regions" - - if not region_column: - if gpu_group.regions and len(gpu_group.regions) > 3: - shortened_region_text = f"{len(gpu_group.regions)} regions" - backends_display = ( - f"{backend_text} ({shortened_region_text})" - if shortened_region_text - else backend_text - ) - else: - backends_display = ( - f"{backend_text} ({region_text})" if region_text else backend_text - ) - else: - backends_display = backend_text - - memory_gb = f"{gpu_group.memory_mib // 1024}GB" - if gpu_group.count.min == gpu_group.count.max: - count_range = str(gpu_group.count.min) - else: - count_range = f"{gpu_group.count.min}..{gpu_group.count.max}" - - # Always include count in GPU spec format: :: - gpu_spec = f"{gpu_group.name}:{memory_gb}:{count_range}" - - spot_types = [] - if "spot" in gpu_group.spot: - spot_types.append("spot") - if "on-demand" in gpu_group.spot: - spot_types.append("on-demand") - spot_display = ", ".join(spot_types) - - if gpu_group.price.min == gpu_group.price.max: - price_display = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") - else: - min_formatted = f"{gpu_group.price.min:.4f}".rstrip("0").rstrip(".") - max_formatted = f"{gpu_group.price.max:.4f}".rstrip("0").rstrip(".") - price_display = f"{min_formatted}..{max_formatted}" - - availability = "" - has_available = any(av.is_available() for av in gpu_group.availability) - has_unavailable = any(not av.is_available() for av in gpu_group.availability) - - if has_unavailable and not has_available: - for av in gpu_group.availability: - if av.value in {"not_available", "no_quota", "idle", "busy"}: - availability = av.value.replace("_", " ").lower() - break - - secondary_style = "grey58" - row_data = [ - f"[{secondary_style}]{i}[/]", - gpu_spec, - f"[{secondary_style}]{spot_display}[/]", - f"[{secondary_style}]{price_display}[/]", - f"[{secondary_style}]{backends_display}[/]", - ] - if region_column: - row_data.append(f"[{secondary_style}]{region_text}[/]") - row_data.append(f"[{secondary_style}]{availability}[/]") - - table.add_row(*row_data) - - console.print(table) - - def _print_filter_info(self, run_spec, group_by): - props = Table(box=None, show_header=False) - props.add_column(no_wrap=True) - props.add_column() - - req = Requirements( - resources=run_spec.configuration.resources, - max_price=run_spec.merged_profile.max_price, - spot=get_policy_map(run_spec.merged_profile.spot_policy, default=SpotPolicy.AUTO), - reservation=run_spec.merged_profile.reservation, - ) - - pretty_req = req.pretty_format(resources_only=True) - max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "-" - - if req.spot is None: - spot_policy = "auto" - elif req.spot: - spot_policy = "spot" - else: - spot_policy = "on-demand" - - def th(s: str) -> str: - return f"[bold]{s}[/bold]" - - props.add_row(th("Project"), self.api.project) - props.add_row(th("User"), "admin") # TODO: Get actual user name - props.add_row(th("Resources"), pretty_req) - props.add_row(th("Spot policy"), spot_policy) - props.add_row(th("Max price"), max_price) - props.add_row(th("Reservation"), run_spec.configuration.reservation or "-") - if group_by: - props.add_row(th("Group by"), ", ".join(group_by)) - - console.print(props) - console.print() diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index c98d5526b1..c91d0f2feb 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -11,7 +11,6 @@ from dstack._internal.cli.commands.delete import DeleteCommand from dstack._internal.cli.commands.fleet import FleetCommand from dstack._internal.cli.commands.gateway import GatewayCommand -from dstack._internal.cli.commands.gpu import GpuCommand from dstack._internal.cli.commands.init import InitCommand from dstack._internal.cli.commands.logs import LogsCommand from dstack._internal.cli.commands.metrics import MetricsCommand @@ -69,7 +68,6 @@ def main(): FleetCommand.register(subparsers) GatewayCommand.register(subparsers) InitCommand.register(subparsers) - GpuCommand.register(subparsers) OfferCommand.register(subparsers) LogsCommand.register(subparsers) MetricsCommand.register(subparsers) From 83b1523be71d7eb553a2f7b2e1f38f07408cda45 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 19 Aug 2025 15:43:49 +0200 Subject: [PATCH 16/16] [Feature]: Allow listing available key resources such as gpu, region, and backends #2142 Documented `--group-by` Plus, a minor fix; --- docs/docs/guides/protips.md | 30 ++++++++++++++++++++++ docs/docs/reference/cli/dstack/offer.md | 34 ++++++++++++++++++++++--- src/dstack/_internal/cli/utils/gpu.py | 2 +- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index f4f35e27a7..f51cc4777b 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -427,6 +427,36 @@ Getting offers... +??? info "Grouping offers" + Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`. + +
+ + ```shell + dstack offer --gpu b200 --group-by gpu,backend,region + Project main + User admin + Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1.. + Spot policy auto + Max price - + Reservation - + Group by gpu, backend, region + + # GPU SPOT $/GPU BACKEND REGION + 1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1 + 2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2 + 3 B200:180GB:8 on-demand 4.99 lambda us-east-1 + 4 B200:180GB:8 on-demand 5.5 nebius us-central1 + ``` + +
+ + When using `--group-by`, `gpu` must always be `included`. + The `region` value can only be used together with `backend`. + +The `offer` command allows you to filter and group offers with various [advanced options](../reference/cli/dstack/offer.md#usage). + + ## Metrics `dstack` tracks essential metrics accessible via the CLI and UI. To access advanced metrics like DCGM, configure the server to export metrics to Prometheus. See [Metrics](metrics.md) for details. diff --git a/docs/docs/reference/cli/dstack/offer.md b/docs/docs/reference/cli/dstack/offer.md index d4e450090f..fb56e73a41 100644 --- a/docs/docs/reference/cli/dstack/offer.md +++ b/docs/docs/reference/cli/dstack/offer.md @@ -1,8 +1,8 @@ # dstack offer -Displays available offers (hardware configurations) with the configured backends (or offers that match already provisioned fleets). +Displays available offers (hardware configurations) from configured backends or from fleets you’ve already provisioned. Supports filtering and grouping. -The output includes backend, region, instance type, resources, spot availability, and pricing details. +The output shows backend, region, instance type, resources, spot availability, and pricing. ## Usage @@ -19,7 +19,7 @@ $ dstack offer --help ## Examples -### List GPU offers +### Filtering offers The `--gpu` flag accepts the same specification format as the `gpu` property in [`dev environment`](../../../concepts/dev-environments.md), [`task`](../../../concepts/tasks.md), [`service`](../../../concepts/services.md), and [`fleet`](../../../concepts/fleets.md) configurations. @@ -71,6 +71,34 @@ Getting offers... +### Grouping offers + +Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`. + +
+ +```shell +dstack offer --gpu b200 --group-by gpu,backend,region + Project main + User admin + Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1.. + Spot policy auto + Max price - + Reservation - + Group by gpu, backend, region + + # GPU SPOT $/GPU BACKEND REGION + 1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1 + 2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2 + 3 B200:180GB:8 on-demand 4.99 lambda us-east-1 + 4 B200:180GB:8 on-demand 5.5 nebius us-central1 +``` + +
+ +When using `--group-by`, `gpu` must always be `included`. +The `region` value can only be used together with `backend`. + ### JSON format Use `--json` to output offers in the JSON format. diff --git a/src/dstack/_internal/cli/utils/gpu.py b/src/dstack/_internal/cli/utils/gpu.py index 1edda81767..fcb138b162 100644 --- a/src/dstack/_internal/cli/utils/gpu.py +++ b/src/dstack/_internal/cli/utils/gpu.py @@ -198,7 +198,7 @@ def th(s: str) -> str: return f"[bold]{s}[/bold]" props.add_row(th("Project"), project) - props.add_row(th("User"), "admin") # TODO: Get actual user name + # TODO: Show user name props.add_row(th("Resources"), pretty_req) props.add_row(th("Spot policy"), spot_policy) props.add_row(th("Max price"), max_price)