Check if run cannot fit into fleet

r4victor · r4victor · commit 08793c1e565d · 2025-10-08T14:47:48.000+05:00
diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py
@@ -260,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
 
         instance_filters = [
             InstanceModel.deleted == False,
-            InstanceModel.total_blocks > InstanceModel.busy_blocks,
             InstanceModel.id.not_in(detaching_instances_ids),
         ]
 
@@ -514,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
         )
         return run_model.fleet, fleet_instances_with_pool_offers
 
-    if len(fleet_models) == 0:
-        return None, []
-
     nodes_required_num = _get_nodes_required_num_for_run(run_spec)
     # The current strategy is first to consider fleets that can accommodate
     # the run without additional provisioning and choose the one with the cheapest pool offer.
@@ -534,31 +530,29 @@ async def _find_optimal_fleet_with_offers(
         ]
     ] = []
     for candidate_fleet_model in fleet_models:
+        candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
         fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
             fleet_model=candidate_fleet_model,
             run_spec=run_spec,
             job=job,
             master_job_provisioning_data=master_job_provisioning_data,
             volumes=volumes,
         )
-        fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
+        fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
         fleet_cheapest_pool_offer = math.inf
         if len(fleet_instances_with_pool_offers) > 0:
             fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
 
-        candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
-        profile = None
-        requirements = None
         try:
+            _check_can_create_new_instance_in_fleet(candidate_fleet)
             profile, requirements = _get_run_profile_and_requirements_in_fleet(
                 job=job,
                 run_spec=run_spec,
                 fleet=candidate_fleet,
             )
         except ValueError:
-            pass
-        fleet_backend_offers = []
-        if profile is not None and requirements is not None:
+            fleet_backend_offers = []
+        else:
             multinode = (
                 candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
                 or job.job_spec.jobs_per_replica > 1
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
         if len(fleet_backend_offers) > 0:
             fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
 
+        if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
+            logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
+            continue
+
         fleet_priority = (
-            not fleet_has_available_capacity,
+            not fleet_has_pool_capacity,
             fleet_cheapest_pool_offer,
             fleet_cheapest_backend_offer,
         )
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
                 fleet_priority,
             )
         )
+    if len(candidate_fleets_with_offers) == 0:
+        return None, []
     if run_spec.merged_profile.fleets is None and all(
         t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
     ):
-        # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
+        # If fleets are not specified and no fleets have available pool
+        # or backend offers, create a new fleet.
         # This is for compatibility with non-fleet-first UX when runs created new fleets
         # if there are no instances to reuse.
         return None, []
@@ -616,6 +617,31 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
     return nodes_required_num
 
 
+def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
+    """
+    Returns `False` if the run cannot fit into fleet for sure.
+    This is helpful heuristic to avoid even considering fleets too small for a run.
+    A run may not fit even if this function returns `True`.
+    This will lead to some jobs failing due to exceeding `nodes.max`
+    or more than `nodes.max` instances being provisioned
+    and eventually removed by the fleet consolidation logic.
+    """
+    # No check for cloud fleets with blocks > 1 since we don't know
+    # how many jobs such fleets can accommodate.
+    # TODO: Check if cannot fit into SSH fleet.
+    nodes_required_num = _get_nodes_required_num_for_run(run_spec)
+    if (
+        fleet.spec.configuration.nodes is not None
+        and fleet.spec.configuration.blocks == 1
+        and fleet.spec.configuration.nodes.max is not None
+    ):
+        busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
+        fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
+        if fleet_available_capacity < nodes_required_num:
+            return False
+    return True
+
+
 def _get_fleet_instances_with_pool_offers(
     fleet_model: FleetModel,
     run_spec: RunSpec,
@@ -713,6 +739,7 @@ async def _run_job_on_new_instance(
     if fleet_model is not None:
         fleet = fleet_model_to_fleet(fleet_model)
         try:
+            _check_can_create_new_instance_in_fleet(fleet)
             profile, requirements = _get_run_profile_and_requirements_in_fleet(
                 job=job,
                 run_spec=run.run_spec,
@@ -787,8 +814,6 @@ def _get_run_profile_and_requirements_in_fleet(
     run_spec: RunSpec,
     fleet: Fleet,
 ) -> tuple[Profile, Requirements]:
-    if not _check_can_create_new_instance_in_fleet(fleet):
-        raise ValueError("Cannot fit new instance into fleet")
     profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
     if profile is None:
         raise ValueError("Cannot combine fleet profile")
@@ -801,13 +826,23 @@ def _get_run_profile_and_requirements_in_fleet(
     return profile, requirements
 
 
-def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
+def _check_can_create_new_instance_in_fleet(fleet: Fleet):
+    if not _can_create_new_instance_in_fleet(fleet):
+        raise ValueError("Cannot fit new instance into fleet")
+
+
+def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
     if fleet.spec.configuration.ssh_config is not None:
         return False
-    # TODO: Respect nodes.max
-    # Ensure concurrent provisioning does not violate nodes.max
-    # E.g. lock fleet and split instance model creation
-    # and instance provisioning into separate transactions.
+    active_instances = [i for i in fleet.instances if i.status.is_active()]
+    # nodes.max is a soft limit that can be exceeded when provisioning concurrently.
+    # The fleet consolidation logic will remove redundant nodes eventually.
+    if (
+        fleet.spec.configuration.nodes is not None
+        and fleet.spec.configuration.nodes.max is not None
+        and len(active_instances) >= fleet.spec.configuration.nodes.max
+    ):
+        return False
     return True
 
 
diff --git a/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py b/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py
@@ -494,7 +494,9 @@ async def test_assigns_job_to_shared_instance(self, test_db, session: AsyncSessi
             project_id=project.id,
         )
         offer = get_instance_offer_with_availability(gpu_count=8, cpu_count=64, memory_gib=128)
-        fleet = await create_fleet(session=session, project=project)
+        fleet_spec = get_fleet_spec()
+        fleet_spec.configuration.blocks = 4
+        fleet = await create_fleet(session=session, project=project, spec=fleet_spec)
         instance = await create_instance(
             session=session,
             project=project,
@@ -537,7 +539,9 @@ async def test_assigns_multi_node_job_to_shared_instance(self, test_db, session:
             project_id=project.id,
         )
         offer = get_instance_offer_with_availability(gpu_count=8, cpu_count=64, memory_gib=128)
-        fleet = await create_fleet(session=session, project=project)
+        fleet_spec = get_fleet_spec()
+        fleet_spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=None)
+        fleet = await create_fleet(session=session, project=project, spec=fleet_spec)
         instance = await create_instance(
             session=session,
             project=project,
@@ -743,6 +747,55 @@ async def test_assigns_no_fleet_when_all_fleets_occupied(self, test_db, session:
         assert job.instance_id is None
         assert job.fleet_id is None
 
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
+    async def test_assigns_no_fleet_if_run_cannot_fit(self, test_db, session: AsyncSession):
+        project = await create_project(session)
+        user = await create_user(session)
+        repo = await create_repo(session=session, project_id=project.id)
+        fleet_spec = get_fleet_spec()
+        fleet_spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=3)
+        fleet = await create_fleet(session=session, project=project, spec=fleet_spec)
+        instance1 = await create_instance(
+            session=session,
+            project=project,
+            fleet=fleet,
+            instance_num=0,
+            status=InstanceStatus.BUSY,
+            busy_blocks=1,
+        )
+        instance2 = await create_instance(
+            session=session,
+            project=project,
+            fleet=fleet,
+            instance_num=1,
+            status=InstanceStatus.IDLE,
+            busy_blocks=0,
+        )
+        fleet.instances.append(instance1)
+        fleet.instances.append(instance2)
+        run_spec = get_run_spec(repo_id=repo.name)
+        run_spec.configuration = TaskConfiguration(nodes=3, commands=["echo"])
+        run = await create_run(
+            session=session,
+            project=project,
+            repo=repo,
+            user=user,
+            run_spec=run_spec,
+        )
+        job = await create_job(
+            session=session,
+            run=run,
+            instance_assigned=False,
+        )
+        await session.commit()
+        await process_submitted_jobs()
+        await session.refresh(job)
+        assert job.status == JobStatus.SUBMITTED
+        assert job.instance_assigned
+        assert job.instance_id is None
+        assert job.fleet_id is None
+
     @pytest.mark.asyncio
     @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
     async def test_does_not_assign_job_to_elastic_empty_fleet_without_backend_offers_if_fleets_unspecified(