@@ -260,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
260260
261261 instance_filters = [
262262 InstanceModel .deleted == False ,
263- InstanceModel .total_blocks > InstanceModel .busy_blocks ,
264263 InstanceModel .id .not_in (detaching_instances_ids ),
265264 ]
266265
@@ -514,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
514513 )
515514 return run_model .fleet , fleet_instances_with_pool_offers
516515
517- if len (fleet_models ) == 0 :
518- return None , []
519-
520516 nodes_required_num = _get_nodes_required_num_for_run (run_spec )
521517 # The current strategy is first to consider fleets that can accommodate
522518 # the run without additional provisioning and choose the one with the cheapest pool offer.
@@ -534,31 +530,29 @@ async def _find_optimal_fleet_with_offers(
534530 ]
535531 ] = []
536532 for candidate_fleet_model in fleet_models :
533+ candidate_fleet = fleet_model_to_fleet (candidate_fleet_model )
537534 fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers (
538535 fleet_model = candidate_fleet_model ,
539536 run_spec = run_spec ,
540537 job = job ,
541538 master_job_provisioning_data = master_job_provisioning_data ,
542539 volumes = volumes ,
543540 )
544- fleet_has_available_capacity = nodes_required_num <= len (fleet_instances_with_pool_offers )
541+ fleet_has_pool_capacity = nodes_required_num <= len (fleet_instances_with_pool_offers )
545542 fleet_cheapest_pool_offer = math .inf
546543 if len (fleet_instances_with_pool_offers ) > 0 :
547544 fleet_cheapest_pool_offer = fleet_instances_with_pool_offers [0 ][1 ].price
548545
549- candidate_fleet = fleet_model_to_fleet (candidate_fleet_model )
550- profile = None
551- requirements = None
552546 try :
547+ _check_can_create_new_instance_in_fleet (candidate_fleet )
553548 profile , requirements = _get_run_profile_and_requirements_in_fleet (
554549 job = job ,
555550 run_spec = run_spec ,
556551 fleet = candidate_fleet ,
557552 )
558553 except ValueError :
559- pass
560- fleet_backend_offers = []
561- if profile is not None and requirements is not None :
554+ fleet_backend_offers = []
555+ else :
562556 multinode = (
563557 candidate_fleet .spec .configuration .placement == InstanceGroupPlacement .CLUSTER
564558 or job .job_spec .jobs_per_replica > 1
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
579573 if len (fleet_backend_offers ) > 0 :
580574 fleet_cheapest_backend_offer = fleet_backend_offers [0 ][1 ].price
581575
576+ if not _run_can_fit_into_fleet (run_spec , candidate_fleet ):
577+ logger .debug ("Skipping fleet %s from consideration: run cannot fit into fleet" )
578+ continue
579+
582580 fleet_priority = (
583- not fleet_has_available_capacity ,
581+ not fleet_has_pool_capacity ,
584582 fleet_cheapest_pool_offer ,
585583 fleet_cheapest_backend_offer ,
586584 )
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
593591 fleet_priority ,
594592 )
595593 )
594+ if len (candidate_fleets_with_offers ) == 0 :
595+ return None , []
596596 if run_spec .merged_profile .fleets is None and all (
597597 t [2 ] == 0 and t [3 ] == 0 for t in candidate_fleets_with_offers
598598 ):
599- # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
599+ # If fleets are not specified and no fleets have available pool
600+ # or backend offers, create a new fleet.
600601 # This is for compatibility with non-fleet-first UX when runs created new fleets
601602 # if there are no instances to reuse.
602603 return None , []
@@ -616,6 +617,31 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
616617 return nodes_required_num
617618
618619
620+ def _run_can_fit_into_fleet (run_spec : RunSpec , fleet : Fleet ) -> bool :
621+ """
622+ Returns `False` if the run cannot fit into fleet for sure.
623+ This is helpful heuristic to avoid even considering fleets too small for a run.
624+ A run may not fit even if this function returns `True`.
625+ This will lead to some jobs failing due to exceeding `nodes.max`
626+ or more than `nodes.max` instances being provisioned
627+ and eventually removed by the fleet consolidation logic.
628+ """
629+ # No check for cloud fleets with blocks > 1 since we don't know
630+ # how many jobs such fleets can accommodate.
631+ # TODO: Check if cannot fit into SSH fleet.
632+ nodes_required_num = _get_nodes_required_num_for_run (run_spec )
633+ if (
634+ fleet .spec .configuration .nodes is not None
635+ and fleet .spec .configuration .blocks == 1
636+ and fleet .spec .configuration .nodes .max is not None
637+ ):
638+ busy_instances = [i for i in fleet .instances if i .busy_blocks > 0 ]
639+ fleet_available_capacity = fleet .spec .configuration .nodes .max - len (busy_instances )
640+ if fleet_available_capacity < nodes_required_num :
641+ return False
642+ return True
643+
644+
619645def _get_fleet_instances_with_pool_offers (
620646 fleet_model : FleetModel ,
621647 run_spec : RunSpec ,
@@ -713,6 +739,7 @@ async def _run_job_on_new_instance(
713739 if fleet_model is not None :
714740 fleet = fleet_model_to_fleet (fleet_model )
715741 try :
742+ _check_can_create_new_instance_in_fleet (fleet )
716743 profile , requirements = _get_run_profile_and_requirements_in_fleet (
717744 job = job ,
718745 run_spec = run .run_spec ,
@@ -787,8 +814,6 @@ def _get_run_profile_and_requirements_in_fleet(
787814 run_spec : RunSpec ,
788815 fleet : Fleet ,
789816) -> tuple [Profile , Requirements ]:
790- if not _check_can_create_new_instance_in_fleet (fleet ):
791- raise ValueError ("Cannot fit new instance into fleet" )
792817 profile = combine_fleet_and_run_profiles (fleet .spec .merged_profile , run_spec .merged_profile )
793818 if profile is None :
794819 raise ValueError ("Cannot combine fleet profile" )
@@ -801,13 +826,23 @@ def _get_run_profile_and_requirements_in_fleet(
801826 return profile , requirements
802827
803828
804- def _check_can_create_new_instance_in_fleet (fleet : Fleet ) -> bool :
829+ def _check_can_create_new_instance_in_fleet (fleet : Fleet ):
830+ if not _can_create_new_instance_in_fleet (fleet ):
831+ raise ValueError ("Cannot fit new instance into fleet" )
832+
833+
834+ def _can_create_new_instance_in_fleet (fleet : Fleet ) -> bool :
805835 if fleet .spec .configuration .ssh_config is not None :
806836 return False
807- # TODO: Respect nodes.max
808- # Ensure concurrent provisioning does not violate nodes.max
809- # E.g. lock fleet and split instance model creation
810- # and instance provisioning into separate transactions.
837+ active_instances = [i for i in fleet .instances if i .status .is_active ()]
838+ # nodes.max is a soft limit that can be exceeded when provisioning concurrently.
839+ # The fleet consolidation logic will remove redundant nodes eventually.
840+ if (
841+ fleet .spec .configuration .nodes is not None
842+ and fleet .spec .configuration .nodes .max is not None
843+ and len (active_instances ) >= fleet .spec .configuration .nodes .max
844+ ):
845+ return False
811846 return True
812847
813848
0 commit comments