11from datetime import timedelta
22from typing import List
3+ from uuid import UUID
34
45from sqlalchemy import select , update
56from sqlalchemy .ext .asyncio import AsyncSession
67from sqlalchemy .orm import joinedload , load_only
78
8- from dstack ._internal .core .models .fleets import FleetStatus
9+ from dstack ._internal .core .models .fleets import FleetSpec , FleetStatus
10+ from dstack ._internal .core .models .instances import InstanceStatus
911from dstack ._internal .server .db import get_db , get_session_ctx
1012from dstack ._internal .server .models import (
1113 FleetModel ,
1517 RunModel ,
1618)
1719from dstack ._internal .server .services .fleets import (
20+ create_fleet_instance_model ,
1821 get_fleet_spec ,
22+ get_next_instance_num ,
1923 is_fleet_empty ,
2024 is_fleet_in_use ,
2125)
@@ -65,31 +69,111 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
6569 res = await session .execute (
6670 select (FleetModel )
6771 .where (FleetModel .id .in_ (fleet_ids ))
68- .options (joinedload (FleetModel .instances ).load_only (InstanceModel .deleted ))
6972 .options (
70- joinedload (FleetModel .instances ).joinedload (InstanceModel .jobs ).load_only (JobModel .id )
73+ joinedload (FleetModel .instances ).joinedload (InstanceModel .jobs ).load_only (JobModel .id ),
74+ joinedload (FleetModel .project ),
7175 )
7276 .options (joinedload (FleetModel .runs ).load_only (RunModel .status ))
7377 .execution_options (populate_existing = True )
7478 )
7579 fleet_models = list (res .unique ().scalars ().all ())
7680
81+ # TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
7782 deleted_fleets_ids = []
78- now = get_current_datetime ()
7983 for fleet_model in fleet_models :
84+ _consolidate_fleet_state_with_spec (session , fleet_model )
8085 deleted = _autodelete_fleet (fleet_model )
8186 if deleted :
8287 deleted_fleets_ids .append (fleet_model .id )
83- fleet_model .last_processed_at = now
88+ fleet_model .last_processed_at = get_current_datetime ()
89+ await _update_deleted_fleets_placement_groups (session , deleted_fleets_ids )
90+ await session .commit ()
8491
85- await session .execute (
86- update (PlacementGroupModel )
87- .where (
88- PlacementGroupModel .fleet_id .in_ (deleted_fleets_ids ),
92+
93+ def _consolidate_fleet_state_with_spec (session : AsyncSession , fleet_model : FleetModel ):
94+ if fleet_model .status == FleetStatus .TERMINATING :
95+ return
96+ fleet_spec = get_fleet_spec (fleet_model )
97+ if fleet_spec .configuration .nodes is None or fleet_spec .autocreated :
98+ # Only explicitly created cloud fleets are consolidated.
99+ return
100+ if not _is_fleet_ready_for_consolidation (fleet_model ):
101+ return
102+ added_instances = _maintain_fleet_nodes_min (session , fleet_model , fleet_spec )
103+ if added_instances :
104+ fleet_model .consolidation_attempt += 1
105+ else :
106+ # The fleet is already consolidated or consolidation is in progress.
107+ # We reset consolidation_attempt in both cases for simplicity.
108+ # The second case does not need reset but is ok to do since
109+ # it means consolidation is longer than delay, so it won't happen too often.
110+ # TODO: Reset consolidation_attempt on fleet in-place update.
111+ fleet_model .consolidation_attempt = 0
112+ fleet_model .last_consolidated_at = get_current_datetime ()
113+
114+
115+ def _is_fleet_ready_for_consolidation (fleet_model : FleetModel ) -> bool :
116+ consolidation_retry_delay = _get_consolidation_retry_delay (fleet_model .consolidation_attempt )
117+ last_consolidated_at = fleet_model .last_consolidated_at or fleet_model .last_processed_at
118+ duration_since_last_consolidation = get_current_datetime () - last_consolidated_at
119+ return duration_since_last_consolidation >= consolidation_retry_delay
120+
121+
122+ # We use exponentially increasing consolidation retry delays so that
123+ # consolidation does not happen too often. In particular, this prevents
124+ # retrying instance provisioning constantly in case of no offers.
125+ # TODO: Adjust delays.
126+ _CONSOLIDATION_RETRY_DELAYS = [
127+ timedelta (seconds = 30 ),
128+ timedelta (minutes = 1 ),
129+ timedelta (minutes = 2 ),
130+ timedelta (minutes = 5 ),
131+ timedelta (minutes = 10 ),
132+ ]
133+
134+
135+ def _get_consolidation_retry_delay (consolidation_attempt : int ) -> timedelta :
136+ if consolidation_attempt < len (_CONSOLIDATION_RETRY_DELAYS ):
137+ return _CONSOLIDATION_RETRY_DELAYS [consolidation_attempt ]
138+ return _CONSOLIDATION_RETRY_DELAYS [- 1 ]
139+
140+
141+ def _maintain_fleet_nodes_min (
142+ session : AsyncSession ,
143+ fleet_model : FleetModel ,
144+ fleet_spec : FleetSpec ,
145+ ) -> bool :
146+ """
147+ Ensures the fleet has at least `nodes.min` instances.
148+ Returns `True` if retried or added new instances and `False` otherwise.
149+ """
150+ assert fleet_spec .configuration .nodes is not None
151+ for instance in fleet_model .instances :
152+ # Delete terminated but not deleted instances since
153+ # they are going to be replaced with new pending instances.
154+ if instance .status == InstanceStatus .TERMINATED and not instance .deleted :
155+ # It's safe to modify instances without instance lock since
156+ # no other task modifies already terminated instances.
157+ instance .deleted = True
158+ instance .deleted_at = get_current_datetime ()
159+ active_instances = [i for i in fleet_model .instances if not i .deleted ]
160+ active_instances_num = len (active_instances )
161+ if active_instances_num >= fleet_spec .configuration .nodes .min :
162+ return False
163+ nodes_missing = fleet_spec .configuration .nodes .min - active_instances_num
164+ for i in range (nodes_missing ):
165+ instance_model = create_fleet_instance_model (
166+ session = session ,
167+ project = fleet_model .project ,
168+ # TODO: Store fleet.user and pass it instead of the project owner.
169+ username = fleet_model .project .owner .name ,
170+ spec = fleet_spec ,
171+ instance_num = get_next_instance_num ({i .instance_num for i in active_instances }),
89172 )
90- .values (fleet_deleted = True )
91- )
92- await session .commit ()
173+ active_instances .append (instance_model )
174+ fleet_model .instances .append (instance_model )
175+ logger .info ("Added %s instances to fleet %s" , nodes_missing , fleet_model .name )
176+ return True
93177
94178
95179def _autodelete_fleet (fleet_model : FleetModel ) -> bool :
@@ -100,7 +184,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
100184 if (
101185 fleet_model .status != FleetStatus .TERMINATING
102186 and fleet_spec .configuration .nodes is not None
103- and ( fleet_spec .configuration .nodes .min is None or fleet_spec . configuration . nodes . min == 0 )
187+ and fleet_spec .configuration .nodes .min == 0
104188 ):
105189 # Empty fleets that allow 0 nodes should not be auto-deleted
106190 return False
@@ -110,3 +194,13 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
110194 fleet_model .deleted = True
111195 logger .info ("Fleet %s deleted" , fleet_model .name )
112196 return True
197+
198+
199+ async def _update_deleted_fleets_placement_groups (session : AsyncSession , fleets_ids : list [UUID ]):
200+ await session .execute (
201+ update (PlacementGroupModel )
202+ .where (
203+ PlacementGroupModel .fleet_id .in_ (fleets_ids ),
204+ )
205+ .values (fleet_deleted = True )
206+ )
0 commit comments