From b5e36109661966ffa38eaa5abd733069133e8d41 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 29 Oct 2025 15:13:37 +0500 Subject: [PATCH 1/3] Do not terminate fleet instances on idle_duration at nodes.min --- .../background/tasks/process_instances.py | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index a2c9f47420..e5ce629aac 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -196,12 +196,12 @@ async def _process_next_instance(): async def _process_instance(session: AsyncSession, instance: InstanceModel): + # Refetch to load related attributes. + # Load related attributes only for statuses that always need them. if instance.status in ( InstanceStatus.PENDING, InstanceStatus.TERMINATING, ): - # Refetch to load related attributes. - # Load related attributes only for statuses that always need them. res = await session.execute( select(InstanceModel) .where(InstanceModel.id == instance.id) @@ -211,6 +211,16 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): .execution_options(populate_existing=True) ) instance = res.unique().scalar_one() + elif instance.status == InstanceStatus.IDLE: + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.id == instance.id) + .options(joinedload(InstanceModel.project)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options(joinedload(InstanceModel.fleet).joinedload(FleetModel.instances)) + .execution_options(populate_existing=True) + ) + instance = res.unique().scalar_one() if instance.status == InstanceStatus.PENDING: if instance.remote_connection_info is not None: @@ -242,6 +252,14 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel and not instance.jobs ): return False + if instance.fleet is not None and not _can_terminate_fleet_instances_on_idle_duration( + instance.fleet + ): + logger.debug( + "Skipping instance %s termination on idle duration. Fleet is already at `nodes.min`.", + instance.name, + ) + return False idle_duration = _get_instance_idle_duration(instance) idle_seconds = instance.termination_idle_time delta = datetime.timedelta(seconds=idle_seconds) @@ -261,6 +279,20 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel return False +def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> bool: + # Do not terminate instances on idle duration if fleet is already at `nodes.min`. + # This is an optimization to avoid terminate-create loop. + # There may be race conditions since we don't take the fleet lock. + # That's ok: in the worst case we go below `nodes.min`, but + # the fleet consolidation logic will provision new nodes. + fleet = fleet_model_to_fleet(fleet_model) + if fleet.spec.configuration.nodes is None: + return True + active_instances = [i for i in fleet_model.instances if i.status.is_active()] + active_instances_num = len(active_instances) + return active_instances_num > fleet.spec.configuration.nodes.min + + async def _add_remote(instance: InstanceModel) -> None: logger.info("Adding ssh instance %s...", instance.name) if instance.status == InstanceStatus.PENDING: From bf8f8d72b5dc056d8197d4768a8d2f12ff8305ae Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 29 Oct 2025 15:20:33 +0500 Subject: [PATCH 2/3] Update idle_duration reference --- src/dstack/_internal/core/models/fleets.py | 7 ++++++- src/dstack/_internal/core/models/profiles.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/core/models/fleets.py b/src/dstack/_internal/core/models/fleets.py index 031b9aae4f..1137c38dba 100644 --- a/src/dstack/_internal/core/models/fleets.py +++ b/src/dstack/_internal/core/models/fleets.py @@ -309,7 +309,12 @@ class InstanceGroupParams(CoreModel): idle_duration: Annotated[ Optional[int], Field( - description="Time to wait before terminating idle instances. Defaults to `5m` for runs and `3d` for fleets. Use `off` for unlimited duration" + description=( + "Time to wait before terminating idle instances." + " Instances are not terminated if the fleet is already at `nodes.min`." + " Defaults to `5m` for runs and `3d` for fleets." + " Use `off` for unlimited duration" + ) ), ] = None diff --git a/src/dstack/_internal/core/models/profiles.py b/src/dstack/_internal/core/models/profiles.py index ca37cd360c..bc2a2260c8 100644 --- a/src/dstack/_internal/core/models/profiles.py +++ b/src/dstack/_internal/core/models/profiles.py @@ -341,7 +341,9 @@ class ProfileParams(CoreModel): Field( description=( "Time to wait before terminating idle instances." - " Defaults to `5m` for runs and `3d` for fleets. Use `off` for unlimited duration" + " Instances are not terminated if the fleet is already at `nodes.min`." + " Defaults to `5m` for runs and `3d` for fleets." + " Use `off` for unlimited duration" ) ), ] = None From 626034d3f1f5e4b88de93196f3b9bf3c45e5d24e Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 29 Oct 2025 15:35:12 +0500 Subject: [PATCH 3/3] Skip nodes.min check for autocreated fleets --- .../_internal/server/background/tasks/process_instances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index e5ce629aac..88a9970b88 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -286,7 +286,7 @@ def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> # That's ok: in the worst case we go below `nodes.min`, but # the fleet consolidation logic will provision new nodes. fleet = fleet_model_to_fleet(fleet_model) - if fleet.spec.configuration.nodes is None: + if fleet.spec.configuration.nodes is None or fleet.spec.autocreated: return True active_instances = [i for i in fleet_model.instances if i.status.is_active()] active_instances_num = len(active_instances)