3131 JobSubmission ,
3232 JobTerminationReason ,
3333 RunSpec ,
34+ RunTerminationReason ,
3435)
3536from dstack ._internal .core .models .volumes import Volume , VolumeMountPoint , VolumeStatus
3637from dstack ._internal .server import settings
@@ -349,6 +350,7 @@ async def process_terminating_job(
349350 if len (volume_models ) > 0 :
350351 logger .info ("Detaching volumes: %s" , [v .name for v in volume_models ])
351352 all_volumes_detached = await _detach_volumes_from_job_instance (
353+ session = session ,
352354 project = instance_model .project ,
353355 job_model = job_model ,
354356 jpd = jpd ,
@@ -432,6 +434,7 @@ async def process_volumes_detaching(
432434 )
433435 logger .info ("Detaching volumes: %s" , [v .name for v in volume_models ])
434436 all_volumes_detached = await _detach_volumes_from_job_instance (
437+ session = session ,
435438 project = instance_model .project ,
436439 job_model = job_model ,
437440 jpd = jpd ,
@@ -523,6 +526,7 @@ def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, Li
523526
524527
525528async def _detach_volumes_from_job_instance (
529+ session : AsyncSession ,
526530 project : ProjectModel ,
527531 job_model : JobModel ,
528532 jpd : JobProvisioningData ,
@@ -542,6 +546,7 @@ async def _detach_volumes_from_job_instance(
542546
543547 all_detached = True
544548 detached_volumes = []
549+ run_termination_reason = await _get_run_termination_reason (session , job_model )
545550 for volume_model in volume_models :
546551 detached = await _detach_volume_from_job_instance (
547552 backend = backend ,
@@ -550,6 +555,7 @@ async def _detach_volumes_from_job_instance(
550555 job_spec = job_spec ,
551556 instance_model = instance_model ,
552557 volume_model = volume_model ,
558+ run_termination_reason = run_termination_reason ,
553559 )
554560 if detached :
555561 detached_volumes .append (volume_model )
@@ -572,6 +578,7 @@ async def _detach_volume_from_job_instance(
572578 job_spec : JobSpec ,
573579 instance_model : InstanceModel ,
574580 volume_model : VolumeModel ,
581+ run_termination_reason : Optional [RunTerminationReason ],
575582) -> bool :
576583 detached = True
577584 volume = volume_model_to_volume (volume_model )
@@ -601,7 +608,11 @@ async def _detach_volume_from_job_instance(
601608 volume = volume ,
602609 provisioning_data = jpd ,
603610 )
604- if not detached and _should_force_detach_volume (job_model , job_spec .stop_duration ):
611+ if not detached and _should_force_detach_volume (
612+ job_model ,
613+ run_termination_reason = run_termination_reason ,
614+ stop_duration = job_spec .stop_duration ,
615+ ):
605616 logger .info (
606617 "Force detaching volume %s from %s" ,
607618 volume_model .name ,
@@ -633,13 +644,27 @@ async def _detach_volume_from_job_instance(
633644MIN_FORCE_DETACH_WAIT_PERIOD = timedelta (seconds = 60 )
634645
635646
636- def _should_force_detach_volume (job_model : JobModel , stop_duration : Optional [int ]) -> bool :
647+ async def _get_run_termination_reason (
648+ session : AsyncSession , job_model : JobModel
649+ ) -> Optional [RunTerminationReason ]:
650+ res = await session .execute (
651+ select (RunModel .termination_reason ).where (RunModel .id == job_model .run_id )
652+ )
653+ return res .scalar_one_or_none ()
654+
655+
656+ def _should_force_detach_volume (
657+ job_model : JobModel ,
658+ run_termination_reason : Optional [RunTerminationReason ],
659+ stop_duration : Optional [int ],
660+ ) -> bool :
637661 return (
638662 job_model .volumes_detached_at is not None
639663 and common .get_current_datetime ()
640664 > job_model .volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
641665 and (
642666 job_model .termination_reason == JobTerminationReason .ABORTED_BY_USER
667+ or run_termination_reason == RunTerminationReason .ABORTED_BY_USER
643668 or stop_duration is not None
644669 and common .get_current_datetime ()
645670 > job_model .volumes_detached_at + timedelta (seconds = stop_duration )
0 commit comments