4747 InstanceOfferWithAvailability ,
4848 InstanceRuntime ,
4949 InstanceStatus ,
50+ InstanceTerminationReason ,
5051 RemoteConnectionInfo ,
5152 SSHKey ,
5253)
@@ -274,7 +275,7 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel
274275 delta = datetime .timedelta (seconds = idle_seconds )
275276 if idle_duration > delta :
276277 instance .status = InstanceStatus .TERMINATING
277- instance .termination_reason = "Idle timeout"
278+ instance .termination_reason = InstanceTerminationReason . IDLE_TIMEOUT
278279 logger .info (
279280 "Instance %s idle duration expired: idle time %ss. Terminating" ,
280281 instance .name ,
@@ -310,7 +311,7 @@ async def _add_remote(instance: InstanceModel) -> None:
310311 retry_duration_deadline = instance .created_at + timedelta (seconds = PROVISIONING_TIMEOUT_SECONDS )
311312 if retry_duration_deadline < get_current_datetime ():
312313 instance .status = InstanceStatus .TERMINATED
313- instance .termination_reason = "Provisioning timeout expired"
314+ instance .termination_reason = InstanceTerminationReason . PROVISIONING_TIMEOUT
314315 logger .warning (
315316 "Failed to start instance %s in %d seconds. Terminating..." ,
316317 instance .name ,
@@ -333,7 +334,8 @@ async def _add_remote(instance: InstanceModel) -> None:
333334 ssh_proxy_pkeys = None
334335 except (ValueError , PasswordRequiredException ):
335336 instance .status = InstanceStatus .TERMINATED
336- instance .termination_reason = "Unsupported private SSH key type"
337+ instance .termination_reason = InstanceTerminationReason .ERROR
338+ instance .termination_reason_message = "Unsupported private SSH key type"
337339 logger .warning (
338340 "Failed to add instance %s: unsupported private SSH key type" ,
339341 instance .name ,
@@ -391,7 +393,10 @@ async def _add_remote(instance: InstanceModel) -> None:
391393 )
392394 if instance_network is not None and internal_ip is None :
393395 instance .status = InstanceStatus .TERMINATED
394- instance .termination_reason = "Failed to locate internal IP address on the given network"
396+ instance .termination_reason = InstanceTerminationReason .ERROR
397+ instance .termination_reason_message = (
398+ "Failed to locate internal IP address on the given network"
399+ )
395400 logger .warning (
396401 "Failed to add instance %s: failed to locate internal IP address on the given network" ,
397402 instance .name ,
@@ -404,7 +409,8 @@ async def _add_remote(instance: InstanceModel) -> None:
404409 if internal_ip is not None :
405410 if not is_ip_among_addresses (ip_address = internal_ip , addresses = host_network_addresses ):
406411 instance .status = InstanceStatus .TERMINATED
407- instance .termination_reason = (
412+ instance .termination_reason = InstanceTerminationReason .ERROR
413+ instance .termination_reason_message = (
408414 "Specified internal IP not found among instance interfaces"
409415 )
410416 logger .warning (
@@ -426,7 +432,8 @@ async def _add_remote(instance: InstanceModel) -> None:
426432 instance .total_blocks = blocks
427433 else :
428434 instance .status = InstanceStatus .TERMINATED
429- instance .termination_reason = "Cannot split into blocks"
435+ instance .termination_reason = InstanceTerminationReason .ERROR
436+ instance .termination_reason_message = "Cannot split into blocks"
430437 logger .warning (
431438 "Failed to add instance %s: cannot split into blocks" ,
432439 instance .name ,
@@ -545,7 +552,8 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
545552 requirements = get_instance_requirements (instance )
546553 except ValidationError as e :
547554 instance .status = InstanceStatus .TERMINATED
548- instance .termination_reason = (
555+ instance .termination_reason = InstanceTerminationReason .ERROR
556+ instance .termination_reason_message = (
549557 f"Error to parse profile, requirements or instance_configuration: { e } "
550558 )
551559 logger .warning (
@@ -671,19 +679,28 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
671679 )
672680 return
673681
674- _mark_terminated (instance , "All offers failed" if offers else "No offers found" )
682+ _mark_terminated (
683+ instance ,
684+ InstanceTerminationReason .NO_OFFERS ,
685+ "All offers failed" if offers else "No offers found" ,
686+ )
675687 if instance .fleet and is_fleet_master_instance (instance ) and is_cloud_cluster (instance .fleet ):
676688 # Do not attempt to deploy other instances, as they won't determine the correct cluster
677689 # backend, region, and placement group without a successfully deployed master instance
678690 for sibling_instance in instance .fleet .instances :
679691 if sibling_instance .id == instance .id :
680692 continue
681- _mark_terminated (sibling_instance , "Master instance failed to start" )
693+ _mark_terminated (sibling_instance , InstanceTerminationReason . MASTER_FAILED )
682694
683695
684- def _mark_terminated (instance : InstanceModel , termination_reason : str ) -> None :
696+ def _mark_terminated (
697+ instance : InstanceModel ,
698+ termination_reason : InstanceTerminationReason ,
699+ termination_reason_message : Optional [str ] = None ,
700+ ) -> None :
685701 instance .status = InstanceStatus .TERMINATED
686702 instance .termination_reason = termination_reason
703+ instance .termination_reason_message = termination_reason_message
687704 logger .info (
688705 "Terminated instance %s: %s" ,
689706 instance .name ,
@@ -703,7 +720,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
703720 ):
704721 # A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068
705722 instance .status = InstanceStatus .TERMINATING
706- instance .termination_reason = "Instance job finished"
723+ instance .termination_reason = InstanceTerminationReason . JOB_FINISHED
707724 logger .info (
708725 "Detected busy instance %s with finished job. Marked as TERMINATING" ,
709726 instance .name ,
@@ -832,7 +849,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
832849 deadline = instance .termination_deadline
833850 if get_current_datetime () > deadline :
834851 instance .status = InstanceStatus .TERMINATING
835- instance .termination_reason = "Termination deadline"
852+ instance .termination_reason = InstanceTerminationReason . UNREACHABLE
836853 logger .warning (
837854 "Instance %s shim waiting timeout. Marked as TERMINATING" ,
838855 instance .name ,
@@ -861,7 +878,8 @@ async def _wait_for_instance_provisioning_data(
861878 "Instance %s failed because instance has not become running in time" , instance .name
862879 )
863880 instance .status = InstanceStatus .TERMINATING
864- instance .termination_reason = "Instance has not become running in time"
881+ instance .termination_reason = InstanceTerminationReason .PROVISIONING_TIMEOUT
882+ instance .termination_reason_message = "Backend did not complete provisioning in time"
865883 return
866884
867885 backend = await backends_services .get_project_backend_by_type (
@@ -874,7 +892,8 @@ async def _wait_for_instance_provisioning_data(
874892 instance .name ,
875893 )
876894 instance .status = InstanceStatus .TERMINATING
877- instance .termination_reason = "Backend not available"
895+ instance .termination_reason = InstanceTerminationReason .ERROR
896+ instance .termination_reason_message = "Backend not available"
878897 return
879898 try :
880899 await run_async (
@@ -891,7 +910,8 @@ async def _wait_for_instance_provisioning_data(
891910 repr (e ),
892911 )
893912 instance .status = InstanceStatus .TERMINATING
894- instance .termination_reason = "Error while waiting for instance to become running"
913+ instance .termination_reason = InstanceTerminationReason .ERROR
914+ instance .termination_reason_message = "Error while waiting for instance to become running"
895915 except Exception :
896916 logger .exception (
897917 "Got exception when updating instance %s provisioning data" , instance .name
0 commit comments