From f48986ebca41b75ff8194a5f99604a97f0fe42e2 Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Mon, 16 Jun 2025 21:13:52 +0200 Subject: [PATCH] Retry on `VOLUME_ERROR` and `INSTANCE_UNREACHABLE` Also refactor so that it is less likely that we forget to associate new termination reasons with retry events. --- src/dstack/_internal/core/models/runs.py | 13 ++++++++++ .../server/background/tasks/process_runs.py | 24 ++++--------------- src/tests/_internal/core/models/test_runs.py | 9 ++++++- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 404cf6cc8a..17afc65ec6 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -148,6 +148,19 @@ def to_status(self) -> JobStatus: } return mapping[self] + def to_retry_event(self) -> Optional[RetryEvent]: + """ + Returns: + the retry event this termination reason triggers + or None if this termination reason should not be retried + """ + mapping = { + self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY, + self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION, + } + default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None + return mapping.get(self, default) + class Requirements(CoreModel): # TODO: Make requirements' fields required diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py index 547a4cd5a0..5237cf68eb 100644 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ b/src/dstack/_internal/server/background/tasks/process_runs.py @@ -393,7 +393,8 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet break if ( - job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + job_model.termination_reason is not None + and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY and last_provisioned_submission is None and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events ): @@ -403,24 +404,9 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet return None if ( - last_provisioned_submission.termination_reason - == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY - and RetryEvent.INTERRUPTION in job.job_spec.retry.on_events - ): - return common.get_current_datetime() - last_provisioned_submission.last_processed_at - - if ( - last_provisioned_submission.termination_reason - in [ - JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, - JobTerminationReason.CREATING_CONTAINER_ERROR, - JobTerminationReason.EXECUTOR_ERROR, - JobTerminationReason.GATEWAY_ERROR, - JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED, - JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED, - JobTerminationReason.PORTS_BINDING_FAILED, - ] - and RetryEvent.ERROR in job.job_spec.retry.on_events + last_provisioned_submission.termination_reason is not None + and last_provisioned_submission.termination_reason.to_retry_event() + in job.job_spec.retry.on_events ): return common.get_current_datetime() - last_provisioned_submission.last_processed_at diff --git a/src/tests/_internal/core/models/test_runs.py b/src/tests/_internal/core/models/test_runs.py index fc7802ee99..851cba9e39 100644 --- a/src/tests/_internal/core/models/test_runs.py +++ b/src/tests/_internal/core/models/test_runs.py @@ -1,3 +1,4 @@ +from dstack._internal.core.models.profiles import RetryEvent from dstack._internal.core.models.runs import ( JobStatus, JobSubmission, @@ -20,12 +21,18 @@ def test_run_termination_reason_to_status_works_with_all_enum_variants(): assert isinstance(run_status, RunStatus) -def test_job_termination_reason_to_status_works_with_all_enum_varians(): +def test_job_termination_reason_to_status_works_with_all_enum_variants(): for job_termination_reason in JobTerminationReason: job_status = job_termination_reason.to_status() assert isinstance(job_status, JobStatus) +def test_job_termination_reason_to_retry_event_works_with_all_enum_variants(): + for job_termination_reason in JobTerminationReason: + retry_event = job_termination_reason.to_retry_event() + assert retry_event is None or isinstance(retry_event, RetryEvent) + + # Will fail if JobTerminationReason value is added without updaing JobSubmission._get_error def test_get_error_returns_expected_messages(): no_error_reasons = [