From fee7303464e4cea3af1c7eb4c64dcbfde1be1505 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 6 Aug 2025 14:13:02 +0500 Subject: [PATCH 1/4] Store all enums as strings in DB --- .../74a1f55209bd_store_enums_as_strings.py | 484 ++++++++++++++++++ src/dstack/_internal/server/models.py | 23 +- src/dstack/_internal/server/services/repos.py | 2 +- 3 files changed, 496 insertions(+), 13 deletions(-) create mode 100644 src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py diff --git a/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py b/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py new file mode 100644 index 0000000000..d4048575d8 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py @@ -0,0 +1,484 @@ +"""Store enums as strings + +Revision ID: 74a1f55209bd +Revises: 25479f540245 +Create Date: 2025-08-06 13:49:28.785378 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "74a1f55209bd" +down_revision = "25479f540245" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column( + "global_role", + existing_type=postgresql.ENUM("ADMIN", "USER", name="globalrole"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + batch_op.alter_column( + "termination_reason", + existing_type=postgresql.ENUM( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ), + type_=sa.String(length=100), + existing_nullable=True, + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + batch_op.alter_column( + "termination_reason", + existing_type=postgresql.ENUM( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + type_=sa.String(length=100), + existing_nullable=True, + ) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus" + ), + type_=sa.String(length=100), + existing_nullable=False, + ) + + sa.Enum("ADMIN", "USER", name="globalrole").drop(op.get_bind()) + sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ).drop(op.get_bind()) + sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").drop( + op.get_bind() + ) + sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").drop( + op.get_bind() + ) + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).drop(op.get_bind()) + sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").drop(op.get_bind()) + sa.Enum( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ).drop(op.get_bind()) + sa.Enum( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ).drop(op.get_bind()) + sa.Enum("SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus").drop( + op.get_bind() + ) + sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").drop(op.get_bind()) + sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ).drop(op.get_bind()) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ).create(op.get_bind()) + sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").create(op.get_bind()) + sa.Enum( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ).create(op.get_bind()) + sa.Enum( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ).create(op.get_bind()) + sa.Enum( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ).create(op.get_bind()) + sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").create(op.get_bind()) + sa.Enum( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ).create(op.get_bind()) + sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").create( + op.get_bind() + ) + sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").create( + op.get_bind() + ) + sa.Enum( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ).create(op.get_bind()) + sa.Enum("ADMIN", "USER", name="globalrole").create(op.get_bind()) + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::volumestatus", + ) + + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.alter_column( + "global_role", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("ADMIN", "USER", name="globalrole"), + existing_nullable=False, + postgresql_using="global_role::VARCHAR::globalrole", + ) + + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "ALL_JOBS_DONE", + "JOB_FAILED", + "RETRY_LIMIT_EXCEEDED", + "STOPPED_BY_USER", + "ABORTED_BY_USER", + "SERVER_ERROR", + name="runterminationreason", + ), + existing_nullable=True, + postgresql_using="termination_reason::VARCHAR::runterminationreason", + ) + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "PENDING", + "SUBMITTED", + "PROVISIONING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "FAILED", + "DONE", + name="runstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::runstatus", + ) + + with op.batch_alter_table("repos", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"), + existing_nullable=False, + postgresql_using="type::VARCHAR::repotype", + ) + + with op.batch_alter_table("members", schema=None) as batch_op: + batch_op.alter_column( + "project_role", + existing_type=sa.String(length=100), + type_=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"), + existing_nullable=False, + postgresql_using="project_role::VARCHAR::projectrole", + ) + + with op.batch_alter_table("jobs", schema=None) as batch_op: + batch_op.alter_column( + "termination_reason", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "FAILED_TO_START_DUE_TO_NO_CAPACITY", + "INTERRUPTED_BY_NO_CAPACITY", + "INSTANCE_UNREACHABLE", + "WAITING_INSTANCE_LIMIT_EXCEEDED", + "WAITING_RUNNER_LIMIT_EXCEEDED", + "TERMINATED_BY_USER", + "VOLUME_ERROR", + "GATEWAY_ERROR", + "SCALED_DOWN", + "DONE_BY_RUNNER", + "ABORTED_BY_USER", + "TERMINATED_BY_SERVER", + "INACTIVITY_DURATION_EXCEEDED", + "TERMINATED_DUE_TO_UTILIZATION_POLICY", + "CONTAINER_EXITED_WITH_ERROR", + "PORTS_BINDING_FAILED", + "CREATING_CONTAINER_ERROR", + "EXECUTOR_ERROR", + "MAX_DURATION_EXCEEDED", + name="jobterminationreason", + ), + existing_nullable=True, + postgresql_using="termination_reason::VARCHAR::jobterminationreason", + ) + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", + "PROVISIONING", + "PULLING", + "RUNNING", + "TERMINATING", + "TERMINATED", + "ABORTED", + "FAILED", + "DONE", + name="jobstatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::jobstatus", + ) + + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "PENDING", + "PROVISIONING", + "IDLE", + "BUSY", + "TERMINATING", + "TERMINATED", + name="instancestatus", + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::instancestatus", + ) + + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::gatewaystatus", + ) + + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.alter_column( + "status", + existing_type=sa.String(length=100), + type_=postgresql.ENUM( + "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus" + ), + existing_nullable=False, + postgresql_using="status::VARCHAR::fleetstatus", + ) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index d08208a399..61e58631b4 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -7,7 +7,6 @@ BigInteger, Boolean, DateTime, - Enum, Float, ForeignKey, Index, @@ -185,7 +184,7 @@ class UserModel(BaseModel): token: Mapped[DecryptedString] = mapped_column(EncryptedString(200), unique=True) # token_hash is needed for fast search by token when stored token is encrypted token_hash: Mapped[str] = mapped_column(String(2000), unique=True) - global_role: Mapped[GlobalRole] = mapped_column(Enum(GlobalRole)) + global_role: Mapped[GlobalRole] = mapped_column(EnumAsString(GlobalRole, 100)) # deactivated users cannot access API active: Mapped[bool] = mapped_column(Boolean, default=True) @@ -246,7 +245,7 @@ class MemberModel(BaseModel): project: Mapped["ProjectModel"] = relationship() user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped[UserModel] = relationship(lazy="joined") - project_role: Mapped[ProjectRole] = mapped_column(Enum(ProjectRole)) + project_role: Mapped[ProjectRole] = mapped_column(EnumAsString(ProjectRole, 100)) # member_num defines members ordering member_num: Mapped[Optional[int]] = mapped_column(Integer) @@ -278,7 +277,7 @@ class RepoModel(BaseModel): project: Mapped["ProjectModel"] = relationship() # RepoModel.name stores repo_id name: Mapped[str] = mapped_column(String(100)) - type: Mapped[RepoType] = mapped_column(Enum(RepoType)) + type: Mapped[RepoType] = mapped_column(EnumAsString(RepoType, 100)) info: Mapped[str] = mapped_column(Text) @@ -359,9 +358,9 @@ class RunModel(BaseModel): submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True) + status: Mapped[RunStatus] = mapped_column(EnumAsString(RunStatus, 100), index=True) termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column( - Enum(RunTerminationReason) + EnumAsString(RunTerminationReason, 100) ) # resubmission_attempt counts consecutive transitions to pending without provisioning. # Can be used to choose retry delay depending on the attempt number. @@ -400,9 +399,9 @@ class JobModel(BaseModel): submission_num: Mapped[int] = mapped_column(Integer) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) - status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True) + status: Mapped[JobStatus] = mapped_column(EnumAsString(JobStatus, 100), index=True) termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column( - Enum(JobTerminationReason) + EnumAsString(JobTerminationReason, 100) ) termination_reason_message: Mapped[Optional[str]] = mapped_column(Text) # `disconnected_at` stores the first time of connectivity issues with the instance. @@ -445,7 +444,7 @@ class GatewayModel(BaseModel): # Use `get_gateway_configuration` to construct `configuration` for old gateways. configuration: Mapped[Optional[str]] = mapped_column(Text) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) - status: Mapped[GatewayStatus] = mapped_column(Enum(GatewayStatus)) + status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100)) status_message: Mapped[Optional[str]] = mapped_column(Text) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) @@ -531,7 +530,7 @@ class FleetModel(BaseModel): deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True) + status: Mapped[FleetStatus] = mapped_column(EnumAsString(FleetStatus, 100), index=True) status_message: Mapped[Optional[str]] = mapped_column(Text) spec: Mapped[str] = mapped_column(Text) @@ -570,7 +569,7 @@ class InstanceModel(BaseModel): fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances") - status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True) + status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True) unreachable: Mapped[bool] = mapped_column(Boolean) # VM @@ -652,7 +651,7 @@ class VolumeModel(BaseModel): deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True) + status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) status_message: Mapped[Optional[str]] = mapped_column(Text) configuration: Mapped[str] = mapped_column(Text) diff --git a/src/dstack/_internal/server/services/repos.py b/src/dstack/_internal/server/services/repos.py index 77f8ccb7f2..1bc2acdfba 100644 --- a/src/dstack/_internal/server/services/repos.py +++ b/src/dstack/_internal/server/services/repos.py @@ -129,7 +129,7 @@ async def create_repo( repo = RepoModel( project_id=project.id, name=repo_id, - type=repo_info.repo_type, + type=RepoType(repo_info.repo_type), info=repo_info.json(), ) try: From 7e51e37a9922a94fb7181e43cdef85548e7ba6d0 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 6 Aug 2025 14:32:44 +0500 Subject: [PATCH 2/4] Make run and job termination_reason a string --- src/dstack/_internal/core/models/runs.py | 20 +++++++++++-------- .../server/background/tasks/process_runs.py | 2 +- .../server/services/jobs/__init__.py | 4 +++- src/dstack/_internal/server/services/runs.py | 4 +++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 1e7754e5b2..a58d7cda05 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -344,15 +344,17 @@ class JobSubmission(CoreModel): deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers submitted_at: datetime last_processed_at: datetime - finished_at: Optional[datetime] - inactivity_secs: Optional[int] + finished_at: Optional[datetime] = None + inactivity_secs: Optional[int] = None status: JobStatus status_message: str = "" # default for backward compatibility - termination_reason: Optional[JobTerminationReason] - termination_reason_message: Optional[str] - exit_status: Optional[int] - job_provisioning_data: Optional[JobProvisioningData] - job_runtime_data: Optional[JobRuntimeData] + # termination_reason stores JobTerminationReason. + # str allows adding new enum members without breaking compatibility with old clients. + termination_reason: Optional[str] = None + termination_reason_message: Optional[str] = None + exit_status: Optional[int] = None + job_provisioning_data: Optional[JobProvisioningData] = None + job_runtime_data: Optional[JobRuntimeData] = None error: Optional[str] = None probes: list[Probe] = [] @@ -502,7 +504,9 @@ class Run(CoreModel): last_processed_at: datetime status: RunStatus status_message: str = "" # default for backward compatibility - termination_reason: Optional[RunTerminationReason] = None + # termination_reason stores RunTerminationReason. + # str allows adding new enum members without breaking compatibility with old clients. + termination_reason: Optional[str] = None run_spec: RunSpec jobs: List[Job] latest_job_submission: Optional[JobSubmission] = None diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py index c0a81089d8..3371621a9d 100644 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ b/src/dstack/_internal/server/background/tasks/process_runs.py @@ -574,7 +574,7 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet if ( last_provisioned_submission.termination_reason is not None - and last_provisioned_submission.termination_reason.to_retry_event() + and JobTerminationReason(last_provisioned_submission.termination_reason).to_retry_event() in job.job_spec.retry.on_events ): return common.get_current_datetime() - last_provisioned_submission.last_processed_at diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index bb9cc68632..0e379b4d99 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -152,7 +152,9 @@ def job_model_to_job_submission( inactivity_secs=job_model.inactivity_secs, status=job_model.status, status_message=status_message, - termination_reason=job_model.termination_reason, + termination_reason=job_model.termination_reason.value + if job_model.termination_reason + else None, termination_reason_message=job_model.termination_reason_message, exit_status=job_model.exit_status, job_provisioning_data=job_provisioning_data, diff --git a/src/dstack/_internal/server/services/runs.py b/src/dstack/_internal/server/services/runs.py index e1f040ec1b..3c4714a2e5 100644 --- a/src/dstack/_internal/server/services/runs.py +++ b/src/dstack/_internal/server/services/runs.py @@ -714,7 +714,9 @@ def run_model_to_run( last_processed_at=run_model.last_processed_at, status=run_model.status, status_message=status_message, - termination_reason=run_model.termination_reason, + termination_reason=run_model.termination_reason.value + if run_model.termination_reason + else None, run_spec=run_spec, jobs=jobs, latest_job_submission=latest_job_submission, From b3cec13112015cb238573e5c177ac840d315cab7 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 6 Aug 2025 14:34:56 +0500 Subject: [PATCH 3/4] Start using JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED --- .../_internal/server/background/tasks/process_running_jobs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index c7222a5230..9b98ed86ca 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -455,8 +455,7 @@ async def _wait_for_instance_provisioning_data(job_model: JobModel): if job_model.instance.status == InstanceStatus.TERMINATED: job_model.status = JobStatus.TERMINATING - # TODO use WAITING_INSTANCE_LIMIT_EXCEEDED after 0.19.x - job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + job_model.termination_reason = JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED return job_model.job_provisioning_data = job_model.instance.job_provisioning_data From 6606911311f2ce834bd96f053522e1d48b644705 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 11 Aug 2025 10:25:43 +0500 Subject: [PATCH 4/4] Rebase migrations --- .../versions/74a1f55209bd_store_enums_as_strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py b/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py index d4048575d8..f98934cf3a 100644 --- a/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +++ b/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py @@ -1,7 +1,7 @@ """Store enums as strings Revision ID: 74a1f55209bd -Revises: 25479f540245 +Revises: 728b1488b1b4 Create Date: 2025-08-06 13:49:28.785378 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "74a1f55209bd" -down_revision = "25479f540245" +down_revision = "728b1488b1b4" branch_labels = None depends_on = None