1616 gateway_connections_pool ,
1717)
1818from dstack ._internal .server .services .locking import advisory_lock_ctx , get_locker
19+ from dstack ._internal .server .services .logging import fmt
1920from dstack ._internal .utils .common import get_current_datetime
2021from dstack ._internal .utils .logging import get_logger
2122
@@ -27,14 +28,14 @@ async def process_gateways_connections():
2728 await _process_active_connections ()
2829
2930
30- async def process_submitted_gateways ():
31+ async def process_gateways ():
3132 lock , lockset = get_locker (get_db ().dialect_name ).get_lockset (GatewayModel .__tablename__ )
3233 async with get_session_ctx () as session :
3334 async with lock :
3435 res = await session .execute (
3536 select (GatewayModel )
3637 .where (
37- GatewayModel .status == GatewayStatus .SUBMITTED ,
38+ GatewayModel .status . in_ ([ GatewayStatus .SUBMITTED , GatewayStatus . PROVISIONING ]) ,
3839 GatewayModel .id .not_in (lockset ),
3940 )
4041 .options (lazyload (GatewayModel .gateway_compute ))
@@ -48,7 +49,25 @@ async def process_submitted_gateways():
4849 lockset .add (gateway_model .id )
4950 try :
5051 gateway_model_id = gateway_model .id
51- await _process_submitted_gateway (session = session , gateway_model = gateway_model )
52+ initial_status = gateway_model .status
53+ if initial_status == GatewayStatus .SUBMITTED :
54+ await _process_submitted_gateway (session = session , gateway_model = gateway_model )
55+ elif initial_status == GatewayStatus .PROVISIONING :
56+ await _process_provisioning_gateway (session = session , gateway_model = gateway_model )
57+ else :
58+ logger .error (
59+ "%s: unexpected gateway status %r" , fmt (gateway_model ), initial_status .upper ()
60+ )
61+ if gateway_model .status != initial_status :
62+ logger .info (
63+ "%s: gateway status has changed %s -> %s%s" ,
64+ fmt (gateway_model ),
65+ initial_status .upper (),
66+ gateway_model .status .upper (),
67+ f": { gateway_model .status_message } " if gateway_model .status_message else "" ,
68+ )
69+ gateway_model .last_processed_at = get_current_datetime ()
70+ await session .commit ()
5271 finally :
5372 lockset .difference_update ([gateway_model_id ])
5473
@@ -89,7 +108,7 @@ async def _process_connection(conn: GatewayConnection):
89108
90109
91110async def _process_submitted_gateway (session : AsyncSession , gateway_model : GatewayModel ):
92- logger .info ("Started gateway %s provisioning" , gateway_model . name )
111+ logger .info ("%s: started gateway provisioning" , fmt ( gateway_model ) )
93112 # Refetch to load related attributes.
94113 # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
95114 res = await session .execute (
@@ -110,8 +129,6 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
110129 except BackendNotAvailable :
111130 gateway_model .status = GatewayStatus .FAILED
112131 gateway_model .status_message = "Backend not available"
113- gateway_model .last_processed_at = get_current_datetime ()
114- await session .commit ()
115132 return
116133
117134 try :
@@ -123,53 +140,54 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
123140 )
124141 session .add (gateway_model )
125142 gateway_model .status = GatewayStatus .PROVISIONING
126- await session .commit ()
127- await session .refresh (gateway_model )
128143 except BackendError as e :
129- logger .info (
130- "Failed to create gateway compute for gateway %s: %s" , gateway_model .name , repr (e )
131- )
144+ logger .info ("%s: failed to create gateway compute: %r" , fmt (gateway_model ), e )
132145 gateway_model .status = GatewayStatus .FAILED
133146 status_message = f"Backend error: { repr (e )} "
134147 if len (e .args ) > 0 :
135148 status_message = str (e .args [0 ])
136149 gateway_model .status_message = status_message
137- gateway_model .last_processed_at = get_current_datetime ()
138- await session .commit ()
139- return
140150 except Exception as e :
141- logger .exception (
142- "Got exception when creating gateway compute for gateway %s" , gateway_model .name
143- )
151+ logger .exception ("%s: got exception when creating gateway compute" , fmt (gateway_model ))
144152 gateway_model .status = GatewayStatus .FAILED
145153 gateway_model .status_message = f"Unexpected error: { repr (e )} "
146- gateway_model .last_processed_at = get_current_datetime ()
147- await session .commit ()
148- return
149154
155+
156+ async def _process_provisioning_gateway (
157+ session : AsyncSession , gateway_model : GatewayModel
158+ ) -> None :
159+ # Refetch to load related attributes.
160+ # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
161+ res = await session .execute (
162+ select (GatewayModel )
163+ .where (GatewayModel .id == gateway_model .id )
164+ .execution_options (populate_existing = True )
165+ )
166+ gateway_model = res .unique ().scalar_one ()
167+
168+ # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
169+ # - cannot delete the gateway before it is provisioned because the DB model is locked
170+ # - connection retry counter is reset on server restart
171+ # - only one server replica is processing the gateway
172+ # Easy to fix by doing only one connection/configuration attempt per processing iteration. The
173+ # main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
174+ # maintaining a different model for Sky.
150175 connection = await gateways_services .connect_to_gateway_with_retry (
151176 gateway_model .gateway_compute
152177 )
153178 if connection is None :
154179 gateway_model .status = GatewayStatus .FAILED
155180 gateway_model .status_message = "Failed to connect to gateway"
156- gateway_model .last_processed_at = get_current_datetime ()
157181 gateway_model .gateway_compute .deleted = True
158- await session .commit ()
159182 return
160-
161183 try :
162184 await gateways_services .configure_gateway (connection )
163185 except Exception :
164- logger .exception ("Failed to configure gateway %s " , gateway_model . name )
186+ logger .exception ("%s: failed to configure gateway" , fmt ( gateway_model ) )
165187 gateway_model .status = GatewayStatus .FAILED
166188 gateway_model .status_message = "Failed to configure gateway"
167- gateway_model .last_processed_at = get_current_datetime ()
168189 await gateway_connections_pool .remove (gateway_model .gateway_compute .ip_address )
169190 gateway_model .gateway_compute .active = False
170- await session .commit ()
171191 return
172192
173193 gateway_model .status = GatewayStatus .RUNNING
174- gateway_model .last_processed_at = get_current_datetime ()
175- await session .commit ()
0 commit comments