@@ -198,12 +198,28 @@ def __init__(self, *,
198198 if resiliency_options is not None
199199 else GrpcClientResiliencyOptions ()
200200 )
201+ # Resiliency state must be initialised BEFORE the interceptor is
202+ # constructed because the interceptor receives a bound reference to
203+ # ``self._schedule_recreate``; any failure handled during construction
204+ # of the underlying channel could otherwise observe a half-built
205+ # client.
206+ self ._closing = False
207+ self ._last_recreate_time = 0.0
208+ self ._recreate_lock = threading .Lock ()
209+ self ._retired_channels : dict [grpc .Channel , threading .Timer ] = {}
210+ self ._recreate_thread_lock = threading .Lock ()
211+ self ._recreate_thread : Optional [threading .Thread ] = None
212+ # Test seam: set after each fire-and-forget recreate attempt finishes
213+ # (whether it actually recreated the channel or short-circuited on
214+ # close / cooldown). Lets tests synchronise without polling and lets
215+ # ``close()`` wait deterministically for an in-flight recreate.
216+ self ._recreate_done_event = threading .Event ()
201217 self ._client_failure_tracker = FailureTracker (
202218 self ._resiliency_options .channel_recreate_failure_threshold
203219 )
204220 self ._resiliency_interceptor = ClientResiliencyInterceptor (
205221 self ._client_failure_tracker ,
206- self ._maybe_recreate_channel ,
222+ self ._schedule_recreate ,
207223 )
208224 resolved_interceptors = (
209225 prepare_sync_interceptors (metadata , interceptors ) if channel is None else interceptors
@@ -230,10 +246,6 @@ def __init__(self, *,
230246 # can prepend the interceptor themselves via grpc.intercept_channel.
231247 self ._channel = channel
232248 self ._stub = stubs .TaskHubSidecarServiceStub (channel )
233- self ._closing = False
234- self ._last_recreate_time = 0.0
235- self ._recreate_lock = threading .Lock ()
236- self ._retired_channels : dict [grpc .Channel , threading .Timer ] = {}
237249 self ._logger = shared .get_logger ("client" , log_handler , log_formatter )
238250 self .default_version = default_version
239251 self ._payload_store = payload_store
@@ -252,6 +264,48 @@ def _compose_interceptors(
252264 composed .extend (user_interceptors )
253265 return composed
254266
267+ def _schedule_recreate (self ) -> None :
268+ """Spawn a daemon thread that recreates the channel fire-and-forget.
269+
270+ Called from the resiliency interceptor on the caller's thread when a
271+ unary RPC fails with a transport error. The interceptor returns to its
272+ caller as soon as this method returns, so the failing RPC's original
273+ error propagates without being delayed by DNS, TLS handshake, or
274+ contention on ``_recreate_lock``.
275+
276+ Single-flight under ``_recreate_thread_lock``: if a recreate thread is
277+ still alive, the new trigger is dropped. The in-flight recreate will
278+ pick up the latest channel state on completion; the cooldown inside
279+ ``_maybe_recreate_channel`` further prevents thrash. ``thread.start()``
280+ is called under the lock so a follow-up caller's ``is_alive()`` check
281+ observes the running state rather than racing the start.
282+ """
283+ try :
284+ if self ._closing :
285+ return
286+ with self ._recreate_thread_lock :
287+ existing = self ._recreate_thread
288+ if existing is not None and existing .is_alive ():
289+ return
290+ self ._recreate_done_event .clear ()
291+ thread = threading .Thread (
292+ target = self ._run_recreate ,
293+ name = "durabletask-client-recreate" ,
294+ daemon = True ,
295+ )
296+ self ._recreate_thread = thread
297+ thread .start ()
298+ except Exception :
299+ self ._logger .exception ("Failed to schedule channel recreate" )
300+
301+ def _run_recreate (self ) -> None :
302+ try :
303+ self ._maybe_recreate_channel ()
304+ except Exception :
305+ self ._logger .exception ("Channel recreate failed" )
306+ finally :
307+ self ._recreate_done_event .set ()
308+
255309 def _maybe_recreate_channel (self ) -> None :
256310 if not self ._owns_channel or self ._closing :
257311 return
@@ -296,8 +350,14 @@ def close(self) -> None:
296350 it.
297351 """
298352 if self ._owns_channel :
353+ # Signal early so any in-flight recreate thread bails out of
354+ # ``_maybe_recreate_channel`` before we tear the channel down.
355+ self ._closing = True
356+ with self ._recreate_thread_lock :
357+ recreate_thread = self ._recreate_thread
358+ if recreate_thread is not None and recreate_thread .is_alive ():
359+ recreate_thread .join (timeout = 5.0 )
299360 with self ._recreate_lock :
300- self ._closing = True
301361 retired_channels = list (self ._retired_channels .items ())
302362 self ._retired_channels .clear ()
303363 current_channel = self ._channel
@@ -628,12 +688,28 @@ def __init__(self, *,
628688 if resiliency_options is not None
629689 else GrpcClientResiliencyOptions ()
630690 )
691+ # Resiliency state must be initialised BEFORE the interceptor is
692+ # constructed because the interceptor receives a bound reference to
693+ # ``self._schedule_recreate``; any failure handled during construction
694+ # of the underlying channel could otherwise observe a half-built
695+ # client.
696+ self ._closing = False
697+ self ._recreate_lock = asyncio .Lock ()
698+ self ._last_recreate_time = 0.0
699+ self ._retired_channels : list [grpc .aio .Channel ] = []
700+ self ._retired_channel_close_tasks : set [asyncio .Task [None ]] = set ()
701+ self ._recreate_task : Optional [asyncio .Task [None ]] = None
702+ # Test seam: set after each fire-and-forget recreate attempt finishes
703+ # (whether it actually recreated the channel or short-circuited on
704+ # close / cooldown). Lets tests synchronise without polling and lets
705+ # ``close()`` await an in-flight recreate deterministically.
706+ self ._recreate_done_event = asyncio .Event ()
631707 self ._client_failure_tracker = FailureTracker (
632708 self ._resiliency_options .channel_recreate_failure_threshold
633709 )
634710 self ._resiliency_interceptor = AsyncClientResiliencyInterceptor (
635711 self ._client_failure_tracker ,
636- self ._maybe_recreate_channel ,
712+ self ._schedule_recreate ,
637713 )
638714 resolved_interceptors = (
639715 prepare_async_interceptors (metadata , interceptors ) if channel is None else interceptors
@@ -660,11 +736,6 @@ def __init__(self, *,
660736 # resiliency should let us create the channel.
661737 self ._channel = channel
662738 self ._stub = stubs .TaskHubSidecarServiceStub (channel )
663- self ._closing = False
664- self ._recreate_lock = asyncio .Lock ()
665- self ._last_recreate_time = 0.0
666- self ._retired_channels : list [grpc .aio .Channel ] = []
667- self ._retired_channel_close_tasks : set [asyncio .Task [None ]] = set ()
668739 self ._logger = shared .get_logger ("async_client" , log_handler , log_formatter )
669740 self .default_version = default_version
670741 self ._payload_store = payload_store
@@ -688,7 +759,17 @@ async def close(self) -> None:
688759 it.
689760 """
690761 if self ._owns_channel :
762+ # Signal early so any in-flight recreate task bails out of
763+ # ``_maybe_recreate_channel`` before we tear the channel down.
691764 self ._closing = True
765+ recreate_task = self ._recreate_task
766+ if recreate_task is not None and not recreate_task .done ():
767+ try :
768+ await recreate_task
769+ except Exception :
770+ # Already logged by ``_run_recreate``; suppressing here
771+ # ensures close() always tears down cleanly.
772+ pass
692773 async with self ._recreate_lock :
693774 retired_channels = list (self ._retired_channels )
694775 self ._retired_channels .clear ()
@@ -708,6 +789,34 @@ async def __aenter__(self):
708789 async def __aexit__ (self , exc_type , exc_val , exc_tb ):
709790 await self .close ()
710791
792+ def _schedule_recreate (self ) -> None :
793+ """Schedule a fire-and-forget channel recreate on the event loop.
794+
795+ Called from the resiliency interceptor when a unary RPC fails with a
796+ transport error. Single-flight: if ``_recreate_task`` is still
797+ pending, the trigger is dropped — the in-flight recreate will pick up
798+ the latest channel state on completion. asyncio is single-threaded
799+ so ``done()`` is race-free; no extra lock is required.
800+ """
801+ try :
802+ if self ._closing :
803+ return
804+ existing = self ._recreate_task
805+ if existing is not None and not existing .done ():
806+ return
807+ self ._recreate_done_event .clear ()
808+ self ._recreate_task = asyncio .create_task (self ._run_recreate ())
809+ except Exception :
810+ self ._logger .exception ("Failed to schedule channel recreate" )
811+
812+ async def _run_recreate (self ) -> None :
813+ try :
814+ await self ._maybe_recreate_channel ()
815+ except Exception :
816+ self ._logger .exception ("Channel recreate failed" )
817+ finally :
818+ self ._recreate_done_event .set ()
819+
711820 async def _maybe_recreate_channel (self ) -> None :
712821 if not self ._owns_channel or self ._closing :
713822 return
0 commit comments