1919from awscrt import mqtt
2020from awscrt .exceptions import AwsCrtError
2121
22- from .auth import NavienAuthClient
22+ from .auth import (
23+ AuthenticationError ,
24+ NavienAuthClient ,
25+ TokenRefreshError ,
26+ )
2327from .events import EventEmitter
2428from .models import (
2529 Device ,
@@ -205,7 +209,8 @@ def _schedule_coroutine(self, coro: Any) -> None:
205209 # Schedule the coroutine in the stored loop using thread-safe method
206210 try :
207211 asyncio .run_coroutine_threadsafe (coro , self ._loop )
208- except Exception as e :
212+ except RuntimeError as e :
213+ # Event loop is closed or not running
209214 _logger .error (f"Failed to schedule coroutine: { e } " , exc_info = True )
210215
211216 def _on_connection_interrupted_internal (
@@ -218,7 +223,6 @@ def _on_connection_interrupted_internal(
218223 error: Error that caused the interruption
219224 **kwargs: Forward-compatibility kwargs from AWS SDK
220225 """
221- _logger .warning (f"Connection interrupted: { error } " )
222226 self ._connected = False
223227
224228 # Emit event
@@ -232,7 +236,7 @@ def _on_connection_interrupted_internal(
232236 # Fallback for callbacks expecting no arguments
233237 try :
234238 self ._on_connection_interrupted () # type: ignore
235- except Exception as e :
239+ except ( TypeError , AttributeError ) as e :
236240 _logger .error (
237241 f"Error in connection_interrupted callback: { e } "
238242 )
@@ -339,12 +343,113 @@ async def _active_reconnect(self) -> None:
339343 "No connection manager available for reconnection"
340344 )
341345
342- except Exception as e :
346+ except ( AwsCrtError , AuthenticationError , RuntimeError ) as e :
343347 _logger .error (
344348 f"Error during active reconnection: { e } " , exc_info = True
345349 )
346350 raise
347351
352+ async def _deep_reconnect (self ) -> None :
353+ """
354+ Perform a deep reconnection by completely rebuilding the connection.
355+
356+ This method is called after multiple quick reconnection failures.
357+ It performs a full teardown and rebuild:
358+ - Disconnects existing connection
359+ - Refreshes authentication tokens
360+ - Creates new connection manager
361+ - Re-establishes all subscriptions
362+
363+ This is more expensive but can recover from issues that a simple
364+ reconnection cannot fix (e.g., stale credentials, corrupted state).
365+ """
366+ if self ._connected :
367+ _logger .debug ("Already connected, skipping deep reconnection" )
368+ return
369+
370+ _logger .warning (
371+ "Performing deep reconnection (full rebuild)... "
372+ "This may take longer."
373+ )
374+
375+ try :
376+ # Step 1: Clean up existing connection if any
377+ if self ._connection_manager :
378+ _logger .debug ("Cleaning up old connection..." )
379+ try :
380+ if self ._connection_manager .is_connected :
381+ await self ._connection_manager .disconnect ()
382+ except (AwsCrtError , RuntimeError ) as e :
383+ # Expected: connection already dead or in bad state
384+ _logger .debug (f"Error during cleanup: { e } (expected)" )
385+
386+ # Step 2: Force token refresh to get fresh AWS credentials
387+ _logger .debug ("Refreshing authentication tokens..." )
388+ try :
389+ # Use the stored refresh token from current tokens
390+ current_tokens = self ._auth_client .current_tokens
391+ if current_tokens and current_tokens .refresh_token :
392+ await self ._auth_client .refresh_token (
393+ current_tokens .refresh_token
394+ )
395+ else :
396+ _logger .warning ("No refresh token available" )
397+ raise ValueError ("No refresh token available for refresh" )
398+ except (TokenRefreshError , ValueError , AuthenticationError ) as e :
399+ # If refresh fails, try full re-authentication with stored
400+ # credentials
401+ if self ._auth_client .has_stored_credentials :
402+ _logger .warning (
403+ f"Token refresh failed: { e } . Attempting full "
404+ "re-authentication..."
405+ )
406+ await self ._auth_client .re_authenticate ()
407+ else :
408+ _logger .error (
409+ "Cannot re-authenticate: no stored credentials"
410+ )
411+ raise
412+
413+ # Step 3: Create completely new connection manager
414+ _logger .debug ("Creating new connection manager..." )
415+ self ._connection_manager = MqttConnection (
416+ config = self .config ,
417+ auth_client = self ._auth_client ,
418+ on_connection_interrupted = self ._on_connection_interrupted_internal ,
419+ on_connection_resumed = self ._on_connection_resumed_internal ,
420+ )
421+
422+ # Step 4: Attempt connection
423+ success = await self ._connection_manager .connect ()
424+
425+ if success :
426+ # Update connection references
427+ self ._connection = self ._connection_manager .connection
428+ self ._connected = True
429+
430+ # Step 5: Re-establish subscriptions
431+ if self ._subscription_manager and self ._connection :
432+ _logger .debug ("Re-establishing subscriptions..." )
433+ self ._subscription_manager .update_connection (
434+ self ._connection
435+ )
436+ await self ._subscription_manager .resubscribe_all ()
437+
438+ _logger .info (
439+ "Deep reconnection successful - fully rebuilt connection"
440+ )
441+ else :
442+ _logger .error ("Deep reconnection failed to connect" )
443+
444+ except (
445+ AwsCrtError ,
446+ AuthenticationError ,
447+ RuntimeError ,
448+ ValueError ,
449+ ) as e :
450+ _logger .error (f"Error during deep reconnection: { e } " , exc_info = True )
451+ raise
452+
348453 async def connect (self ) -> bool :
349454 """
350455 Establish connection to AWS IoT Core.
@@ -394,6 +499,7 @@ async def connect(self) -> bool:
394499 is_connected_func = lambda : self ._connected ,
395500 schedule_coroutine_func = self ._schedule_coroutine ,
396501 reconnect_func = self ._active_reconnect ,
502+ deep_reconnect_func = self ._deep_reconnect ,
397503 emit_event_func = self .emit ,
398504 )
399505 self ._reconnection_handler .enable ()
@@ -428,7 +534,12 @@ async def connect(self) -> bool:
428534
429535 return False
430536
431- except Exception as e :
537+ except (
538+ AwsCrtError ,
539+ AuthenticationError ,
540+ RuntimeError ,
541+ ValueError ,
542+ ) as e :
432543 _logger .error (f"Failed to connect: { e } " )
433544 raise
434545
@@ -473,7 +584,7 @@ async def disconnect(self) -> None:
473584 self ._connection = None
474585
475586 _logger .info ("Disconnected successfully" )
476- except Exception as e :
587+ except ( AwsCrtError , RuntimeError ) as e :
477588 _logger .error (f"Error during disconnect: { e } " )
478589 raise
479590
@@ -493,7 +604,7 @@ def _on_message_received(
493604
494605 except json .JSONDecodeError as e :
495606 _logger .error (f"Failed to parse message payload: { e } " )
496- except Exception as e :
607+ except ( AttributeError , KeyError , TypeError ) as e :
497608 _logger .error (f"Error processing message: { e } " )
498609
499610 def _topic_matches_pattern (self , topic : str , pattern : str ) -> bool :
@@ -618,12 +729,11 @@ async def publish(
618729
619730 try :
620731 return await self ._connection_manager .publish (topic , payload , qos )
621- except Exception as e :
732+ except AwsCrtError as e :
622733 # Handle clean session cancellation gracefully
623- # Check exception type and name attribute for proper
624- # error identification
734+ # Safely check e.name attribute (may not exist or be None)
625735 if (
626- isinstance (e , AwsCrtError )
736+ hasattr (e , "name" )
627737 and e .name == "AWS_ERROR_MQTT_CANCELLED_FOR_CLEAN_SESSION"
628738 ):
629739 _logger .warning (
@@ -641,9 +751,9 @@ async def publish(
641751 raise RuntimeError (
642752 "Publish cancelled due to clean session and "
643753 "command queue is disabled"
644- )
754+ ) from e
645755
646- # Note: redact_topic is already used elsewhere in the file
756+ # Other AWS CRT errors
647757 _logger .error (f"Failed to publish to topic: { e } " )
648758 raise
649759
0 commit comments