File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -23,6 +23,7 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
2323 return _default_retryable_exceptions
2424
2525 # Lazy imports (these are expensive)
26+ import aiohttp
2627 import httpx
2728 import litellm
2829 import requests
@@ -32,6 +33,9 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
3233 ConnectionError , # type: ignore[assignment]
3334 TimeoutError , # type: ignore[assignment]
3435 OSError , # type: ignore[assignment] # Covers network-related OS errors
36+ # aiohttp library exceptions
37+ aiohttp .ClientConnectionError ,
38+ aiohttp .ServerDisconnectedError ,
3539 # Requests library exceptions
3640 requests .exceptions .ConnectionError ,
3741 requests .exceptions .Timeout ,
Original file line number Diff line number Diff line change @@ -104,11 +104,25 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
104104 try :
105105 session = self ._get_or_create_session ()
106106 async with session .post (init_url , json = init_payload .model_dump (), timeout = timeout_init ) as resp :
107+ if resp .status >= 500 :
108+ body = await resp .text ()
109+ raise ConnectionError (f"Remote /init returned server error (HTTP { resp .status } ): { body } " )
107110 if resp .status >= 400 :
108111 body = await resp .text ()
109112 raise RuntimeError (f"Remote /init failed (HTTP { resp .status } ): { body } " )
110113 resp .raise_for_status ()
111114 await resp .read () # Drain the response body and release the connection back to the pool
115+ except asyncio .CancelledError :
116+ # Distinguish intentional cancellation (Ctrl+C, test teardown) from
117+ # aiohttp-internal cancellation caused by a poisoned DNS resolver
118+ # after a server disconnect.
119+ current = asyncio .current_task ()
120+ if current is not None and current .cancelled ():
121+ raise # Intentional cancellation — propagate immediately
122+ # Network-level failure; discard the session so retries get a
123+ # fresh connection pool.
124+ self ._session = None
125+ raise ConnectionError ("Remote server connection lost (request cancelled)" )
112126 except asyncio .TimeoutError :
113127 raise TimeoutError (
114128 f"The /init endpoint tried { init_url } with { init_payload .model_dump ()} but timed out after 300 seconds."
You can’t perform that action at this time.
0 commit comments