Skip to content

Commit bbd8d24

Browse files
committed
retry on connection errors
1 parent 69cb5dc commit bbd8d24

2 files changed

Lines changed: 18 additions & 0 deletions

File tree

eval_protocol/pytest/exception_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
2323
return _default_retryable_exceptions
2424

2525
# Lazy imports (these are expensive)
26+
import aiohttp
2627
import httpx
2728
import litellm
2829
import requests
@@ -32,6 +33,9 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
3233
ConnectionError, # type: ignore[assignment]
3334
TimeoutError, # type: ignore[assignment]
3435
OSError, # type: ignore[assignment] # Covers network-related OS errors
36+
# aiohttp library exceptions
37+
aiohttp.ClientConnectionError,
38+
aiohttp.ServerDisconnectedError,
3539
# Requests library exceptions
3640
requests.exceptions.ConnectionError,
3741
requests.exceptions.Timeout,

eval_protocol/pytest/remote_rollout_processor.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,25 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
104104
try:
105105
session = self._get_or_create_session()
106106
async with session.post(init_url, json=init_payload.model_dump(), timeout=timeout_init) as resp:
107+
if resp.status >= 500:
108+
body = await resp.text()
109+
raise ConnectionError(f"Remote /init returned server error (HTTP {resp.status}): {body}")
107110
if resp.status >= 400:
108111
body = await resp.text()
109112
raise RuntimeError(f"Remote /init failed (HTTP {resp.status}): {body}")
110113
resp.raise_for_status()
111114
await resp.read() # Drain the response body and release the connection back to the pool
115+
except asyncio.CancelledError:
116+
# Distinguish intentional cancellation (Ctrl+C, test teardown) from
117+
# aiohttp-internal cancellation caused by a poisoned DNS resolver
118+
# after a server disconnect.
119+
current = asyncio.current_task()
120+
if current is not None and current.cancelled():
121+
raise # Intentional cancellation — propagate immediately
122+
# Network-level failure; discard the session so retries get a
123+
# fresh connection pool.
124+
self._session = None
125+
raise ConnectionError("Remote server connection lost (request cancelled)")
112126
except asyncio.TimeoutError:
113127
raise TimeoutError(
114128
f"The /init endpoint tried {init_url} with {init_payload.model_dump()} but timed out after 300 seconds."

0 commit comments

Comments
 (0)