From 2ae9c3fd380ca424729d6690cf8ccbc476aadef4 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Wed, 18 Mar 2026 19:41:12 +0530 Subject: [PATCH 1/2] [FIX] Activate litellm retry for all LLM providers litellm's wrapper-level retry (completion_with_retries) works for all providers including httpx-based ones (Anthropic, Vertex, Bedrock, Mistral, Azure AI Foundry), but only activates when num_retries is set in kwargs. Our adapters pass max_retries (from user UI config) which only works for SDK-based providers (OpenAI, Azure). httpx-based providers silently ignored it, resulting in zero retries on transient errors (500, 502, 503). Bridge the gap by copying the user's max_retries value into num_retries and setting retry_strategy to exponential_backoff_retry before calling litellm.completion(). litellm internally zeroes max_retries during wrapper retries to prevent double-retry with SDK providers. Co-Authored-By: Claude Opus 4.6 (1M context) --- unstract/sdk1/src/unstract/sdk1/llm.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/unstract/sdk1/src/unstract/sdk1/llm.py b/unstract/sdk1/src/unstract/sdk1/llm.py index b907cd3961..a96d3679ce 100644 --- a/unstract/sdk1/src/unstract/sdk1/llm.py +++ b/unstract/sdk1/src/unstract/sdk1/llm.py @@ -207,6 +207,7 @@ def complete(self, prompt: str, **kwargs: object) -> dict[str, object]: completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs}) completion_kwargs.pop("cost_model", None) + self._set_litellm_retry_params(completion_kwargs) # if hasattr(self, "model") and self.model not in O1_MODELS: # completion_kwargs["temperature"] = 0.003 @@ -295,6 +296,7 @@ def stream_complete( completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs}) completion_kwargs.pop("cost_model", None) + self._set_litellm_retry_params(completion_kwargs) for chunk in litellm.completion( messages=messages, @@ -363,6 +365,7 @@ async def acomplete(self, prompt: str, **kwargs: object) -> dict[str, object]: completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs}) completion_kwargs.pop("cost_model", None) + self._set_litellm_retry_params(completion_kwargs) response = await litellm.acompletion( messages=messages, @@ -454,6 +457,24 @@ def get_metrics(self) -> dict[str, object]: def get_usage_reason(self) -> object: return self.platform_kwargs.get("llm_usage_reason") + @staticmethod + def _set_litellm_retry_params(completion_kwargs: dict) -> None: + """Activate litellm's wrapper-level retry for all providers. + + litellm's retry mechanism (completion_with_retries) only activates when + num_retries is set. Our adapters pass max_retries (from user UI config) + which only works for SDK-based providers (OpenAI, Azure). This bridges + the gap by copying max_retries into num_retries so httpx-based providers + (Anthropic, Vertex, Bedrock, Mistral, etc.) also get retries. + + litellm internally sets max_retries=0 during wrapper retries to prevent + double-retry with SDK providers. + """ + max_retries = completion_kwargs.get("max_retries") + if max_retries: + completion_kwargs["num_retries"] = max_retries + completion_kwargs["retry_strategy"] = "exponential_backoff_retry" + def _record_usage( self, model: str, From 94670157a0be404e761952668e461d93823fc62c Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Wed, 18 Mar 2026 20:38:38 +0530 Subject: [PATCH 2/2] [FIX] Zero out max_retries to prevent double-retry with SDK providers SDK-based providers (OpenAI, Azure) default to max_retries=2 internally even when not explicitly set. Without zeroing it, the first attempt exhausts SDK retries before the wrapper retry kicks in, multiplying total attempts (e.g. 5 SDK + 5 wrapper = 11 instead of expected 5). Setting max_retries=0 ensures all retries go through litellm's wrapper uniformly across all providers. Co-Authored-By: Claude Opus 4.6 (1M context) --- unstract/sdk1/src/unstract/sdk1/llm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unstract/sdk1/src/unstract/sdk1/llm.py b/unstract/sdk1/src/unstract/sdk1/llm.py index a96d3679ce..5bcd5ca66a 100644 --- a/unstract/sdk1/src/unstract/sdk1/llm.py +++ b/unstract/sdk1/src/unstract/sdk1/llm.py @@ -467,12 +467,14 @@ def _set_litellm_retry_params(completion_kwargs: dict) -> None: the gap by copying max_retries into num_retries so httpx-based providers (Anthropic, Vertex, Bedrock, Mistral, etc.) also get retries. - litellm internally sets max_retries=0 during wrapper retries to prevent - double-retry with SDK providers. + SDK-based providers (OpenAI, Azure) default to max_retries=2 internally, + which would multiply with wrapper retries. Setting max_retries=0 ensures + all retries go through the wrapper uniformly. """ max_retries = completion_kwargs.get("max_retries") if max_retries: completion_kwargs["num_retries"] = max_retries + completion_kwargs["max_retries"] = 0 completion_kwargs["retry_strategy"] = "exponential_backoff_retry" def _record_usage(