From 2ae9c3fd380ca424729d6690cf8ccbc476aadef4 Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Wed, 18 Mar 2026 19:41:12 +0530
Subject: [PATCH 1/2] [FIX] Activate litellm retry for all LLM providers

litellm's wrapper-level retry (completion_with_retries) works for all
providers including httpx-based ones (Anthropic, Vertex, Bedrock, Mistral,
Azure AI Foundry), but only activates when num_retries is set in kwargs.

Our adapters pass max_retries (from user UI config) which only works for
SDK-based providers (OpenAI, Azure). httpx-based providers silently
ignored it, resulting in zero retries on transient errors (500, 502, 503).

Bridge the gap by copying the user's max_retries value into num_retries
and setting retry_strategy to exponential_backoff_retry before calling
litellm.completion(). litellm internally zeroes max_retries during
wrapper retries to prevent double-retry with SDK providers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 unstract/sdk1/src/unstract/sdk1/llm.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/unstract/sdk1/src/unstract/sdk1/llm.py b/unstract/sdk1/src/unstract/sdk1/llm.py
index b907cd3961..a96d3679ce 100644
--- a/unstract/sdk1/src/unstract/sdk1/llm.py
+++ b/unstract/sdk1/src/unstract/sdk1/llm.py
@@ -207,6 +207,7 @@ def complete(self, prompt: str, **kwargs: object) -> dict[str, object]:
 
             completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs})
             completion_kwargs.pop("cost_model", None)
+            self._set_litellm_retry_params(completion_kwargs)
 
             # if hasattr(self, "model") and self.model not in O1_MODELS:
             #     completion_kwargs["temperature"] = 0.003
@@ -295,6 +296,7 @@ def stream_complete(
 
             completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs})
             completion_kwargs.pop("cost_model", None)
+            self._set_litellm_retry_params(completion_kwargs)
 
             for chunk in litellm.completion(
                 messages=messages,
@@ -363,6 +365,7 @@ async def acomplete(self, prompt: str, **kwargs: object) -> dict[str, object]:
 
             completion_kwargs = self.adapter.validate({**self.kwargs, **kwargs})
             completion_kwargs.pop("cost_model", None)
+            self._set_litellm_retry_params(completion_kwargs)
 
             response = await litellm.acompletion(
                 messages=messages,
@@ -454,6 +457,24 @@ def get_metrics(self) -> dict[str, object]:
     def get_usage_reason(self) -> object:
         return self.platform_kwargs.get("llm_usage_reason")
 
+    @staticmethod
+    def _set_litellm_retry_params(completion_kwargs: dict) -> None:
+        """Activate litellm's wrapper-level retry for all providers.
+
+        litellm's retry mechanism (completion_with_retries) only activates when
+        num_retries is set. Our adapters pass max_retries (from user UI config)
+        which only works for SDK-based providers (OpenAI, Azure). This bridges
+        the gap by copying max_retries into num_retries so httpx-based providers
+        (Anthropic, Vertex, Bedrock, Mistral, etc.) also get retries.
+
+        litellm internally sets max_retries=0 during wrapper retries to prevent
+        double-retry with SDK providers.
+        """
+        max_retries = completion_kwargs.get("max_retries")
+        if max_retries:
+            completion_kwargs["num_retries"] = max_retries
+            completion_kwargs["retry_strategy"] = "exponential_backoff_retry"
+
     def _record_usage(
         self,
         model: str,

From 94670157a0be404e761952668e461d93823fc62c Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Wed, 18 Mar 2026 20:38:38 +0530
Subject: [PATCH 2/2] [FIX] Zero out max_retries to prevent double-retry with
 SDK providers

SDK-based providers (OpenAI, Azure) default to max_retries=2 internally
even when not explicitly set. Without zeroing it, the first attempt
exhausts SDK retries before the wrapper retry kicks in, multiplying
total attempts (e.g. 5 SDK + 5 wrapper = 11 instead of expected 5).

Setting max_retries=0 ensures all retries go through litellm's wrapper
uniformly across all providers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 unstract/sdk1/src/unstract/sdk1/llm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/unstract/sdk1/src/unstract/sdk1/llm.py b/unstract/sdk1/src/unstract/sdk1/llm.py
index a96d3679ce..5bcd5ca66a 100644
--- a/unstract/sdk1/src/unstract/sdk1/llm.py
+++ b/unstract/sdk1/src/unstract/sdk1/llm.py
@@ -467,12 +467,14 @@ def _set_litellm_retry_params(completion_kwargs: dict) -> None:
         the gap by copying max_retries into num_retries so httpx-based providers
         (Anthropic, Vertex, Bedrock, Mistral, etc.) also get retries.
 
-        litellm internally sets max_retries=0 during wrapper retries to prevent
-        double-retry with SDK providers.
+        SDK-based providers (OpenAI, Azure) default to max_retries=2 internally,
+        which would multiply with wrapper retries. Setting max_retries=0 ensures
+        all retries go through the wrapper uniformly.
         """
         max_retries = completion_kwargs.get("max_retries")
         if max_retries:
             completion_kwargs["num_retries"] = max_retries
+            completion_kwargs["max_retries"] = 0
             completion_kwargs["retry_strategy"] = "exponential_backoff_retry"
 
     def _record_usage(