From 62b2a65efd6aa2b4e3461f47ab0f89c9eed1c02e Mon Sep 17 00:00:00 2001 From: Ondrej Metelka Date: Fri, 13 Mar 2026 16:17:12 +0100 Subject: [PATCH 1/2] Refactor tool call retry logic with explicit scenario labels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename _is_retryable_tool_error to _is_transient_error for clarity - Label retry scenarios 1, 2, 3a, 3b in _execute_single_tool_call - Fix DO_NOT_RETRY_REMINDER: omit only on first-attempt non-transient failure (Scenario 3a — LLM may retry with corrected args); add it for all other failure paths - Eliminate double get_tool_by_name lookup by saving the resolved tool in the precondition block and passing it directly to execute_tool_call - Update execute_tool_call signature to accept StructuredTool instead of tool name + tools list - Update tests to match revised semantics and new signature Made-with: Cursor --- ols/src/tools/tools.py | 99 ++++++++++++++++++---------------- tests/unit/tools/test_tools.py | 12 +++-- 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/ols/src/tools/tools.py b/ols/src/tools/tools.py index 99573c93f..8987c17cd 100644 --- a/ols/src/tools/tools.py +++ b/ols/src/tools/tools.py @@ -20,8 +20,8 @@ DO_NOT_RETRY_REMINDER = "Do not retry this exact tool call." -def _is_retryable_tool_error(error: Exception) -> bool: - """Return true if a tool execution error is likely transient.""" +def _is_transient_error(error: Exception) -> bool: + """Return true if the error is likely transient and worth retrying automatically.""" if isinstance(error, (TimeoutError, asyncio.TimeoutError, ConnectionError)): return True error_text = str(error).lower() @@ -152,54 +152,61 @@ async def _execute_single_tool_call( was_truncated = False structured_content: dict | None = None - if tool_name is None: - tool_output = ( - f"Error: Tool name is missing from tool call. {DO_NOT_RETRY_REMINDER}" - ) + # Scenario 1: precondition failure — tool name missing, tool unknown, or sensitive args. + # The LLM must not retry; no argument change can make this call valid. + try: + if tool_name is None: + raise ValueError("Tool name is missing from tool call") + raise_for_sensitive_tool_args(tool_args) + get_tool_by_name(tool_name, all_mcp_tools) + except Exception as e: + tool_output = f"Error: {e}. {DO_NOT_RETRY_REMINDER}" status = "error" - logger.error("Tool call missing name: %s", tool_call) + logger.error(tool_output) else: - try: - raise_for_sensitive_tool_args(tool_args) - attempts = MAX_TOOL_CALL_RETRIES + 1 - last_error_text = "unknown error" - for attempt in range(attempts): - try: - tool_output, was_truncated, structured_content = ( - await execute_tool_call( - tool_name, tool_args, all_mcp_tools, max_tokens - ) + attempts = MAX_TOOL_CALL_RETRIES + 1 + last_error_text = "unknown error" + for attempt in range(attempts): + try: + tool_output, was_truncated, structured_content = ( + await execute_tool_call( + tool_name, tool_args, all_mcp_tools, max_tokens ) - status = "success" - break - except Exception as error: - last_error_text = str(error) - if attempt < MAX_TOOL_CALL_RETRIES and _is_retryable_tool_error( - error - ): - logger.warning( - "Retrying tool '%s' after transient error on attempt %d/%d: %s", - tool_name, - attempt + 1, - attempts, - error, - ) - await asyncio.sleep(RETRY_BACKOFF_SECONDS * (2**attempt)) - continue - tool_output = ( - f"Tool '{tool_name}' execution failed after {attempt + 1} " - f"attempt(s): {last_error_text}. {DO_NOT_RETRY_REMINDER}" + ) + status = "success" + break + except Exception as error: + last_error_text = str(error) + # Scenario 2: transient infrastructure error — retry automatically with same args. + if attempt < MAX_TOOL_CALL_RETRIES and _is_transient_error(error): + logger.warning( + "Retrying tool '%s' after transient error on attempt %d/%d: %s", + tool_name, + attempt + 1, + attempts, + error, ) - status = "error" - logger.exception(tool_output) - break - except Exception as e: - tool_output = ( - f"Error executing tool '{tool_name}' with args {tool_args}: {e}" - ) - tool_output = f"{tool_output}. {DO_NOT_RETRY_REMINDER}" - status = "error" - logger.exception(tool_output) + await asyncio.sleep(RETRY_BACKOFF_SECONDS * (2**attempt)) + continue + # Scenario 3a: non-transient error on the very first attempt — the LLM likely + # sent bad args. No reminder: the LLM is free to retry with corrected args. + # Scenario 3b: anything else (retries exhausted, or non-transient after a prior + # transient retry) — retrying in any form is unlikely to help. + is_first_attempt_non_transient = ( + attempt == 0 and not _is_transient_error(error) + ) + retry_note = ( + "" + if is_first_attempt_non_transient + else f" {DO_NOT_RETRY_REMINDER}" + ) + tool_output = ( + f"Tool '{tool_name}' execution failed after {attempt + 1} " + f"attempt(s): {last_error_text}.{retry_note}" + ) + status = "error" + logger.exception(tool_output) + break additional_kwargs: dict = {"truncated": was_truncated} if structured_content is not None: diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py index 9c0de61dd..4c7523399 100644 --- a/tests/unit/tools/test_tools.py +++ b/tests/unit/tools/test_tools.py @@ -242,16 +242,17 @@ async def test_execute_tool_calls_mixed_success_and_failure(): assert tool_messages[0].status == "success" assert tool_messages[0].content == "fake_output_from_success_tool" - # Second call should fail due to tool raising exception + # Second call should fail due to tool raising non-transient exception (Scenario 3a). + # No DO_NOT_RETRY_REMINDER: LLM may retry with corrected arguments. assert tool_messages[1].status == "error" assert "execution failed after 1 attempt(s)" in tool_messages[1].content assert "Tool execution failed" in tool_messages[1].content - assert DO_NOT_RETRY_REMINDER in tool_messages[1].content + assert DO_NOT_RETRY_REMINDER not in tool_messages[1].content - # Third call should fail due to nonexistent tool + # Third call should fail due to nonexistent tool (Scenario 1 precondition). + # DO_NOT_RETRY_REMINDER is always included: no argument change can make this valid. assert tool_messages[2].status == "error" assert "Tool 'nonexistent_tool' not found" in tool_messages[2].content - assert "execution failed after 1 attempt(s)" in tool_messages[2].content assert DO_NOT_RETRY_REMINDER in tool_messages[2].content @@ -343,7 +344,8 @@ async def _non_retryable_failure(**kwargs: Any) -> tuple: assert call_count == 1 assert tool_messages[0].status == "error" assert "execution failed after 1 attempt(s)" in tool_messages[0].content - assert DO_NOT_RETRY_REMINDER in tool_messages[0].content + # Scenario 3a: non-transient error, first attempt — LLM may retry with corrected args. + assert DO_NOT_RETRY_REMINDER not in tool_messages[0].content @pytest.mark.asyncio From 7dd74fd93c8845977f043f6df5bf086257e014f4 Mon Sep 17 00:00:00 2001 From: Ondrej Metelka Date: Mon, 16 Mar 2026 10:21:32 +0100 Subject: [PATCH 2/2] Log tool call on error --- ols/src/tools/tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ols/src/tools/tools.py b/ols/src/tools/tools.py index 8987c17cd..4c720f3f4 100644 --- a/ols/src/tools/tools.py +++ b/ols/src/tools/tools.py @@ -158,11 +158,11 @@ async def _execute_single_tool_call( if tool_name is None: raise ValueError("Tool name is missing from tool call") raise_for_sensitive_tool_args(tool_args) - get_tool_by_name(tool_name, all_mcp_tools) + get_tool_by_name(tool_name, all_mcp_tools) # raise for unknown tool except Exception as e: tool_output = f"Error: {e}. {DO_NOT_RETRY_REMINDER}" status = "error" - logger.error(tool_output) + logger.error("%s, Tool call: %s", tool_output, tool_call) else: attempts = MAX_TOOL_CALL_RETRIES + 1 last_error_text = "unknown error"