openshift · onmete · Mar 13, 2026 · Mar 16, 2026
diff --git a/ols/src/tools/tools.py b/ols/src/tools/tools.py
@@ -20,8 +20,8 @@
 DO_NOT_RETRY_REMINDER = "Do not retry this exact tool call."
 
 
-def _is_retryable_tool_error(error: Exception) -> bool:
-    """Return true if a tool execution error is likely transient."""
+def _is_transient_error(error: Exception) -> bool:
+    """Return true if the error is likely transient and worth retrying automatically."""
     if isinstance(error, (TimeoutError, asyncio.TimeoutError, ConnectionError)):
         return True
     error_text = str(error).lower()
@@ -152,54 +152,61 @@ async def _execute_single_tool_call(
     was_truncated = False
     structured_content: dict | None = None
 
-    if tool_name is None:
-        tool_output = (
-            f"Error: Tool name is missing from tool call. {DO_NOT_RETRY_REMINDER}"
-        )
+    # Scenario 1: precondition failure — tool name missing, tool unknown, or sensitive args.
+    # The LLM must not retry; no argument change can make this call valid.
+    try:
+        if tool_name is None:
+            raise ValueError("Tool name is missing from tool call")
+        raise_for_sensitive_tool_args(tool_args)
+        get_tool_by_name(tool_name, all_mcp_tools)  # raise for unknown tool
+    except Exception as e:
+        tool_output = f"Error: {e}. {DO_NOT_RETRY_REMINDER}"
         status = "error"
-        logger.error("Tool call missing name: %s", tool_call)
+        logger.error("%s, Tool call: %s", tool_output, tool_call)
     else:
-        try:
-            raise_for_sensitive_tool_args(tool_args)
-            attempts = MAX_TOOL_CALL_RETRIES + 1
-            last_error_text = "unknown error"
-            for attempt in range(attempts):
-                try:
-                    tool_output, was_truncated, structured_content = (
-                        await execute_tool_call(
-                            tool_name, tool_args, all_mcp_tools, max_tokens
-                        )
+        attempts = MAX_TOOL_CALL_RETRIES + 1
+        last_error_text = "unknown error"
+        for attempt in range(attempts):
+            try:
+                tool_output, was_truncated, structured_content = (
+                    await execute_tool_call(
+                        tool_name, tool_args, all_mcp_tools, max_tokens
                     )
-                    status = "success"
-                    break
-                except Exception as error:
-                    last_error_text = str(error)
-                    if attempt < MAX_TOOL_CALL_RETRIES and _is_retryable_tool_error(
-                        error
-                    ):
-                        logger.warning(
-                            "Retrying tool '%s' after transient error on attempt %d/%d: %s",
-                            tool_name,
-                            attempt + 1,
-                            attempts,
-                            error,
-                        )
-                        await asyncio.sleep(RETRY_BACKOFF_SECONDS * (2**attempt))
-                        continue
-                    tool_output = (
-                        f"Tool '{tool_name}' execution failed after {attempt + 1} "
-                        f"attempt(s): {last_error_text}. {DO_NOT_RETRY_REMINDER}"
+                )
+                status = "success"
+                break
+            except Exception as error:
+                last_error_text = str(error)
+                # Scenario 2: transient infrastructure error — retry automatically with same args.
+                if attempt < MAX_TOOL_CALL_RETRIES and _is_transient_error(error):
+                    logger.warning(
+                        "Retrying tool '%s' after transient error on attempt %d/%d: %s",
+                        tool_name,
+                        attempt + 1,
+                        attempts,
+                        error,
                     )
-                    status = "error"
-                    logger.exception(tool_output)
-                    break
-        except Exception as e:
-            tool_output = (
-                f"Error executing tool '{tool_name}' with args {tool_args}: {e}"
-            )
-            tool_output = f"{tool_output}. {DO_NOT_RETRY_REMINDER}"
-            status = "error"
-            logger.exception(tool_output)
+                    await asyncio.sleep(RETRY_BACKOFF_SECONDS * (2**attempt))
+                    continue
+                # Scenario 3a: non-transient error on the very first attempt — the LLM likely
+                # sent bad args. No reminder: the LLM is free to retry with corrected args.
+                # Scenario 3b: anything else (retries exhausted, or non-transient after a prior
+                # transient retry) — retrying in any form is unlikely to help.
+                is_first_attempt_non_transient = (
+                    attempt == 0 and not _is_transient_error(error)
+                )
+                retry_note = (
+                    ""
+                    if is_first_attempt_non_transient
+                    else f" {DO_NOT_RETRY_REMINDER}"
+                )
+                tool_output = (
+                    f"Tool '{tool_name}' execution failed after {attempt + 1} "
+                    f"attempt(s): {last_error_text}.{retry_note}"
+                )
+                status = "error"
+                logger.exception(tool_output)
+                break
 
     additional_kwargs: dict = {"truncated": was_truncated}
     if structured_content is not None:

diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
@@ -242,16 +242,17 @@ async def test_execute_tool_calls_mixed_success_and_failure():
     assert tool_messages[0].status == "success"
     assert tool_messages[0].content == "fake_output_from_success_tool"
 
-    # Second call should fail due to tool raising exception
+    # Second call should fail due to tool raising non-transient exception (Scenario 3a).
+    # No DO_NOT_RETRY_REMINDER: LLM may retry with corrected arguments.
     assert tool_messages[1].status == "error"
     assert "execution failed after 1 attempt(s)" in tool_messages[1].content
     assert "Tool execution failed" in tool_messages[1].content
-    assert DO_NOT_RETRY_REMINDER in tool_messages[1].content
+    assert DO_NOT_RETRY_REMINDER not in tool_messages[1].content
 
-    # Third call should fail due to nonexistent tool
+    # Third call should fail due to nonexistent tool (Scenario 1 precondition).
+    # DO_NOT_RETRY_REMINDER is always included: no argument change can make this valid.
     assert tool_messages[2].status == "error"
     assert "Tool 'nonexistent_tool' not found" in tool_messages[2].content
-    assert "execution failed after 1 attempt(s)" in tool_messages[2].content
     assert DO_NOT_RETRY_REMINDER in tool_messages[2].content
 
 
@@ -343,7 +344,8 @@ async def _non_retryable_failure(**kwargs: Any) -> tuple:
     assert call_count == 1
     assert tool_messages[0].status == "error"
     assert "execution failed after 1 attempt(s)" in tool_messages[0].content
-    assert DO_NOT_RETRY_REMINDER in tool_messages[0].content
+    # Scenario 3a: non-transient error, first attempt — LLM may retry with corrected args.
+    assert DO_NOT_RETRY_REMINDER not in tool_messages[0].content
 
 
 @pytest.mark.asyncio