openshift · onmete · Mar 13, 2026 · Mar 13, 2026
diff --git a/docs/reasoning-tokens.md b/docs/reasoning-tokens.md
diff --git a/examples/olsconfig.yaml b/examples/olsconfig.yaml
@@ -8,6 +8,11 @@ llm_providers:
     models:
       - name: model-name-1
       - name: model-name-2
+        # Reasoning model settings (GPT-5, o-series). Omit for non-reasoning models.
+        # parameters:
+        #   reasoning_effort: low      # low | medium | high (default: low)
+        #   reasoning_summary: concise  # auto | concise | detailed (default: concise)
+        #   verbosity: low             # low | medium | high (default: low)
   - name: my_azure_openai
     type: azure_openai
     url: "https://myendpoint.openai.azure.com/"

diff --git a/ols/app/endpoints/ols.py b/ols/app/endpoints/ols.py
@@ -260,6 +260,13 @@ def calc_output_tokens(token_counter: Optional[TokenCounter]) -> int:
     return token_counter.output_tokens
 
 
+def calc_reasoning_tokens(token_counter: Optional[TokenCounter]) -> int:
+    """Calculate reasoning tokens."""
+    if token_counter is None:
+        return 0
+    return token_counter.reasoning_tokens
+
+
 def get_available_quotas(
     quota_limiters: Optional[list[QuotaLimiter]],
     user_id: str,

diff --git a/ols/app/endpoints/streaming_ols.py b/ols/app/endpoints/streaming_ols.py
@@ -17,6 +17,7 @@
 from ols.app.endpoints.ols import (
     calc_input_tokens,
     calc_output_tokens,
+    calc_reasoning_tokens,
     consume_tokens,
     generate_response,
     get_available_quotas,
@@ -51,6 +52,7 @@
 LLM_TOKEN_EVENT = "token"  # noqa: S105
 LLM_TOOL_CALL_EVENT = "tool_call"
 LLM_TOOL_RESULT_EVENT = "tool_result"
+LLM_REASONING_EVENT = "reasoning"
 
 
 query_responses: dict[int | str, dict[str, Any]] = {
@@ -159,6 +161,8 @@ def stream_event(data: dict, event_type: str, media_type: str) -> str:
     if media_type == MEDIA_TYPE_TEXT:
         if event_type == LLM_TOKEN_EVENT:
             return data["token"]
+        if event_type == LLM_REASONING_EVENT:
+            return data["reasoning"]
         if event_type == LLM_TOOL_CALL_EVENT:
             return f"\nTool call: {json.dumps(data)}\n"
         if event_type == LLM_TOOL_RESULT_EVENT:
@@ -198,6 +202,7 @@ def stream_end_event(
                     "truncated": truncated,
                     "input_tokens": calc_input_tokens(token_counter),
                     "output_tokens": calc_output_tokens(token_counter),
+                    "reasoning_tokens": calc_reasoning_tokens(token_counter),
                 },
                 "available_quotas": available_quotas,
             }
@@ -377,6 +382,7 @@ async def response_processing_wrapper(
     tool_results: list = []
     history_truncated: bool = False
     idx: int = 0
+    was_reasoning: bool = False
     token_counter: Optional[TokenCounter] = None
 
     try:
@@ -399,7 +405,18 @@ async def response_processing_wrapper(
                     event_type=LLM_TOOL_RESULT_EVENT,
                     media_type=media_type,
                 )
+            elif item.type == "reasoning":
+                was_reasoning = True
+                yield stream_event(
+                    data={"id": idx, "reasoning": item.text},
+                    event_type=LLM_REASONING_EVENT,
+                    media_type=media_type,
+                )
+                idx += 1
             elif item.type == "text":
+                if was_reasoning and media_type == MEDIA_TYPE_TEXT:
+                    yield "\n\n"
+                    was_reasoning = False
                 response += item.text
                 yield stream_event(
                     data={"id": idx, "token": item.text},

diff --git a/ols/app/metrics/metrics.py b/ols/app/metrics/metrics.py
@@ -43,6 +43,11 @@
 llm_token_received_total = Counter(
     "ols_llm_token_received_total", "LLM tokens received", ["provider", "model"]
 )
+llm_reasoning_token_total = Counter(
+    "ols_llm_reasoning_token_total",
+    "LLM reasoning summary tokens received",
+    ["provider", "model"],
+)
 
 # metric that indicates what provider + model customers are using so we can
 # understand what is popular/important

diff --git a/ols/app/metrics/token_counter.py b/ols/app/metrics/token_counter.py
@@ -8,6 +8,7 @@
 
 from ols.app.metrics.metrics import (
     llm_calls_total,
+    llm_reasoning_token_total,
     llm_token_received_total,
     llm_token_sent_total,
 )
@@ -20,9 +21,10 @@
 class GenericTokenCounter(AsyncCallbackHandler):  # pylint: disable=R0901
     """A callback handler to count tokens sent and received by the LLM.
 
-    It provides 3 counters via TokenCounter dataclass stored as an attribute:
-    - input_tokens_counted: number of input tokens counted by the handler
-    - output_tokens: number of tokens received from LLM
+    It provides counters via TokenCounter dataclass stored as an attribute:
+    - input_tokens: number of input tokens counted by the handler (tiktoken)
+    - output_tokens: number of output tokens counted by the handler (tiktoken)
+    - reasoning_tokens: number of reasoning summary tokens (tiktoken)
     - llm_calls: number of LLM calls
     """
 
@@ -42,8 +44,25 @@ async def on_llm_new_token(
         **kwargs: Any,
     ) -> None:
         """Compute token count when llm token is yielded."""
-        if token is not None and token != "":
+        if token and isinstance(token, str):
             self.token_counter.output_tokens += self.tokens_count(token)
+        elif isinstance(token, list):
+            for block in token:
+                if not isinstance(block, dict):
+                    continue
+                block_type = block.get("type")
+                if block_type == "text":
+                    text = block.get("text", "")
+                    if text:
+                        self.token_counter.output_tokens += self.tokens_count(text)
+                elif block_type == "reasoning":
+                    for part in block.get("summary", []):
+                        if isinstance(part, dict):
+                            text = part.get("text", "")
+                            if text:
+                                self.token_counter.reasoning_tokens += self.tokens_count(
+                                    text
+                                )
 
     async def on_llm_start(
         self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any
@@ -63,6 +82,7 @@ def __str__(self) -> str:
             f"{self.__class__.__name__}: "
             + f"input_tokens: {self.token_counter.input_tokens} "
             + f"output_tokens: {self.token_counter.output_tokens} "
+            + f"reasoning_tokens: {self.token_counter.reasoning_tokens} "
             + f"LLM calls: {self.token_counter.llm_calls}"
         )
 
@@ -73,6 +93,7 @@ class TokenMetricUpdater:
     These metrics are updated:
     - llm_token_sent_total
     - llm_token_received_total
+    - llm_reasoning_token_total
     - llm_calls_total
 
     Example usage:
@@ -119,3 +140,6 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         llm_token_received_total.labels(provider=self.provider, model=self.model).inc(
             self.token_counter.token_counter.output_tokens
         )
+        llm_reasoning_token_total.labels(provider=self.provider, model=self.model).inc(
+            self.token_counter.token_counter.reasoning_tokens
+        )
diff --git a/ols/app/models/config.py b/ols/app/models/config.py
@@ -4,7 +4,7 @@
 import os
 import re
 from enum import StrEnum
-from typing import Any, Optional, Self
+from typing import Any, Literal, Optional, Self
 
 from pydantic import (
     AnyHttpUrl,
@@ -30,6 +30,10 @@ class ModelParameters(BaseModel):
     )
     max_tokens_for_tools: PositiveInt = constants.DEFAULT_MAX_TOKENS_FOR_TOOLS
 
+    reasoning_effort: Literal["low", "medium", "high"] = "low"
+    reasoning_summary: Literal["auto", "concise", "detailed"] = "concise"
+    verbosity: Literal["low", "medium", "high"] = "low"
+
 
 class ModelConfig(BaseModel):
     """Model configuration."""

diff --git a/ols/app/models/models.py b/ols/app/models/models.py
@@ -749,13 +749,15 @@ class TokenCounter:
     Attributes:
         llm: LLM instance
         input_tokens: number of tokens sent to LLM
-        output_tokens: number of tokens received from LLM
+        output_tokens: number of text tokens received from LLM
+        reasoning_tokens: number of reasoning summary tokens received from LLM
         llm_calls: number of LLM calls
     """
 
     llm: Optional[LLM] = None
     input_tokens: int = 0
     output_tokens: int = 0
+    reasoning_tokens: int = 0
     llm_calls: int = 0
 
 
@@ -992,7 +994,7 @@ class StreamedChunk:
         data: Additional data associated with the chunk (for non-text chunks)
     """
 
-    type: Literal["text", "tool_call", "tool_result", "end"]
+    type: Literal["text", "tool_call", "tool_result", "end", "reasoning"]
     text: str = ""
     data: dict[str, Any] = field(default_factory=dict)
 

diff --git a/ols/src/llms/providers/openai.py b/ols/src/llms/providers/openai.py
@@ -44,8 +44,17 @@ def default_params(self) -> dict[str, Any]:
             "http_async_client": self._construct_httpx_client(True, True),
         }
 
-        # gpt-5 and o-series models don't support certain parameters
-        if not ("gpt-5" in self.model or self.model.startswith("o")):
+        # gpt-5 and o-series models use the Responses API for reasoning support
+        model_config = self.provider_config.models.get(self.model)
+        params = getattr(model_config, "parameters", None)
+
+        if "gpt-5" in self.model or self.model.startswith("o"):
+            default_parameters["reasoning"] = {
+                "effort": params.reasoning_effort,
+                "summary": params.reasoning_summary,
+            }
+            default_parameters["verbosity"] = params.verbosity
+        else:
             default_parameters["temperature"] = 0.01
             default_parameters["top_p"] = 0.95
             default_parameters["frequency_penalty"] = 1.03

diff --git a/ols/src/llms/providers/provider.py b/ols/src/llms/providers/provider.py
@@ -70,6 +70,8 @@ class ProviderParameter:
     ProviderParameter("verbose", bool),
     ProviderParameter("http_client", httpx.Client),
     ProviderParameter("http_async_client", httpx.AsyncClient),
+    ProviderParameter("reasoning", dict),
+    ProviderParameter("verbosity", str),
 }
 
 RHOAIVLLMParameters = {

diff --git a/ols/src/query_helpers/docs_summarizer.py b/ols/src/query_helpers/docs_summarizer.py
@@ -360,12 +360,18 @@ async def _invoke_llm(
             AIMessageChunk objects from the LLM response stream
         """
         logger.debug("provided %s tools", len(tools_map))
-        # determine whether to use tools based on round and availability
-        llm = (
-            self.bare_llm
-            if is_final_round or not tools_map
-            else self.bare_llm.bind_tools(tools_map)
-        )
+        if not tools_map:
+            llm = self.bare_llm
+        elif is_final_round:
+            # strict=False: the Responses API (used when reasoning params are
+            # present) defaults strict to True, unlike Chat Completions which
+            # defaults to False.  With strict=True the model enters structured-
+            # outputs mode which only supports a subset of JSON Schema — MCP tool
+            # schemas contain unsupported keywords like ``pattern`` that cause the
+            # model to misinterpret regex constraints as literal argument values.
+            llm = self.bare_llm.bind_tools(tools_map, tool_choice="none", strict=False)
+        else:
+            llm = self.bare_llm.bind_tools(tools_map, strict=False)
 
         # create and execute the chain
         chain = messages | llm
@@ -455,6 +461,7 @@ async def iterate_with_tools(  # noqa: C901  # pylint: disable=R0912,R0915
                 logger.debug("Tool calling round %s (final: %s)", i, is_final_round)
 
                 tool_call_chunks = []
+                all_chunks: list[AIMessageChunk] = []
                 chunk_counter = 0
                 stop_generation = False
                 # invoke LLM and process response chunks
@@ -484,15 +491,34 @@ async def iterate_with_tools(  # noqa: C901  # pylint: disable=R0912,R0915
                         stop_generation = True
                         continue
 
-                    # collect tool chunk or yield text
+                    all_chunks.append(chunk)
+
+                    # collect tool chunk or yield text/reasoning
                     if getattr(chunk, "tool_call_chunks", None):
                         tool_call_chunks.append(chunk)
-                    else:
-                        if not skip_special_chunk(
+                    elif isinstance(chunk.content, str):
+                        if chunk.content and not skip_special_chunk(
                             chunk.content, chunk_counter, self.model, is_final_round
                         ):
-                            # stream text chunks directly
                             yield StreamedChunk(type="text", text=chunk.content)
+                    elif isinstance(chunk.content, list):
+                        for block in chunk.content:
+                            if not isinstance(block, dict):
+                                continue
+                            block_type = block.get("type")
+                            if block_type == "text":
+                                text = block.get("text", "")
+                                if text and not skip_special_chunk(
+                                    text, chunk_counter, self.model, is_final_round
+                                ):
+                                    yield StreamedChunk(type="text", text=text)
+                            elif block_type == "reasoning":
+                                for part in block.get("summary", []):
+                                    text = part.get("text", "")
+                                    if text:
+                                        yield StreamedChunk(
+                                            type="reasoning", text=text
+                                        )
 
                     chunk_counter += 1
 
@@ -504,17 +530,40 @@ async def iterate_with_tools(  # noqa: C901  # pylint: disable=R0912,R0915
                     break
 
                 # tool calling part
+                if not tool_call_chunks:
+                    break
+
                 if tool_call_chunks:
                     # assess tool calls and add to messages
                     tool_calls = tool_calls_from_tool_calls_chunks(tool_call_chunks)
-                    ai_tool_call_message = AIMessage(
-                        content="", type="ai", tool_calls=tool_calls
-                    )
+
+                    # Accumulate the full AI message (reasoning + tool calls)
+                    # so reasoning context is preserved between rounds per
+                    # OpenAI's "Keeping reasoning items in context" guidance.
+                    if all_chunks:
+                        accumulated = all_chunks[0]
+                        for c in all_chunks[1:]:
+                            accumulated += c  # type: ignore [assignment]
+                        ai_tool_call_message = AIMessage(
+                            content=accumulated.content,
+                            tool_calls=tool_calls,
+                            additional_kwargs=accumulated.additional_kwargs,
+                        )
+                    else:
+                        ai_tool_call_message = AIMessage(
+                            content="", type="ai", tool_calls=tool_calls
+                        )
                     messages.append(ai_tool_call_message)
 
-                    # Count tokens used by the AIMessage with tool calls
+                    ai_content_text = (
+                        json.dumps(ai_tool_call_message.content)
+                        if isinstance(ai_tool_call_message.content, list)
+                        else str(ai_tool_call_message.content)
+                    )
                     ai_message_tokens = TokenHandler._get_token_count(
-                        token_handler.text_to_tokens(json.dumps(tool_calls))
+                        token_handler.text_to_tokens(
+                            ai_content_text + json.dumps(tool_calls)
+                        )
                     )
                     tool_tokens_used += ai_message_tokens
 
@@ -653,11 +702,11 @@ async def drain_generate_response() -> SummarizerResponse:
                     tool_calls.append(chunk.data)
                 elif chunk.type == "tool_result":
                     tool_results.append(chunk.data)
+                elif chunk.type == "reasoning":
+                    pass
                 elif chunk.type == "text":
                     chunks.append(chunk.text)
                 else:
-                    # this "can't" happen as we control what chunk types
-                    # are yielded in the generator directly
                     msg = f"Unknown chunk type: {chunk.type}"
                     logger.warning(msg)
                     raise ValueError(msg)