Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 299 additions & 0 deletions docs/reasoning-tokens.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions examples/olsconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ llm_providers:
models:
- name: model-name-1
- name: model-name-2
# Reasoning model settings (GPT-5, o-series). Omit for non-reasoning models.
# parameters:
# reasoning_effort: low # low | medium | high (default: low)
# reasoning_summary: concise # auto | concise | detailed (default: concise)
# verbosity: low # low | medium | high (default: low)
- name: my_azure_openai
type: azure_openai
url: "https://myendpoint.openai.azure.com/"
Expand Down
7 changes: 7 additions & 0 deletions ols/app/endpoints/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ def calc_output_tokens(token_counter: Optional[TokenCounter]) -> int:
return token_counter.output_tokens


def calc_reasoning_tokens(token_counter: Optional[TokenCounter]) -> int:
"""Calculate reasoning tokens."""
if token_counter is None:
return 0
return token_counter.reasoning_tokens


def get_available_quotas(
quota_limiters: Optional[list[QuotaLimiter]],
user_id: str,
Expand Down
17 changes: 17 additions & 0 deletions ols/app/endpoints/streaming_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ols.app.endpoints.ols import (
calc_input_tokens,
calc_output_tokens,
calc_reasoning_tokens,
consume_tokens,
generate_response,
get_available_quotas,
Expand Down Expand Up @@ -51,6 +52,7 @@
LLM_TOKEN_EVENT = "token" # noqa: S105
LLM_TOOL_CALL_EVENT = "tool_call"
LLM_TOOL_RESULT_EVENT = "tool_result"
LLM_REASONING_EVENT = "reasoning"


query_responses: dict[int | str, dict[str, Any]] = {
Expand Down Expand Up @@ -159,6 +161,8 @@ def stream_event(data: dict, event_type: str, media_type: str) -> str:
if media_type == MEDIA_TYPE_TEXT:
if event_type == LLM_TOKEN_EVENT:
return data["token"]
if event_type == LLM_REASONING_EVENT:
return data["reasoning"]
if event_type == LLM_TOOL_CALL_EVENT:
return f"\nTool call: {json.dumps(data)}\n"
if event_type == LLM_TOOL_RESULT_EVENT:
Expand Down Expand Up @@ -198,6 +202,7 @@ def stream_end_event(
"truncated": truncated,
"input_tokens": calc_input_tokens(token_counter),
"output_tokens": calc_output_tokens(token_counter),
"reasoning_tokens": calc_reasoning_tokens(token_counter),
},
"available_quotas": available_quotas,
}
Expand Down Expand Up @@ -377,6 +382,7 @@ async def response_processing_wrapper(
tool_results: list = []
history_truncated: bool = False
idx: int = 0
was_reasoning: bool = False
token_counter: Optional[TokenCounter] = None

try:
Expand All @@ -399,7 +405,18 @@ async def response_processing_wrapper(
event_type=LLM_TOOL_RESULT_EVENT,
media_type=media_type,
)
elif item.type == "reasoning":
was_reasoning = True
yield stream_event(
data={"id": idx, "reasoning": item.text},
event_type=LLM_REASONING_EVENT,
media_type=media_type,
)
idx += 1
elif item.type == "text":
if was_reasoning and media_type == MEDIA_TYPE_TEXT:
yield "\n\n"
was_reasoning = False
response += item.text
yield stream_event(
data={"id": idx, "token": item.text},
Expand Down
5 changes: 5 additions & 0 deletions ols/app/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@
llm_token_received_total = Counter(
"ols_llm_token_received_total", "LLM tokens received", ["provider", "model"]
)
llm_reasoning_token_total = Counter(
"ols_llm_reasoning_token_total",
"LLM reasoning summary tokens received",
["provider", "model"],
)

# metric that indicates what provider + model customers are using so we can
# understand what is popular/important
Expand Down
32 changes: 28 additions & 4 deletions ols/app/metrics/token_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from ols.app.metrics.metrics import (
llm_calls_total,
llm_reasoning_token_total,
llm_token_received_total,
llm_token_sent_total,
)
Expand All @@ -20,9 +21,10 @@
class GenericTokenCounter(AsyncCallbackHandler): # pylint: disable=R0901
"""A callback handler to count tokens sent and received by the LLM.

It provides 3 counters via TokenCounter dataclass stored as an attribute:
- input_tokens_counted: number of input tokens counted by the handler
- output_tokens: number of tokens received from LLM
It provides counters via TokenCounter dataclass stored as an attribute:
- input_tokens: number of input tokens counted by the handler (tiktoken)
- output_tokens: number of output tokens counted by the handler (tiktoken)
- reasoning_tokens: number of reasoning summary tokens (tiktoken)
- llm_calls: number of LLM calls
"""

Expand All @@ -42,8 +44,25 @@ async def on_llm_new_token(
**kwargs: Any,
) -> None:
"""Compute token count when llm token is yielded."""
if token is not None and token != "":
if token and isinstance(token, str):
self.token_counter.output_tokens += self.tokens_count(token)
elif isinstance(token, list):
for block in token:
if not isinstance(block, dict):
continue
block_type = block.get("type")
if block_type == "text":
text = block.get("text", "")
if text:
self.token_counter.output_tokens += self.tokens_count(text)
elif block_type == "reasoning":
for part in block.get("summary", []):
if isinstance(part, dict):
text = part.get("text", "")
if text:
self.token_counter.reasoning_tokens += self.tokens_count(
text
)

async def on_llm_start(
self, serialized: dict[str, Any], prompts: list[str], **kwargs: Any
Expand All @@ -63,6 +82,7 @@ def __str__(self) -> str:
f"{self.__class__.__name__}: "
+ f"input_tokens: {self.token_counter.input_tokens} "
+ f"output_tokens: {self.token_counter.output_tokens} "
+ f"reasoning_tokens: {self.token_counter.reasoning_tokens} "
+ f"LLM calls: {self.token_counter.llm_calls}"
)

Expand All @@ -73,6 +93,7 @@ class TokenMetricUpdater:
These metrics are updated:
- llm_token_sent_total
- llm_token_received_total
- llm_reasoning_token_total
- llm_calls_total

Example usage:
Expand Down Expand Up @@ -119,3 +140,6 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
llm_token_received_total.labels(provider=self.provider, model=self.model).inc(
self.token_counter.token_counter.output_tokens
)
llm_reasoning_token_total.labels(provider=self.provider, model=self.model).inc(
self.token_counter.token_counter.reasoning_tokens
)
6 changes: 5 additions & 1 deletion ols/app/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import re
from enum import StrEnum
from typing import Any, Optional, Self
from typing import Any, Literal, Optional, Self

from pydantic import (
AnyHttpUrl,
Expand All @@ -30,6 +30,10 @@ class ModelParameters(BaseModel):
)
max_tokens_for_tools: PositiveInt = constants.DEFAULT_MAX_TOKENS_FOR_TOOLS

reasoning_effort: Literal["low", "medium", "high"] = "low"
reasoning_summary: Literal["auto", "concise", "detailed"] = "concise"
verbosity: Literal["low", "medium", "high"] = "low"


class ModelConfig(BaseModel):
"""Model configuration."""
Expand Down
6 changes: 4 additions & 2 deletions ols/app/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,13 +749,15 @@ class TokenCounter:
Attributes:
llm: LLM instance
input_tokens: number of tokens sent to LLM
output_tokens: number of tokens received from LLM
output_tokens: number of text tokens received from LLM
reasoning_tokens: number of reasoning summary tokens received from LLM
llm_calls: number of LLM calls
"""

llm: Optional[LLM] = None
input_tokens: int = 0
output_tokens: int = 0
reasoning_tokens: int = 0
llm_calls: int = 0


Expand Down Expand Up @@ -992,7 +994,7 @@ class StreamedChunk:
data: Additional data associated with the chunk (for non-text chunks)
"""

type: Literal["text", "tool_call", "tool_result", "end"]
type: Literal["text", "tool_call", "tool_result", "end", "reasoning"]
text: str = ""
data: dict[str, Any] = field(default_factory=dict)

Expand Down
13 changes: 11 additions & 2 deletions ols/src/llms/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,17 @@ def default_params(self) -> dict[str, Any]:
"http_async_client": self._construct_httpx_client(True, True),
}

# gpt-5 and o-series models don't support certain parameters
if not ("gpt-5" in self.model or self.model.startswith("o")):
# gpt-5 and o-series models use the Responses API for reasoning support
model_config = self.provider_config.models.get(self.model)
params = getattr(model_config, "parameters", None)

if "gpt-5" in self.model or self.model.startswith("o"):
default_parameters["reasoning"] = {
"effort": params.reasoning_effort,
"summary": params.reasoning_summary,
}
default_parameters["verbosity"] = params.verbosity
else:
default_parameters["temperature"] = 0.01
default_parameters["top_p"] = 0.95
default_parameters["frequency_penalty"] = 1.03
Expand Down
2 changes: 2 additions & 0 deletions ols/src/llms/providers/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class ProviderParameter:
ProviderParameter("verbose", bool),
ProviderParameter("http_client", httpx.Client),
ProviderParameter("http_async_client", httpx.AsyncClient),
ProviderParameter("reasoning", dict),
ProviderParameter("verbosity", str),
}

RHOAIVLLMParameters = {
Expand Down
83 changes: 66 additions & 17 deletions ols/src/query_helpers/docs_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,18 @@ async def _invoke_llm(
AIMessageChunk objects from the LLM response stream
"""
logger.debug("provided %s tools", len(tools_map))
# determine whether to use tools based on round and availability
llm = (
self.bare_llm
if is_final_round or not tools_map
else self.bare_llm.bind_tools(tools_map)
)
if not tools_map:
llm = self.bare_llm
elif is_final_round:
# strict=False: the Responses API (used when reasoning params are
# present) defaults strict to True, unlike Chat Completions which
# defaults to False. With strict=True the model enters structured-
# outputs mode which only supports a subset of JSON Schema — MCP tool
# schemas contain unsupported keywords like ``pattern`` that cause the
# model to misinterpret regex constraints as literal argument values.
llm = self.bare_llm.bind_tools(tools_map, tool_choice="none", strict=False)
else:
llm = self.bare_llm.bind_tools(tools_map, strict=False)

# create and execute the chain
chain = messages | llm
Expand Down Expand Up @@ -455,6 +461,7 @@ async def iterate_with_tools( # noqa: C901 # pylint: disable=R0912,R0915
logger.debug("Tool calling round %s (final: %s)", i, is_final_round)

tool_call_chunks = []
all_chunks: list[AIMessageChunk] = []
chunk_counter = 0
stop_generation = False
# invoke LLM and process response chunks
Expand Down Expand Up @@ -484,15 +491,34 @@ async def iterate_with_tools( # noqa: C901 # pylint: disable=R0912,R0915
stop_generation = True
continue

# collect tool chunk or yield text
all_chunks.append(chunk)

# collect tool chunk or yield text/reasoning
if getattr(chunk, "tool_call_chunks", None):
tool_call_chunks.append(chunk)
else:
if not skip_special_chunk(
elif isinstance(chunk.content, str):
if chunk.content and not skip_special_chunk(
chunk.content, chunk_counter, self.model, is_final_round
):
# stream text chunks directly
yield StreamedChunk(type="text", text=chunk.content)
elif isinstance(chunk.content, list):
for block in chunk.content:
if not isinstance(block, dict):
continue
block_type = block.get("type")
if block_type == "text":
text = block.get("text", "")
if text and not skip_special_chunk(
text, chunk_counter, self.model, is_final_round
):
yield StreamedChunk(type="text", text=text)
elif block_type == "reasoning":
for part in block.get("summary", []):
text = part.get("text", "")
if text:
yield StreamedChunk(
type="reasoning", text=text
)

chunk_counter += 1

Expand All @@ -504,17 +530,40 @@ async def iterate_with_tools( # noqa: C901 # pylint: disable=R0912,R0915
break

# tool calling part
if not tool_call_chunks:
break

if tool_call_chunks:
# assess tool calls and add to messages
tool_calls = tool_calls_from_tool_calls_chunks(tool_call_chunks)
ai_tool_call_message = AIMessage(
content="", type="ai", tool_calls=tool_calls
)

# Accumulate the full AI message (reasoning + tool calls)
# so reasoning context is preserved between rounds per
# OpenAI's "Keeping reasoning items in context" guidance.
if all_chunks:
accumulated = all_chunks[0]
for c in all_chunks[1:]:
accumulated += c # type: ignore [assignment]
ai_tool_call_message = AIMessage(
content=accumulated.content,
tool_calls=tool_calls,
additional_kwargs=accumulated.additional_kwargs,
)
else:
ai_tool_call_message = AIMessage(
content="", type="ai", tool_calls=tool_calls
)
messages.append(ai_tool_call_message)

# Count tokens used by the AIMessage with tool calls
ai_content_text = (
json.dumps(ai_tool_call_message.content)
if isinstance(ai_tool_call_message.content, list)
else str(ai_tool_call_message.content)
)
ai_message_tokens = TokenHandler._get_token_count(
token_handler.text_to_tokens(json.dumps(tool_calls))
token_handler.text_to_tokens(
ai_content_text + json.dumps(tool_calls)
)
)
tool_tokens_used += ai_message_tokens

Expand Down Expand Up @@ -653,11 +702,11 @@ async def drain_generate_response() -> SummarizerResponse:
tool_calls.append(chunk.data)
elif chunk.type == "tool_result":
tool_results.append(chunk.data)
elif chunk.type == "reasoning":
pass
elif chunk.type == "text":
chunks.append(chunk.text)
else:
# this "can't" happen as we control what chunk types
# are yielded in the generator directly
msg = f"Unknown chunk type: {chunk.type}"
logger.warning(msg)
raise ValueError(msg)
Expand Down