From 9345890541cd5121f7f4745912a9939671fed482 Mon Sep 17 00:00:00 2001 From: mitya Date: Mon, 9 Jun 2025 13:35:38 +0200 Subject: [PATCH 1/6] split thinking part --- .../refact_utils/third_party/utils/configs.py | 2 +- .../webgui/selfhost_fastapi_completions.py | 66 +++++++++++++++++-- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/refact-server/refact_utils/third_party/utils/configs.py b/refact-server/refact_utils/third_party/utils/configs.py index acd503853..f394009e1 100644 --- a/refact-server/refact_utils/third_party/utils/configs.py +++ b/refact-server/refact_utils/third_party/utils/configs.py @@ -17,7 +17,7 @@ class ModelCapabilities(BaseModel): agent: bool clicks: bool completion: bool - reasoning: Optional[str] = False + reasoning: Optional[str] = None boost_reasoning: bool = False diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py index 4060648cf..5e3beebe1 100644 --- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py +++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py @@ -192,6 +192,39 @@ async def embeddings_streamer(ticket: Ticket, timeout, created_ts): ticket.done() +# NOTE: some models doesn't support multiple parsers for now, we need parse thinking manually in this case +class ThinkingPatcher: + def __init__(self, thinking_split_token: Optional[str]): + self._thinking_split_token = thinking_split_token + self._thinking_split_index = set() + + def patch_choices(self, choices: List[Dict]) -> List[Dict]: + if self._thinking_split_token is None: + return choices + for choice in choices: + index = choice["index"] + if "delta" in choice: + if content := choice["delta"].get("content"): + if index not in self._thinking_split_index: + if self._thinking_split_token in content: + self._thinking_split_index.add(index) + choice["delta"]["reasoning_content"], choice["delta"]["content"] \ + = (*content.split(self._thinking_split_token), "")[:2] + else: + choice["delta"]["reasoning_content"] = content + choice["delta"]["content"] = "" + else: + choice["delta"]["reasoning_content"] = "" + choice["delta"]["content"] = content + elif "message" in choice: + if content := choice["message"].get("content", ""): + choice["message"]["reasoning_content"], choice["message"]["content"] \ + = (*content.split(self._thinking_split_token), "")[:2] + else: + log(f"unknown choice type with keys: {choice.keys()}, skip thinking patch") + return choices + + class BaseCompletionsRouter(APIRouter): def __init__(self, @@ -573,6 +606,25 @@ def _wrap_output(output: str) -> str: "timeout": 60 * 60, # An hour timeout for thinking models } + thinking_split_token = None + if model_config.capabilities.reasoning in ["qwen", "deepseek"]: + thinking_split_token = "" + + # Qwen3 thinking arguments override + if post.enable_thinking is not None: + completion_kwargs["chat_template_kwargs"] = {"enable_thinking": post.enable_thinking} + completion_kwargs["top_k"] = 20 + if post.enable_thinking: + completion_kwargs["top_p"] = 0.95 + completion_kwargs["min_p"] = 0 + completion_kwargs["presence_penalty"] = 1 + else: + thinking_split_token = None + completion_kwargs["temperature"] = 0.7 + completion_kwargs["top_p"] = 0.8 + completion_kwargs["presence_penalty"] = 1.5 + thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token) + if post.reasoning_effort or post.thinking: del completion_kwargs["temperature"] del completion_kwargs["top_p"] @@ -592,11 +644,13 @@ async def litellm_streamer(): async for model_response in response: try: data = model_response.dict() - choice0 = data["choices"][0] - finish_reason = choice0["finish_reason"] - if delta := choice0.get("delta"): - if text := delta.get("content"): - generated_tokens_n += litellm.token_counter(model_config.model_id, text=text) + if "choices" in data: + data["choices"] = thinking_patcher.patch_choices(data["choices"]) + choice0 = data["choices"][0] + finish_reason = choice0["finish_reason"] + if delta := choice0.get("delta"): + if text := delta.get("content"): + generated_tokens_n += litellm.token_counter(model_config.model_id, text=text) except json.JSONDecodeError: data = {"choices": [{"finish_reason": finish_reason}]} @@ -628,6 +682,8 @@ async def litellm_non_streamer(): if text := choice.get("message", {}).get("content"): generated_tokens_n += litellm.token_counter(model_config.model_id, text=text) finish_reason = choice.get("finish_reason") + if "choices" in data: + data["choices"] = thinking_patcher.patch_choices(data["choices"]) usage_dict = model_config.compose_usage_dict(prompt_tokens_n, generated_tokens_n) data.update(usage_dict) except json.JSONDecodeError: From 772f75ac628a4eda272b86a4ae503afa8856112f Mon Sep 17 00:00:00 2001 From: mitya Date: Mon, 9 Jun 2025 13:58:33 +0200 Subject: [PATCH 2/6] less args --- .../webgui/selfhost_fastapi_completions.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py index 5e3beebe1..b9cfd99f0 100644 --- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py +++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py @@ -611,18 +611,11 @@ def _wrap_output(output: str) -> str: thinking_split_token = "" # Qwen3 thinking arguments override + # NOTE: qwen3 can work in two different modes, + # but we're not pass this specific argument into litellm here if post.enable_thinking is not None: - completion_kwargs["chat_template_kwargs"] = {"enable_thinking": post.enable_thinking} - completion_kwargs["top_k"] = 20 - if post.enable_thinking: - completion_kwargs["top_p"] = 0.95 - completion_kwargs["min_p"] = 0 - completion_kwargs["presence_penalty"] = 1 - else: - thinking_split_token = None - completion_kwargs["temperature"] = 0.7 - completion_kwargs["top_p"] = 0.8 - completion_kwargs["presence_penalty"] = 1.5 + completion_kwargs["top_p"] = 0.95 + completion_kwargs["presence_penalty"] = 1 thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token) if post.reasoning_effort or post.thinking: From 0db08a5370e9b2aa2ec16b9206ca32277ff33f2a Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 10 Jun 2025 11:28:33 +0200 Subject: [PATCH 3/6] repair invalid index and id for tool call stream --- refact-agent/gui/src/features/Chat/Thread/utils.ts | 13 ++++++++----- .../webgui/selfhost_fastapi_completions.py | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts index 0d7c67e82..f434c4768 100644 --- a/refact-agent/gui/src/features/Chat/Thread/utils.ts +++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts @@ -36,6 +36,7 @@ import { isToolCallMessage, Usage, } from "../../../services/refact"; +import { v4 as uuidv4 } from "uuid"; import { parseOrElse } from "../../../utils"; import { type LspChatMessage } from "../../../services/refact"; import { checkForDetailMessage } from "./types"; @@ -81,8 +82,12 @@ POINT2 FOR_FUTURE_FEREFENCE: ... function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] { const calls = prev.slice(); - if (calls[add.index]) { - const prevCall = calls[add.index]; + if (!calls.length || add.function.name) { + add.index = calls.length; + if (!add.id) { add.id = uuidv4() } + calls[calls.length] = add + } else { + const prevCall = calls[calls.length - 1]; const prevArgs = prevCall.function.arguments; const nextArgs = prevArgs + add.function.arguments; const call: ToolCall = { @@ -92,9 +97,7 @@ function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] { arguments: nextArgs, }, }; - calls[add.index] = call; - } else { - calls[add.index] = add; + calls[calls.length - 1] = call; } return calls; } diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py index b9cfd99f0..e8be9029e 100644 --- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py +++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py @@ -30,7 +30,7 @@ from pydantic import BaseModel from typing import List, Dict, Union, Optional, Tuple, Any -__all__ = ["BaseCompletionsRouter", "CompletionsRouter"] +__all__ = ["BaseCompletionsRouter", "CompletionsRouter", "ThinkingPatcher"] def clamp(lower, upper, x): From 5a9ec2d92517570aa6aeb17115a443eda3a79c5c Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 10 Jun 2025 11:39:05 +0200 Subject: [PATCH 4/6] prettier --- refact-agent/gui/src/features/Chat/Thread/utils.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts index f434c4768..5b5bfe482 100644 --- a/refact-agent/gui/src/features/Chat/Thread/utils.ts +++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts @@ -84,8 +84,10 @@ function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] { if (!calls.length || add.function.name) { add.index = calls.length; - if (!add.id) { add.id = uuidv4() } - calls[calls.length] = add + if (!add.id) { + add.id = uuidv4(); + } + calls[calls.length] = add; } else { const prevCall = calls[calls.length - 1]; const prevArgs = prevCall.function.arguments; From b70b1ff637f950fa1ad9a063cbaae79c2072bd2a Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 10 Jun 2025 12:39:15 +0200 Subject: [PATCH 5/6] remove if need to patch --- .../webgui/selfhost_fastapi_completions.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py index e8be9029e..7edc582aa 100644 --- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py +++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py @@ -194,22 +194,29 @@ async def embeddings_streamer(ticket: Ticket, timeout, created_ts): # NOTE: some models doesn't support multiple parsers for now, we need parse thinking manually in this case class ThinkingPatcher: - def __init__(self, thinking_split_token: Optional[str]): - self._thinking_split_token = thinking_split_token + def __init__( + self, + thinking_tokens: Optional[Tuple[str, str]], + ): + if thinking_tokens is None: + thinking_tokens = None, None + self._thinking_start_token, self._thinking_end_token = thinking_tokens self._thinking_split_index = set() def patch_choices(self, choices: List[Dict]) -> List[Dict]: - if self._thinking_split_token is None: + if self._thinking_end_token is None: return choices for choice in choices: index = choice["index"] if "delta" in choice: if content := choice["delta"].get("content"): + if self._thinking_start_token: + content = content.replace(self._thinking_start_token, "") if index not in self._thinking_split_index: - if self._thinking_split_token in content: + if self._thinking_end_token in content: self._thinking_split_index.add(index) choice["delta"]["reasoning_content"], choice["delta"]["content"] \ - = (*content.split(self._thinking_split_token), "")[:2] + = (*content.split(self._thinking_end_token), "")[:2] else: choice["delta"]["reasoning_content"] = content choice["delta"]["content"] = "" @@ -218,8 +225,10 @@ def patch_choices(self, choices: List[Dict]) -> List[Dict]: choice["delta"]["content"] = content elif "message" in choice: if content := choice["message"].get("content", ""): + if self._thinking_start_token: + content = content.replace(self._thinking_start_token, "") choice["message"]["reasoning_content"], choice["message"]["content"] \ - = (*content.split(self._thinking_split_token), "")[:2] + = (*content.split(self._thinking_end_token), "")[:2] else: log(f"unknown choice type with keys: {choice.keys()}, skip thinking patch") return choices @@ -606,9 +615,9 @@ def _wrap_output(output: str) -> str: "timeout": 60 * 60, # An hour timeout for thinking models } - thinking_split_token = None + thinking_tokens = None if model_config.capabilities.reasoning in ["qwen", "deepseek"]: - thinking_split_token = "" + thinking_tokens = "", "" # Qwen3 thinking arguments override # NOTE: qwen3 can work in two different modes, @@ -616,7 +625,7 @@ def _wrap_output(output: str) -> str: if post.enable_thinking is not None: completion_kwargs["top_p"] = 0.95 completion_kwargs["presence_penalty"] = 1 - thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token) + thinking_patcher = ThinkingPatcher(thinking_tokens=thinking_tokens) if post.reasoning_effort or post.thinking: del completion_kwargs["temperature"] From 9a91ee78326012b48c2ff934ceb803ad0da29ae6 Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 10 Jun 2025 15:20:02 +0200 Subject: [PATCH 6/6] comment on logic change --- refact-agent/gui/src/features/Chat/Thread/utils.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts index 5b5bfe482..e97b9d29d 100644 --- a/refact-agent/gui/src/features/Chat/Thread/utils.ts +++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts @@ -82,6 +82,10 @@ POINT2 FOR_FUTURE_FEREFENCE: ... function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] { const calls = prev.slice(); + // NOTE: we can't be sure that backend sends correct indexes for tool calls + // in case of qwen3 with sglang I get 2 problems fixed here: + // 1. index of first tool call delta == 2 next == 0 (huh?) + // 2. second tool call in a row has id == null if (!calls.length || add.function.name) { add.index = calls.length; if (!add.id) {