From 9345890541cd5121f7f4745912a9939671fed482 Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Mon, 9 Jun 2025 13:35:38 +0200
Subject: [PATCH 1/6] split thinking part

---
 .../refact_utils/third_party/utils/configs.py |  2 +-
 .../webgui/selfhost_fastapi_completions.py    | 66 +++++++++++++++++--
 2 files changed, 62 insertions(+), 6 deletions(-)
diff --git a/refact-server/refact_utils/third_party/utils/configs.py b/refact-server/refact_utils/third_party/utils/configs.py
index acd503853..f394009e1 100644
--- a/refact-server/refact_utils/third_party/utils/configs.py
+++ b/refact-server/refact_utils/third_party/utils/configs.py
@@ -17,7 +17,7 @@ class ModelCapabilities(BaseModel):
     agent: bool
     clicks: bool
     completion: bool
-    reasoning: Optional[str] = False
+    reasoning: Optional[str] = None
     boost_reasoning: bool = False
 
 
diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
index 4060648cf..5e3beebe1 100644
--- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
+++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -192,6 +192,39 @@ async def embeddings_streamer(ticket: Ticket, timeout, created_ts):
         ticket.done()
 
 
+# NOTE: some models doesn't support multiple parsers for now, we need parse thinking manually in this case
+class ThinkingPatcher:
+    def __init__(self, thinking_split_token: Optional[str]):
+        self._thinking_split_token = thinking_split_token
+        self._thinking_split_index = set()
+
+    def patch_choices(self, choices: List[Dict]) -> List[Dict]:
+        if self._thinking_split_token is None:
+            return choices
+        for choice in choices:
+            index = choice["index"]
+            if "delta" in choice:
+                if content := choice["delta"].get("content"):
+                    if index not in self._thinking_split_index:
+                        if self._thinking_split_token in content:
+                            self._thinking_split_index.add(index)
+                            choice["delta"]["reasoning_content"], choice["delta"]["content"] \
+                                = (*content.split(self._thinking_split_token), "")[:2]
+                        else:
+                            choice["delta"]["reasoning_content"] = content
+                            choice["delta"]["content"] = ""
+                    else:
+                        choice["delta"]["reasoning_content"] = ""
+                        choice["delta"]["content"] = content
+            elif "message" in choice:
+                if content := choice["message"].get("content", ""):
+                    choice["message"]["reasoning_content"], choice["message"]["content"] \
+                        = (*content.split(self._thinking_split_token), "")[:2]
+            else:
+                log(f"unknown choice type with keys: {choice.keys()}, skip thinking patch")
+        return choices
+
+
 class BaseCompletionsRouter(APIRouter):
 
     def __init__(self,
@@ -573,6 +606,25 @@ def _wrap_output(output: str) -> str:
             "timeout": 60 * 60,  # An hour timeout for thinking models
         }
 
+        thinking_split_token = None
+        if model_config.capabilities.reasoning in ["qwen", "deepseek"]:
+            thinking_split_token = "</think>"
+
+        # Qwen3 thinking arguments override
+        if post.enable_thinking is not None:
+            completion_kwargs["chat_template_kwargs"] = {"enable_thinking": post.enable_thinking}
+            completion_kwargs["top_k"] = 20
+            if post.enable_thinking:
+                completion_kwargs["top_p"] = 0.95
+                completion_kwargs["min_p"] = 0
+                completion_kwargs["presence_penalty"] = 1
+            else:
+                thinking_split_token = None
+                completion_kwargs["temperature"] = 0.7
+                completion_kwargs["top_p"] = 0.8
+                completion_kwargs["presence_penalty"] = 1.5
+        thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token)
+
         if post.reasoning_effort or post.thinking:
             del completion_kwargs["temperature"]
             del completion_kwargs["top_p"]
@@ -592,11 +644,13 @@ async def litellm_streamer():
                 async for model_response in response:
                     try:
                         data = model_response.dict()
-                        choice0 = data["choices"][0]
-                        finish_reason = choice0["finish_reason"]
-                        if delta := choice0.get("delta"):
-                            if text := delta.get("content"):
-                                generated_tokens_n += litellm.token_counter(model_config.model_id, text=text)
+                        if "choices" in data:
+                            data["choices"] = thinking_patcher.patch_choices(data["choices"])
+                            choice0 = data["choices"][0]
+                            finish_reason = choice0["finish_reason"]
+                            if delta := choice0.get("delta"):
+                                if text := delta.get("content"):
+                                    generated_tokens_n += litellm.token_counter(model_config.model_id, text=text)
 
                     except json.JSONDecodeError:
                         data = {"choices": [{"finish_reason": finish_reason}]}
@@ -628,6 +682,8 @@ async def litellm_non_streamer():
                         if text := choice.get("message", {}).get("content"):
                             generated_tokens_n += litellm.token_counter(model_config.model_id, text=text)
                         finish_reason = choice.get("finish_reason")
+                    if "choices" in data:
+                        data["choices"] = thinking_patcher.patch_choices(data["choices"])
                     usage_dict = model_config.compose_usage_dict(prompt_tokens_n, generated_tokens_n)
                     data.update(usage_dict)
                 except json.JSONDecodeError:

From 772f75ac628a4eda272b86a4ae503afa8856112f Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Mon, 9 Jun 2025 13:58:33 +0200
Subject: [PATCH 2/6] less args

---
 .../webgui/selfhost_fastapi_completions.py        | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
index 5e3beebe1..b9cfd99f0 100644
--- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
+++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -611,18 +611,11 @@ def _wrap_output(output: str) -> str:
             thinking_split_token = "</think>"
 
         # Qwen3 thinking arguments override
+        # NOTE: qwen3 can work in two different modes,
+        # but we're not pass this specific argument into litellm here
         if post.enable_thinking is not None:
-            completion_kwargs["chat_template_kwargs"] = {"enable_thinking": post.enable_thinking}
-            completion_kwargs["top_k"] = 20
-            if post.enable_thinking:
-                completion_kwargs["top_p"] = 0.95
-                completion_kwargs["min_p"] = 0
-                completion_kwargs["presence_penalty"] = 1
-            else:
-                thinking_split_token = None
-                completion_kwargs["temperature"] = 0.7
-                completion_kwargs["top_p"] = 0.8
-                completion_kwargs["presence_penalty"] = 1.5
+            completion_kwargs["top_p"] = 0.95
+            completion_kwargs["presence_penalty"] = 1
         thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token)
 
         if post.reasoning_effort or post.thinking:

From 0db08a5370e9b2aa2ec16b9206ca32277ff33f2a Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Tue, 10 Jun 2025 11:28:33 +0200
Subject: [PATCH 3/6] repair invalid index and id for tool call stream

---
 refact-agent/gui/src/features/Chat/Thread/utils.ts  | 13 ++++++++-----
 .../webgui/selfhost_fastapi_completions.py          |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts
index 0d7c67e82..f434c4768 100644
--- a/refact-agent/gui/src/features/Chat/Thread/utils.ts
+++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts
@@ -36,6 +36,7 @@ import {
   isToolCallMessage,
   Usage,
 } from "../../../services/refact";
+import { v4 as uuidv4 } from "uuid";
 import { parseOrElse } from "../../../utils";
 import { type LspChatMessage } from "../../../services/refact";
 import { checkForDetailMessage } from "./types";
@@ -81,8 +82,12 @@ POINT2 FOR_FUTURE_FEREFENCE: ...
 function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] {
   const calls = prev.slice();
 
-  if (calls[add.index]) {
-    const prevCall = calls[add.index];
+  if (!calls.length || add.function.name) {
+    add.index = calls.length;
+    if (!add.id) { add.id = uuidv4() }
+    calls[calls.length] = add
+  } else {
+    const prevCall = calls[calls.length - 1];
     const prevArgs = prevCall.function.arguments;
     const nextArgs = prevArgs + add.function.arguments;
     const call: ToolCall = {
@@ -92,9 +97,7 @@ function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] {
         arguments: nextArgs,
       },
     };
-    calls[add.index] = call;
-  } else {
-    calls[add.index] = add;
+    calls[calls.length - 1] = call;
   }
   return calls;
 }
diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
index b9cfd99f0..e8be9029e 100644
--- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
+++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -30,7 +30,7 @@
 from pydantic import BaseModel
 from typing import List, Dict, Union, Optional, Tuple, Any
 
-__all__ = ["BaseCompletionsRouter", "CompletionsRouter"]
+__all__ = ["BaseCompletionsRouter", "CompletionsRouter", "ThinkingPatcher"]
 
 
 def clamp(lower, upper, x):

From 5a9ec2d92517570aa6aeb17115a443eda3a79c5c Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Tue, 10 Jun 2025 11:39:05 +0200
Subject: [PATCH 4/6] prettier

---
 refact-agent/gui/src/features/Chat/Thread/utils.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts
index f434c4768..5b5bfe482 100644
--- a/refact-agent/gui/src/features/Chat/Thread/utils.ts
+++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts
@@ -84,8 +84,10 @@ function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] {
 
   if (!calls.length || add.function.name) {
     add.index = calls.length;
-    if (!add.id) { add.id = uuidv4() }
-    calls[calls.length] = add
+    if (!add.id) {
+      add.id = uuidv4();
+    }
+    calls[calls.length] = add;
   } else {
     const prevCall = calls[calls.length - 1];
     const prevArgs = prevCall.function.arguments;

From b70b1ff637f950fa1ad9a063cbaae79c2072bd2a Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Tue, 10 Jun 2025 12:39:15 +0200
Subject: [PATCH 5/6] <think> remove if need to patch

---
 .../webgui/selfhost_fastapi_completions.py    | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
index e8be9029e..7edc582aa 100644
--- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
+++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -194,22 +194,29 @@ async def embeddings_streamer(ticket: Ticket, timeout, created_ts):
 
 # NOTE: some models doesn't support multiple parsers for now, we need parse thinking manually in this case
 class ThinkingPatcher:
-    def __init__(self, thinking_split_token: Optional[str]):
-        self._thinking_split_token = thinking_split_token
+    def __init__(
+            self,
+            thinking_tokens: Optional[Tuple[str, str]],
+    ):
+        if thinking_tokens is None:
+            thinking_tokens = None, None
+        self._thinking_start_token, self._thinking_end_token = thinking_tokens
         self._thinking_split_index = set()
 
     def patch_choices(self, choices: List[Dict]) -> List[Dict]:
-        if self._thinking_split_token is None:
+        if self._thinking_end_token is None:
             return choices
         for choice in choices:
             index = choice["index"]
             if "delta" in choice:
                 if content := choice["delta"].get("content"):
+                    if self._thinking_start_token:
+                        content = content.replace(self._thinking_start_token, "")
                     if index not in self._thinking_split_index:
-                        if self._thinking_split_token in content:
+                        if self._thinking_end_token in content:
                             self._thinking_split_index.add(index)
                             choice["delta"]["reasoning_content"], choice["delta"]["content"] \
-                                = (*content.split(self._thinking_split_token), "")[:2]
+                                = (*content.split(self._thinking_end_token), "")[:2]
                         else:
                             choice["delta"]["reasoning_content"] = content
                             choice["delta"]["content"] = ""
@@ -218,8 +225,10 @@ def patch_choices(self, choices: List[Dict]) -> List[Dict]:
                         choice["delta"]["content"] = content
             elif "message" in choice:
                 if content := choice["message"].get("content", ""):
+                    if self._thinking_start_token:
+                        content = content.replace(self._thinking_start_token, "")
                     choice["message"]["reasoning_content"], choice["message"]["content"] \
-                        = (*content.split(self._thinking_split_token), "")[:2]
+                        = (*content.split(self._thinking_end_token), "")[:2]
             else:
                 log(f"unknown choice type with keys: {choice.keys()}, skip thinking patch")
         return choices
@@ -606,9 +615,9 @@ def _wrap_output(output: str) -> str:
             "timeout": 60 * 60,  # An hour timeout for thinking models
         }
 
-        thinking_split_token = None
+        thinking_tokens = None
         if model_config.capabilities.reasoning in ["qwen", "deepseek"]:
-            thinking_split_token = "</think>"
+            thinking_tokens = "<think>", "</think>"
 
         # Qwen3 thinking arguments override
         # NOTE: qwen3 can work in two different modes,
@@ -616,7 +625,7 @@ def _wrap_output(output: str) -> str:
         if post.enable_thinking is not None:
             completion_kwargs["top_p"] = 0.95
             completion_kwargs["presence_penalty"] = 1
-        thinking_patcher = ThinkingPatcher(thinking_split_token=thinking_split_token)
+        thinking_patcher = ThinkingPatcher(thinking_tokens=thinking_tokens)
 
         if post.reasoning_effort or post.thinking:
             del completion_kwargs["temperature"]

From 9a91ee78326012b48c2ff934ceb803ad0da29ae6 Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Tue, 10 Jun 2025 15:20:02 +0200
Subject: [PATCH 6/6] comment on logic change

---
 refact-agent/gui/src/features/Chat/Thread/utils.ts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/refact-agent/gui/src/features/Chat/Thread/utils.ts b/refact-agent/gui/src/features/Chat/Thread/utils.ts
index 5b5bfe482..e97b9d29d 100644
--- a/refact-agent/gui/src/features/Chat/Thread/utils.ts
+++ b/refact-agent/gui/src/features/Chat/Thread/utils.ts
@@ -82,6 +82,10 @@ POINT2 FOR_FUTURE_FEREFENCE: ...
 function mergeToolCall(prev: ToolCall[], add: ToolCall): ToolCall[] {
   const calls = prev.slice();
 
+  // NOTE: we can't be sure that backend sends correct indexes for tool calls
+  // in case of qwen3 with sglang I get 2 problems fixed here:
+  // 1. index of first tool call delta == 2 next == 0 (huh?)
+  // 2. second tool call in a row has id == null
   if (!calls.length || add.function.name) {
     add.index = calls.length;
     if (!add.id) {