From cace6c212c7d312086b33b5f9e20d4ed920e532f Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Sun, 8 Jun 2025 09:22:20 +0200
Subject: [PATCH 1/5] qwen thinking mode and some improvements on server side

---
 refact-agent/engine/src/call_validation.rs           |  2 ++
 .../engine/src/forward_to_openai_endpoint.rs         |  8 +++++++-
 .../engine/src/scratchpads/chat_passthrough.rs       | 12 ++++++++----
 .../webgui/selfhost_fastapi_completions.py           |  2 ++
 .../refact_webgui/webgui/selfhost_model_assigner.py  |  3 ++-
 5 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/refact-agent/engine/src/call_validation.rs b/refact-agent/engine/src/call_validation.rs
index 4af2cca32..03b625e89 100644
--- a/refact-agent/engine/src/call_validation.rs
+++ b/refact-agent/engine/src/call_validation.rs
@@ -53,6 +53,8 @@ pub struct SamplingParameters {
     pub reasoning_effort: Option<ReasoningEffort>,  // OpenAI style reasoning
     #[serde(default)]
     pub thinking: Option<serde_json::Value>,  // Anthropic style reasoning
+    #[serde(default)]
+    pub enable_thinking: Option<bool>,  // Qwen style reasoning
 }
 
 #[derive(Debug, Deserialize, Clone)]
diff --git a/refact-agent/engine/src/forward_to_openai_endpoint.rs b/refact-agent/engine/src/forward_to_openai_endpoint.rs
index 0876c4f29..f6cec3d70 100644
--- a/refact-agent/engine/src/forward_to_openai_endpoint.rs
+++ b/refact-agent/engine/src/forward_to_openai_endpoint.rs
@@ -44,6 +44,9 @@ pub async fn forward_to_openai_style_endpoint(
         data["reasoning_effort"] = serde_json::Value::String(reasoning_effort.to_string());
     } else if let Some(thinking) = sampling_parameters.thinking.clone() {
         data["thinking"] = thinking.clone();
+    } else if let Some(enable_thinking) = sampling_parameters.enable_thinking {
+        data["enable_thinking"] = serde_json::Value::Bool(enable_thinking);
+        data["temperature"] = serde_json::Value::from(sampling_parameters.temperature);
     } else if let Some(temperature) = sampling_parameters.temperature {
         data["temperature"] = serde_json::Value::from(temperature);
     }
@@ -130,7 +133,10 @@ pub async fn forward_to_openai_style_endpoint_streaming(
         data["reasoning_effort"] = serde_json::Value::String(reasoning_effort.to_string());
     } else if let Some(thinking) = sampling_parameters.thinking.clone() {
         data["thinking"] = thinking.clone();
-    } else if let Some(temperature) = sampling_parameters.temperature {
+    } else if let Some(enable_thinking) = sampling_parameters.enable_thinking {
+        data["enable_thinking"] = serde_json::Value::Bool(enable_thinking);
+        data["temperature"] = serde_json::Value::from(sampling_parameters.temperature);
+    }else if let Some(temperature) = sampling_parameters.temperature {
         data["temperature"] = serde_json::Value::from(temperature);
     }
     data["max_completion_tokens"] = serde_json::Value::from(sampling_parameters.max_new_tokens);
diff --git a/refact-agent/engine/src/scratchpads/chat_passthrough.rs b/refact-agent/engine/src/scratchpads/chat_passthrough.rs
index ae5bcb552..f5463ae30 100644
--- a/refact-agent/engine/src/scratchpads/chat_passthrough.rs
+++ b/refact-agent/engine/src/scratchpads/chat_passthrough.rs
@@ -282,7 +282,6 @@ fn _adapt_for_reasoning_models(
                 sampling_parameters.reasoning_effort = Some(ReasoningEffort::High);
             }
             sampling_parameters.temperature = default_temperature;
-            sampling_parameters.thinking = None;
 
             // NOTE: OpenAI prefer user message over system
             messages.into_iter().map(|mut msg| {
@@ -304,12 +303,17 @@ fn _adapt_for_reasoning_models(
                     "budget_tokens": budget_tokens,
                 }));
             }
-            sampling_parameters.reasoning_effort = None;
+            messages
+        },
+        "qwen" => {
+            if supports_boost_reasoning && sampling_parameters.boost_reasoning {
+                sampling_parameters.enable_thinking = Some(true);
+            }
+            // In fact qwen3 wants 0.7 temperature for no-thinking mode but we'll use defaults for thinking
+            sampling_parameters.temperature = default_temperature.clone();
             messages
         },
         _ => {
-            sampling_parameters.reasoning_effort = None;
-            sampling_parameters.thinking = None;
             sampling_parameters.temperature = default_temperature.clone();
             messages
         }
diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
index db91c7f94..4060648cf 100644
--- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
+++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py
@@ -96,6 +96,7 @@ class ChatContext(NlpSamplingParams):
     n: int = 1
     reasoning_effort: Optional[str] = None  # OpenAI style reasoning
     thinking: Optional[Dict] = None  # Anthropic style reasoning
+    enable_thinking: Optional[bool] = None  # Qwen style reasoning
 
 
 class EmbeddingsStyleOpenAI(BaseModel):
@@ -569,6 +570,7 @@ def _wrap_output(output: str) -> str:
             "stop": post.stop if post.stop else None,
             "n": post.n,
             "extra_headers": model_config.extra_headers if model_config.extra_headers else None,
+            "timeout": 60 * 60,  # An hour timeout for thinking models
         }
 
         if post.reasoning_effort or post.thinking:
diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
index 60c012294..1fd0ee17d 100644
--- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
+++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
@@ -17,7 +17,8 @@
 __all__ = ["ModelAssigner"]
 
 
-ALLOWED_N_CTX = [2 ** p for p in range(10, 20)]
+# ALLOWED_N_CTX = [2 ** p for p in range(10, 20)]
+ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)]
 ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)]
 
 

From 8695ee416b53bb2ec0a85d2fd370d0109ecbbebe Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Sun, 8 Jun 2025 09:38:03 +0200
Subject: [PATCH 2/5] add qwen reasoning option

---
 refact-server/refact_utils/third_party/utils/configs.py         | 2 +-
 .../refact_webgui/webgui/static/tab-third-party-apis.js         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/refact-server/refact_utils/third_party/utils/configs.py b/refact-server/refact_utils/third_party/utils/configs.py
index 9e642493a..acd503853 100644
--- a/refact-server/refact_utils/third_party/utils/configs.py
+++ b/refact-server/refact_utils/third_party/utils/configs.py
@@ -68,7 +68,7 @@ def to_chat_model_record(self) -> Dict[str, Any]:
             "supports_agent": self.capabilities.agent,
             "supports_reasoning": self.capabilities.reasoning,
             "supports_boost_reasoning": self.capabilities.boost_reasoning,
-            "default_temperature": 0.6 if self.capabilities.reasoning == "deepseek" else None,
+            "default_temperature": 0.6 if self.capabilities.reasoning in ["deepseek", "qwen"] else None,
         }
 
 
diff --git a/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js b/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js
index d8dd26daa..4cca580af 100644
--- a/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js
+++ b/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js
@@ -654,6 +654,7 @@ function showAddModelModal(providerId) {
                                 <option value="">None</option>
                                 <option value="openai">OpenAI</option>
                                 <option value="anthropic">Anthropic</option>
+                                <option value="qwen">Qwen</option>
                                 <option value="deepseek">DeepSeek</option>
                             </select>
                             <div class="form-text">Select the reasoning type supported by this model.</div>

From 2fab71f6372214d76cbf8a641a45c023849f83af Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Sun, 8 Jun 2025 10:32:59 +0200
Subject: [PATCH 3/5] missed enable thinking

---
 refact-agent/engine/src/scratchpads/chat_passthrough.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/refact-agent/engine/src/scratchpads/chat_passthrough.rs b/refact-agent/engine/src/scratchpads/chat_passthrough.rs
index f5463ae30..50f5337ca 100644
--- a/refact-agent/engine/src/scratchpads/chat_passthrough.rs
+++ b/refact-agent/engine/src/scratchpads/chat_passthrough.rs
@@ -220,6 +220,7 @@ impl ScratchpadAbstract for ChatPassthrough {
             // drop all reasoning parameters in case of non-reasoning model
             sampling_parameters_to_patch.reasoning_effort = None;
             sampling_parameters_to_patch.thinking = None;
+            sampling_parameters_to_patch.enable_thinking = None;
             limited_msgs
         };
 
@@ -308,6 +309,8 @@ fn _adapt_for_reasoning_models(
         "qwen" => {
             if supports_boost_reasoning && sampling_parameters.boost_reasoning {
                 sampling_parameters.enable_thinking = Some(true);
+            } else {
+                sampling_parameters.enable_thinking = Some(false);
             }
             // In fact qwen3 wants 0.7 temperature for no-thinking mode but we'll use defaults for thinking
             sampling_parameters.temperature = default_temperature.clone();

From c8056c9eca01b0e5a58b7175e15a8717e7d25a3d Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Mon, 9 Jun 2025 09:23:37 +0200
Subject: [PATCH 4/5] add concurrency setup in UI

---
 .../webgui/selfhost_model_assigner.py         | 28 +++++++++++++++-
 .../webgui/static/tab-model-hosting.html      |  1 +
 .../webgui/static/tab-model-hosting.js        | 33 ++++++++++++++++---
 .../refact_webgui/webgui/tab_models_host.py   |  7 +++-
 4 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
index 1fd0ee17d..0087a9cc3 100644
--- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
+++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
@@ -20,6 +20,7 @@
 # ALLOWED_N_CTX = [2 ** p for p in range(10, 20)]
 ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)]
 ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)]
+ALLOWED_CONCURRENCY = [2 ** p for p in range(9)]
 
 
 def has_context_switch(filter_caps: List[str]) -> bool:
@@ -56,6 +57,7 @@ class ModelWatchdogDConfig:
     share_gpu: bool
     n_ctx: Optional[int] = None
     has_loras: bool = False
+    concurrency: Optional[int] = None
 
     def dump(self, model_cfg_j: Dict) -> str:
         model_cfg_j["command_line"].extend(["--model", self.model_name])
@@ -64,6 +66,8 @@ def dump(self, model_cfg_j: Dict) -> str:
                 model_cfg_j["command_line"].extend(["--n-ctx", self.n_ctx])
             if not self.has_loras:
                 model_cfg_j["command_line"].append("--loraless")
+            if self.concurrency:
+                model_cfg_j["command_line"].extend(["--concurrency", self.concurrency])
 
         model_cfg_j["gpus"] = self.gpus
         model_cfg_j["share_gpu"] = self.share_gpu
@@ -104,6 +108,10 @@ def shard_gpu_backends(self) -> Set[str]:
     def share_gpu_backends(self) -> Set[str]:
         return {"transformers"}
 
+    @property
+    def concurrency_backends(self) -> Set[str]:
+        return set()
+
     @property
     def models_db(self) -> Dict[str, Any]:
         return models_mini_db
@@ -219,6 +227,7 @@ def _model_inference_setup(self, inference_config: Dict[str, Any]) -> Dict[str,
                         share_gpu=assignment.get("share_gpu", False),
                         n_ctx=assignment.get("n_ctx", None),
                         has_loras=self._has_loras(model_name),
+                        concurrency=assignment.get("concurrency", None),
                     ))
                     continue
                 for model_cursor in range(cursor, next_cursor, assignment["gpus_shard"]):
@@ -229,6 +238,7 @@ def _model_inference_setup(self, inference_config: Dict[str, Any]) -> Dict[str,
                         share_gpu=assignment.get("share_gpu", False),
                         n_ctx=assignment.get("n_ctx", None),
                         has_loras=self._has_loras(model_name),
+                        concurrency=assignment.get("concurrency", None),
                     ))
             for _ in range(model_group.gpus_shard()):
                 if gpus[cursor]["mem_total_mb"] < model_group.required_memory_mb(self.models_db):
@@ -328,6 +338,13 @@ def models_info(self):
                     gpus_shard for gpus_shard in ALLOWED_GPUS_SHARD
                     if gpus_shard <= max_available_shards
                 ]
+            if rec["backend"] in self.concurrency_backends:
+                default_concurrency = ALLOWED_CONCURRENCY[-1]
+                available_concurrency = ALLOWED_CONCURRENCY
+            else:
+                default_concurrency = 0
+                available_concurrency = []
+
             info.append({
                 "name": k,
                 "backend": rec["backend"],
@@ -341,6 +358,8 @@ def models_info(self):
                 "default_n_ctx": default_n_ctx,
                 "available_n_ctx": available_n_ctx,
                 "available_shards": available_shards,
+                "default_concurrency": default_concurrency,
+                "available_concurrency": available_concurrency,
                 "is_deprecated": bool(rec.get("deprecated", False)),
                 "repo_status": self._models_repo_status[k],
                 "repo_url": f"https://huggingface.co/{rec['model_path']}",
@@ -368,8 +387,15 @@ def _set_n_ctx(model: str, record: Dict) -> Dict:
             record["n_ctx"] = n_ctx
             return record
 
+        def _set_concurrency(model: str, record: Dict) -> Dict:
+            if self.models_db[model]["backend"] in self.concurrency_backends:
+                record["concurrency"] = record.get("concurrency", ALLOWED_CONCURRENCY[-1])
+            else:
+                record["concurrency"] = 0
+            return record
+
         j["model_assign"] = self._share_gpu_filter({
-            model: _set_n_ctx(model, v)
+            model: _set_concurrency(model, _set_n_ctx(model, v))
             for model, v in j["model_assign"].items()
             if model in self.models_db
         })
diff --git a/refact-server/refact_webgui/webgui/static/tab-model-hosting.html b/refact-server/refact_webgui/webgui/static/tab-model-hosting.html
index 943d7b4af..46da2d740 100644
--- a/refact-server/refact_webgui/webgui/static/tab-model-hosting.html
+++ b/refact-server/refact_webgui/webgui/static/tab-model-hosting.html
@@ -11,6 +11,7 @@ <h3>Hosted Models</h3>
       <tr>
         <th>Model</th>
         <th>Context</th>
+        <th>Concurrency</th>
         <th>Finetune</th>
         <th>Sharding</th>
         <th>Share GPU</th>
diff --git a/refact-server/refact_webgui/webgui/static/tab-model-hosting.js b/refact-server/refact_webgui/webgui/static/tab-model-hosting.js
index a22136ef1..e9952bf4e 100644
--- a/refact-server/refact_webgui/webgui/static/tab-model-hosting.js
+++ b/refact-server/refact_webgui/webgui/static/tab-model-hosting.js
@@ -262,13 +262,15 @@ function render_models_assigned(models) {
         const row = document.createElement('tr');
         row.setAttribute('data-model',index);
         let model_name = document.createElement("td");
-        model_name.style.width = "20%";
+        model_name.style.width = "18%";
         let context = document.createElement("td");
-        context.style.width = "15%";
+        context.style.width = "12%";
+        let concurrency = document.createElement("td");
+        concurrency.style.width = "12%";
         let finetune_info = document.createElement("td");
-        finetune_info.style.width = "35%";
+        finetune_info.style.width = "30%";
         let select_gpus = document.createElement("td");
-        select_gpus.style.width = "15%";
+        select_gpus.style.width = "13%";
         let gpus_share = document.createElement("td");
         gpus_share.style.width = "10%";
         let del = document.createElement("td");
@@ -317,6 +319,27 @@ function render_models_assigned(models) {
             context.innerHTML = `<span class="default-context">${models_info[index].default_n_ctx}</span>`;
         }
 
+        const model_concurrency = models_data.model_assign[index].concurrency || models_info[index].default_concurrency;
+        if (models_info[index].available_concurrency && models_info[index].available_concurrency.length > 0) {
+            const concurrency_options = models_info[index].available_concurrency;
+            const concurrency_input = document.createElement("select");
+            concurrency_input.classList.add('form-select','form-select-sm');
+            concurrency_options.forEach(element => {
+                const concurrency_option = document.createElement("option");
+                concurrency_option.setAttribute('value',element);
+                concurrency_option.textContent = element;
+                if(element === model_concurrency) {
+                    concurrency_option.setAttribute('selected','selected');
+                }
+                concurrency_input.appendChild(concurrency_option);
+            });
+            concurrency_input.addEventListener('change', function() {
+                models_data.model_assign[index].concurrency = Number(this.value);
+                save_model_assigned();
+            });
+            concurrency.appendChild(concurrency_input);
+        }
+
         let finetune_runs = [];
         if (finetune_configs_and_runs) {
             finetune_runs = finetune_configs_and_runs.finetune_runs.filter(
@@ -397,6 +420,7 @@ function render_models_assigned(models) {
 
         row.appendChild(model_name);
         row.appendChild(context);
+        row.appendChild(concurrency);
         row.appendChild(finetune_info);
         row.appendChild(select_gpus);
         row.appendChild(gpus_share);
@@ -680,6 +704,7 @@ function render_models(models) {
                     models_data.model_assign[model_name] = {
                         gpus_shard: default_gpus_shard,
                         n_ctx: element.default_n_ctx,
+                        concurrency: element.default_concurrency,
                     };
                     save_model_assigned();
                     add_model_modal.hide();
diff --git a/refact-server/refact_webgui/webgui/tab_models_host.py b/refact-server/refact_webgui/webgui/tab_models_host.py
index 847842e7e..a898dde79 100644
--- a/refact-server/refact_webgui/webgui/tab_models_host.py
+++ b/refact-server/refact_webgui/webgui/tab_models_host.py
@@ -10,7 +10,7 @@
 from refact_webgui.webgui.tab_loras import rm
 from refact_webgui.webgui.tab_loras import unpack
 from refact_webgui.webgui.tab_loras import write_to_file
-from refact_webgui.webgui.selfhost_model_assigner import ModelAssigner
+from refact_webgui.webgui.selfhost_model_assigner import ModelAssigner, ALLOWED_CONCURRENCY
 
 from pathlib import Path
 from pydantic import BaseModel
@@ -38,6 +38,7 @@ class TabHostModelRec(BaseModel):
     gpus_shard: int = Query(default=1, ge=0, le=1024)
     share_gpu: bool = False
     n_ctx: Optional[int] = None
+    concurrency: Optional[int] = None
 
 
 class TabHostModelsAssign(BaseModel):
@@ -111,11 +112,15 @@ async def _tab_host_models_assign(self, post: TabHostModelsAssign):
         for model_name, model_cfg in post.model_assign.items():
             if model_cfg.n_ctx is None:
                 raise HTTPException(status_code=400, detail=f"n_ctx must be set for {model_name}")
+            if model_cfg.concurrency is None:
+                raise HTTPException(status_code=400, detail=f"concurrency must be set for {model_name}")
             for model_info in self._model_assigner.models_info["models"]:
                 if model_info["name"] == model_name:
                     max_n_ctx = model_info["default_n_ctx"]
                     if model_cfg.n_ctx > max_n_ctx:
                         raise HTTPException(status_code=400, detail=f"n_ctx must be less or equal to {max_n_ctx} for {model_name}")
+                    if model_cfg.concurrency and model_cfg.concurrency not in model_info["available_concurrency"]:
+                        raise HTTPException(status_code=400, detail=f"concurrency must be one of {model_info['available_concurrency']} for {model_name}")
                     break
             else:
                 raise HTTPException(status_code=400, detail=f"model {model_name} not found")

From 7ce818747176845ca2c1dc51e8e0df9cc7dd7b4c Mon Sep 17 00:00:00 2001
From: mitya <dimitry.ageev@gmail.com>
Date: Mon, 9 Jun 2025 12:58:30 +0200
Subject: [PATCH 5/5] allowed nctx

---
 refact-server/refact_webgui/webgui/selfhost_model_assigner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
index 0087a9cc3..147e20c06 100644
--- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
+++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py
@@ -17,8 +17,7 @@
 __all__ = ["ModelAssigner"]
 
 
-# ALLOWED_N_CTX = [2 ** p for p in range(10, 20)]
-ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)]
+ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 19)]
 ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)]
 ALLOWED_CONCURRENCY = [2 ** p for p in range(9)]