From cace6c212c7d312086b33b5f9e20d4ed920e532f Mon Sep 17 00:00:00 2001 From: mitya Date: Sun, 8 Jun 2025 09:22:20 +0200 Subject: [PATCH 1/5] qwen thinking mode and some improvements on server side --- refact-agent/engine/src/call_validation.rs | 2 ++ .../engine/src/forward_to_openai_endpoint.rs | 8 +++++++- .../engine/src/scratchpads/chat_passthrough.rs | 12 ++++++++---- .../webgui/selfhost_fastapi_completions.py | 2 ++ .../refact_webgui/webgui/selfhost_model_assigner.py | 3 ++- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/refact-agent/engine/src/call_validation.rs b/refact-agent/engine/src/call_validation.rs index 4af2cca32..03b625e89 100644 --- a/refact-agent/engine/src/call_validation.rs +++ b/refact-agent/engine/src/call_validation.rs @@ -53,6 +53,8 @@ pub struct SamplingParameters { pub reasoning_effort: Option, // OpenAI style reasoning #[serde(default)] pub thinking: Option, // Anthropic style reasoning + #[serde(default)] + pub enable_thinking: Option, // Qwen style reasoning } #[derive(Debug, Deserialize, Clone)] diff --git a/refact-agent/engine/src/forward_to_openai_endpoint.rs b/refact-agent/engine/src/forward_to_openai_endpoint.rs index 0876c4f29..f6cec3d70 100644 --- a/refact-agent/engine/src/forward_to_openai_endpoint.rs +++ b/refact-agent/engine/src/forward_to_openai_endpoint.rs @@ -44,6 +44,9 @@ pub async fn forward_to_openai_style_endpoint( data["reasoning_effort"] = serde_json::Value::String(reasoning_effort.to_string()); } else if let Some(thinking) = sampling_parameters.thinking.clone() { data["thinking"] = thinking.clone(); + } else if let Some(enable_thinking) = sampling_parameters.enable_thinking { + data["enable_thinking"] = serde_json::Value::Bool(enable_thinking); + data["temperature"] = serde_json::Value::from(sampling_parameters.temperature); } else if let Some(temperature) = sampling_parameters.temperature { data["temperature"] = serde_json::Value::from(temperature); } @@ -130,7 +133,10 @@ pub async fn forward_to_openai_style_endpoint_streaming( data["reasoning_effort"] = serde_json::Value::String(reasoning_effort.to_string()); } else if let Some(thinking) = sampling_parameters.thinking.clone() { data["thinking"] = thinking.clone(); - } else if let Some(temperature) = sampling_parameters.temperature { + } else if let Some(enable_thinking) = sampling_parameters.enable_thinking { + data["enable_thinking"] = serde_json::Value::Bool(enable_thinking); + data["temperature"] = serde_json::Value::from(sampling_parameters.temperature); + }else if let Some(temperature) = sampling_parameters.temperature { data["temperature"] = serde_json::Value::from(temperature); } data["max_completion_tokens"] = serde_json::Value::from(sampling_parameters.max_new_tokens); diff --git a/refact-agent/engine/src/scratchpads/chat_passthrough.rs b/refact-agent/engine/src/scratchpads/chat_passthrough.rs index ae5bcb552..f5463ae30 100644 --- a/refact-agent/engine/src/scratchpads/chat_passthrough.rs +++ b/refact-agent/engine/src/scratchpads/chat_passthrough.rs @@ -282,7 +282,6 @@ fn _adapt_for_reasoning_models( sampling_parameters.reasoning_effort = Some(ReasoningEffort::High); } sampling_parameters.temperature = default_temperature; - sampling_parameters.thinking = None; // NOTE: OpenAI prefer user message over system messages.into_iter().map(|mut msg| { @@ -304,12 +303,17 @@ fn _adapt_for_reasoning_models( "budget_tokens": budget_tokens, })); } - sampling_parameters.reasoning_effort = None; + messages + }, + "qwen" => { + if supports_boost_reasoning && sampling_parameters.boost_reasoning { + sampling_parameters.enable_thinking = Some(true); + } + // In fact qwen3 wants 0.7 temperature for no-thinking mode but we'll use defaults for thinking + sampling_parameters.temperature = default_temperature.clone(); messages }, _ => { - sampling_parameters.reasoning_effort = None; - sampling_parameters.thinking = None; sampling_parameters.temperature = default_temperature.clone(); messages } diff --git a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py index db91c7f94..4060648cf 100644 --- a/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py +++ b/refact-server/refact_webgui/webgui/selfhost_fastapi_completions.py @@ -96,6 +96,7 @@ class ChatContext(NlpSamplingParams): n: int = 1 reasoning_effort: Optional[str] = None # OpenAI style reasoning thinking: Optional[Dict] = None # Anthropic style reasoning + enable_thinking: Optional[bool] = None # Qwen style reasoning class EmbeddingsStyleOpenAI(BaseModel): @@ -569,6 +570,7 @@ def _wrap_output(output: str) -> str: "stop": post.stop if post.stop else None, "n": post.n, "extra_headers": model_config.extra_headers if model_config.extra_headers else None, + "timeout": 60 * 60, # An hour timeout for thinking models } if post.reasoning_effort or post.thinking: diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py index 60c012294..1fd0ee17d 100644 --- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py +++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py @@ -17,7 +17,8 @@ __all__ = ["ModelAssigner"] -ALLOWED_N_CTX = [2 ** p for p in range(10, 20)] +# ALLOWED_N_CTX = [2 ** p for p in range(10, 20)] +ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)] ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)] From 8695ee416b53bb2ec0a85d2fd370d0109ecbbebe Mon Sep 17 00:00:00 2001 From: mitya Date: Sun, 8 Jun 2025 09:38:03 +0200 Subject: [PATCH 2/5] add qwen reasoning option --- refact-server/refact_utils/third_party/utils/configs.py | 2 +- .../refact_webgui/webgui/static/tab-third-party-apis.js | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/refact-server/refact_utils/third_party/utils/configs.py b/refact-server/refact_utils/third_party/utils/configs.py index 9e642493a..acd503853 100644 --- a/refact-server/refact_utils/third_party/utils/configs.py +++ b/refact-server/refact_utils/third_party/utils/configs.py @@ -68,7 +68,7 @@ def to_chat_model_record(self) -> Dict[str, Any]: "supports_agent": self.capabilities.agent, "supports_reasoning": self.capabilities.reasoning, "supports_boost_reasoning": self.capabilities.boost_reasoning, - "default_temperature": 0.6 if self.capabilities.reasoning == "deepseek" else None, + "default_temperature": 0.6 if self.capabilities.reasoning in ["deepseek", "qwen"] else None, } diff --git a/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js b/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js index d8dd26daa..4cca580af 100644 --- a/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js +++ b/refact-server/refact_webgui/webgui/static/tab-third-party-apis.js @@ -654,6 +654,7 @@ function showAddModelModal(providerId) { +
Select the reasoning type supported by this model.
From 2fab71f6372214d76cbf8a641a45c023849f83af Mon Sep 17 00:00:00 2001 From: mitya Date: Sun, 8 Jun 2025 10:32:59 +0200 Subject: [PATCH 3/5] missed enable thinking --- refact-agent/engine/src/scratchpads/chat_passthrough.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/refact-agent/engine/src/scratchpads/chat_passthrough.rs b/refact-agent/engine/src/scratchpads/chat_passthrough.rs index f5463ae30..50f5337ca 100644 --- a/refact-agent/engine/src/scratchpads/chat_passthrough.rs +++ b/refact-agent/engine/src/scratchpads/chat_passthrough.rs @@ -220,6 +220,7 @@ impl ScratchpadAbstract for ChatPassthrough { // drop all reasoning parameters in case of non-reasoning model sampling_parameters_to_patch.reasoning_effort = None; sampling_parameters_to_patch.thinking = None; + sampling_parameters_to_patch.enable_thinking = None; limited_msgs }; @@ -308,6 +309,8 @@ fn _adapt_for_reasoning_models( "qwen" => { if supports_boost_reasoning && sampling_parameters.boost_reasoning { sampling_parameters.enable_thinking = Some(true); + } else { + sampling_parameters.enable_thinking = Some(false); } // In fact qwen3 wants 0.7 temperature for no-thinking mode but we'll use defaults for thinking sampling_parameters.temperature = default_temperature.clone(); From c8056c9eca01b0e5a58b7175e15a8717e7d25a3d Mon Sep 17 00:00:00 2001 From: mitya Date: Mon, 9 Jun 2025 09:23:37 +0200 Subject: [PATCH 4/5] add concurrency setup in UI --- .../webgui/selfhost_model_assigner.py | 28 +++++++++++++++- .../webgui/static/tab-model-hosting.html | 1 + .../webgui/static/tab-model-hosting.js | 33 ++++++++++++++++--- .../refact_webgui/webgui/tab_models_host.py | 7 +++- 4 files changed, 63 insertions(+), 6 deletions(-) diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py index 1fd0ee17d..0087a9cc3 100644 --- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py +++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py @@ -20,6 +20,7 @@ # ALLOWED_N_CTX = [2 ** p for p in range(10, 20)] ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)] ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)] +ALLOWED_CONCURRENCY = [2 ** p for p in range(9)] def has_context_switch(filter_caps: List[str]) -> bool: @@ -56,6 +57,7 @@ class ModelWatchdogDConfig: share_gpu: bool n_ctx: Optional[int] = None has_loras: bool = False + concurrency: Optional[int] = None def dump(self, model_cfg_j: Dict) -> str: model_cfg_j["command_line"].extend(["--model", self.model_name]) @@ -64,6 +66,8 @@ def dump(self, model_cfg_j: Dict) -> str: model_cfg_j["command_line"].extend(["--n-ctx", self.n_ctx]) if not self.has_loras: model_cfg_j["command_line"].append("--loraless") + if self.concurrency: + model_cfg_j["command_line"].extend(["--concurrency", self.concurrency]) model_cfg_j["gpus"] = self.gpus model_cfg_j["share_gpu"] = self.share_gpu @@ -104,6 +108,10 @@ def shard_gpu_backends(self) -> Set[str]: def share_gpu_backends(self) -> Set[str]: return {"transformers"} + @property + def concurrency_backends(self) -> Set[str]: + return set() + @property def models_db(self) -> Dict[str, Any]: return models_mini_db @@ -219,6 +227,7 @@ def _model_inference_setup(self, inference_config: Dict[str, Any]) -> Dict[str, share_gpu=assignment.get("share_gpu", False), n_ctx=assignment.get("n_ctx", None), has_loras=self._has_loras(model_name), + concurrency=assignment.get("concurrency", None), )) continue for model_cursor in range(cursor, next_cursor, assignment["gpus_shard"]): @@ -229,6 +238,7 @@ def _model_inference_setup(self, inference_config: Dict[str, Any]) -> Dict[str, share_gpu=assignment.get("share_gpu", False), n_ctx=assignment.get("n_ctx", None), has_loras=self._has_loras(model_name), + concurrency=assignment.get("concurrency", None), )) for _ in range(model_group.gpus_shard()): if gpus[cursor]["mem_total_mb"] < model_group.required_memory_mb(self.models_db): @@ -328,6 +338,13 @@ def models_info(self): gpus_shard for gpus_shard in ALLOWED_GPUS_SHARD if gpus_shard <= max_available_shards ] + if rec["backend"] in self.concurrency_backends: + default_concurrency = ALLOWED_CONCURRENCY[-1] + available_concurrency = ALLOWED_CONCURRENCY + else: + default_concurrency = 0 + available_concurrency = [] + info.append({ "name": k, "backend": rec["backend"], @@ -341,6 +358,8 @@ def models_info(self): "default_n_ctx": default_n_ctx, "available_n_ctx": available_n_ctx, "available_shards": available_shards, + "default_concurrency": default_concurrency, + "available_concurrency": available_concurrency, "is_deprecated": bool(rec.get("deprecated", False)), "repo_status": self._models_repo_status[k], "repo_url": f"https://huggingface.co/{rec['model_path']}", @@ -368,8 +387,15 @@ def _set_n_ctx(model: str, record: Dict) -> Dict: record["n_ctx"] = n_ctx return record + def _set_concurrency(model: str, record: Dict) -> Dict: + if self.models_db[model]["backend"] in self.concurrency_backends: + record["concurrency"] = record.get("concurrency", ALLOWED_CONCURRENCY[-1]) + else: + record["concurrency"] = 0 + return record + j["model_assign"] = self._share_gpu_filter({ - model: _set_n_ctx(model, v) + model: _set_concurrency(model, _set_n_ctx(model, v)) for model, v in j["model_assign"].items() if model in self.models_db }) diff --git a/refact-server/refact_webgui/webgui/static/tab-model-hosting.html b/refact-server/refact_webgui/webgui/static/tab-model-hosting.html index 943d7b4af..46da2d740 100644 --- a/refact-server/refact_webgui/webgui/static/tab-model-hosting.html +++ b/refact-server/refact_webgui/webgui/static/tab-model-hosting.html @@ -11,6 +11,7 @@

Hosted Models

Model Context + Concurrency Finetune Sharding Share GPU diff --git a/refact-server/refact_webgui/webgui/static/tab-model-hosting.js b/refact-server/refact_webgui/webgui/static/tab-model-hosting.js index a22136ef1..e9952bf4e 100644 --- a/refact-server/refact_webgui/webgui/static/tab-model-hosting.js +++ b/refact-server/refact_webgui/webgui/static/tab-model-hosting.js @@ -262,13 +262,15 @@ function render_models_assigned(models) { const row = document.createElement('tr'); row.setAttribute('data-model',index); let model_name = document.createElement("td"); - model_name.style.width = "20%"; + model_name.style.width = "18%"; let context = document.createElement("td"); - context.style.width = "15%"; + context.style.width = "12%"; + let concurrency = document.createElement("td"); + concurrency.style.width = "12%"; let finetune_info = document.createElement("td"); - finetune_info.style.width = "35%"; + finetune_info.style.width = "30%"; let select_gpus = document.createElement("td"); - select_gpus.style.width = "15%"; + select_gpus.style.width = "13%"; let gpus_share = document.createElement("td"); gpus_share.style.width = "10%"; let del = document.createElement("td"); @@ -317,6 +319,27 @@ function render_models_assigned(models) { context.innerHTML = `${models_info[index].default_n_ctx}`; } + const model_concurrency = models_data.model_assign[index].concurrency || models_info[index].default_concurrency; + if (models_info[index].available_concurrency && models_info[index].available_concurrency.length > 0) { + const concurrency_options = models_info[index].available_concurrency; + const concurrency_input = document.createElement("select"); + concurrency_input.classList.add('form-select','form-select-sm'); + concurrency_options.forEach(element => { + const concurrency_option = document.createElement("option"); + concurrency_option.setAttribute('value',element); + concurrency_option.textContent = element; + if(element === model_concurrency) { + concurrency_option.setAttribute('selected','selected'); + } + concurrency_input.appendChild(concurrency_option); + }); + concurrency_input.addEventListener('change', function() { + models_data.model_assign[index].concurrency = Number(this.value); + save_model_assigned(); + }); + concurrency.appendChild(concurrency_input); + } + let finetune_runs = []; if (finetune_configs_and_runs) { finetune_runs = finetune_configs_and_runs.finetune_runs.filter( @@ -397,6 +420,7 @@ function render_models_assigned(models) { row.appendChild(model_name); row.appendChild(context); + row.appendChild(concurrency); row.appendChild(finetune_info); row.appendChild(select_gpus); row.appendChild(gpus_share); @@ -680,6 +704,7 @@ function render_models(models) { models_data.model_assign[model_name] = { gpus_shard: default_gpus_shard, n_ctx: element.default_n_ctx, + concurrency: element.default_concurrency, }; save_model_assigned(); add_model_modal.hide(); diff --git a/refact-server/refact_webgui/webgui/tab_models_host.py b/refact-server/refact_webgui/webgui/tab_models_host.py index 847842e7e..a898dde79 100644 --- a/refact-server/refact_webgui/webgui/tab_models_host.py +++ b/refact-server/refact_webgui/webgui/tab_models_host.py @@ -10,7 +10,7 @@ from refact_webgui.webgui.tab_loras import rm from refact_webgui.webgui.tab_loras import unpack from refact_webgui.webgui.tab_loras import write_to_file -from refact_webgui.webgui.selfhost_model_assigner import ModelAssigner +from refact_webgui.webgui.selfhost_model_assigner import ModelAssigner, ALLOWED_CONCURRENCY from pathlib import Path from pydantic import BaseModel @@ -38,6 +38,7 @@ class TabHostModelRec(BaseModel): gpus_shard: int = Query(default=1, ge=0, le=1024) share_gpu: bool = False n_ctx: Optional[int] = None + concurrency: Optional[int] = None class TabHostModelsAssign(BaseModel): @@ -111,11 +112,15 @@ async def _tab_host_models_assign(self, post: TabHostModelsAssign): for model_name, model_cfg in post.model_assign.items(): if model_cfg.n_ctx is None: raise HTTPException(status_code=400, detail=f"n_ctx must be set for {model_name}") + if model_cfg.concurrency is None: + raise HTTPException(status_code=400, detail=f"concurrency must be set for {model_name}") for model_info in self._model_assigner.models_info["models"]: if model_info["name"] == model_name: max_n_ctx = model_info["default_n_ctx"] if model_cfg.n_ctx > max_n_ctx: raise HTTPException(status_code=400, detail=f"n_ctx must be less or equal to {max_n_ctx} for {model_name}") + if model_cfg.concurrency and model_cfg.concurrency not in model_info["available_concurrency"]: + raise HTTPException(status_code=400, detail=f"concurrency must be one of {model_info['available_concurrency']} for {model_name}") break else: raise HTTPException(status_code=400, detail=f"model {model_name} not found") From 7ce818747176845ca2c1dc51e8e0df9cc7dd7b4c Mon Sep 17 00:00:00 2001 From: mitya Date: Mon, 9 Jun 2025 12:58:30 +0200 Subject: [PATCH 5/5] allowed nctx --- refact-server/refact_webgui/webgui/selfhost_model_assigner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py index 0087a9cc3..147e20c06 100644 --- a/refact-server/refact_webgui/webgui/selfhost_model_assigner.py +++ b/refact-server/refact_webgui/webgui/selfhost_model_assigner.py @@ -17,8 +17,7 @@ __all__ = ["ModelAssigner"] -# ALLOWED_N_CTX = [2 ** p for p in range(10, 20)] -ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 16)] +ALLOWED_N_CTX = [1024, 2048, 4096] + [8192 * (t + 1) for t in range(0, 19)] ALLOWED_GPUS_SHARD = [2 ** p for p in range(10)] ALLOWED_CONCURRENCY = [2 ** p for p in range(9)]