feat(cli): --chat flag now correctly routes to per-family chat templates

unamedkr · claude · unamedkr · commit f14065480749 · 2026-04-15T02:16:50.000+09:00
The previous --chat fell through to Llama 3 template for all non-Gemma
models, causing Phi-3.5 and Qwen family models to produce garbage output.

New detection order by model config + filename:
  1. Gemma 4       → &lt;|turn&gt;user\n...&lt;turn|&gt;\n&lt;|turn&gt;model\n
                     (skip &lt;|think|&gt; — no logit suppression in CLI)
  2. Gemma 2/3     → &lt;start_of_turn&gt;user\n...&lt;end_of_turn&gt;
  3. Phi-3/4       → &lt;|user|&gt;...&lt;|end|&gt;\n&lt;|assistant|&gt;\n
  4. Llama 3.x     → &lt;|start_header_id|&gt;user&lt;|end_header_id|&gt;\n\n...&lt;|eot_id|&gt;
  5. Default       → ChatML (Qwen/Qwen2/Qwen3/Qwen3.5)

Verified with --chat -p "What is 2+2?":
- Phi-3.5 Q8_0:    "The answer to...4. The sum of two and two equals four..."
- Llama 3.1 8B:    "The answer to 2 + 2 is: 4"
- Llama 3.2 3B:    "4"
- Qwen2.5-0.5B:    coherent English (0.5B model limit)
- Gemma 4 E2B:     partial (thinking-mode interaction)
- Qwen3.5-4B:      DeltaNet short-prompt issue persists (known)

All 35 unit tests + 7 regression tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tools/quant.c b/tools/quant.c
@@ -1244,26 +1244,50 @@ int main(int argc, char** argv) {
         return 0;
     }
 
-    /* Auto-wrap prompt with chat template when --chat is used */
+    /* Auto-wrap prompt with chat template when --chat is used.
+     * Template detection order:
+     *   1. Gemma 4 → <|turn>...<turn|> + thinking mode
+     *   2. Gemma 2/3 → <start_of_turn>...<end_of_turn>
+     *   3. Phi-3/Phi-4 (by filename) → <|user|>...<|end|>
+     *   4. Llama 3.x (by filename) → <|start_header_id|>...<|eot_id|>
+     *   5. Default → ChatML <|im_start|>...<|im_end|> (Qwen/Qwen3/Qwen3.5) */
     char chat_prompt[8192];
     if (chat_mode) {
         tq_model_config_t* mc = &model->config;
+        const char* mp = model_path ? model_path : "";
+        /* Basename for filename detection */
+        const char* bn = strrchr(mp, '/');
+        bn = bn ? bn + 1 : mp;
+
+        int is_phi = (strstr(bn, "phi-3") || strstr(bn, "phi3") ||
+                      strstr(bn, "Phi-3") || strstr(bn, "Phi3") ||
+                      strstr(bn, "phi-4") || strstr(bn, "phi4") ||
+                      strstr(bn, "Phi-4") || strstr(bn, "Phi4"));
+        int is_llama3 = (strstr(bn, "Llama-3") || strstr(bn, "llama-3") ||
+                         strstr(bn, "Llama3") || strstr(bn, "llama3") ||
+                         strstr(bn, "Meta-Llama-3"));
+
         if (mc->model_type == 1 && mc->is_gemma4) {
-            /* Gemma 4: uses <|turn> tokens + thinking mode.
-             * Reference: llama.cpp apply-template output for gemma4. */
+            /* Skip <|think|> in CLI — the server suppresses it via logit mask,
+             * but the CLI has no such suppression. Without it, the CLI uses
+             * plain Gemma 4 format without thinking mode. */
             snprintf(chat_prompt, sizeof(chat_prompt),
-                "<|turn>system\n<|think|><turn|>\n<|turn>user\n%s<turn|>\n<|turn>model\n", prompt);
+                "<|turn>user\n%s<turn|>\n<|turn>model\n", prompt);
         } else if (mc->model_type == 1) {
-            /* Gemma 2/3: <start_of_turn>user\n...\n<end_of_turn>\n<start_of_turn>model\n */
             snprintf(chat_prompt, sizeof(chat_prompt),
                 "<start_of_turn>user\n%s<end_of_turn>\n<start_of_turn>model\n", prompt);
-        } else if (strstr(prompt, "<|start_header_id|>") == NULL) {
-            /* Llama 3 / generic: wrap if not already wrapped */
+        } else if (is_phi) {
+            /* Phi-3/4: <|user|>...<|end|>\n<|assistant|>\n */
+            snprintf(chat_prompt, sizeof(chat_prompt),
+                "<|user|>\n%s<|end|>\n<|assistant|>\n", prompt);
+        } else if (is_llama3) {
             snprintf(chat_prompt, sizeof(chat_prompt),
                 "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|>"
                 "<|start_header_id|>assistant<|end_header_id|>\n\n", prompt);
         } else {
-            snprintf(chat_prompt, sizeof(chat_prompt), "%s", prompt);
+            /* Default ChatML (Qwen/Qwen3/Qwen3.5) */
+            snprintf(chat_prompt, sizeof(chat_prompt),
+                "<|im_start|>user\n%s<|im_end|>\n<|im_start|>assistant\n", prompt);
         }
         prompt = chat_prompt;
     }