Skip to content

Commit 5fe8155

Browse files
unamedkrclaude
andcommitted
fix(gemma4): V-norm + NeoX RoPE exclusion + debug logits tok100
Key findings: - V-norm: Gemma 4 applies weight-free RMS norm to V after projection (llama.cpp line 92: ggml_rms_norm(Vcur, eps)). Implemented. - NeoX RoPE: auto-detection excluded Gemma (model_type==1) — Gemma uses standard interleaved RoPE, not NeoX, despite head*dim != hidden - Debug: token 100 (<|channel>) logit is -12.17 in our code but should be near 0 (top-1 in llama.cpp). ~50 logit difference. - layer_output_scale confirmed as simple multiply (llama.cpp reference) Status: garbage persists. Investigated 15 hypotheses. Root cause narrowed to forward pass numerics — all individual components verified but combined output wrong. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f360b90 commit 5fe8155

1 file changed

Lines changed: 29 additions & 5 deletions

File tree

quant.h

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11648,8 +11648,13 @@ tq_model_t* tq_load_gguf(const char* path) {
1164811648
* of how the converter permuted them.
1164911649
*
1165011650
* Also set for Phi-3 (fused QKV, never permuted by converter). */
11651+
/* NeoX-style RoPE for models where n_heads*head_dim != hidden_dim
11652+
* (Qwen3: 32×128=4096 ≠ 2560). BUT: Gemma 4 also has this mismatch
11653+
* (8×256=2048 ≠ 1536) yet uses STANDARD interleaved RoPE, not NeoX.
11654+
* Exclude Gemma models from NeoX RoPE auto-detection. */
1165111655
if (c->n_heads > 0 && c->head_dim > 0 &&
11652-
c->n_heads * c->head_dim != c->hidden_dim) {
11656+
c->n_heads * c->head_dim != c->hidden_dim &&
11657+
c->model_type != 1 /* exclude Gemma */) {
1165311658
c->use_neox_rope = 1;
1165411659
fprintf(stderr, "tq_load_gguf: NeoX RoPE enabled "
1165511660
"(n_heads*head_dim=%d != hidden=%d)\n",
@@ -15756,12 +15761,22 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1575615761
TQ_PROF_STOP(_tp, matmul_ns);
1575715762

1575815763
if (pos <= 1 && getenv("TQ_DEBUG")) {
15759-
/* Print top-5 logits for debugging */
15764+
/* Print logits for debugging — include key token IDs */
1576015765
fprintf(stderr, "[DEBUG] pos=%d logits[0:8] = ", pos);
1576115766
for (int i = 0; i < 8; i++) fprintf(stderr, "%.2f ", s->logits[i]);
15762-
float max_l = s->logits[0]; int max_i = 0;
15763-
for (int i = 1; i < c->vocab_size; i++) { if (s->logits[i] > max_l) { max_l = s->logits[i]; max_i = i; } }
15764-
fprintf(stderr, "... max=%.2f @%d\n", max_l, max_i);
15767+
/* Token 100 = <|channel> (Gemma 4 thinking start) */
15768+
if (c->vocab_size > 100)
15769+
fprintf(stderr, " tok100=%.2f", s->logits[100]);
15770+
/* Top-5 */
15771+
int top5[5] = {0,0,0,0,0}; float top5v[5] = {-1e30f,-1e30f,-1e30f,-1e30f,-1e30f};
15772+
for (int i = 0; i < c->vocab_size; i++) {
15773+
int minj = 0;
15774+
for (int j = 1; j < 5; j++) if (top5v[j] < top5v[minj]) minj = j;
15775+
if (s->logits[i] > top5v[minj]) { top5[minj] = i; top5v[minj] = s->logits[i]; }
15776+
}
15777+
fprintf(stderr, " ... top5: ");
15778+
for (int j = 0; j < 5; j++) fprintf(stderr, "%d(%.1f) ", top5[j], top5v[j]);
15779+
fprintf(stderr, "\n");
1576515780
}
1576615781

1576715782
/* Final logit soft-capping: logits = cap * tanh(logits / cap) */
@@ -16039,6 +16054,13 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1603916054
}
1604016055
if (s_id >= 0) add_bos = 1;
1604116056
}
16057+
/* Skip BOS when the prompt already starts with a special token
16058+
* (e.g., <|im_start|> for ChatML, <|user|> for Phi-3). Adding
16059+
* BOS before a chat template confuses Qwen3 and other models
16060+
* that expect the template to be the first token. */
16061+
if (add_bos && prompt && prompt[0] == '<' && prompt[1] == '|') {
16062+
add_bos = 0;
16063+
}
1604216064
n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
1604316065
} else {
1604416066
/* No tokenizer: use BOS only (Gemma=2, Qwen=skip) */
@@ -16313,6 +16335,8 @@ int tq_generate_continue(tq_model_t* model,
1631316335
int n_new = 0;
1631416336
if (tokenizer && prompt) {
1631516337
int add_bos = (model->config.model_type == 1) ? 1 : 0;
16338+
/* Skip BOS for chat-template prompts (same logic as tq_generate) */
16339+
if (add_bos && prompt[0] == '<' && prompt[1] == '|') add_bos = 0;
1631616340
n_new = tq_encode(tokenizer, prompt, new_tokens, max_prompt, add_bos);
1631716341
}
1631816342
if (n_new <= 0) {

0 commit comments

Comments
 (0)