fix: clear prompt for recurrent / hybrid models when only a partial prefix matches (abetlen#2108)

avion23 · Ralf Waldukat · web-flow · commit cdb7a755c63d · 2026-05-31T20:06:45.000-07:00
Co-authored-by: Ralf Waldukat &lt;ralf.waldukat@gmail.com&gt;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -10,6 +10,11 @@ on:
 env:
   REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF
   MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf
+  RECURRENT_REPO_ID: QuantFactory/mamba-130m-hf-GGUF
+  RECURRENT_MODEL_FILE: mamba-130m-hf.Q2_K.gguf
+  HYBRID_REPO_ID: tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF
+  HYBRID_MODEL_FILE: Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf
+  MODEL_CACHE_KEY: qwen35-q8-mamba130m-q2-falconh1tiny-q2
 
 jobs:
   download-model:
@@ -22,12 +27,15 @@ jobs:
       - name: Install huggingface-hub
         run: pip install huggingface-hub
       - name: Download model
-        run: hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
+        run: |
+          hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
+          hf download ${{ env.RECURRENT_REPO_ID }} ${{ env.RECURRENT_MODEL_FILE }}
+          hf download ${{ env.HYBRID_REPO_ID }} ${{ env.HYBRID_MODEL_FILE }}
       - name: Cache model
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
-          key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+          key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
 
   build-linux:
     needs: download-model
@@ -49,7 +57,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
-          key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+          key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
       - name: Install dependencies (Linux/MacOS)
         run: |
           python -m pip install --upgrade pip
@@ -81,7 +89,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
-          key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+          key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
 
       - name: Install dependencies (Windows)
         run: |
@@ -121,7 +129,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
-          key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+          key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
           
       - name: Install dependencies (Linux/MacOS)
         run: |
@@ -157,7 +165,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
-          key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
+          key: ${{ runner.os }}-model-${{ env.MODEL_CACHE_KEY }}
 
       - name: Install dependencies
         run: |
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: clear prompt for recurrent / hybrid models when only a partial prefix matches by @avion23 in #2108
 - fix: match Transformers `tojson` in chat template rendering by @CISC in #1486
 - fix: use env var configured multimodal library override paths when loading shared libraries by @navratil-matej in #1782
 - feat: add Jinja2 loop controls to chat templates by @handshape in #2018
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -559,6 +559,10 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Cache recurrent/hybrid model detection to avoid repeated FFI calls
+        self._is_recurrent = llama_cpp.llama_model_is_recurrent(self._model.model)
+        self._is_hybrid = llama_cpp.llama_model_is_hybrid(self._model.model)
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -644,6 +648,11 @@ def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
 
+        if self._is_recurrent or self._is_hybrid:
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
+
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
@@ -899,6 +908,19 @@ def generate(
                     longest_prefix += 1
                 else:
                     break
+
+            # Recurrent and hybrid models cannot rewind state; reset if needed
+            if (
+                self._is_recurrent or self._is_hybrid
+            ) and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+                if self.verbose:
+                    print(
+                        "Llama.generate: recurrent/hybrid model requires full state reset",
+                        file=sys.stderr,
+                    )
+
             if longest_prefix > 0:
                 if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
                     reset = False
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -72,6 +72,22 @@ def llama_cpp_embedding_model_path():
     return model_path
 
 
+@pytest.fixture
+def llama_cpp_recurrent_model_path():
+    repo_id = "QuantFactory/mamba-130m-hf-GGUF"
+    filename = "mamba-130m-hf.Q2_K.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
+@pytest.fixture
+def llama_cpp_hybrid_model_path():
+    repo_id = "tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF"
+    filename = "Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
 def test_real_model(llama_cpp_model_path):
     import os
 
@@ -233,6 +249,96 @@ def logit_processor_func(input_ids, logits):
     assert number_1 == number_3
 
 
+def test_real_llama_repeated_prompt_cache(llama_cpp_model_path):
+    model = llama_cpp.Llama(
+        llama_cpp_model_path,
+        n_ctx=32,
+        n_batch=32,
+        n_ubatch=32,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        flash_attn=True,
+        verbose=False,
+    )
+    prompt = "The quick brown fox jumps over the lazy dog. The quick brown fox"
+
+    output_1 = model.create_completion(
+        prompt,
+        max_tokens=6,
+        temperature=0.0,
+        seed=1337,
+    )
+    output_2 = model.create_completion(
+        prompt,
+        max_tokens=6,
+        temperature=0.0,
+        seed=1337,
+    )
+
+    assert output_1["choices"][0]["text"] == " jumps over the lazy dog."
+    assert output_2["choices"][0]["text"] == output_1["choices"][0]["text"]
+
+
+def _assert_prompt_cache_reset_handles_history_edit(
+    model_path,
+    *,
+    is_recurrent: bool,
+    is_hybrid: bool,
+):
+    model = llama_cpp.Llama(
+        model_path,
+        n_ctx=32,
+        n_batch=32,
+        n_ubatch=32,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        verbose=False,
+    )
+
+    assert model._is_recurrent is is_recurrent
+    assert model._is_hybrid is is_hybrid
+
+    first_prompt = "The quick brown fox"
+    second_prompt = "The slow brown fox"
+    first_tokens = model.tokenize(first_prompt.encode(), add_bos=True, special=True)
+    second_tokens = model.tokenize(second_prompt.encode(), add_bos=True, special=True)
+
+    assert first_tokens != second_tokens
+    assert first_tokens[0] == second_tokens[0]
+
+    first_output = model.create_completion(
+        first_prompt,
+        max_tokens=1,
+        temperature=0.0,
+    )
+    assert isinstance(first_output["choices"][0]["text"], str)
+
+    second_output = model.create_completion(
+        second_prompt,
+        max_tokens=1,
+        temperature=0.0,
+    )
+    assert isinstance(second_output["choices"][0]["text"], str)
+
+
+def test_recurrent_model_prompt_cache_reset(llama_cpp_recurrent_model_path):
+    _assert_prompt_cache_reset_handles_history_edit(
+        llama_cpp_recurrent_model_path,
+        is_recurrent=True,
+        is_hybrid=False,
+    )
+
+
+def test_hybrid_model_prompt_cache_reset(llama_cpp_hybrid_model_path):
+    _assert_prompt_cache_reset_handles_history_edit(
+        llama_cpp_hybrid_model_path,
+        is_recurrent=False,
+        is_hybrid=True,
+    )
+
+
 def test_real_llama_embeddings(llama_cpp_embedding_model_path):
     model = llama_cpp.Llama(
         llama_cpp_embedding_model_path,