@@ -72,6 +72,22 @@ def llama_cpp_embedding_model_path():
7272 return model_path
7373
7474
75+ @pytest .fixture
76+ def llama_cpp_recurrent_model_path ():
77+ repo_id = "QuantFactory/mamba-130m-hf-GGUF"
78+ filename = "mamba-130m-hf.Q2_K.gguf"
79+ model_path = hf_hub_download (repo_id , filename )
80+ return model_path
81+
82+
83+ @pytest .fixture
84+ def llama_cpp_hybrid_model_path ():
85+ repo_id = "tiiuae/Falcon-H1-Tiny-90M-Instruct-GGUF"
86+ filename = "Falcon-H1-Tiny-90M-Instruct-Q2_K.gguf"
87+ model_path = hf_hub_download (repo_id , filename )
88+ return model_path
89+
90+
7591def test_real_model (llama_cpp_model_path ):
7692 import os
7793
@@ -233,6 +249,96 @@ def logit_processor_func(input_ids, logits):
233249 assert number_1 == number_3
234250
235251
252+ def test_real_llama_repeated_prompt_cache (llama_cpp_model_path ):
253+ model = llama_cpp .Llama (
254+ llama_cpp_model_path ,
255+ n_ctx = 32 ,
256+ n_batch = 32 ,
257+ n_ubatch = 32 ,
258+ n_threads = multiprocessing .cpu_count (),
259+ n_threads_batch = multiprocessing .cpu_count (),
260+ logits_all = False ,
261+ flash_attn = True ,
262+ verbose = False ,
263+ )
264+ prompt = "The quick brown fox jumps over the lazy dog. The quick brown fox"
265+
266+ output_1 = model .create_completion (
267+ prompt ,
268+ max_tokens = 6 ,
269+ temperature = 0.0 ,
270+ seed = 1337 ,
271+ )
272+ output_2 = model .create_completion (
273+ prompt ,
274+ max_tokens = 6 ,
275+ temperature = 0.0 ,
276+ seed = 1337 ,
277+ )
278+
279+ assert output_1 ["choices" ][0 ]["text" ] == " jumps over the lazy dog."
280+ assert output_2 ["choices" ][0 ]["text" ] == output_1 ["choices" ][0 ]["text" ]
281+
282+
283+ def _assert_prompt_cache_reset_handles_history_edit (
284+ model_path ,
285+ * ,
286+ is_recurrent : bool ,
287+ is_hybrid : bool ,
288+ ):
289+ model = llama_cpp .Llama (
290+ model_path ,
291+ n_ctx = 32 ,
292+ n_batch = 32 ,
293+ n_ubatch = 32 ,
294+ n_threads = multiprocessing .cpu_count (),
295+ n_threads_batch = multiprocessing .cpu_count (),
296+ logits_all = False ,
297+ verbose = False ,
298+ )
299+
300+ assert model ._is_recurrent is is_recurrent
301+ assert model ._is_hybrid is is_hybrid
302+
303+ first_prompt = "The quick brown fox"
304+ second_prompt = "The slow brown fox"
305+ first_tokens = model .tokenize (first_prompt .encode (), add_bos = True , special = True )
306+ second_tokens = model .tokenize (second_prompt .encode (), add_bos = True , special = True )
307+
308+ assert first_tokens != second_tokens
309+ assert first_tokens [0 ] == second_tokens [0 ]
310+
311+ first_output = model .create_completion (
312+ first_prompt ,
313+ max_tokens = 1 ,
314+ temperature = 0.0 ,
315+ )
316+ assert isinstance (first_output ["choices" ][0 ]["text" ], str )
317+
318+ second_output = model .create_completion (
319+ second_prompt ,
320+ max_tokens = 1 ,
321+ temperature = 0.0 ,
322+ )
323+ assert isinstance (second_output ["choices" ][0 ]["text" ], str )
324+
325+
326+ def test_recurrent_model_prompt_cache_reset (llama_cpp_recurrent_model_path ):
327+ _assert_prompt_cache_reset_handles_history_edit (
328+ llama_cpp_recurrent_model_path ,
329+ is_recurrent = True ,
330+ is_hybrid = False ,
331+ )
332+
333+
334+ def test_hybrid_model_prompt_cache_reset (llama_cpp_hybrid_model_path ):
335+ _assert_prompt_cache_reset_handles_history_edit (
336+ llama_cpp_hybrid_model_path ,
337+ is_recurrent = False ,
338+ is_hybrid = True ,
339+ )
340+
341+
236342def test_real_llama_embeddings (llama_cpp_embedding_model_path ):
237343 model = llama_cpp .Llama (
238344 llama_cpp_embedding_model_path ,
0 commit comments