quic · vbaddi · Jun 2, 2026
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
@@ -102,7 +102,11 @@ pipeline {
                            mkdir -p $PWD/Non_qaic_llm &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic_llm &&
-                           pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2.xml --durations=10 &&
+                           export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                           PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                           export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                           if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                           pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2.xml --durations=10 &&
                            junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -123,7 +127,11 @@ pipeline {
                     mkdir -p $PWD/Non_qaic_feature &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_qaic_feature &&
-                    pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2_feature.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2_feature.xml --durations=10 &&
                     junitparser merge tests/tests_log2_feature.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -141,7 +149,11 @@ pipeline {
                     mkdir -p $PWD/Non_cli_qaic_multimodal &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_cli_qaic_multimodal &&
-                    pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py --junitxml=tests/tests_log6.xml --durations=10 &&
+export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log6.xml --durations=10 &&
                     junitparser merge tests/tests_log6.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -179,7 +191,11 @@ pipeline {
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_cli_qaic_diffusion &&
                     export HF_HUB_CACHE=/huggingface_hub &&
-                    pytest tests -m 'diffusion_models' --ignore tests/vllm  --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log_diffusion.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m 'diffusion_models' --ignore tests/vllm  --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log_diffusion.xml --durations=10 &&
                     junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -200,7 +216,10 @@ pipeline {
                     mkdir -p $PWD/cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/cli &&
-                    pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log3.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log3.xml --durations=10 &&
                     junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
                     deactivate"
                     '''

diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py
@@ -10,6 +10,7 @@
 
 import pytest
 import yaml
+from transformers import AutoConfig
 
 import QEfficient
 from QEfficient.cloud.execute import main as execute
@@ -38,14 +39,16 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
     base_key = "past_key."
     base_value = "past_value."
     precision = "float16"
+    config = AutoConfig.from_pretrained(model_name)
+    num_layers = getattr(config, "num_hidden_layers", getattr(config, "n_layer", 12))
 
     data = []
 
-    for i in range(12):
+    for i in range(num_layers):
         data.append({"IOName": f"{base_key}{i}", "Precision": precision})
         data.append({"IOName": f"{base_value}{i}", "Precision": precision})
 
-    for i in range(12):
+    for i in range(num_layers):
         data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
         data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
 
@@ -61,8 +64,8 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
         aic_enable_depth_first=True,
         mos=1,
         batch_size=1,
-        prompt_len=32,
-        ctx_len=128,
+        prompt_len=8,
+        ctx_len=32,
         mxfp6=True,
         mxint8=True,
         full_batch_size=full_batch_size,
@@ -77,7 +80,7 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
         qpc_path=qpc_path,
         prompt="My name is",
         prompts_txt_file_path="examples/sample_prompts/prompts.txt",
-        generation_len=20,
+        generation_len=4,
         full_batch_size=full_batch_size,
     )
 
@@ -89,27 +92,31 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
 @pytest.mark.cli
 def test_export_compile_execute(mocker):
     # testing export -> compile -> infer without full_batch_size
-    check_export_compile_execute(mocker, model_name="gpt2")
+    check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel")
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 def test_export_compile_execute_fbs(mocker):
     # testing export -> compile -> infer with full_batch_size
-    check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3)
+    check_export_compile_execute(
+        mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3
+    )
 
 
 @pytest.mark.on_qaic
 @pytest.mark.qnn
 @pytest.mark.cli
 def test_export_compile_execute_qnn(mocker):
     # testing export -> compile -> infer without full_batch_size in QNN environment
-    check_export_compile_execute(mocker, model_name="gpt2", enable_qnn=True)
+    check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", enable_qnn=True)
 
 
 @pytest.mark.on_qaic
 @pytest.mark.qnn
 @pytest.mark.cli
 def test_export_compile_execute_qnn_fbs(mocker):
     # testing export -> compile -> infer with full_batch_size in QNN environment
-    check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3, enable_qnn=True)
+    check_export_compile_execute(
+        mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3, enable_qnn=True
+    )
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
@@ -37,8 +37,8 @@ def check_infer(
         mos=1,
         hf_token=None,
         batch_size=1,
-        prompt_len=32,
-        ctx_len=128,
+        prompt_len=8,
+        ctx_len=32,
         generation_len=generation_len,
         mxfp6=True,
         mxint8=True,
@@ -70,30 +70,40 @@ def test_infer(mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     # testing infer without full_batch_size
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft")
+    check_infer(mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", generation_len=4)
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 def test_infer_fbs(mocker):
     # testing infer with full_batch_size
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3)
+    check_infer(
+        mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", full_batch_size=3, generation_len=4
+    )
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 @pytest.mark.qnn
 def test_infer_qnn(mocker):
     # testing infer without full_batch_size in QNN environment
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", enable_qnn=True)
+    check_infer(
+        mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", enable_qnn=True, generation_len=4
+    )
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 @pytest.mark.qnn
 def test_infer_qnn_fbs(mocker):
     # testing infer with full_batch_size in QNN environment
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3, enable_qnn=True)
+    check_infer(
+        mocker,
+        model_name="hf-internal-testing/tiny-random-LlamaForCausalLM",
+        full_batch_size=3,
+        enable_qnn=True,
+        generation_len=4,
+    )
 
 
 @pytest.mark.on_qaic
@@ -102,9 +112,10 @@ def test_infer_vlm(mocker):
     # testing infer for MM models
     check_infer(
         mocker,
-        model_name="llava-hf/llava-1.5-7b-hf",
+        model_name="tiny-random/gemma-3",
         prompt="Describe the image.",
         image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
+        generation_len=4,
     )
 
 

diff --git a/tests/configs/audio_model_configs.json b/tests/configs/audio_model_configs.json
@@ -1,8 +1,8 @@
 {
-   "speech_seq2seq_models": [
-        "openai/whisper-tiny"
-    ],
-    "audio_embedding_models": [
-        "facebook/wav2vec2-base-960h"
-    ]
-}
+  "speech_seq2seq_models": [
+    "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+  ],
+  "audio_embedding_models": [
+    "hf-internal-testing/tiny-random-wav2vec2"
+  ]
+}
diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json
@@ -659,7 +659,7 @@
   ],
   "disaggregated_dummy_models": [
     {
-      "model_name": "openai/gpt-oss-20b",
+      "model_name": "tiny-random/gpt-oss-bf16",
       "model_type": "gpt_oss",
       "tokenizer_id": "gpt2",
       "additional_params": {
@@ -671,7 +671,7 @@
         "num_local_experts": 4,
         "head_dim": 32,
         "max_position_embeddings": 512,
-        "vocab_size": 201088,
+        "vocab_size": 50257,
         "sliding_window": 128
       }
     },
@@ -708,15 +708,20 @@
       }
     },
     {
-      "model_name": "openai/gpt-oss-20b",
+      "model_name": "tiny-random/gpt-oss-bf16",
       "model_type": "gpt_oss",
       "additional_params": {
         "num_hidden_layers": 2,
         "hidden_size": 64,
         "intermediate_size": 256,
         "num_attention_heads": 2,
         "num_key_value_heads": 1,
-        "num_local_experts": 4
+        "num_local_experts": 4,
+        "vocab_size": 8192,
+        "max_position_embeddings": 128,
+        "sliding_window": 128,
+        "pad_token_id": 0,
+        "eos_token_id": 0
       }
     }
   ]

diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
@@ -499,77 +499,16 @@
   ],
   "image_text_subfunction_models":[
     {
-      "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+      "model_name": "optimum-intel-internal-testing/tiny-random-qwen2.5-vl",
       "model_type": "qwen2_5_vl",
       "batch_size": 1,
       "prompt_len": 128,
-      "ctx_len": 4096,
-      "img_size": 1540,
+      "ctx_len": 512,
+      "img_size": 224,
       "img_url": "https://picsum.photos/id/237/536/354",
       "text_prompt": "Can you describe the image in detail.",
       "num_layers": 1,
-      "additional_params": {
-        "dtype": "float32",
-        "hidden_size": 2048,
-        "intermediate_size": 11008,
-        "max_position_embeddings": 128000,
-        "max_window_layers": 70,
-        "num_attention_heads": 16,
-        "num_hidden_layers": 1,
-        "num_key_value_heads": 2,
-        "text_config": {
-          "architectures": [
-            "Qwen2_5_VLForConditionalGeneration"
-          ],
-          "layer_types": [
-            "full_attention"
-          ],
-          "dtype": "float32",
-          "hidden_size": 2048,
-          "intermediate_size": 11008,
-          "max_position_embeddings": 128000,
-          "max_window_layers": 70,
-          "model_type": "qwen2_5_vl_text",
-          "num_attention_heads": 16,
-          "num_hidden_layers": 1,
-          "num_key_value_heads": 2,
-          "rms_norm_eps": 1e-06,
-          "rope_scaling": {
-            "mrope_section": [
-              16,
-              24,
-              24
-            ],
-            "rope_type": "default",
-            "type": "default"
-          },
-          "vocab_size": 151936
-        },
-        "vision_config": {
-          "depth": 1,
-          "num_hidden_layers": 1,
-          "hidden_act": "silu",
-          "hidden_size": 1280,
-          "intermediate_size": 3420,
-          "num_heads": 16,
-          "in_chans": 3,
-          "out_hidden_size": 2048,
-          "patch_size": 14,
-          "spatial_merge_size": 2,
-          "spatial_patch_size": 14,
-          "window_size": 112,
-          "fullatt_block_indexes": [
-            7,
-            15,
-            23,
-            31
-          ],
-          "tokens_per_second": 2,
-          "temporal_patch_size": 2
-        },
-        "vision_start_token_id": 151652,
-        "vocab_size": 151936
-      }
+      "additional_params": {}
     }
   ],
   "image_text_custom_dtype_models":[

diff --git a/tests/configs/sequence_model_configs.json b/tests/configs/sequence_model_configs.json
@@ -1,5 +1,5 @@
 {
-    "seq_classification_models": [
-        "meta-llama/Llama-Prompt-Guard-2-22M"
-    ]
-}
+  "seq_classification_models": [
+    "ydshieh/tiny-random-BertForSequenceClassification"
+  ]
+}