From 81f0a5abfd308d99ecd96d2dbeb2c7e84b8a7d89 Mon Sep 17 00:00:00 2001
From: vbaddi <vbaddi@qti.qualcomm.com>
Date: Wed, 3 Jun 2026 00:07:22 +0530
Subject: [PATCH] ci(0306): speed up QAIC PR tests with safe parallelism

- Run QAIC CI stages with xdist using four workers where safe
- Add opt-in QAIC device allocator to avoid runtime device contention
- Isolate QEFF_HOME per xdist worker to avoid cache/output collisions
- Switch slow CI test configs and CLI tests to tiny models
- Reduce CLI compile/generation shapes for faster PR turnaround

Signed-off-by: vbaddi <vbaddi@qti.qualcomm.com>
---
 scripts/Jenkinsfile                           |  29 ++++-
 tests/cloud/test_export_compile_execute.py    |  25 ++--
 tests/cloud/test_infer.py                     |  25 ++--
 tests/configs/audio_model_configs.json        |  14 +--
 tests/configs/causal_model_configs.json       |  13 ++-
 tests/configs/image_text_model_configs.json   |  69 +----------
 tests/configs/sequence_model_configs.json     |   8 +-
 tests/conftest.py                             | 108 +++++++++++++++++-
 .../test_seq_classification.py                |   2 +-
 9 files changed, 190 insertions(+), 103 deletions(-)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 49f637c2f9..b9eb32784f 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -102,7 +102,11 @@ pipeline {
                            mkdir -p $PWD/Non_qaic_llm &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic_llm &&
-                           pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2.xml --durations=10 &&
+                           export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                           PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                           export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                           if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                           pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2.xml --durations=10 &&
                            junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -123,7 +127,11 @@ pipeline {
                     mkdir -p $PWD/Non_qaic_feature &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_qaic_feature &&
-                    pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2_feature.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2_feature.xml --durations=10 &&
                     junitparser merge tests/tests_log2_feature.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -141,7 +149,11 @@ pipeline {
                     mkdir -p $PWD/Non_cli_qaic_multimodal &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_cli_qaic_multimodal &&
-                    pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py --junitxml=tests/tests_log6.xml --durations=10 &&
+export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log6.xml --durations=10 &&
                     junitparser merge tests/tests_log6.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -179,7 +191,11 @@ pipeline {
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_cli_qaic_diffusion &&
                     export HF_HUB_CACHE=/huggingface_hub &&
-                    pytest tests -m 'diffusion_models' --ignore tests/vllm  --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log_diffusion.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi &&
+                    pytest tests -m 'diffusion_models' --ignore tests/vllm  --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log_diffusion.xml --durations=10 &&
                     junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -200,7 +216,10 @@ pipeline {
                     mkdir -p $PWD/cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/cli &&
-                    pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log3.xml --durations=10 &&
+                    export QEFF_QAIC_DEVICE_POOL=0,1,2,3 &&
+                    PYTEST_XDIST_ARGS='-n 4 --dist loadscope' &&
+                    export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 &&
+                    pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log3.xml --durations=10 &&
                     junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
                     deactivate"
                     '''
diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py
index c2e77578ad..65a4824436 100644
--- a/tests/cloud/test_export_compile_execute.py
+++ b/tests/cloud/test_export_compile_execute.py
@@ -10,6 +10,7 @@
 
 import pytest
 import yaml
+from transformers import AutoConfig
 
 import QEfficient
 from QEfficient.cloud.execute import main as execute
@@ -38,14 +39,16 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
     base_key = "past_key."
     base_value = "past_value."
     precision = "float16"
+    config = AutoConfig.from_pretrained(model_name)
+    num_layers = getattr(config, "num_hidden_layers", getattr(config, "n_layer", 12))
 
     data = []
 
-    for i in range(12):
+    for i in range(num_layers):
         data.append({"IOName": f"{base_key}{i}", "Precision": precision})
         data.append({"IOName": f"{base_value}{i}", "Precision": precision})
 
-    for i in range(12):
+    for i in range(num_layers):
         data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
         data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
 
@@ -61,8 +64,8 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
         aic_enable_depth_first=True,
         mos=1,
         batch_size=1,
-        prompt_len=32,
-        ctx_len=128,
+        prompt_len=8,
+        ctx_len=32,
         mxfp6=True,
         mxint8=True,
         full_batch_size=full_batch_size,
@@ -77,7 +80,7 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
         qpc_path=qpc_path,
         prompt="My name is",
         prompts_txt_file_path="examples/sample_prompts/prompts.txt",
-        generation_len=20,
+        generation_len=4,
         full_batch_size=full_batch_size,
     )
 
@@ -89,14 +92,16 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
 @pytest.mark.cli
 def test_export_compile_execute(mocker):
     # testing export -> compile -> infer without full_batch_size
-    check_export_compile_execute(mocker, model_name="gpt2")
+    check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel")
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 def test_export_compile_execute_fbs(mocker):
     # testing export -> compile -> infer with full_batch_size
-    check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3)
+    check_export_compile_execute(
+        mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3
+    )
 
 
 @pytest.mark.on_qaic
@@ -104,7 +109,7 @@ def test_export_compile_execute_fbs(mocker):
 @pytest.mark.cli
 def test_export_compile_execute_qnn(mocker):
     # testing export -> compile -> infer without full_batch_size in QNN environment
-    check_export_compile_execute(mocker, model_name="gpt2", enable_qnn=True)
+    check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", enable_qnn=True)
 
 
 @pytest.mark.on_qaic
@@ -112,4 +117,6 @@ def test_export_compile_execute_qnn(mocker):
 @pytest.mark.cli
 def test_export_compile_execute_qnn_fbs(mocker):
     # testing export -> compile -> infer with full_batch_size in QNN environment
-    check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3, enable_qnn=True)
+    check_export_compile_execute(
+        mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3, enable_qnn=True
+    )
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 5cb1f3b6dd..22234f00c0 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -37,8 +37,8 @@ def check_infer(
         mos=1,
         hf_token=None,
         batch_size=1,
-        prompt_len=32,
-        ctx_len=128,
+        prompt_len=8,
+        ctx_len=32,
         generation_len=generation_len,
         mxfp6=True,
         mxint8=True,
@@ -70,14 +70,16 @@ def test_infer(mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     # testing infer without full_batch_size
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft")
+    check_infer(mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", generation_len=4)
 
 
 @pytest.mark.on_qaic
 @pytest.mark.cli
 def test_infer_fbs(mocker):
     # testing infer with full_batch_size
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3)
+    check_infer(
+        mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", full_batch_size=3, generation_len=4
+    )
 
 
 @pytest.mark.on_qaic
@@ -85,7 +87,9 @@ def test_infer_fbs(mocker):
 @pytest.mark.qnn
 def test_infer_qnn(mocker):
     # testing infer without full_batch_size in QNN environment
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", enable_qnn=True)
+    check_infer(
+        mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", enable_qnn=True, generation_len=4
+    )
 
 
 @pytest.mark.on_qaic
@@ -93,7 +97,13 @@ def test_infer_qnn(mocker):
 @pytest.mark.qnn
 def test_infer_qnn_fbs(mocker):
     # testing infer with full_batch_size in QNN environment
-    check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3, enable_qnn=True)
+    check_infer(
+        mocker,
+        model_name="hf-internal-testing/tiny-random-LlamaForCausalLM",
+        full_batch_size=3,
+        enable_qnn=True,
+        generation_len=4,
+    )
 
 
 @pytest.mark.on_qaic
@@ -102,9 +112,10 @@ def test_infer_vlm(mocker):
     # testing infer for MM models
     check_infer(
         mocker,
-        model_name="llava-hf/llava-1.5-7b-hf",
+        model_name="tiny-random/gemma-3",
         prompt="Describe the image.",
         image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
+        generation_len=4,
     )
 
 
diff --git a/tests/configs/audio_model_configs.json b/tests/configs/audio_model_configs.json
index c658eb0c35..c95d3fdf6e 100644
--- a/tests/configs/audio_model_configs.json
+++ b/tests/configs/audio_model_configs.json
@@ -1,8 +1,8 @@
 {
-   "speech_seq2seq_models": [
-        "openai/whisper-tiny"
-    ],
-    "audio_embedding_models": [
-        "facebook/wav2vec2-base-960h"
-    ]
-}
\ No newline at end of file
+  "speech_seq2seq_models": [
+    "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+  ],
+  "audio_embedding_models": [
+    "hf-internal-testing/tiny-random-wav2vec2"
+  ]
+}
diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json
index 93f4e7ae2f..95940857f6 100644
--- a/tests/configs/causal_model_configs.json
+++ b/tests/configs/causal_model_configs.json
@@ -659,7 +659,7 @@
   ],
   "disaggregated_dummy_models": [
     {
-      "model_name": "openai/gpt-oss-20b",
+      "model_name": "tiny-random/gpt-oss-bf16",
       "model_type": "gpt_oss",
       "tokenizer_id": "gpt2",
       "additional_params": {
@@ -671,7 +671,7 @@
         "num_local_experts": 4,
         "head_dim": 32,
         "max_position_embeddings": 512,
-        "vocab_size": 201088,
+        "vocab_size": 50257,
         "sliding_window": 128
       }
     },
@@ -708,7 +708,7 @@
       }
     },
     {
-      "model_name": "openai/gpt-oss-20b",
+      "model_name": "tiny-random/gpt-oss-bf16",
       "model_type": "gpt_oss",
       "additional_params": {
         "num_hidden_layers": 2,
@@ -716,7 +716,12 @@
         "intermediate_size": 256,
         "num_attention_heads": 2,
         "num_key_value_heads": 1,
-        "num_local_experts": 4
+        "num_local_experts": 4,
+        "vocab_size": 8192,
+        "max_position_embeddings": 128,
+        "sliding_window": 128,
+        "pad_token_id": 0,
+        "eos_token_id": 0
       }
     }
   ]
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index eebb87957a..b066249535 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -499,77 +499,16 @@
   ],
   "image_text_subfunction_models":[
     {
-      "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+      "model_name": "optimum-intel-internal-testing/tiny-random-qwen2.5-vl",
       "model_type": "qwen2_5_vl",
       "batch_size": 1,
       "prompt_len": 128,
-      "ctx_len": 4096,
-      "img_size": 1540,
+      "ctx_len": 512,
+      "img_size": 224,
       "img_url": "https://picsum.photos/id/237/536/354",
       "text_prompt": "Can you describe the image in detail.",
       "num_layers": 1,
-      "additional_params": {
-        "dtype": "float32",
-        "hidden_size": 2048,
-        "intermediate_size": 11008,
-        "max_position_embeddings": 128000,
-        "max_window_layers": 70,
-        "num_attention_heads": 16,
-        "num_hidden_layers": 1,
-        "num_key_value_heads": 2,
-        "text_config": {
-          "architectures": [
-            "Qwen2_5_VLForConditionalGeneration"
-          ],
-          "layer_types": [
-            "full_attention"
-          ],
-          "dtype": "float32",
-          "hidden_size": 2048,
-          "intermediate_size": 11008,
-          "max_position_embeddings": 128000,
-          "max_window_layers": 70,
-          "model_type": "qwen2_5_vl_text",
-          "num_attention_heads": 16,
-          "num_hidden_layers": 1,
-          "num_key_value_heads": 2,
-          "rms_norm_eps": 1e-06,
-          "rope_scaling": {
-            "mrope_section": [
-              16,
-              24,
-              24
-            ],
-            "rope_type": "default",
-            "type": "default"
-          },
-          "vocab_size": 151936
-        },
-        "vision_config": {
-          "depth": 1,
-          "num_hidden_layers": 1,
-          "hidden_act": "silu",
-          "hidden_size": 1280,
-          "intermediate_size": 3420,
-          "num_heads": 16,
-          "in_chans": 3,
-          "out_hidden_size": 2048,
-          "patch_size": 14,
-          "spatial_merge_size": 2,
-          "spatial_patch_size": 14,
-          "window_size": 112,
-          "fullatt_block_indexes": [
-            7,
-            15,
-            23,
-            31
-          ],
-          "tokens_per_second": 2,
-          "temporal_patch_size": 2
-        },
-        "vision_start_token_id": 151652,
-        "vocab_size": 151936
-      }
+      "additional_params": {}
     }
   ],
   "image_text_custom_dtype_models":[
diff --git a/tests/configs/sequence_model_configs.json b/tests/configs/sequence_model_configs.json
index 32a37a84d4..699a6afb10 100644
--- a/tests/configs/sequence_model_configs.json
+++ b/tests/configs/sequence_model_configs.json
@@ -1,5 +1,5 @@
 {
-    "seq_classification_models": [
-        "meta-llama/Llama-Prompt-Guard-2-22M"
-    ]
-}
\ No newline at end of file
+  "seq_classification_models": [
+    "ydshieh/tiny-random-BertForSequenceClassification"
+  ]
+}
diff --git a/tests/conftest.py b/tests/conftest.py
index 62714e1459..5bfd74700c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,14 +7,23 @@
 
 import os
 import shutil
+import tempfile
+import time
+from contextlib import contextmanager
 from pathlib import Path
 
 import pytest
 from transformers import logging
 
+import QEfficient.utils.cache as qeff_cache
 from QEfficient.utils.cache import QEFF_HOME
 from QEfficient.utils.logging_utils import logger
 
+try:
+    import fcntl
+except ImportError:  # pragma: no cover - CI runs on Linux.
+    fcntl = None
+
 _QUICKCHECK_FILE = "tests/unit_test/models/test_model_quickcheck.py"
 _QUICKCHECK_SUMMARY = {}
 _QUICKCHECK_META = {
@@ -65,6 +74,68 @@
 }
 
 
+def _qaic_device_pool():
+    pool = os.environ.get("QEFF_QAIC_DEVICE_POOL", "0,1,2,3")
+    return [int(device_id) for device_id in pool.split(",") if device_id.strip()]
+
+
+def _qaic_device_lock_dir():
+    return Path(os.environ.get("QEFF_QAIC_DEVICE_LOCK_DIR", tempfile.gettempdir())) / "qeff_qaic_device_locks"
+
+
+@contextmanager
+def _allocated_qaic_device():
+    devices = _qaic_device_pool()
+    if not devices:
+        yield None
+        return
+
+    lock_dir = _qaic_device_lock_dir()
+    lock_dir.mkdir(parents=True, exist_ok=True)
+    locked_file = None
+    try:
+        while True:
+            for device_id in devices:
+                lock_file = open(lock_dir / f"device_{device_id}.lock", "a+", encoding="utf-8")
+                if fcntl is None:
+                    locked_file = lock_file
+                    yield device_id
+                    return
+                try:
+                    fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+                except BlockingIOError:
+                    lock_file.close()
+                    continue
+                locked_file = lock_file
+                yield device_id
+                return
+            time.sleep(1)
+    finally:
+        if locked_file is not None:
+            try:
+                if fcntl is not None:
+                    fcntl.flock(locked_file.fileno(), fcntl.LOCK_UN)
+                locked_file.close()
+            except OSError:
+                pass
+
+
+def _configure_worker_qeff_home():
+    global QEFF_HOME
+
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER")
+    if not worker_id:
+        return
+
+    base_qeff_home = Path(os.environ.get("QEFF_HOME", str(QEFF_HOME)))
+    worker_qeff_home = base_qeff_home if base_qeff_home.name == worker_id else base_qeff_home / worker_id
+    worker_qeff_home.mkdir(parents=True, exist_ok=True)
+    os.environ["QEFF_HOME"] = str(worker_qeff_home)
+
+    QEFF_HOME = worker_qeff_home
+    qeff_cache.QEFF_HOME = worker_qeff_home
+
+
 def _is_nightly_pipeline_session(session):
     """Check if this is a nightly_pipeline test session"""
     # Check invocation args
@@ -82,7 +153,7 @@ def _is_nightly_pipeline_session(session):
     return False
 
 
-def qeff_models_clean_up(qeff_dir=QEFF_HOME):
+def qeff_models_clean_up(qeff_dir=None):
     """
     Clean up QEFF models and cache.
 
@@ -90,6 +161,9 @@ def qeff_models_clean_up(qeff_dir=QEFF_HOME):
         qeff_dir: Can be a string (file/dir path), PosixPath, or list of strings/PosixPath objects
                  If a file path is provided, its parent directory will be deleted
     """
+    if qeff_dir is None:
+        qeff_dir = QEFF_HOME
+
     if isinstance(qeff_dir, (str, Path)):
         paths = [qeff_dir]
     else:
@@ -117,6 +191,37 @@ def manual_cleanup():
     return qeff_models_clean_up
 
 
+@pytest.fixture(autouse=True)
+def qaic_device_allocator(request, monkeypatch):
+    """Assign one QAIC device per on_qaic test when xdist is enabled in CI.
+
+    The allocator is opt-in so full-layer or multi-device runs can stay on the
+    default runtime behavior. For one-device tests it redirects the implicit
+    default device 0 to the worker's locked device.
+    """
+    if "on_qaic" not in request.keywords or os.environ.get("QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR") != "1":
+        yield
+        return
+
+    with _allocated_qaic_device() as device_id:
+        if device_id is None:
+            yield
+            return
+
+        monkeypatch.setenv("QEFF_QAIC_DEVICE_ID", str(device_id))
+        from QEfficient.generation.cloud_infer import QAICInferenceSession
+
+        original_init = QAICInferenceSession.__init__
+
+        def _init_with_allocated_device(self, qpc_path, device_ids=None, *args, **kwargs):
+            if device_ids is None or device_ids == [0]:
+                device_ids = [device_id]
+            return original_init(self, qpc_path, device_ids, *args, **kwargs)
+
+        monkeypatch.setattr(QAICInferenceSession, "__init__", _init_with_allocated_device)
+        yield
+
+
 def pytest_sessionstart(session):
     logger.info("PYTEST Session Starting ...")
     # Skip cleanup for nightly_pipeline tests
@@ -131,6 +236,7 @@ def pytest_sessionstart(session):
 
 def pytest_configure(config):
     """Register custom markers for test categorization."""
+    _configure_worker_qeff_home()
     config.addinivalue_line("markers", "llm_model: mark test as a pure LLM model inference test")
     config.addinivalue_line(
         "markers", "feature: mark test as a feature-specific test (SPD, sampler, prefix caching, LoRA, etc.)"
diff --git a/tests/transformers/models/sequence_models/test_seq_classification.py b/tests/transformers/models/sequence_models/test_seq_classification.py
index 0d76067c52..b4a4479f28 100644
--- a/tests/transformers/models/sequence_models/test_seq_classification.py
+++ b/tests/transformers/models/sequence_models/test_seq_classification.py
@@ -85,7 +85,7 @@ def check_seq_classification_pytorch_vs_ai100(
     assert os.path.isfile(qconfig_path), f"qconfig.json not found at {qconfig_path}"
 
     # Run on Cloud AI 100
-    ai100_outputs = qeff_model.generate(inputs=inputs, device_ids=[0])
+    ai100_outputs = qeff_model.generate(inputs=inputs)
     ai100_logits = ai100_outputs["logits"]
     ai100_predicted_class = ai100_logits.argmax().item()