From 81f0a5abfd308d99ecd96d2dbeb2c7e84b8a7d89 Mon Sep 17 00:00:00 2001 From: vbaddi Date: Wed, 3 Jun 2026 00:07:22 +0530 Subject: [PATCH] ci(0306): speed up QAIC PR tests with safe parallelism - Run QAIC CI stages with xdist using four workers where safe - Add opt-in QAIC device allocator to avoid runtime device contention - Isolate QEFF_HOME per xdist worker to avoid cache/output collisions - Switch slow CI test configs and CLI tests to tiny models - Reduce CLI compile/generation shapes for faster PR turnaround Signed-off-by: vbaddi --- scripts/Jenkinsfile | 29 ++++- tests/cloud/test_export_compile_execute.py | 25 ++-- tests/cloud/test_infer.py | 25 ++-- tests/configs/audio_model_configs.json | 14 +-- tests/configs/causal_model_configs.json | 13 ++- tests/configs/image_text_model_configs.json | 69 +---------- tests/configs/sequence_model_configs.json | 8 +- tests/conftest.py | 108 +++++++++++++++++- .../test_seq_classification.py | 2 +- 9 files changed, 190 insertions(+), 103 deletions(-) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 49f637c2f9..b9eb32784f 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -102,7 +102,11 @@ pipeline { mkdir -p $PWD/Non_qaic_llm && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic_llm && - pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2.xml --durations=10 && + export QEFF_QAIC_DEVICE_POOL=0,1,2,3 && + PYTEST_XDIST_ARGS='-n 4 --dist loadscope' && + export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 && + if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi && + pytest tests -m '(llm_model) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2.xml --durations=10 && junitparser merge tests/tests_log2.xml tests/tests_log.xml && deactivate" ''' @@ -123,7 +127,11 @@ pipeline { mkdir -p $PWD/Non_qaic_feature && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic_feature && - pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log2_feature.xml --durations=10 && + export QEFF_QAIC_DEVICE_POOL=0,1,2,3 && + PYTEST_XDIST_ARGS='-n 4 --dist loadscope' && + export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 && + if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi && + pytest tests -m '(on_qaic) and (feature) and (not qnn) and ${TEST_FILTER}' --ignore tests/transformers/sampler --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log2_feature.xml --durations=10 && junitparser merge tests/tests_log2_feature.xml tests/tests_log.xml && deactivate" ''' @@ -141,7 +149,11 @@ pipeline { mkdir -p $PWD/Non_cli_qaic_multimodal && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic_multimodal && - pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py --junitxml=tests/tests_log6.xml --durations=10 && +export QEFF_QAIC_DEVICE_POOL=0,1,2,3 && + PYTEST_XDIST_ARGS='-n 4 --dist loadscope' && + export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 && + if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi && + pytest tests -m '(multimodal) and (not qnn) and ${TEST_FILTER}' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --ignore tests/transformers/models/reranker/test_reranker_mad.py \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log6.xml --durations=10 && junitparser merge tests/tests_log6.xml tests/tests_log.xml && deactivate" ''' @@ -179,7 +191,11 @@ pipeline { export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic_diffusion && export HF_HUB_CACHE=/huggingface_hub && - pytest tests -m 'diffusion_models' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log_diffusion.xml --durations=10 && + export QEFF_QAIC_DEVICE_POOL=0,1,2,3 && + PYTEST_XDIST_ARGS='-n 4 --dist loadscope' && + export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 && + if [ x${TEST_PROFILE} = xfull_layers_model ]; then PYTEST_XDIST_ARGS=''; unset QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR; fi && + pytest tests -m 'diffusion_models' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log_diffusion.xml --durations=10 && junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml && deactivate" ''' @@ -200,7 +216,10 @@ pipeline { mkdir -p $PWD/cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/cli && - pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline --junitxml=tests/tests_log3.xml --durations=10 && + export QEFF_QAIC_DEVICE_POOL=0,1,2,3 && + PYTEST_XDIST_ARGS='-n 4 --dist loadscope' && + export QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR=1 && + pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --ignore tests/unit_test --ignore tests/nightly_pipeline \${PYTEST_XDIST_ARGS} --junitxml=tests/tests_log3.xml --durations=10 && junitparser merge tests/tests_log3.xml tests/tests_log.xml && deactivate" ''' diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py index c2e77578ad..65a4824436 100644 --- a/tests/cloud/test_export_compile_execute.py +++ b/tests/cloud/test_export_compile_execute.py @@ -10,6 +10,7 @@ import pytest import yaml +from transformers import AutoConfig import QEfficient from QEfficient.cloud.execute import main as execute @@ -38,14 +39,16 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl base_key = "past_key." base_value = "past_value." precision = "float16" + config = AutoConfig.from_pretrained(model_name) + num_layers = getattr(config, "num_hidden_layers", getattr(config, "n_layer", 12)) data = [] - for i in range(12): + for i in range(num_layers): data.append({"IOName": f"{base_key}{i}", "Precision": precision}) data.append({"IOName": f"{base_value}{i}", "Precision": precision}) - for i in range(12): + for i in range(num_layers): data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) @@ -61,8 +64,8 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl aic_enable_depth_first=True, mos=1, batch_size=1, - prompt_len=32, - ctx_len=128, + prompt_len=8, + ctx_len=32, mxfp6=True, mxint8=True, full_batch_size=full_batch_size, @@ -77,7 +80,7 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl qpc_path=qpc_path, prompt="My name is", prompts_txt_file_path="examples/sample_prompts/prompts.txt", - generation_len=20, + generation_len=4, full_batch_size=full_batch_size, ) @@ -89,14 +92,16 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl @pytest.mark.cli def test_export_compile_execute(mocker): # testing export -> compile -> infer without full_batch_size - check_export_compile_execute(mocker, model_name="gpt2") + check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel") @pytest.mark.on_qaic @pytest.mark.cli def test_export_compile_execute_fbs(mocker): # testing export -> compile -> infer with full_batch_size - check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3) + check_export_compile_execute( + mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3 + ) @pytest.mark.on_qaic @@ -104,7 +109,7 @@ def test_export_compile_execute_fbs(mocker): @pytest.mark.cli def test_export_compile_execute_qnn(mocker): # testing export -> compile -> infer without full_batch_size in QNN environment - check_export_compile_execute(mocker, model_name="gpt2", enable_qnn=True) + check_export_compile_execute(mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", enable_qnn=True) @pytest.mark.on_qaic @@ -112,4 +117,6 @@ def test_export_compile_execute_qnn(mocker): @pytest.mark.cli def test_export_compile_execute_qnn_fbs(mocker): # testing export -> compile -> infer with full_batch_size in QNN environment - check_export_compile_execute(mocker, model_name="gpt2", full_batch_size=3, enable_qnn=True) + check_export_compile_execute( + mocker, model_name="hf-internal-testing/tiny-random-GPT2LMHeadModel", full_batch_size=3, enable_qnn=True + ) diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 5cb1f3b6dd..22234f00c0 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -37,8 +37,8 @@ def check_infer( mos=1, hf_token=None, batch_size=1, - prompt_len=32, - ctx_len=128, + prompt_len=8, + ctx_len=32, generation_len=generation_len, mxfp6=True, mxint8=True, @@ -70,14 +70,16 @@ def test_infer(mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ # testing infer without full_batch_size - check_infer(mocker, model_name="lu-vae/llama-68m-fft") + check_infer(mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", generation_len=4) @pytest.mark.on_qaic @pytest.mark.cli def test_infer_fbs(mocker): # testing infer with full_batch_size - check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3) + check_infer( + mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", full_batch_size=3, generation_len=4 + ) @pytest.mark.on_qaic @@ -85,7 +87,9 @@ def test_infer_fbs(mocker): @pytest.mark.qnn def test_infer_qnn(mocker): # testing infer without full_batch_size in QNN environment - check_infer(mocker, model_name="lu-vae/llama-68m-fft", enable_qnn=True) + check_infer( + mocker, model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", enable_qnn=True, generation_len=4 + ) @pytest.mark.on_qaic @@ -93,7 +97,13 @@ def test_infer_qnn(mocker): @pytest.mark.qnn def test_infer_qnn_fbs(mocker): # testing infer with full_batch_size in QNN environment - check_infer(mocker, model_name="lu-vae/llama-68m-fft", full_batch_size=3, enable_qnn=True) + check_infer( + mocker, + model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", + full_batch_size=3, + enable_qnn=True, + generation_len=4, + ) @pytest.mark.on_qaic @@ -102,9 +112,10 @@ def test_infer_vlm(mocker): # testing infer for MM models check_infer( mocker, - model_name="llava-hf/llava-1.5-7b-hf", + model_name="tiny-random/gemma-3", prompt="Describe the image.", image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", + generation_len=4, ) diff --git a/tests/configs/audio_model_configs.json b/tests/configs/audio_model_configs.json index c658eb0c35..c95d3fdf6e 100644 --- a/tests/configs/audio_model_configs.json +++ b/tests/configs/audio_model_configs.json @@ -1,8 +1,8 @@ { - "speech_seq2seq_models": [ - "openai/whisper-tiny" - ], - "audio_embedding_models": [ - "facebook/wav2vec2-base-960h" - ] -} \ No newline at end of file + "speech_seq2seq_models": [ + "hf-internal-testing/tiny-random-WhisperForConditionalGeneration" + ], + "audio_embedding_models": [ + "hf-internal-testing/tiny-random-wav2vec2" + ] +} diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json index 93f4e7ae2f..95940857f6 100644 --- a/tests/configs/causal_model_configs.json +++ b/tests/configs/causal_model_configs.json @@ -659,7 +659,7 @@ ], "disaggregated_dummy_models": [ { - "model_name": "openai/gpt-oss-20b", + "model_name": "tiny-random/gpt-oss-bf16", "model_type": "gpt_oss", "tokenizer_id": "gpt2", "additional_params": { @@ -671,7 +671,7 @@ "num_local_experts": 4, "head_dim": 32, "max_position_embeddings": 512, - "vocab_size": 201088, + "vocab_size": 50257, "sliding_window": 128 } }, @@ -708,7 +708,7 @@ } }, { - "model_name": "openai/gpt-oss-20b", + "model_name": "tiny-random/gpt-oss-bf16", "model_type": "gpt_oss", "additional_params": { "num_hidden_layers": 2, @@ -716,7 +716,12 @@ "intermediate_size": 256, "num_attention_heads": 2, "num_key_value_heads": 1, - "num_local_experts": 4 + "num_local_experts": 4, + "vocab_size": 8192, + "max_position_embeddings": 128, + "sliding_window": 128, + "pad_token_id": 0, + "eos_token_id": 0 } } ] diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index eebb87957a..b066249535 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -499,77 +499,16 @@ ], "image_text_subfunction_models":[ { - "model_name": "Qwen/Qwen2.5-VL-3B-Instruct", + "model_name": "optimum-intel-internal-testing/tiny-random-qwen2.5-vl", "model_type": "qwen2_5_vl", "batch_size": 1, "prompt_len": 128, - "ctx_len": 4096, - "img_size": 1540, + "ctx_len": 512, + "img_size": 224, "img_url": "https://picsum.photos/id/237/536/354", "text_prompt": "Can you describe the image in detail.", "num_layers": 1, - "additional_params": { - "dtype": "float32", - "hidden_size": 2048, - "intermediate_size": 11008, - "max_position_embeddings": 128000, - "max_window_layers": 70, - "num_attention_heads": 16, - "num_hidden_layers": 1, - "num_key_value_heads": 2, - "text_config": { - "architectures": [ - "Qwen2_5_VLForConditionalGeneration" - ], - "layer_types": [ - "full_attention" - ], - "dtype": "float32", - "hidden_size": 2048, - "intermediate_size": 11008, - "max_position_embeddings": 128000, - "max_window_layers": 70, - "model_type": "qwen2_5_vl_text", - "num_attention_heads": 16, - "num_hidden_layers": 1, - "num_key_value_heads": 2, - "rms_norm_eps": 1e-06, - "rope_scaling": { - "mrope_section": [ - 16, - 24, - 24 - ], - "rope_type": "default", - "type": "default" - }, - "vocab_size": 151936 - }, - "vision_config": { - "depth": 1, - "num_hidden_layers": 1, - "hidden_act": "silu", - "hidden_size": 1280, - "intermediate_size": 3420, - "num_heads": 16, - "in_chans": 3, - "out_hidden_size": 2048, - "patch_size": 14, - "spatial_merge_size": 2, - "spatial_patch_size": 14, - "window_size": 112, - "fullatt_block_indexes": [ - 7, - 15, - 23, - 31 - ], - "tokens_per_second": 2, - "temporal_patch_size": 2 - }, - "vision_start_token_id": 151652, - "vocab_size": 151936 - } + "additional_params": {} } ], "image_text_custom_dtype_models":[ diff --git a/tests/configs/sequence_model_configs.json b/tests/configs/sequence_model_configs.json index 32a37a84d4..699a6afb10 100644 --- a/tests/configs/sequence_model_configs.json +++ b/tests/configs/sequence_model_configs.json @@ -1,5 +1,5 @@ { - "seq_classification_models": [ - "meta-llama/Llama-Prompt-Guard-2-22M" - ] -} \ No newline at end of file + "seq_classification_models": [ + "ydshieh/tiny-random-BertForSequenceClassification" + ] +} diff --git a/tests/conftest.py b/tests/conftest.py index 62714e1459..5bfd74700c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,14 +7,23 @@ import os import shutil +import tempfile +import time +from contextlib import contextmanager from pathlib import Path import pytest from transformers import logging +import QEfficient.utils.cache as qeff_cache from QEfficient.utils.cache import QEFF_HOME from QEfficient.utils.logging_utils import logger +try: + import fcntl +except ImportError: # pragma: no cover - CI runs on Linux. + fcntl = None + _QUICKCHECK_FILE = "tests/unit_test/models/test_model_quickcheck.py" _QUICKCHECK_SUMMARY = {} _QUICKCHECK_META = { @@ -65,6 +74,68 @@ } +def _qaic_device_pool(): + pool = os.environ.get("QEFF_QAIC_DEVICE_POOL", "0,1,2,3") + return [int(device_id) for device_id in pool.split(",") if device_id.strip()] + + +def _qaic_device_lock_dir(): + return Path(os.environ.get("QEFF_QAIC_DEVICE_LOCK_DIR", tempfile.gettempdir())) / "qeff_qaic_device_locks" + + +@contextmanager +def _allocated_qaic_device(): + devices = _qaic_device_pool() + if not devices: + yield None + return + + lock_dir = _qaic_device_lock_dir() + lock_dir.mkdir(parents=True, exist_ok=True) + locked_file = None + try: + while True: + for device_id in devices: + lock_file = open(lock_dir / f"device_{device_id}.lock", "a+", encoding="utf-8") + if fcntl is None: + locked_file = lock_file + yield device_id + return + try: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + lock_file.close() + continue + locked_file = lock_file + yield device_id + return + time.sleep(1) + finally: + if locked_file is not None: + try: + if fcntl is not None: + fcntl.flock(locked_file.fileno(), fcntl.LOCK_UN) + locked_file.close() + except OSError: + pass + + +def _configure_worker_qeff_home(): + global QEFF_HOME + + worker_id = os.environ.get("PYTEST_XDIST_WORKER") + if not worker_id: + return + + base_qeff_home = Path(os.environ.get("QEFF_HOME", str(QEFF_HOME))) + worker_qeff_home = base_qeff_home if base_qeff_home.name == worker_id else base_qeff_home / worker_id + worker_qeff_home.mkdir(parents=True, exist_ok=True) + os.environ["QEFF_HOME"] = str(worker_qeff_home) + + QEFF_HOME = worker_qeff_home + qeff_cache.QEFF_HOME = worker_qeff_home + + def _is_nightly_pipeline_session(session): """Check if this is a nightly_pipeline test session""" # Check invocation args @@ -82,7 +153,7 @@ def _is_nightly_pipeline_session(session): return False -def qeff_models_clean_up(qeff_dir=QEFF_HOME): +def qeff_models_clean_up(qeff_dir=None): """ Clean up QEFF models and cache. @@ -90,6 +161,9 @@ def qeff_models_clean_up(qeff_dir=QEFF_HOME): qeff_dir: Can be a string (file/dir path), PosixPath, or list of strings/PosixPath objects If a file path is provided, its parent directory will be deleted """ + if qeff_dir is None: + qeff_dir = QEFF_HOME + if isinstance(qeff_dir, (str, Path)): paths = [qeff_dir] else: @@ -117,6 +191,37 @@ def manual_cleanup(): return qeff_models_clean_up +@pytest.fixture(autouse=True) +def qaic_device_allocator(request, monkeypatch): + """Assign one QAIC device per on_qaic test when xdist is enabled in CI. + + The allocator is opt-in so full-layer or multi-device runs can stay on the + default runtime behavior. For one-device tests it redirects the implicit + default device 0 to the worker's locked device. + """ + if "on_qaic" not in request.keywords or os.environ.get("QEFF_ENABLE_QAIC_DEVICE_ALLOCATOR") != "1": + yield + return + + with _allocated_qaic_device() as device_id: + if device_id is None: + yield + return + + monkeypatch.setenv("QEFF_QAIC_DEVICE_ID", str(device_id)) + from QEfficient.generation.cloud_infer import QAICInferenceSession + + original_init = QAICInferenceSession.__init__ + + def _init_with_allocated_device(self, qpc_path, device_ids=None, *args, **kwargs): + if device_ids is None or device_ids == [0]: + device_ids = [device_id] + return original_init(self, qpc_path, device_ids, *args, **kwargs) + + monkeypatch.setattr(QAICInferenceSession, "__init__", _init_with_allocated_device) + yield + + def pytest_sessionstart(session): logger.info("PYTEST Session Starting ...") # Skip cleanup for nightly_pipeline tests @@ -131,6 +236,7 @@ def pytest_sessionstart(session): def pytest_configure(config): """Register custom markers for test categorization.""" + _configure_worker_qeff_home() config.addinivalue_line("markers", "llm_model: mark test as a pure LLM model inference test") config.addinivalue_line( "markers", "feature: mark test as a feature-specific test (SPD, sampler, prefix caching, LoRA, etc.)" diff --git a/tests/transformers/models/sequence_models/test_seq_classification.py b/tests/transformers/models/sequence_models/test_seq_classification.py index 0d76067c52..b4a4479f28 100644 --- a/tests/transformers/models/sequence_models/test_seq_classification.py +++ b/tests/transformers/models/sequence_models/test_seq_classification.py @@ -85,7 +85,7 @@ def check_seq_classification_pytorch_vs_ai100( assert os.path.isfile(qconfig_path), f"qconfig.json not found at {qconfig_path}" # Run on Cloud AI 100 - ai100_outputs = qeff_model.generate(inputs=inputs, device_ids=[0]) + ai100_outputs = qeff_model.generate(inputs=inputs) ai100_logits = ai100_outputs["logits"] ai100_predicted_class = ai100_logits.argmax().item()