diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 09e989ea06..0916fb0022 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -87,7 +87,7 @@ def main( cloud_ai_100_exec_kv( tokenizer=tokenizer, qpc_path=qpc_path, - device_id=device_group, + device_ids=device_group, prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 3fa049a8ff..48c151aee7 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -315,7 +315,7 @@ def main( _ = qeff_model.generate( tokenizer, prompts=prompt, - device_id=device_group, + device_ids=device_group, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, iteration=iteration, diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 4dffa1f7c5..f8934c5611 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -127,7 +127,7 @@ def latency_stats_bertstyle( qpc_path: str, seq_len: int, prompt: str, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, ): """ Function to execute Bertstyle ONNX model on Cloud AI 100. @@ -137,9 +137,9 @@ def latency_stats_bertstyle( :qpc_path (str): Path to save generated binary file after compilation. :seq_len (int): Sequence length. :prompt (str): Sample prompt for the model text generation. - :device_id (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup. + :device_ids (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup. """ - session = QAICInferenceSession(qpc_path, device_id) + session = QAICInferenceSession(qpc_path, device_ids) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding_side="left") padding_check_and_fix(tokenizer) # Check and fix tokenizer viability inputs = tokenizer(prompt, return_tensors="np", max_length=seq_len, padding="max_length") @@ -319,7 +319,7 @@ def cloud_ai_100_exec_kv( qpc_path: str, prompt: Optional[str] = None, prompts_txt_file_path: Optional[str] = None, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, generation_len: Optional[int] = None, comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, @@ -348,7 +348,7 @@ def cloud_ai_100_exec_kv( :prompt (str): Sample prompt for the model text generation. ``Defaults to None``. :prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``. :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``. - :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. + :device_ids (List[int]): Device IDs to be used for execution. If ``len(device_ids) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``. :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``. :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``. @@ -377,7 +377,7 @@ def cloud_ai_100_exec_kv( base_path, onnx_model_path = QEfficient.export(model_name="gpt2") qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") - exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0]) + exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_ids=[0]) """ batch_size, ctx_len, full_batch_size = get_compilation_dims(qpc_path) @@ -390,7 +390,7 @@ def cloud_ai_100_exec_kv( generate_text = TextGeneration( tokenizer=tokenizer, qpc_path=qpc_path, - device_id=device_id, + device_ids=device_ids, ctx_len=ctx_len, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, @@ -443,7 +443,7 @@ def __init__( ctx_len: Optional[int] = None, comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, enable_debug_logs: bool = False, write_io_dir: Optional[str] = None, is_tlm: Optional[int] = None, @@ -465,7 +465,7 @@ def __init__( # Load QPC self._session = QAICInferenceSession( - qpc_path, device_id, activate=activate, enable_debug_logs=enable_debug_logs + qpc_path, device_ids, activate=activate, enable_debug_logs=enable_debug_logs ) # Validate sampler inputs for On-Device Sampling @@ -1083,7 +1083,7 @@ def __init__( ctx_len: Optional[int] = None, comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, enable_debug_logs: bool = False, write_io_dir: Optional[str] = None, is_tlm: bool = False, @@ -1099,7 +1099,7 @@ def __init__( ctx_len=ctx_len, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, - device_id=device_id, + device_ids=device_ids, enable_debug_logs=enable_debug_logs, write_io_dir=write_io_dir, is_tlm=is_tlm, diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py index 892fc145c4..d66279962a 100644 --- a/QEfficient/generation/vlm_generation.py +++ b/QEfficient/generation/vlm_generation.py @@ -54,7 +54,7 @@ class VisionLanguageGeneration(QEffTextGenerationBase): ... processor=processor, ... lang_qpc_path="path/to/lang.qpc", ... vision_qpc_path="path/to/vision.qpc", - ... device_id=[0] + ... device_ids=[0] ... ) >>> result = vlm.generate( ... images=["image1.jpg"], @@ -68,7 +68,7 @@ class VisionLanguageGeneration(QEffTextGenerationBase): ... processor=processor, ... lang_qpc_path="path/to/lang.qpc", ... vision_qpc_path="path/to/vision.qpc", - ... device_id=[0], + ... device_ids=[0], ... full_batch_size=8, # Enable continuous batching ... include_sampler=True, # Enable on-device sampling ... sampling_params=sampling_config @@ -82,7 +82,7 @@ def __init__( processor: AutoImageProcessor, lang_qpc_path: str, vision_qpc_path: str, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, ctx_len: Optional[int] = None, comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, @@ -106,7 +106,7 @@ def __init__( processor: Image processor lang_qpc_path: Path to language model QPC vision_qpc_path: Path to vision encoder QPC - device_id: Device IDs for execution (default: [0]) + device_ids: Device IDs for execution (default: [0]) ctx_len: Context length enable_debug_logs: Enable debug logging write_io_dir: Directory for I/O file writing @@ -134,7 +134,7 @@ def __init__( ctx_len=ctx_len, comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, - device_id=device_id, + device_ids=device_ids, enable_debug_logs=enable_debug_logs, write_io_dir=write_io_dir, is_tlm=is_tlm, @@ -157,7 +157,7 @@ def __init__( self.image_height = image_height self.image_width = image_width self._vision_qpc_path = vision_qpc_path - self.device_id = device_id # Store device_id for vision components + self.device_ids = device_ids # Store device_ids for vision components self.enable_debug_logs = enable_debug_logs # Store for vision components self._vision_outputs_cache = LRUCache(max_size=100) # LRU cache for vision outputs self._vision_cache = {} # Cache for vision outputs across batches @@ -177,7 +177,7 @@ def _init_vision_components(self): """Initialize vision-specific components""" # Vision session (separate from base class language session) self._vision_session = QAICInferenceSession( - self._vision_qpc_path, self.device_id, activate=False, enable_debug_logs=self.enable_debug_logs + self._vision_qpc_path, self.device_ids, activate=False, enable_debug_logs=self.enable_debug_logs ) # Vision handler with language session coordination @@ -801,7 +801,7 @@ def generate_stream_tokens( tokenizer=self.tokenizer, qpc_path=self._qpc_path, ctx_len=self._ctx_len, - device_id=self.device_id, + device_ids=self.device_ids, enable_debug_logs=self.enable_debug_logs, is_tlm=self.is_tlm, include_sampler=self.include_sampler, diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py index 91a62ae51a..870960d95a 100644 --- a/QEfficient/peft/lora/auto.py +++ b/QEfficient/peft/lora/auto.py @@ -395,7 +395,7 @@ def generate( tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], prompt_to_adapter_mapping: List[str] = None, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, runtime: Optional[str] = "AI_100", **kwargs, ): @@ -410,7 +410,7 @@ def generate( tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): Tokenizer used for inference. prompts (List[str]): List of prompts to generate outputs for. prompt_to_adapter_mapping (List[str]): List of adapter names to use for each prompt. Use "base" for the base model (no adapter). - device_id (List[int], optional): Device IDs to use for execution. If `None`, auto-device-picker is used. + device_ids (List[int], optional): Device IDs to use for execution. If `None`, auto-device-picker is used. runtime (str, optional): Runtime to use. Only "AI_100" is currently supported. Default is "AI_100". **kwargs: Additional generation parameters. @@ -440,7 +440,7 @@ def generate( tokenizer, self.qpc_path, prompt=prompts, - device_id=device_id, + device_ids=device_ids, generation_len=generation_len, prompt_to_lora_id_mapping=[ self.active_adapter_to_id[name] if name != "base" else 0 for name in prompt_to_adapter_mapping diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 2668be8a1e..45886d95b7 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1717,7 +1717,7 @@ def generate( vision_qpc_path=self.vision_model.qpc_path, tokenizer=tokenizer, processor=processor, - device_id=device_ids, # if device_ids is not None else [0], + device_ids=device_ids, # if device_ids is not None else [0], ctx_len=ctx_len_comp, full_batch_size=fbs, comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill, @@ -3674,7 +3674,7 @@ def generate( self, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], - device_id: List[int] = None, + device_ids: List[int] = None, runtime_ai100: bool = True, **kwargs, ): @@ -3690,7 +3690,7 @@ def generate( Tokenizer for the model. prompts : list of str List of prompts to generate output for. - device_id : list of int, optional + device_ids : list of int, optional Device IDs for running the QPC. Defaults to `[0]` if not specified. runtime_ai100 : bool, optional Whether to use AI 100 runtime. Default is True. @@ -3724,7 +3724,7 @@ def generate( prompt=prompts, comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill, comp_ctx_lengths_decode=self.comp_ctx_lengths_decode, - device_id=device_id, + device_ids=device_ids, generation_len=generation_len, automation=kwargs.pop("automation", False), iteration=kwargs.pop("iteration", 1), @@ -4363,7 +4363,7 @@ def generate( :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. :processor (AutoProcessor): The Processor to use for encoding the waveform. ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :device_ids (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py index 743f4a2e50..f10fdd3eeb 100644 --- a/QEfficient/utils/run_utils.py +++ b/QEfficient/utils/run_utils.py @@ -234,7 +234,7 @@ def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group=None): execinfo = TextGeneration( tokenizer=self.input_handler.tokenizer, qpc_path=qpc_path, - device_id=device_group, + device_ids=device_group, ctx_len=self.input_handler.ctx_len, full_batch_size=self.input_handler.full_batch_size, ).generate(prompt=self.input_handler.prompt, generation_len=self.gen_len, stream=False) diff --git a/examples/peft/multi_adapter.py b/examples/peft/multi_adapter.py index a578c83506..942662d3f0 100644 --- a/examples/peft/multi_adapter.py +++ b/examples/peft/multi_adapter.py @@ -74,7 +74,7 @@ qeff_model.generate( tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name), prompts=prompts, - device_id=device_group, + device_ids=device_group, prompt_to_adapter_mapping=[ "gsm8k", "tldr_content_gen", diff --git a/examples/performance/cpp_execution/text_inference_cpp.py b/examples/performance/cpp_execution/text_inference_cpp.py index 8355c1e448..d65c4902a0 100644 --- a/examples/performance/cpp_execution/text_inference_cpp.py +++ b/examples/performance/cpp_execution/text_inference_cpp.py @@ -131,7 +131,7 @@ def main( qpc_path=qpc_dir_path, prompt_len=prompt_len, prompt=prompt, - device_id=device_group, + device_ids=device_group, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, full_batch_size=full_batch_size, @@ -144,7 +144,7 @@ def cloud_ai_100_exec_kv_cpp( prompt_len: int, prompt: Optional[List[str]] = None, prompts_txt_file_path: Optional[str] = None, - device_id: Optional[List[int]] = None, + device_ids: Optional[List[int]] = None, generation_len: Optional[int] = None, enable_debug_logs: bool = False, stream: bool = True, @@ -156,7 +156,7 @@ def cloud_ai_100_exec_kv_cpp( # ********* CPP Calling ******** InferenceSetIOBuffer.generatePrompt( - tokenizer, qpc_path, prompt_len, ctx_len, batch_size, prompt, generation_len, device_id + tokenizer, qpc_path, prompt_len, ctx_len, batch_size, prompt, generation_len, device_ids ) diff --git a/examples/performance/on_device_sampling.py b/examples/performance/on_device_sampling.py index c34a241c88..599970ee41 100644 --- a/examples/performance/on_device_sampling.py +++ b/examples/performance/on_device_sampling.py @@ -101,7 +101,7 @@ def main(args, **kwargs): tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=args.model_name), prompts=args.prompt, prompts_txt_file_path=args.prompts_txt_file_path, - device_id=args.device_group, + device_ids=args.device_group, generation_len=args.generation_len, include_sampler=include_sampler, return_pdfs=return_pdfs, diff --git a/examples/text_generation/basic_inference.py b/examples/text_generation/basic_inference.py index 5e52a962de..0e6709e89b 100644 --- a/examples/text_generation/basic_inference.py +++ b/examples/text_generation/basic_inference.py @@ -47,7 +47,7 @@ def main(): exec_info = model.generate( tokenizer=tokenizer, prompts=[args.prompt], - device_id=args.device_group, + device_ids=args.device_group, generation_len=args.generation_len, ) diff --git a/examples/text_generation/continuous_batching.py b/examples/text_generation/continuous_batching.py index ec3a36ea92..5f39a52351 100644 --- a/examples/text_generation/continuous_batching.py +++ b/examples/text_generation/continuous_batching.py @@ -56,7 +56,7 @@ def main(): exec_info = model.generate( tokenizer=tokenizer, prompts=prompt_list, - device_id=args.device_group, + device_ids=args.device_group, generation_len=args.generation_len, ) diff --git a/examples/text_generation/moe_inference.py b/examples/text_generation/moe_inference.py index 276c766dd6..483bce49de 100644 --- a/examples/text_generation/moe_inference.py +++ b/examples/text_generation/moe_inference.py @@ -54,7 +54,7 @@ def main(): exec_info = model.generate( tokenizer=tokenizer, prompts=[args.prompt], - device_id=args.device_group, + device_ids=args.device_group, generation_len=args.generation_len, )