Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def main(
cloud_ai_100_exec_kv(
tokenizer=tokenizer,
qpc_path=qpc_path,
device_id=device_group,
device_ids=device_group,
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def main(
_ = qeff_model.generate(
tokenizer,
prompts=prompt,
device_id=device_group,
device_ids=device_group,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
iteration=iteration,
Expand Down
22 changes: 11 additions & 11 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def latency_stats_bertstyle(
qpc_path: str,
seq_len: int,
prompt: str,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
):
"""
Function to execute Bertstyle ONNX model on Cloud AI 100.
Expand All @@ -137,9 +137,9 @@ def latency_stats_bertstyle(
:qpc_path (str): Path to save generated binary file after compilation.
:seq_len (int): Sequence length.
:prompt (str): Sample prompt for the model text generation.
:device_id (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup.
:device_ids (List[int]): Device Ids to be used for compilation. If devices > 1, it enables multiple card setup.
"""
session = QAICInferenceSession(qpc_path, device_id)
session = QAICInferenceSession(qpc_path, device_ids)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, padding_side="left")
padding_check_and_fix(tokenizer) # Check and fix tokenizer viability
inputs = tokenizer(prompt, return_tensors="np", max_length=seq_len, padding="max_length")
Expand Down Expand Up @@ -319,7 +319,7 @@ def cloud_ai_100_exec_kv(
qpc_path: str,
prompt: Optional[str] = None,
prompts_txt_file_path: Optional[str] = None,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
generation_len: Optional[int] = None,
comp_ctx_lengths_prefill: Optional[List[int]] = None,
comp_ctx_lengths_decode: Optional[List[int]] = None,
Expand Down Expand Up @@ -348,7 +348,7 @@ def cloud_ai_100_exec_kv(
:prompt (str): Sample prompt for the model text generation. ``Defaults to None``.
:prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``.
:generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``.
:device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
:device_ids (List[int]): Device IDs to be used for execution. If ``len(device_ids) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
:enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``.
:stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``.
:Write_io_dir (str): Path to write the input and output files. ``Defaults to None``.
Expand Down Expand Up @@ -377,7 +377,7 @@ def cloud_ai_100_exec_kv(
base_path, onnx_model_path = QEfficient.export(model_name="gpt2")
qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0])
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0])
exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_ids=[0])

"""
batch_size, ctx_len, full_batch_size = get_compilation_dims(qpc_path)
Expand All @@ -390,7 +390,7 @@ def cloud_ai_100_exec_kv(
generate_text = TextGeneration(
tokenizer=tokenizer,
qpc_path=qpc_path,
device_id=device_id,
device_ids=device_ids,
ctx_len=ctx_len,
comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
comp_ctx_lengths_decode=comp_ctx_lengths_decode,
Expand Down Expand Up @@ -443,7 +443,7 @@ def __init__(
ctx_len: Optional[int] = None,
comp_ctx_lengths_prefill: Optional[List[int]] = None,
comp_ctx_lengths_decode: Optional[List[int]] = None,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
enable_debug_logs: bool = False,
write_io_dir: Optional[str] = None,
is_tlm: Optional[int] = None,
Expand All @@ -465,7 +465,7 @@ def __init__(

# Load QPC
self._session = QAICInferenceSession(
qpc_path, device_id, activate=activate, enable_debug_logs=enable_debug_logs
qpc_path, device_ids, activate=activate, enable_debug_logs=enable_debug_logs
)

# Validate sampler inputs for On-Device Sampling
Expand Down Expand Up @@ -1083,7 +1083,7 @@ def __init__(
ctx_len: Optional[int] = None,
comp_ctx_lengths_prefill: Optional[List[int]] = None,
comp_ctx_lengths_decode: Optional[List[int]] = None,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
enable_debug_logs: bool = False,
write_io_dir: Optional[str] = None,
is_tlm: bool = False,
Expand All @@ -1099,7 +1099,7 @@ def __init__(
ctx_len=ctx_len,
comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
comp_ctx_lengths_decode=comp_ctx_lengths_decode,
device_id=device_id,
device_ids=device_ids,
enable_debug_logs=enable_debug_logs,
write_io_dir=write_io_dir,
is_tlm=is_tlm,
Expand Down
16 changes: 8 additions & 8 deletions QEfficient/generation/vlm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class VisionLanguageGeneration(QEffTextGenerationBase):
... processor=processor,
... lang_qpc_path="path/to/lang.qpc",
... vision_qpc_path="path/to/vision.qpc",
... device_id=[0]
... device_ids=[0]
... )
>>> result = vlm.generate(
... images=["image1.jpg"],
Expand All @@ -68,7 +68,7 @@ class VisionLanguageGeneration(QEffTextGenerationBase):
... processor=processor,
... lang_qpc_path="path/to/lang.qpc",
... vision_qpc_path="path/to/vision.qpc",
... device_id=[0],
... device_ids=[0],
... full_batch_size=8, # Enable continuous batching
... include_sampler=True, # Enable on-device sampling
... sampling_params=sampling_config
Expand All @@ -82,7 +82,7 @@ def __init__(
processor: AutoImageProcessor,
lang_qpc_path: str,
vision_qpc_path: str,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
ctx_len: Optional[int] = None,
comp_ctx_lengths_prefill: Optional[List[int]] = None,
comp_ctx_lengths_decode: Optional[List[int]] = None,
Expand All @@ -106,7 +106,7 @@ def __init__(
processor: Image processor
lang_qpc_path: Path to language model QPC
vision_qpc_path: Path to vision encoder QPC
device_id: Device IDs for execution (default: [0])
device_ids: Device IDs for execution (default: [0])
ctx_len: Context length
enable_debug_logs: Enable debug logging
write_io_dir: Directory for I/O file writing
Expand Down Expand Up @@ -134,7 +134,7 @@ def __init__(
ctx_len=ctx_len,
comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
comp_ctx_lengths_decode=comp_ctx_lengths_decode,
device_id=device_id,
device_ids=device_ids,
enable_debug_logs=enable_debug_logs,
write_io_dir=write_io_dir,
is_tlm=is_tlm,
Expand All @@ -157,7 +157,7 @@ def __init__(
self.image_height = image_height
self.image_width = image_width
self._vision_qpc_path = vision_qpc_path
self.device_id = device_id # Store device_id for vision components
self.device_ids = device_ids # Store device_ids for vision components
self.enable_debug_logs = enable_debug_logs # Store for vision components
self._vision_outputs_cache = LRUCache(max_size=100) # LRU cache for vision outputs
self._vision_cache = {} # Cache for vision outputs across batches
Expand All @@ -177,7 +177,7 @@ def _init_vision_components(self):
"""Initialize vision-specific components"""
# Vision session (separate from base class language session)
self._vision_session = QAICInferenceSession(
self._vision_qpc_path, self.device_id, activate=False, enable_debug_logs=self.enable_debug_logs
self._vision_qpc_path, self.device_ids, activate=False, enable_debug_logs=self.enable_debug_logs
)

# Vision handler with language session coordination
Expand Down Expand Up @@ -801,7 +801,7 @@ def generate_stream_tokens(
tokenizer=self.tokenizer,
qpc_path=self._qpc_path,
ctx_len=self._ctx_len,
device_id=self.device_id,
device_ids=self.device_ids,
enable_debug_logs=self.enable_debug_logs,
is_tlm=self.is_tlm,
include_sampler=self.include_sampler,
Expand Down
6 changes: 3 additions & 3 deletions QEfficient/peft/lora/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def generate(
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
prompts: List[str],
prompt_to_adapter_mapping: List[str] = None,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
runtime: Optional[str] = "AI_100",
**kwargs,
):
Expand All @@ -410,7 +410,7 @@ def generate(
tokenizer (PreTrainedTokenizerFast or PreTrainedTokenizer): Tokenizer used for inference.
prompts (List[str]): List of prompts to generate outputs for.
prompt_to_adapter_mapping (List[str]): List of adapter names to use for each prompt. Use "base" for the base model (no adapter).
device_id (List[int], optional): Device IDs to use for execution. If `None`, auto-device-picker is used.
device_ids (List[int], optional): Device IDs to use for execution. If `None`, auto-device-picker is used.
runtime (str, optional): Runtime to use. Only "AI_100" is currently supported. Default is "AI_100".
**kwargs: Additional generation parameters.

Expand Down Expand Up @@ -440,7 +440,7 @@ def generate(
tokenizer,
self.qpc_path,
prompt=prompts,
device_id=device_id,
device_ids=device_ids,
generation_len=generation_len,
prompt_to_lora_id_mapping=[
self.active_adapter_to_id[name] if name != "base" else 0 for name in prompt_to_adapter_mapping
Expand Down
10 changes: 5 additions & 5 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,7 +1717,7 @@ def generate(
vision_qpc_path=self.vision_model.qpc_path,
tokenizer=tokenizer,
processor=processor,
device_id=device_ids, # if device_ids is not None else [0],
device_ids=device_ids, # if device_ids is not None else [0],
ctx_len=ctx_len_comp,
full_batch_size=fbs,
comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill,
Expand Down Expand Up @@ -3674,7 +3674,7 @@ def generate(
self,
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
prompts: List[str],
device_id: List[int] = None,
device_ids: List[int] = None,
runtime_ai100: bool = True,
**kwargs,
):
Expand All @@ -3690,7 +3690,7 @@ def generate(
Tokenizer for the model.
prompts : list of str
List of prompts to generate output for.
device_id : list of int, optional
device_ids : list of int, optional
Device IDs for running the QPC. Defaults to `[0]` if not specified.
runtime_ai100 : bool, optional
Whether to use AI 100 runtime. Default is True.
Expand Down Expand Up @@ -3724,7 +3724,7 @@ def generate(
prompt=prompts,
comp_ctx_lengths_prefill=self.comp_ctx_lengths_prefill,
comp_ctx_lengths_decode=self.comp_ctx_lengths_decode,
device_id=device_id,
device_ids=device_ids,
generation_len=generation_len,
automation=kwargs.pop("automation", False),
iteration=kwargs.pop("iteration", 1),
Expand Down Expand Up @@ -4363,7 +4363,7 @@ def generate(
:inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
:processor (AutoProcessor): The Processor to use for encoding the waveform.
``optional`` Args:
:device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
:device_ids (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
:runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
Returns:
:dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/utils/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group=None):
execinfo = TextGeneration(
tokenizer=self.input_handler.tokenizer,
qpc_path=qpc_path,
device_id=device_group,
device_ids=device_group,
ctx_len=self.input_handler.ctx_len,
full_batch_size=self.input_handler.full_batch_size,
).generate(prompt=self.input_handler.prompt, generation_len=self.gen_len, stream=False)
Expand Down
2 changes: 1 addition & 1 deletion examples/peft/multi_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
qeff_model.generate(
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=base_model_name),
prompts=prompts,
device_id=device_group,
device_ids=device_group,
prompt_to_adapter_mapping=[
"gsm8k",
"tldr_content_gen",
Expand Down
6 changes: 3 additions & 3 deletions examples/performance/cpp_execution/text_inference_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def main(
qpc_path=qpc_dir_path,
prompt_len=prompt_len,
prompt=prompt,
device_id=device_group,
device_ids=device_group,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
full_batch_size=full_batch_size,
Expand All @@ -144,7 +144,7 @@ def cloud_ai_100_exec_kv_cpp(
prompt_len: int,
prompt: Optional[List[str]] = None,
prompts_txt_file_path: Optional[str] = None,
device_id: Optional[List[int]] = None,
device_ids: Optional[List[int]] = None,
generation_len: Optional[int] = None,
enable_debug_logs: bool = False,
stream: bool = True,
Expand All @@ -156,7 +156,7 @@ def cloud_ai_100_exec_kv_cpp(

# ********* CPP Calling ********
InferenceSetIOBuffer.generatePrompt(
tokenizer, qpc_path, prompt_len, ctx_len, batch_size, prompt, generation_len, device_id
tokenizer, qpc_path, prompt_len, ctx_len, batch_size, prompt, generation_len, device_ids
)


Expand Down
2 changes: 1 addition & 1 deletion examples/performance/on_device_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def main(args, **kwargs):
tokenizer=load_hf_tokenizer(pretrained_model_name_or_path=args.model_name),
prompts=args.prompt,
prompts_txt_file_path=args.prompts_txt_file_path,
device_id=args.device_group,
device_ids=args.device_group,
generation_len=args.generation_len,
include_sampler=include_sampler,
return_pdfs=return_pdfs,
Expand Down
2 changes: 1 addition & 1 deletion examples/text_generation/basic_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main():
exec_info = model.generate(
tokenizer=tokenizer,
prompts=[args.prompt],
device_id=args.device_group,
device_ids=args.device_group,
generation_len=args.generation_len,
)

Expand Down
2 changes: 1 addition & 1 deletion examples/text_generation/continuous_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def main():
exec_info = model.generate(
tokenizer=tokenizer,
prompts=prompt_list,
device_id=args.device_group,
device_ids=args.device_group,
generation_len=args.generation_len,
)

Expand Down
2 changes: 1 addition & 1 deletion examples/text_generation/moe_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def main():
exec_info = model.generate(
tokenizer=tokenizer,
prompts=[args.prompt],
device_id=args.device_group,
device_ids=args.device_group,
generation_len=args.generation_len,
)

Expand Down
Loading