diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index d29ca7d294..64a07b57c2 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -6,9 +6,25 @@ # ----------------------------------------------------------------------------- from QEfficient.base.common import QEFFCommonLoader # noqa: F401 -from QEfficient.transformers.models.modeling_auto import ( # noqa: F401 - QEFFAutoModel, - QEFFAutoModelForCausalLM, - QEFFAutoModelForImageTextToText, - QEFFAutoModelForSpeechSeq2Seq, -) +# from QEfficient.transformers.models.modeling_auto import ( # noqa: F401 +# QEFFAutoModel, +# QEFFAutoModelForCausalLM, +# QEFFAutoModelForImageTextToText, +# QEFFAutoModelForSpeechSeq2Seq, +# ) + + +# __init__.py +def get_qeff_models(): + from QEfficient.transformers.models.modeling_auto import ( + QEFFAutoModel, + QEFFAutoModelForCausalLM, + QEFFAutoModelForImageTextToText, + QEFFAutoModelForSpeechSeq2Seq, + ) + return { + "QEFFAutoModel": QEFFAutoModel, + "QEFFAutoModelForCausalLM": QEFFAutoModelForCausalLM, + "QEFFAutoModelForImageTextToText": QEFFAutoModelForImageTextToText, + "QEFFAutoModelForSpeechSeq2Seq": QEFFAutoModelForSpeechSeq2Seq, + } \ No newline at end of file diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index 7f66b9f3f3..3a5065df3a 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -16,6 +16,7 @@ from typing import Any from transformers import AutoConfig +import importlib from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.transformers.modeling_utils import EXTERNAL_MODEL_CLASS_MAPPING, MODEL_CLASS_MAPPING @@ -47,7 +48,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> or EXTERNAL_MODEL_CLASS_MAPPING[config.__class__.__name__] ) if class_name: - module = __import__("QEfficient.transformers.models.modeling_auto") + module = importlib.import_module("QEfficient.transformers.models.modeling_auto") model_class = getattr(module, class_name) else: raise NotImplementedError( diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 8519d824c6..6d5c7e5f3a 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -11,33 +11,33 @@ import numpy as np -try: - import qaicrt -except ImportError: - import platform - import sys +# try: +# import qaicrt +# except ImportError: +# import platform +# import sys - sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") - import qaicrt +# sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") +# import qaicrt -try: - import QAicApi_pb2 as aicapi -except ImportError: - import sys +# try: +# import QAicApi_pb2 as aicapi +# except ImportError: +# import sys - sys.path.append("/opt/qti-aic/dev/python") - import QAicApi_pb2 as aicapi +# sys.path.append("/opt/qti-aic/dev/python") +# import QAicApi_pb2 as aicapi aic_to_np_dtype_mapping = { - aicapi.FLOAT_TYPE: np.dtype(np.float32), - aicapi.FLOAT_16_TYPE: np.dtype(np.float16), - aicapi.INT8_Q_TYPE: np.dtype(np.int8), - aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), - aicapi.INT16_Q_TYPE: np.dtype(np.int16), - aicapi.INT32_Q_TYPE: np.dtype(np.int32), - aicapi.INT32_I_TYPE: np.dtype(np.int32), - aicapi.INT64_I_TYPE: np.dtype(np.int64), - aicapi.INT8_TYPE: np.dtype(np.int8), + # aicapi.FLOAT_TYPE: np.dtype(np.float32), + # aicapi.FLOAT_16_TYPE: np.dtype(np.float16), + # aicapi.INT8_Q_TYPE: np.dtype(np.int8), + # aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), + # aicapi.INT16_Q_TYPE: np.dtype(np.int16), + # aicapi.INT32_Q_TYPE: np.dtype(np.int32), + # aicapi.INT32_I_TYPE: np.dtype(np.int32), + # aicapi.INT64_I_TYPE: np.dtype(np.int64), + # aicapi.INT8_TYPE: np.dtype(np.int8), } @@ -58,59 +58,61 @@ def __init__( :activate: bool. If false, activation will be disabled. Default=True. :enable_debug_logs: bool. If True, It will enable debug logs. Default=False. """ - # Load QPC - if device_ids is not None: - devices = qaicrt.QIDList(device_ids) - self.context = qaicrt.Context(devices) - self.queue = qaicrt.Queue(self.context, device_ids[0]) - else: - self.context = qaicrt.Context() - self.queue = qaicrt.Queue(self.context, 0) # Async API - if enable_debug_logs: - if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS: - raise RuntimeError("Failed to setLogLevel") - qpc = qaicrt.Qpc(str(qpc_path)) - # Load IO Descriptor - iodesc = aicapi.IoDesc() - status, iodesc_data = qpc.getIoDescriptor() - if status != qaicrt.QStatus.QS_SUCCESS: - raise RuntimeError("Failed to getIoDescriptor") - iodesc.ParseFromString(bytes(iodesc_data)) - self.allowed_shapes = [ - [(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] - for allowed_shape in iodesc.allowed_shapes - ] - self.bindings = iodesc.selected_set.bindings - self.binding_index_map = {binding.name: binding.index for binding in self.bindings} - # Create and load Program - prog_properties = qaicrt.QAicProgramProperties() - prog_properties.SubmitRetryTimeoutMs = 60_000 - if device_ids and len(device_ids) > 1: - prog_properties.devMapping = ":".join(map(str, device_ids)) - self.program = qaicrt.Program(self.context, None, qpc, prog_properties) - if self.program.load() != qaicrt.QStatus.QS_SUCCESS: - raise RuntimeError("Failed to load program") - if activate: - self.activate() - # Create input qbuffers and buf_dims - self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] - self.buf_dims = qaicrt.BufferDimensionsVecRef( - [(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] - ) + # # Load QPC + # if device_ids is not None: + # devices = qaicrt.QIDList(device_ids) + # self.context = qaicrt.Context(devices) + # self.queue = qaicrt.Queue(self.context, device_ids[0]) + # else: + # self.context = qaicrt.Context() + # self.queue = qaicrt.Queue(self.context, 0) # Async API + # if enable_debug_logs: + # if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS: + # raise RuntimeError("Failed to setLogLevel") + # qpc = qaicrt.Qpc(str(qpc_path)) + # # Load IO Descriptor + # iodesc = aicapi.IoDesc() + # status, iodesc_data = qpc.getIoDescriptor() + # if status != qaicrt.QStatus.QS_SUCCESS: + # raise RuntimeError("Failed to getIoDescriptor") + # iodesc.ParseFromString(bytes(iodesc_data)) + # self.allowed_shapes = [ + # [(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] + # for allowed_shape in iodesc.allowed_shapes + # ] + # self.bindings = iodesc.selected_set.bindings + # self.binding_index_map = {binding.name: binding.index for binding in self.bindings} + # # Create and load Program + # prog_properties = qaicrt.QAicProgramProperties() + # prog_properties.SubmitRetryTimeoutMs = 60_000 + # if device_ids and len(device_ids) > 1: + # prog_properties.devMapping = ":".join(map(str, device_ids)) + # self.program = qaicrt.Program(self.context, None, qpc, prog_properties) + # if self.program.load() != qaicrt.QStatus.QS_SUCCESS: + # raise RuntimeError("Failed to load program") + # if activate: + # self.activate() + # # Create input qbuffers and buf_dims + # self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] + # self.buf_dims = qaicrt.BufferDimensionsVecRef( + # [(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] + # ) @property def input_names(self) -> List[str]: - return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT] + # return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT] + return None @property def output_names(self) -> List[str]: - return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT] + # return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT] + return None def activate(self): """Activate qpc""" self.program.activate() - self.execObj = qaicrt.ExecObj(self.context, self.program) + # self.execObj = qaicrt.ExecObj(self.context, self.program) def deactivate(self): """Deactivate qpc""" @@ -131,7 +133,7 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]): warn(f'Buffer: "{buffer_name}" not found') continue buffer_index = self.binding_index_map[buffer_name] - self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes()) + # self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes()) self.buf_dims[buffer_index] = ( buffer.itemsize, buffer.shape if len(buffer.shape) > 0 else (1,), @@ -159,48 +161,48 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """ # Set inputs self.set_buffers(inputs) - if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS: - raise MemoryError("Failed to setData") - # # Run with sync API - # if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS: - # Run with async API - if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS: - raise MemoryError("Failed to enqueue") - if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS: - error_message = "Failed to run" - # Print additional error messages for unmatched dimension error - if self.allowed_shapes: - error_message += "\n\n" - error_message += '(Only if "No matching dimension found" error is present above)' - error_message += "\nAllowed shapes:" - for i, allowed_shape in enumerate(self.allowed_shapes): - error_message += f"\n{i}\n" - for binding, (elemsize, shape), (_, passed_shape) in zip( - self.bindings, allowed_shape, self.buf_dims - ): - if passed_shape == [0]: - if not binding.is_partial_buf_allowed: - warn(f"Partial buffer not allowed for: {binding.name}") - continue - error_message += f"{binding.name}:\t{elemsize}\t{shape}\n" - error_message += "\n\nPassed shapes:\n" - for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims): - if shape == [0]: - continue - error_message += f"{binding.name}:\t{elemsize}\t{shape}\n" - raise ValueError(error_message) - # Get output buffers - status, output_qbuffers = self.execObj.getData() - if status != qaicrt.QStatus.QS_SUCCESS: - raise MemoryError("Failed to getData") + # if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS: + # raise MemoryError("Failed to setData") + # # # Run with sync API + # # if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS: + # # Run with async API + # if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS: + # raise MemoryError("Failed to enqueue") + # if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS: + # error_message = "Failed to run" + # # Print additional error messages for unmatched dimension error + # if self.allowed_shapes: + # error_message += "\n\n" + # error_message += '(Only if "No matching dimension found" error is present above)' + # error_message += "\nAllowed shapes:" + # for i, allowed_shape in enumerate(self.allowed_shapes): + # error_message += f"\n{i}\n" + # for binding, (elemsize, shape), (_, passed_shape) in zip( + # self.bindings, allowed_shape, self.buf_dims + # ): + # if passed_shape == [0]: + # if not binding.is_partial_buf_allowed: + # warn(f"Partial buffer not allowed for: {binding.name}") + # continue + # error_message += f"{binding.name}:\t{elemsize}\t{shape}\n" + # error_message += "\n\nPassed shapes:\n" + # for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims): + # if shape == [0]: + # continue + # error_message += f"{binding.name}:\t{elemsize}\t{shape}\n" + # raise ValueError(error_message) + # # Get output buffers + # status, output_qbuffers = self.execObj.getData() + # if status != qaicrt.QStatus.QS_SUCCESS: + # raise MemoryError("Failed to getData") # Build output outputs = {} for output_name in self.output_names: buffer_index = self.binding_index_map[output_name] if self.qbuffers[buffer_index].size == 0: continue - outputs[output_name] = np.frombuffer( - bytes(output_qbuffers[buffer_index]), - aic_to_np_dtype_mapping[self.bindings[buffer_index].type], - ).reshape(self.buf_dims[buffer_index][1]) + # outputs[output_name] = np.frombuffer( + # bytes(output_qbuffers[buffer_index]), + # aic_to_np_dtype_mapping[self.bindings[buffer_index].type], + # ).reshape(self.buf_dims[buffer_index][1]) return outputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 2f3ee3dc02..d9b6fa9430 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -25,7 +25,7 @@ ) import QEfficient -from QEfficient.base.modeling_qeff import QEFFBaseModel + from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession @@ -60,6 +60,7 @@ from QEfficient.utils.logging_utils import logger +from QEfficient.base.modeling_qeff import QEFFBaseModel class QEFFTransformersBase(QEFFBaseModel): """ Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. diff --git a/tests/transformers/models/test_gpt2_windows.py b/tests/transformers/models/test_gpt2_windows.py new file mode 100644 index 0000000000..f04fd66e4b --- /dev/null +++ b/tests/transformers/models/test_gpt2_windows.py @@ -0,0 +1,71 @@ +from transformers import AutoModelForCausalLM +from QEfficient.utils import hf_download +from QEfficient.utils.constants import Constants +from QEfficient.utils.run_utils import ApiRunner +from QEfficient.utils._utils import load_hf_tokenizer +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + +def load_causal_lm_model(model_config): + """ + Function to load model from huggingface and transform to KV model + -------- + + :model_config: Dict + + :return model_hf, params + """ + model_path = hf_download( + repo_id=model_config["model_name"], + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + use_cache=True, + attn_implementation="eager", + low_cpu_mem_usage=False, + ) # Run models for single layers only + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + +model_name = "gpt2" + +model_config = {"model_name": model_name} + +model_hf, _ = load_causal_lm_model(model_config) + +tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) +config = model_hf.config +batch_size = len(Constants.INPUT_STR) +api_runner = ApiRunner( + batch_size, + tokenizer, + config, + Constants.INPUT_STR, + Constants.PROMPT_LEN, + Constants.CTX_LEN, +) + +# pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) +# print("Pytorch HF tokens:", pytorch_hf_tokens) + +qeff_model = QEFFAutoModelForCausalLM(model_hf, pretrained_model_name_or_path=model_name) + +# pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) +# print("Pytorch KV tokens:", pytorch_kv_tokens) + +qpc_path = qeff_model.export() +print("qpc_path: ", qpc_path) + +# qpc_path = qeff_model.compile( +# prefill_seq_len=Constants.PROMPT_LEN, +# ctx_len=Constants.CTX_LEN, +# num_cores=16, +# mxfp6_matmul=False, +# mxint8_kv_cache=False, +# num_devices=1, +# mos=1, +# aic_enable_depth_first=True, +# num_speculative_tokens=None, +# ) +# print("Compiled Successfully at path: ", qpc_path)