diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index f181ee5eb..35a852a30 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -85,7 +85,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = Fals kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model, is_tlm=is_tlm) + return cls(model, is_tlm=is_tlm, pretrained_model_name_or_path=pretrained_model_name_or_path) @property def model_name(self) -> str: @@ -160,6 +160,7 @@ def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers + self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) @classmethod @with_replaced_quantizers @@ -212,7 +213,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): model, kv_offload=kv_offload ) - return cls(model) + return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path) @property def model_hash(self) -> str: @@ -226,6 +227,7 @@ def model_hash(self) -> str: mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable(self.pretrained_model_name_or_path)) mhash = mhash.hexdigest()[:16] return mhash @@ -533,7 +535,6 @@ def __init__( self.config = model.config self.vision_model = QEffVisionEncoderForTextImageToTextModel(model) self.lang_model = QEffCausalLMForTextImageToTextModel(model) - self.input_shapes, self.output_names = None, None @property @@ -1319,13 +1320,13 @@ def __init__( ) super().__init__(model) - # Set use_cache=True to get KV values as output during ONNX export self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers self.continuous_batching = continuous_batching self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs) self.is_tlm = transformed + self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) @property def model_name(self) -> str: @@ -1399,11 +1400,11 @@ def from_pretrained( return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__]( model, kv_offload=kv_offload ) - return cls( model, continuous_batching=continuous_batching, qaic_config=qaic_config, + pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs, ) @@ -1415,6 +1416,7 @@ def model_hash(self) -> str: mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) mhash.update(to_hashable({"is_tlm": self.is_tlm})) mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable(self.pretrained_model_name_or_path)) mhash = mhash.hexdigest()[:16] return mhash @@ -1755,6 +1757,7 @@ def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers + self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) @property def model_hash(self) -> str: @@ -1768,6 +1771,7 @@ def model_hash(self) -> str: mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable(self.pretrained_model_name_or_path)) mhash = mhash.hexdigest()[:16] return mhash diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index efa2187b7..1302b45d9 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -123,7 +123,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True - qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm) + qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name) pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) @@ -183,7 +183,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) - qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm) + qeff_model = QEFFAutoModelForCausalLM( + model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name + ) onnx_model_path = qeff_model.export() if not get_available_device_id(): @@ -219,7 +221,7 @@ def test_causal_lm_export_with_deprecated_api(model_name): model_config["n_layer"] = 1 model, _ = load_causal_lm_model(model_config) tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) - qeff_model = QEFFAutoModelForCausalLM(model) + qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) new_api_onnx_model_path = qeff_model.export() _, old_api_onnx_model_path = qualcomm_efficient_converter( model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 22f4bd580..14d96221e 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -47,7 +47,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( pt_outputs = pt_model(**inputs) pt_embeddings = pt_outputs[0][0].detach().numpy() # Pytorch transformed model - qeff_model = QEFFAutoModel(pt_model) + qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name) qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 63fd318c7..df0180a7e 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -314,7 +314,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_hf_tokens = run_seq2seq_pytorch_hf(model_hf, processor, data, sample_rate, ctx_len) - qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf) + qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf, pretrained_model_name_or_path=model_name) pytorch_kv_tokens = run_seq2seq_pytorch_with_kv(qeff_model, processor, data, sample_rate, ctx_len)