quic · qcdipankar · Apr 30, 2025 · Apr 30, 2025 · May 12, 2025
@@ -85,7 +85,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = Fals
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model, is_tlm=is_tlm)
+        return cls(model, is_tlm=is_tlm, pretrained_model_name_or_path=pretrained_model_name_or_path)
 
     @property
     def model_name(self) -> str:
@@ -160,6 +160,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
+        self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
 
     @classmethod
     @with_replaced_quantizers
@@ -212,7 +213,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
                 model, kv_offload=kv_offload
             )
 
-        return cls(model)
+        return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path)
 
     @property
     def model_hash(self) -> str:
@@ -226,6 +227,7 @@ def model_hash(self) -> str:
         mhash = hashlib.sha256()
         mhash.update(to_hashable(self.model.config.to_diff_dict()))
         mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable(self.pretrained_model_name_or_path))
         mhash = mhash.hexdigest()[:16]
         return mhash
 
@@ -533,7 +535,6 @@ def __init__(
         self.config = model.config
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model)
-
         self.input_shapes, self.output_names = None, None
 
     @property
@@ -1299,6 +1300,7 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         qaic_config: Optional[dict] = None,
+        is_tlm: bool = False,
         **kwargs,
     ):
         model_class_name = model.__class__.__name__
@@ -1319,13 +1321,19 @@ def __init__(
             )
 
         super().__init__(model)
-
+        # breakpoint()
         # Set use_cache=True to get KV values as output during ONNX export
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
         self.continuous_batching = continuous_batching
         self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs)
         self.is_tlm = transformed
+        self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
+
+        if is_tlm:
+            # TODO: It is possible to always apply this transform and make value of indices as last indices by default in PyTorch
+            self.model, transformed = SpDTransform.apply(self.model)
+        self.is_tlm = is_tlm
 
     @property
     def model_name(self) -> str:
@@ -1399,11 +1407,11 @@ def from_pretrained(
             return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__](
                 model, kv_offload=kv_offload
             )
-
         return cls(
             model,
             continuous_batching=continuous_batching,
             qaic_config=qaic_config,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
             **kwargs,
         )
 
@@ -1415,6 +1423,7 @@ def model_hash(self) -> str:
         mhash.update(to_hashable({"continuous_batching": self.continuous_batching}))
         mhash.update(to_hashable({"is_tlm": self.is_tlm}))
         mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable(self.pretrained_model_name_or_path))
         mhash = mhash.hexdigest()[:16]
         return mhash
 
@@ -1755,6 +1764,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
+        self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
 
     @property
     def model_hash(self) -> str:
@@ -1768,6 +1778,7 @@ def model_hash(self) -> str:
         mhash = hashlib.sha256()
         mhash.update(to_hashable(self.model.config.to_diff_dict()))
         mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable(self.pretrained_model_name_or_path))
         mhash = mhash.hexdigest()[:16]
         return mhash
 

@@ -123,7 +123,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
 
     is_tlm = False if num_speculative_tokens is None else True
-    qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm)
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name)
 
     pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
 
@@ -183,7 +183,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
     pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
 
-    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm)
+    qeff_model = QEFFAutoModelForCausalLM(
+        model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name
+    )
     onnx_model_path = qeff_model.export()
 
     if not get_available_device_id():
@@ -219,7 +221,7 @@ def test_causal_lm_export_with_deprecated_api(model_name):
     model_config["n_layer"] = 1
     model, _ = load_causal_lm_model(model_config)
     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
-    qeff_model = QEFFAutoModelForCausalLM(model)
+    qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
     new_api_onnx_model_path = qeff_model.export()
     _, old_api_onnx_model_path = qualcomm_efficient_converter(
         model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer

@@ -47,7 +47,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel(pt_model)
+    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
     qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
     mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))

@@ -314,7 +314,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
 
     pytorch_hf_tokens = run_seq2seq_pytorch_hf(model_hf, processor, data, sample_rate, ctx_len)
 
-    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf)
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf, pretrained_model_name_or_path=model_name)
 
     pytorch_kv_tokens = run_seq2seq_pytorch_with_kv(qeff_model, processor, data, sample_rate, ctx_len)