quic · quic-dhirajku · Apr 29, 2025 · Apr 29, 2025 · Apr 30, 2025 · May 2, 2025
@@ -20,7 +20,7 @@
 import torch
 
 from QEfficient.base.onnx_transforms import OnnxTransform
-from QEfficient.base.pytorch_transforms import PytorchTransform
+from QEfficient.base.pytorch_transforms import PytorchTransform, append_tranform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import constants, dump_qconfig
@@ -46,6 +46,7 @@ class QEFFBaseModel(ABC):
     def _transform_names(cls) -> List[str]:
         return [x.__name__ for x in cls._pytorch_transforms + cls._onnx_transforms]
 
+    @append_tranform
     def __init__(self, model: torch.nn.Module) -> None:
         super().__init__()
         self.model = model

@@ -9,6 +9,8 @@
 
 from torch import nn
 
+from QEfficient.utils.logging_utils import logger
+
 
 class PytorchTransform:
     """
@@ -110,3 +112,71 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
                     transformed = True
 
         return model, transformed
+
+
+class SplitGateUpWeightsTransform(PytorchTransform):
+    """
+    split fused Gate+Up weights and copy into the model
+
+    For every transformer layer inside `model`:
+      • expects   <PREFIX>.experts.gate_up_proj   in the *source* `sd`
+      • copies halves into
+            <PREFIX>.experts.gate_proj     <-- Gate   [E,H,I]
+            <PREFIX>.experts.up_proj       <-- Up     [E,H,I]
+    """
+
+    @classmethod
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        transformed = False
+
+        model_tmp = model.language_model if hasattr(model, "language_model") else model
+
+        num_layers = len(model_tmp.model.layers)
+        delete_fused_key = True
+        sd = model_tmp.state_dict()
+        for layer_idx in range(num_layers):
+            # ---- build the textual prefix once per layer ----------
+            prefix = f"model.layers.{layer_idx}.feed_forward.experts."
+
+            fused_key = prefix + "gate_up_proj"
+            gate_key = prefix + "gate_proj"
+            up_key = prefix + "up_proj"
+
+            # ---- split  [E,H,2I] → two  [E,H,I]  tensors ----------------------
+            fused = sd[fused_key]  # [E, H, 2I]  (no .weight here)
+            E, H, two_I = fused.shape
+            ffn_dim = two_I // 2
+            gate, up = fused.split(ffn_dim, dim=-1)  # views – no copy
+
+            experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+            experts.gate_proj.data.copy_(gate)
+            experts.up_proj.data.copy_(up)
+
+            # ---- update the state-dict so load_state_dict sees the right keys
+            sd[gate_key] = gate
+            sd[up_key] = up
+
+            if delete_fused_key:
+                del sd[fused_key]
+
+            logger.info(f"[layer {layer_idx:02d}] loaded gate_proj & up_proj from fused tensor  (shape {fused.shape})")
+            transformed = True
+
+        if hasattr(model, "language_model"):
+            model.language_model = model_tmp
+        else:
+            model = model_tmp
+        return model, transformed
+
+
+VLM_SPLIT_GATE_UP_WEIGHTS = ["Llama4ForConditionalGeneration", "Llama4TextModel"]
+
+
+def append_tranform(func):
+    def wrapper(*args, **kwargs):
+        model_class = args[1].model.__class__.__name__ if hasattr(args[1], "model") else args[1].__class__.__name__
+        if model_class in VLM_SPLIT_GATE_UP_WEIGHTS:
+            args[0]._pytorch_transforms.append(SplitGateUpWeightsTransform)
+        return func(*args, **kwargs)
+
+    return wrapper
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -31,21 +31,23 @@ def __init__(self, model):
         self.config = self.model.language_model.config
         self.language_model = self.model.language_model
 
-    def forward(self, input_ids, vision_embeds, position_ids, past_key_values):
+    def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values):
         input_embeds = self.model.language_model.get_input_embeddings()(input_ids)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
         selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
+        indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1)
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
         image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
         image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
         outputs = self.model.language_model(
             inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
         )
-        return outputs.logits, vision_embeds, outputs.past_key_values
+        image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0)
+        return outputs.logits, vision_embeds, image_idx, outputs.past_key_values
 
 
 class QEffInternVLModel(nn.Module):
@@ -81,13 +83,14 @@ def get_specializations(
             logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
         if img_size != constants.INTERN_IMG_SIZE and kv_offload:
             raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
+
+        per_patch_embed_size = (img_size // self.config.vision_config.patch_size * self.config.downsample_ratio) ** 2
+        vision_size = int(num_patches * per_patch_embed_size)
         vision = [
             {
                 "batch_size": batch_size,
                 "num_patches": num_patches,
                 "img_size": img_size,
-                "seq_len": prefill_seq_len,
-                "ctx_len": ctx_len,
             }
         ]
         lang = [
@@ -97,13 +100,15 @@ def get_specializations(
                 "ctx_len": ctx_len,
                 "num_patches": num_patches,
                 "img_size": img_size,
+                "vision_size": vision_size,
             },
             {
                 "batch_size": batch_size,
                 "seq_len": "1",
                 "ctx_len": ctx_len,
                 "num_patches": num_patches,
                 "img_size": img_size,
+                "vision_size": vision_size,
             },
         ]
 
@@ -122,7 +127,7 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         lang_dynamic_axes = {}
         lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
         lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-        lang_dynamic_axes["vision_embeds"] = {0: "num_patches"}
+        lang_dynamic_axes["vision_embeds"] = {0: "batch_size", 1: "vision_size"}
         vision_dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
 
         pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
@@ -148,10 +153,12 @@ def get_output_names(self, kv_offload: bool = False):
         output_names = {}
         if kv_offload:
             lang_output_names.insert(1, "vision_embeds_RetainedState")
+            lang_output_names.insert(2, "image_idx_output")
             output_names["vision"] = vision_output_names
             output_names["lang"] = lang_output_names
         else:
             lang_output_names.insert(1, "pixel_values_RetainedState")
+            lang_output_names.insert(2, "image_idx_output")
             return lang_output_names
         return output_names
 
@@ -176,8 +183,8 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         inputs_shapes["vision_embeds"] = (
-            constants.INTERN_NUM_PATCHES,
-            constants.INTERN_FEATURE_SIZE,
+            constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
+            computed_feature_size,
             self.language_model.config.hidden_size,
         )
         inputs_shapes["position_ids"] = (
@@ -202,6 +209,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
+        lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64)
 
         # Add data for KV
         kv_cache_shape = get_padding_shape_from_config(
@@ -225,22 +233,25 @@ def get_dummy_inputs(self, kv_offload: bool = False):
 
         return inputs
 
-    def forward(self, input_ids, pixel_values, position_ids, past_key_values):
+    def forward(self, input_ids, pixel_values, position_ids, image_idx, past_key_values):
         input_embeds = self.language_model.get_input_embeddings()(input_ids)
         vision_embeds = self.extract_feature(pixel_values)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
         selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
+        indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1)
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
         image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
         image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
         outputs = self.language_model(
             inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
         )
-        return outputs.logits, pixel_values, outputs.past_key_values
+        next_image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0)
+        image_idx = torch.where(image_idx < next_image_idx, next_image_idx, image_idx)
+        return outputs.logits, pixel_values, image_idx, outputs.past_key_values
 
     def get_inputs_info(self):
         return [

@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------