quic
diff --git a/‎.github/CODEOWNERS
+1-1 b/‎.github/CODEOWNERS
+1-1
diff --git a/‎QEfficient/cloud/finetune.py
+36-31 b/‎QEfficient/cloud/finetune.py
+36-31
diff --git a/‎QEfficient/finetune/configs/training.py
+2 b/‎QEfficient/finetune/configs/training.py
+2
diff --git a/‎QEfficient/transformers/models/internvl/modeling_internvl.py
+13-13 b/‎QEfficient/transformers/models/internvl/modeling_internvl.py
+13-13
diff --git a/‎QEfficient/transformers/models/llava/modeling_llava.py
+15-15 b/‎QEfficient/transformers/models/llava/modeling_llava.py
+15-15
diff --git a/‎QEfficient/transformers/models/mllama/modeling_mllama.py
+5-3 b/‎QEfficient/transformers/models/mllama/modeling_mllama.py
+5-3
@@ -7,6 +7,6 @@
 
 # Default owners
 # review when someone opens a pull request and assign appropriate reviewer
-* @quic-rishinr @ochougul @quic-hemagnih
+* @quic-rishinr @ochougul @quic-hemagnih @quic-amitraj
 pyproject.toml @carlstreeter-quic
 
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import math
 import random
 import warnings
 
@@ -30,49 +31,44 @@
     get_preprocessed_dataset,
 )
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
-from QEfficient.utils._utils import login_and_download_hf_lm
+from QEfficient.utils._utils import get_num_layers_from_config, login_and_download_hf_lm
 
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
     print(f"Warning: {e}. Moving ahead without these qaic modules.")
 
 
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 # Suppress all warnings
 warnings.filterwarnings("ignore")
 
-def get_device_map_for_llama_70B(dev0, dev1, dev2, dev3, dev4, dev5):  # total_num_layers, num_stages
+
+def get_device_map(rank, num_pp_stages, num_layers):
     device_map = {
-      'model.embed_tokens': dev0,
-      'lm_head': dev5,
-      'model.norm': dev5,
-      'model.rotary_emb': dev5
+        "model.embed_tokens": rank * num_pp_stages,
+        "lm_head": rank * num_pp_stages,
+        "model.norm": rank * num_pp_stages + (num_pp_stages - 1),
+        "model.rotary_emb": rank * num_pp_stages + (num_pp_stages - 1),
     }
-    for i in range(80):
-        if i < 14:
-            device_map[f"model.layers.{i}"] = dev0
-        elif i < 28:
-            device_map[f"model.layers.{i}"] = dev1
-        elif i < 42:
-            device_map[f"model.layers.{i}"] = dev2
-        elif i < 56:
-            device_map[f"model.layers.{i}"] = dev3
-        elif i < 70:
-            device_map[f"model.layers.{i}"] = dev4
-        else:
-            device_map[f"model.layers.{i}"] = dev5
+    n_layer_per_stage = math.ceil(num_layers / num_pp_stages)  # number of layers per device 80/6 = 13.3 ~ 14
+    for j in range(num_pp_stages):
+        for i in range(n_layer_per_stage * j, n_layer_per_stage * (j + 1)):
+            if i < num_layers:
+                device_map[f"model.layers.{i}"] = rank * num_pp_stages + j
+
     return device_map
 
 
-def setup_distributed_training():
-    torch_device = torch.device("qaic")
+def setup_distributed_training(train_config):
+    torch_device = torch.device(train_config.device)
     assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
-    dist.init_process_group(backend="qccl")
-    # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-    #getattr(torch, torch_device.type).set_device(dist.get_rank()*2)
+    dist.init_process_group(backend=train_config.dist_backend)
+    if not train_config.enable_pp:
+        # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
+        getattr(torch, torch_device.type).set_device(dist.get_rank())
 
 
 def main(**kwargs):
@@ -87,6 +83,13 @@ def main(**kwargs):
     # update the configuration for the training process
     train_config = TRAIN_CONFIG()
     update_config(train_config, **kwargs)
+
+    if train_config.enable_ddp or train_config.enable_pp:
+        setup_distributed_training(train_config)
+    if train_config.enable_pp:
+        assert dist.get_world_size() % train_config.num_pp_stages == 0, (
+            "total available devices should be multiple of number of pipeline stages"
+        )
     dataset_config = generate_dataset_config(train_config, kwargs)
 
     # Set the seeds for reproducibility
@@ -95,7 +98,7 @@ def main(**kwargs):
     np.random.seed(train_config.seed)
 
     # Load the pre-trained model and setup its configuration
-    # config = AutoConfig.from_pretrained(train_config.model_name)
+    model_config = AutoConfig.from_pretrained(train_config.model_name)
     pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
     if train_config.task_type == "seq_classification":
         model = AutoModelForSequenceClassification.from_pretrained(
@@ -115,9 +118,12 @@ def main(**kwargs):
             if param.requires_grad:
                 param.data = param.data.to(torch.float32)
     else:
-        rank = dist.get_rank()
-
-        device_map = get_device_map_for_llama_70B(rank*6, rank*6+1, rank*6+2, rank*6+3, rank*6+4, rank*6+5)
+        if train_config.enable_pp and train_config.enable_ddp:
+            rank = dist.get_rank()
+            num_layers = get_num_layers_from_config(model_config)
+            device_map = get_device_map(rank, train_config.num_pp_stages, num_layers)
+        else:
+            device_map = "auto"
         model = AutoModelForCausalLM.from_pretrained(
             pretrained_model_path,
             use_cache=False,
@@ -246,7 +252,7 @@ def main(**kwargs):
 
     # wrap model with DDP
     if train_config.enable_ddp:
-        model = nn.parallel.DistributedDataParallel(model)#, device_ids=[dist.get_rank()])
+        model = nn.parallel.DistributedDataParallel(model)  # , device_ids=[dist.get_rank()])
 
     _ = train(
         model,
@@ -268,5 +274,4 @@ def main(**kwargs):
 
 
 if __name__ == "__main__":
-    setup_distributed_training()
     fire.Fire(main)
@@ -53,6 +53,8 @@ class train_config:
     # profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
 
     # dist-related
+    enable_pp: bool = False
+    num_pp_stages: int = 1
     enable_ddp: bool = False
     dist_backend: str = "cpu:gloo,qaic:qccl,cuda:gloo"
 
 
@@ -20,8 +20,8 @@ def __init__(self, model):
         self.model = model
 
     def forward(self, pixel_values):
-        vit_embeds = self.model.extract_feature(pixel_values)
-        return vit_embeds
+        vision_embeds = self.model.extract_feature(pixel_values)
+        return vision_embeds
 
 
 class QEffInternDecoderWrapper(nn.Module):
@@ -31,21 +31,21 @@ def __init__(self, model):
         self.config = self.model.language_model.config
         self.language_model = self.model.language_model
 
-    def forward(self, input_ids, vit_embeds, position_ids, past_key_values):
+    def forward(self, input_ids, vision_embeds, position_ids, past_key_values):
         input_embeds = self.model.language_model.get_input_embeddings()(input_ids)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
         selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
-        image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
+        image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
         image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
         outputs = self.model.language_model(
             inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
         )
-        return outputs.logits, vit_embeds, outputs.past_key_values
+        return outputs.logits, vision_embeds, outputs.past_key_values
 
 
 class QEffInternVLModel(nn.Module):
@@ -122,7 +122,7 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         lang_dynamic_axes = {}
         lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
         lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-        lang_dynamic_axes["vit_embeds"] = {0: "num_patches"}
+        lang_dynamic_axes["vision_embeds"] = {0: "num_patches"}
         vision_dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
 
         pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
@@ -139,15 +139,15 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         return dynamic_axes
 
     def get_output_names(self, kv_offload: bool = False):
-        vision_output_names = ["vit_embeds"]
+        vision_output_names = ["vision_embeds"]
         lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
                 lang_output_names.append(f"past_{kv}.{i}_RetainedState")
 
         output_names = {}
         if kv_offload:
-            lang_output_names.insert(1, "vit_embeds_RetainedState")
+            lang_output_names.insert(1, "vision_embeds_RetainedState")
             output_names["vision"] = vision_output_names
             output_names["lang"] = lang_output_names
         else:
@@ -175,7 +175,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         # Define shapes
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
-        inputs_shapes["vit_embeds"] = (
+        inputs_shapes["vision_embeds"] = (
             constants.INTERN_NUM_PATCHES,
             constants.INTERN_FEATURE_SIZE,
             self.language_model.config.hidden_size,
@@ -196,7 +196,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         lang_inputs = {}
         vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
         lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
-        lang_inputs["vit_embeds"] = torch.zeros((inputs_shapes["vit_embeds"]), dtype=torch.float32)
+        lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32)
         lang_inputs["position_ids"] = (
             torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
             .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
@@ -220,21 +220,21 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             inputs["vision"] = vision_inputs
             inputs["lang"] = lang_inputs
         else:
-            lang_inputs.pop("vit_embeds")
+            lang_inputs.pop("vision_embeds")
             inputs = {**vision_inputs, **lang_inputs}
 
         return inputs
 
     def forward(self, input_ids, pixel_values, position_ids, past_key_values):
         input_embeds = self.language_model.get_input_embeddings()(input_ids)
-        vit_embeds = self.extract_feature(pixel_values)
+        vision_embeds = self.extract_feature(pixel_values)
         B, N, C = input_embeds.shape
         image_input_embeds = input_embeds.reshape(B * N, C)
         image_input_ids = input_ids.reshape(B * N)
         selected = image_input_ids == constants.INTERN_IMG_CONTEXT_TOKEN
         indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
-        image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
+        image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
         image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
         outputs = self.language_model(
 
@@ -38,9 +38,9 @@ def forward(self, pixel_values):
             selected_image_feature = selected_image_feature
         else:
             raise ValueError(f"Unexpected select feature strategy: {self.model.config.vision_feature_select_strategy}")
-        image_features = self.model.multi_modal_projector(selected_image_feature)
+        vision_embeds = self.model.multi_modal_projector(selected_image_feature)
 
-        return image_features
+        return vision_embeds
 
 
 class QEFFLlavaDecoderWrapper(nn.Module):
@@ -50,21 +50,21 @@ def __init__(self, model):
         self.config = self.model.config
         self.language_model = self.model.language_model
 
-    def forward(self, input_ids, image_features, position_ids, past_key_values):
+    def forward(self, input_ids, vision_embeds, position_ids, past_key_values):
         inputs_embeds = self.model.get_input_embeddings()(input_ids)
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
         mask = input_ids == self.model.config.image_token_index
         indices1 = mask.to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(mask.shape[0]).view(-1, 1)
-        image_features_expanded = image_features[indices0, indices1]
-        inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds)
+        vision_embeds_expanded = vision_embeds[indices0, indices1]
+        inputs_embeds = torch.where(mask.unsqueeze(-1), vision_embeds_expanded, inputs_embeds)
         outputs = self.model.language_model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
             past_key_values=past_key_values,
         )
 
-        return outputs.logits, image_features, outputs.past_key_values
+        return outputs.logits, vision_embeds, outputs.past_key_values
 
 
 class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration):
@@ -86,14 +86,14 @@ def forward(self, input_ids, position_ids, pixel_values, past_key_values):
             selected_image_feature = selected_image_feature
         else:
             raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-        image_features = self.multi_modal_projector(selected_image_feature)
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        vision_embeds = self.multi_modal_projector(selected_image_feature)
+        vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
 
         mask = input_ids == self.config.image_token_index
         indices1 = mask.to(torch.int64).cumsum(1) - 1
         indices0 = torch.arange(mask.shape[0]).view(-1, 1)
-        image_features_expanded = image_features[indices0, indices1]
-        image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds)
+        vision_embeds_expanded = vision_embeds[indices0, indices1]
+        image_inputs_embeds = torch.where(mask.unsqueeze(-1), vision_embeds_expanded, inputs_embeds)
         # *where to skip image encoder for decode*
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds)
         outputs = self.language_model(
@@ -118,7 +118,7 @@ def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
         }
         lang_inputs = {
             "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
-            "image_features": torch.ones((BS, 576, self.language_model.config.hidden_size), dtype=torch.float32),
+            "vision_embeds": torch.ones((BS, 576, self.language_model.config.hidden_size), dtype=torch.float32),
             "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
         }
         lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
@@ -137,7 +137,7 @@ def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
             inputs["vision"] = vision_inputs
             inputs["lang"] = lang_inputs
         else:
-            lang_inputs.pop("image_features")
+            lang_inputs.pop("vision_embeds")
             inputs = {**vision_inputs, **lang_inputs}
         return inputs
 
@@ -218,15 +218,15 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         return dynamic_axes
 
     def get_output_names(self, kv_offload: bool = False):
-        vision_output_names = ["image_features"]
+        vision_output_names = ["vision_embeds"]
         lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
                 lang_output_names.append(f"past_{kv}.{i}_RetainedState")
 
         output_names = {}
         if kv_offload:
-            lang_output_names.insert(1, "image_features_RetainedState")
+            lang_output_names.insert(1, "vision_embeds_RetainedState")
             output_names["vision"] = vision_output_names
             output_names["lang"] = lang_output_names
         else:
 
@@ -161,8 +161,8 @@ def forward(
         value_states_new = torch.index_put(value_states_old, indices, value_states)
 
         # Select old or new image KV states based on q_len
-        key_states = torch.where(q_len == 1, key_states_old, key_states_new)
-        value_states = torch.where(q_len == 1, value_states_old, value_states_new)
+        key_states = torch.where(torch.tensor(q_len == 1), key_states_old, key_states_new)
+        value_states = torch.where(torch.tensor(q_len == 1), value_states_old, value_states_new)
 
         # Update the image cache
         past_key_value.key_cache[self.layer_idx] = key_states
@@ -924,7 +924,7 @@ def forward(
             return_dict=return_dict,
             cache_position=cache_position,
         )
-
+        outputs["pixel_values"] = pixel_values
         return outputs
 
     def get_dummy_inputs(self, kv_offload: bool = False):
@@ -1092,6 +1092,8 @@ def get_output_names(self, kv_offload: bool = False):
             "logits",
             *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]],
         ]
+        if not kv_offload:
+            lang_output_names.append("pixel_values_RetainedState")
 
         output_names = {}
         if kv_offload: