quic
diff --git a/‎.github/CODEOWNERS
+1-1 b/‎.github/CODEOWNERS
+1-1
diff --git a/‎QEfficient/base/common.py
+5-4 b/‎QEfficient/base/common.py
+5-4
diff --git a/‎QEfficient/base/modeling_qeff.py
+27-103 b/‎QEfficient/base/modeling_qeff.py
+27-103
diff --git a/‎QEfficient/cloud/compile.py
+21-9 b/‎QEfficient/cloud/compile.py
+21-9
@@ -7,6 +7,6 @@
 
 # Default owners
 # review when someone opens a pull request and assign appropriate reviewer
-* @quic-rishinr @ochougul @quic-hemagnih
+* @quic-rishinr @ochougul @quic-hemagnih @quic-amitraj
 pyproject.toml @carlstreeter-quic
 
@@ -16,10 +16,9 @@
 from typing import Any
 
 from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
 from QEfficient.utils import login_and_download_hf_lm
 
 
@@ -44,8 +43,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-            model_class = QEFFAutoModelForCausalLM
+        class_name = MODEL_CLASS_MAPPING.get(architecture)
+        if class_name:
+            module = __import__("QEfficient.transformers.models.modeling_auto")
+            model_class = getattr(module, class_name)
         else:
             raise NotImplementedError(
                 f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
 
@@ -24,7 +24,6 @@
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import constants, dump_qconfig
-from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
 logger = logging.getLogger(__name__)
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
             :num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
             :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
-            :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+            :compiler_options: Pass any compiler option as input.
+            Following flag can be passed in compiler_options to enable QNN Compilation path.
+                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
 
@@ -217,10 +220,13 @@ def _compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
+        mxint8_kv_cache: bool = False,
         specializations: Optional[List[Dict[str, int]]] = None,
         custom_io: Optional[Dict[str, str]] = None,
         mdp_ts_num_devices: int = 1,
         num_speculative_tokens: Optional[int] = None,
+        enable_qnn: Optional[bool] = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -229,10 +235,13 @@ def _compile(
         Args:
             :onnx_path (str): Onnx file to compile
             :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
+            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :specializations (list): List of specializations to compile for
             :custom_io (dict): Custom IO to specify the input and outputs in different formats than default
             :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
+            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
             raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
+
+        if enable_qnn:
+            self.qpc_path = qnn_compile(
+                onnx_path=onnx_path,
+                qpc_base_path=compile_dir,
+                specializations=specializations,
+                custom_io=custom_io,
+                device_group=list(range(mdp_ts_num_devices)),
+                num_cores=compiler_options.get("aic_num_cores", 16),
+                mxfp6=compiler_options.get("mxfp6_matmul", False),
+                mxint8=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+
+            return self.qpc_path
+
         command = constants.COMPILER + [f"-m={onnx_path}"]
         if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
             mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
         self.qpc_path = qpc_path
 
         return qpc_path
-
-    @dump_qconfig
-    def _qnn_compile(
-        self,
-        onnx_path: Optional[str] = None,
-        compile_dir: Optional[str] = None,
-        *,
-        specializations: Optional[List[Dict[str, int]]] = None,
-        prefill_seq_len: int = 32,
-        ctx_len: int = 128,
-        batch_size: int = 1,
-        full_batch_size: Optional[int] = None,
-        mdp_ts_num_devices: int = 1,
-        num_cores: int = 16,
-        mxfp6_matmul: bool = False,
-        mxint8_kv_cache: bool = False,
-        qnn_config: Optional[str] = None,
-        kv_cache_batch_size: Optional[int] = None,
-    ) -> str:
-        """
-        Interface for QNN compiler
-
-        Args:
-            :onnx_path (str): Onnx file to compile
-            :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
-            :specializations (list): List of specializations to compile for
-            :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
-            :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
-            :batch_size (int, optional): Batch size. ``Defaults to 1``.
-            :full_batch_size (int, optional): Continuous batching batch size.
-            :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
-            :num_cores (int): Number of cores used to compile the model.
-            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
-            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
-            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
-            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
-        """
-        if onnx_path is None and self.onnx_path is None:
-            self.export()
-
-        onnx_path = Path(onnx_path or self.onnx_path)
-        compile_dir = Path(compile_dir or onnx_path.parent)
-        qpc_path = compile_dir / "qpc"
-        if not onnx_path.is_file():
-            raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
-
-        compile_hash = hashlib.sha256(to_hashable("qnn"))
-
-        if specializations is not None:
-            compile_hash.update(to_hashable(specializations))
-
-        if qnn_config is not None:
-            qnn_config_values = load_json(qnn_config)
-            compile_hash.update(to_hashable(qnn_config_values))
-
-        if mdp_ts_num_devices > 1:
-            compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
-
-        compile_hash.update(to_hashable({"num_cores": num_cores}))
-        compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
-        compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
-
-        # Check if already compiled
-        compile_hash = compile_hash.hexdigest()[:16]
-        qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
-        if qpc_path.is_dir():
-            if (qpc_path / "programqpc.bin").is_file():
-                self.qpc_path = qpc_path
-                return qpc_path
-            # Probably compilation failure last time, delete directory to start over
-            shutil.rmtree(qpc_path)
-
-        # Write specializations.json file
-        if specializations is not None:
-            specializations_json = compile_dir / "specializations.json"
-            with open(specializations_json, "w") as fp:
-                json.dump(
-                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
-                    fp,
-                    indent=4,
-                )
-
-        qnn_compile(
-            onnx_path=onnx_path,
-            qpc_base_path=compile_dir,
-            num_cores=num_cores,
-            device_group=list(range(mdp_ts_num_devices)),
-            batch_size=batch_size,
-            prompt_len=prefill_seq_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6_matmul,
-            mxint8=mxint8_kv_cache,
-            full_batch_size=full_batch_size,
-            qnn_config=qnn_config,
-            qnn_binary_dir=qpc_path,
-            kv_cache_batch_size=kv_cache_batch_size,
-        )
-
-        self.qpc_path = qpc_path
-
-        return qpc_path
@@ -85,17 +85,29 @@
     parser.add_argument(
         "--enable_qnn",
         "--enable-qnn",
-        action="store_true",
+        nargs="?",
+        const=True,
+        type=str,
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
              Sample Config: QEfficient/compile/qnn_config.json",
     )
-    parser.add_argument(
-        "qnn_config",
-        nargs="?",
-        type=str,
-    )
-    # FIXME(ochougul): Allow extra compilation arguments
-    args = parser.parse_args()
-    QEfficient.compile(**vars(args))
+
+    args, compiler_options = parser.parse_known_args()
+
+    if isinstance(args.enable_qnn, str):
+        args.qnn_config = args.enable_qnn
+        args.enable_qnn = True
+
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-").replace("-", "_")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
+            compiler_options_dict[key] = value
+    QEfficient.compile(**args.__dict__, **compiler_options_dict)