Skip to content

Commit 27556fe

Browse files
committed
Merge branch 'main' into use_logger
Signed-off-by: Mamta Singh <[email protected]>
2 parents 74e1915 + 7d345dd commit 27556fe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2714
-860
lines changed

.github/CODEOWNERS

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77

88
# Default owners
99
# review when someone opens a pull request and assign appropriate reviewer
10-
* @quic-rishinr @ochougul @quic-hemagnih
10+
* @quic-rishinr @ochougul @quic-hemagnih @quic-amitraj
1111
pyproject.toml @carlstreeter-quic
1212

QEfficient/base/common.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@
1616
from typing import Any
1717

1818
from transformers import AutoConfig
19-
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
2019

2120
from QEfficient.base.modeling_qeff import QEFFBaseModel
22-
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
21+
from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
2322
from QEfficient.utils import login_and_download_hf_lm
2423

2524

@@ -44,8 +43,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
4443
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
4544
architecture = config.architectures[0] if config.architectures else None
4645

47-
if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
48-
model_class = QEFFAutoModelForCausalLM
46+
class_name = MODEL_CLASS_MAPPING.get(architecture)
47+
if class_name:
48+
module = __import__("QEfficient.transformers.models.modeling_auto")
49+
model_class = getattr(module, class_name)
4950
else:
5051
raise NotImplementedError(
5152
f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"

QEfficient/base/modeling_qeff.py

+27-103
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from QEfficient.compile.qnn_compiler import compile as qnn_compile
2525
from QEfficient.generation.cloud_infer import QAICInferenceSession
2626
from QEfficient.utils import constants, dump_qconfig
27-
from QEfficient.utils._utils import load_json
2827
from QEfficient.utils.cache import QEFF_HOME, to_hashable
2928

3029
logger = logging.getLogger(__name__)
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
9897
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
9998
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
10099
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101-
:compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
100+
:compiler_options: Pass any compiler option as input.
101+
Following flag can be passed in compiler_options to enable QNN Compilation path.
102+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
103+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
104+
for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102105
- aic_num_cores=16 -> -aic-num-cores=16
103106
- convert_to_fp16=True -> -convert-to-fp16
104107
@@ -217,10 +220,13 @@ def _compile(
217220
onnx_path: Optional[str] = None,
218221
compile_dir: Optional[str] = None,
219222
*,
223+
mxint8_kv_cache: bool = False,
220224
specializations: Optional[List[Dict[str, int]]] = None,
221225
custom_io: Optional[Dict[str, str]] = None,
222226
mdp_ts_num_devices: int = 1,
223227
num_speculative_tokens: Optional[int] = None,
228+
enable_qnn: Optional[bool] = False,
229+
qnn_config: Optional[str] = None,
224230
**compiler_options,
225231
) -> str:
226232
"""
@@ -229,10 +235,13 @@ def _compile(
229235
Args:
230236
:onnx_path (str): Onnx file to compile
231237
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238+
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232239
:specializations (list): List of specializations to compile for
233240
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
243+
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
244+
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
236245
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
237246
- aic_num_cores=16 -> -aic-num-cores=16
238247
- convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
245254
qpc_path = compile_dir / "qpc"
246255
if not onnx_path.is_file():
247256
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
257+
258+
if enable_qnn:
259+
self.qpc_path = qnn_compile(
260+
onnx_path=onnx_path,
261+
qpc_base_path=compile_dir,
262+
specializations=specializations,
263+
custom_io=custom_io,
264+
device_group=list(range(mdp_ts_num_devices)),
265+
num_cores=compiler_options.get("aic_num_cores", 16),
266+
mxfp6=compiler_options.get("mxfp6_matmul", False),
267+
mxint8=mxint8_kv_cache,
268+
qnn_config=qnn_config,
269+
)
270+
271+
return self.qpc_path
272+
248273
command = constants.COMPILER + [f"-m={onnx_path}"]
249274
if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
250275
mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
339364
self.qpc_path = qpc_path
340365

341366
return qpc_path
342-
343-
@dump_qconfig
344-
def _qnn_compile(
345-
self,
346-
onnx_path: Optional[str] = None,
347-
compile_dir: Optional[str] = None,
348-
*,
349-
specializations: Optional[List[Dict[str, int]]] = None,
350-
prefill_seq_len: int = 32,
351-
ctx_len: int = 128,
352-
batch_size: int = 1,
353-
full_batch_size: Optional[int] = None,
354-
mdp_ts_num_devices: int = 1,
355-
num_cores: int = 16,
356-
mxfp6_matmul: bool = False,
357-
mxint8_kv_cache: bool = False,
358-
qnn_config: Optional[str] = None,
359-
kv_cache_batch_size: Optional[int] = None,
360-
) -> str:
361-
"""
362-
Interface for QNN compiler
363-
364-
Args:
365-
:onnx_path (str): Onnx file to compile
366-
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
367-
:specializations (list): List of specializations to compile for
368-
:prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369-
:ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370-
:batch_size (int, optional): Batch size. ``Defaults to 1``.
371-
:full_batch_size (int, optional): Continuous batching batch size.
372-
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373-
:num_cores (int): Number of cores used to compile the model.
374-
:mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375-
:mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376-
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377-
:kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378-
"""
379-
if onnx_path is None and self.onnx_path is None:
380-
self.export()
381-
382-
onnx_path = Path(onnx_path or self.onnx_path)
383-
compile_dir = Path(compile_dir or onnx_path.parent)
384-
qpc_path = compile_dir / "qpc"
385-
if not onnx_path.is_file():
386-
raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
387-
388-
compile_hash = hashlib.sha256(to_hashable("qnn"))
389-
390-
if specializations is not None:
391-
compile_hash.update(to_hashable(specializations))
392-
393-
if qnn_config is not None:
394-
qnn_config_values = load_json(qnn_config)
395-
compile_hash.update(to_hashable(qnn_config_values))
396-
397-
if mdp_ts_num_devices > 1:
398-
compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
399-
400-
compile_hash.update(to_hashable({"num_cores": num_cores}))
401-
compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
402-
compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
403-
404-
# Check if already compiled
405-
compile_hash = compile_hash.hexdigest()[:16]
406-
qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
407-
if qpc_path.is_dir():
408-
if (qpc_path / "programqpc.bin").is_file():
409-
self.qpc_path = qpc_path
410-
return qpc_path
411-
# Probably compilation failure last time, delete directory to start over
412-
shutil.rmtree(qpc_path)
413-
414-
# Write specializations.json file
415-
if specializations is not None:
416-
specializations_json = compile_dir / "specializations.json"
417-
with open(specializations_json, "w") as fp:
418-
json.dump(
419-
{"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
420-
fp,
421-
indent=4,
422-
)
423-
424-
qnn_compile(
425-
onnx_path=onnx_path,
426-
qpc_base_path=compile_dir,
427-
num_cores=num_cores,
428-
device_group=list(range(mdp_ts_num_devices)),
429-
batch_size=batch_size,
430-
prompt_len=prefill_seq_len,
431-
ctx_len=ctx_len,
432-
mxfp6=mxfp6_matmul,
433-
mxint8=mxint8_kv_cache,
434-
full_batch_size=full_batch_size,
435-
qnn_config=qnn_config,
436-
qnn_binary_dir=qpc_path,
437-
kv_cache_batch_size=kv_cache_batch_size,
438-
)
439-
440-
self.qpc_path = qpc_path
441-
442-
return qpc_path

QEfficient/cloud/compile.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,29 @@
8585
parser.add_argument(
8686
"--enable_qnn",
8787
"--enable-qnn",
88-
action="store_true",
88+
nargs="?",
89+
const=True,
90+
type=str,
8991
default=False,
9092
help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
9193
If not provided, the default configuration will be used.\
9294
Sample Config: QEfficient/compile/qnn_config.json",
9395
)
94-
parser.add_argument(
95-
"qnn_config",
96-
nargs="?",
97-
type=str,
98-
)
99-
# FIXME(ochougul): Allow extra compilation arguments
100-
args = parser.parse_args()
101-
QEfficient.compile(**vars(args))
96+
97+
args, compiler_options = parser.parse_known_args()
98+
99+
if isinstance(args.enable_qnn, str):
100+
args.qnn_config = args.enable_qnn
101+
args.enable_qnn = True
102+
103+
compiler_options_dict = {}
104+
for i in range(0, len(compiler_options)):
105+
if compiler_options[i].startswith("--"):
106+
key = compiler_options[i].lstrip("-").replace("-", "_")
107+
value = (
108+
compiler_options[i + 1]
109+
if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
110+
else True
111+
)
112+
compiler_options_dict[key] = value
113+
QEfficient.compile(**args.__dict__, **compiler_options_dict)

0 commit comments

Comments
 (0)