24
24
from QEfficient .compile .qnn_compiler import compile as qnn_compile
25
25
from QEfficient .generation .cloud_infer import QAICInferenceSession
26
26
from QEfficient .utils import constants , dump_qconfig
27
- from QEfficient .utils ._utils import load_json
28
27
from QEfficient .utils .cache import QEFF_HOME , to_hashable
29
28
30
29
logger = logging .getLogger (__name__ )
@@ -98,7 +97,11 @@ def compile(self, *args, **kwargs) -> Path:
98
97
:num_cores (int): Number of cores to utilize in each device ``Defaults to 16``.
99
98
:mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
100
99
:mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
101
- :compiler_options: Pass any compiler option as input. Any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
100
+ :compiler_options: Pass any compiler option as input.
101
+ Following flag can be passed in compiler_options to enable QNN Compilation path.
102
+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
103
+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
104
+ for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
102
105
- aic_num_cores=16 -> -aic-num-cores=16
103
106
- convert_to_fp16=True -> -convert-to-fp16
104
107
@@ -217,10 +220,13 @@ def _compile(
217
220
onnx_path : Optional [str ] = None ,
218
221
compile_dir : Optional [str ] = None ,
219
222
* ,
223
+ mxint8_kv_cache : bool = False ,
220
224
specializations : Optional [List [Dict [str , int ]]] = None ,
221
225
custom_io : Optional [Dict [str , str ]] = None ,
222
226
mdp_ts_num_devices : int = 1 ,
223
227
num_speculative_tokens : Optional [int ] = None ,
228
+ enable_qnn : Optional [bool ] = False ,
229
+ qnn_config : Optional [str ] = None ,
224
230
** compiler_options ,
225
231
) -> str :
226
232
"""
@@ -229,10 +235,13 @@ def _compile(
229
235
Args:
230
236
:onnx_path (str): Onnx file to compile
231
237
:compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
238
+ :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
232
239
:specializations (list): List of specializations to compile for
233
240
:custom_io (dict): Custom IO to specify the input and outputs in different formats than default
234
241
:mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
235
242
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
243
+ :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
244
+ :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
236
245
:compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
237
246
- aic_num_cores=16 -> -aic-num-cores=16
238
247
- convert_to_fp16=True -> -convert-to-fp16
@@ -245,6 +254,22 @@ def _compile(
245
254
qpc_path = compile_dir / "qpc"
246
255
if not onnx_path .is_file ():
247
256
raise FileNotFoundError (f"ONNX file not found at: { onnx_path } " )
257
+
258
+ if enable_qnn :
259
+ self .qpc_path = qnn_compile (
260
+ onnx_path = onnx_path ,
261
+ qpc_base_path = compile_dir ,
262
+ specializations = specializations ,
263
+ custom_io = custom_io ,
264
+ device_group = list (range (mdp_ts_num_devices )),
265
+ num_cores = compiler_options .get ("aic_num_cores" , 16 ),
266
+ mxfp6 = compiler_options .get ("mxfp6_matmul" , False ),
267
+ mxint8 = mxint8_kv_cache ,
268
+ qnn_config = qnn_config ,
269
+ )
270
+
271
+ return self .qpc_path
272
+
248
273
command = constants .COMPILER + [f"-m={ onnx_path } " ]
249
274
if mdp_ts_json_path := compiler_options .pop ("mdp_ts_json_path" , None ):
250
275
mdp_ts_num_devices = None
@@ -339,104 +364,3 @@ def _compile(
339
364
self .qpc_path = qpc_path
340
365
341
366
return qpc_path
342
-
343
- @dump_qconfig
344
- def _qnn_compile (
345
- self ,
346
- onnx_path : Optional [str ] = None ,
347
- compile_dir : Optional [str ] = None ,
348
- * ,
349
- specializations : Optional [List [Dict [str , int ]]] = None ,
350
- prefill_seq_len : int = 32 ,
351
- ctx_len : int = 128 ,
352
- batch_size : int = 1 ,
353
- full_batch_size : Optional [int ] = None ,
354
- mdp_ts_num_devices : int = 1 ,
355
- num_cores : int = 16 ,
356
- mxfp6_matmul : bool = False ,
357
- mxint8_kv_cache : bool = False ,
358
- qnn_config : Optional [str ] = None ,
359
- kv_cache_batch_size : Optional [int ] = None ,
360
- ) -> str :
361
- """
362
- Interface for QNN compiler
363
-
364
- Args:
365
- :onnx_path (str): Onnx file to compile
366
- :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
367
- :specializations (list): List of specializations to compile for
368
- :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
369
- :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
370
- :batch_size (int, optional): Batch size. ``Defaults to 1``.
371
- :full_batch_size (int, optional): Continuous batching batch size.
372
- :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
373
- :num_cores (int): Number of cores used to compile the model.
374
- :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
375
- :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
376
- :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
377
- :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
378
- """
379
- if onnx_path is None and self .onnx_path is None :
380
- self .export ()
381
-
382
- onnx_path = Path (onnx_path or self .onnx_path )
383
- compile_dir = Path (compile_dir or onnx_path .parent )
384
- qpc_path = compile_dir / "qpc"
385
- if not onnx_path .is_file ():
386
- raise FileNotFoundError (f"ONNX file not found at: { onnx_path } " )
387
-
388
- compile_hash = hashlib .sha256 (to_hashable ("qnn" ))
389
-
390
- if specializations is not None :
391
- compile_hash .update (to_hashable (specializations ))
392
-
393
- if qnn_config is not None :
394
- qnn_config_values = load_json (qnn_config )
395
- compile_hash .update (to_hashable (qnn_config_values ))
396
-
397
- if mdp_ts_num_devices > 1 :
398
- compile_hash .update (to_hashable ({"mdp_ts_num_devices" : mdp_ts_num_devices }))
399
-
400
- compile_hash .update (to_hashable ({"num_cores" : num_cores }))
401
- compile_hash .update (to_hashable ({"mxfp6_matmul" : mxfp6_matmul }))
402
- compile_hash .update (to_hashable ({"mxint8_kv_cache" : mxint8_kv_cache }))
403
-
404
- # Check if already compiled
405
- compile_hash = compile_hash .hexdigest ()[:16 ]
406
- qpc_path = qpc_path .with_name (qpc_path .name + "-" + compile_hash )
407
- if qpc_path .is_dir ():
408
- if (qpc_path / "programqpc.bin" ).is_file ():
409
- self .qpc_path = qpc_path
410
- return qpc_path
411
- # Probably compilation failure last time, delete directory to start over
412
- shutil .rmtree (qpc_path )
413
-
414
- # Write specializations.json file
415
- if specializations is not None :
416
- specializations_json = compile_dir / "specializations.json"
417
- with open (specializations_json , "w" ) as fp :
418
- json .dump (
419
- {"specializations" : [{k : str (v ) for k , v in spec .items ()} for spec in specializations ]},
420
- fp ,
421
- indent = 4 ,
422
- )
423
-
424
- qnn_compile (
425
- onnx_path = onnx_path ,
426
- qpc_base_path = compile_dir ,
427
- num_cores = num_cores ,
428
- device_group = list (range (mdp_ts_num_devices )),
429
- batch_size = batch_size ,
430
- prompt_len = prefill_seq_len ,
431
- ctx_len = ctx_len ,
432
- mxfp6 = mxfp6_matmul ,
433
- mxint8 = mxint8_kv_cache ,
434
- full_batch_size = full_batch_size ,
435
- qnn_config = qnn_config ,
436
- qnn_binary_dir = qpc_path ,
437
- kv_cache_batch_size = kv_cache_batch_size ,
438
- )
439
-
440
- self .qpc_path = qpc_path
441
-
442
- return qpc_path
0 commit comments