diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 52e2be3d3..c3b732325 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -350,6 +350,7 @@ def _run_build( precision: str, timeout: int, model_dir: Path, + ep: str | None = None, ) -> dict: """Run winml config + winml build for one model. Returns build result dict. @@ -387,6 +388,8 @@ def _run_build( ] if entry.task: config_args += ["--task", entry.task] + if ep: + config_args += ["--ep", ep] config_proc = _run_subprocess(config_args, timeout) if config_proc["exit_code"] != 0: @@ -516,6 +519,7 @@ def run_model( device: str, timeout: int, onnx_paths: dict[str, str] | None = None, + ep: str | None = None, ) -> dict: """Execute winml perf for one or more ONNX models. Returns merged result dict. @@ -537,6 +541,8 @@ def run_model( ] if entry.task: args += ["--task", entry.task] + if ep: + args += ["--ep", ep] args += ["--iterations", "10", "--warmup", "2"] args += entry.perf_args @@ -565,6 +571,8 @@ def run_model( safe_print(f" perf: {label}") args = [*WINML_CLI, "perf", "-m", path, "--device", device] + if ep: + args += ["--ep", ep] args += ["--iterations", "10", "--warmup", "2"] args += entry.perf_args @@ -670,6 +678,7 @@ def _run_winml_eval( ds_config: dict, model_dir: Path, onnx_path: str | None = None, + ep: str | None = None, ) -> dict: """Invoke winml eval for one model. Returns process result + parsed metric.""" output_path = model_dir / "winml_eval_output.json" @@ -699,6 +708,8 @@ def _run_winml_eval( ] if entry.task: args += ["--task", entry.task] + if ep: + args += ["--ep", ep] # When ds_config is provided, pass explicit dataset args; # otherwise winml eval uses its built-in task defaults. if ds_config.get("dataset"): @@ -857,6 +868,7 @@ def _run_accuracy_phase( timeout: int, model_dir: Path, onnx_path: str | None = None, + ep: str | None = None, ) -> dict: """Run winml eval + pytorch baseline for one model. Returns accuracy sub-section dict.""" ds_config = get_dataset_config(entry.hf_id, entry.task) or {} @@ -864,7 +876,7 @@ def _run_accuracy_phase( # Build local dataset if a build_script is configured _build_dataset(ds_config, timeout) - winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path) + winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path, ep=ep) # Check baseline cache before running the expensive PyTorch baseline cached = _lookup_baseline_cache(entry.hf_id, entry.task, ds_config) @@ -1012,6 +1024,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--model-type", help="Filter by model_type") parser.add_argument("--group", help="Filter by group") parser.add_argument("--device", default="auto", help="Target device (default: auto)") + parser.add_argument("--ep", default=None, help="Execution provider (e.g. qnn, dml, ov)") parser.add_argument( "--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)" ) @@ -1194,7 +1207,10 @@ def main() -> None: retry_types = {t.upper() for t in args.retry_failed} if args.retry_failed else set() safe_print(f"E2E Evaluation: {len(entries)} models -> {output_dir}") - safe_print(f"Device: {args.device} | Timeout: {args.timeout}s | Eval: {args.eval_type}") + ep_label = args.ep or "auto" + safe_print( + f"Device: {args.device} | EP: {ep_label} | Timeout: {args.timeout}s | Eval: {args.eval_type}" + ) safe_print(f"Disk free: {_get_disk_free_gb():.1f} GB") if args.clean_cache: safe_print("Cache cleanup: ON (caches + temp files cleaned after each model)") @@ -1287,6 +1303,7 @@ def main() -> None: _DEFAULT_PRECISION, args.timeout, model_dir, + ep=args.ep, ) onnx_paths = build_result["onnx_paths"] if build_result["success"] else {} # Composite models produce multiple ONNX paths; accuracy phase requires a @@ -1316,7 +1333,7 @@ def main() -> None: ) accuracy_result = {"skipped": True, "skip_reason": "composite_model_not_supported"} if args.eval_type == "both": - perf_proc = run_model(entry, args.device, args.timeout, onnx_paths) + perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep) elif args.eval_type == "accuracy": accuracy_result = _run_accuracy_phase( entry, @@ -1324,12 +1341,13 @@ def main() -> None: args.timeout, model_dir, first_path, + ep=args.ep, ) elif args.eval_type == "perf": - perf_proc = run_model(entry, args.device, args.timeout, onnx_paths) + perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep) else: # "both": perf → eval - perf_proc = run_model(entry, args.device, args.timeout, onnx_paths) + perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep) if perf_proc["exit_code"] != 0: accuracy_result = {"skipped": True, "skip_reason": "perf_failed"} else: @@ -1339,6 +1357,7 @@ def main() -> None: args.timeout, model_dir, first_path, + ep=args.ep, ) except KeyboardInterrupt: diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py index 1c132abf5..daa730654 100644 --- a/scripts/e2e_eval/run_sa_eval.py +++ b/scripts/e2e_eval/run_sa_eval.py @@ -162,6 +162,8 @@ def stage2_sa_pre( model_dir: Path, graph_opt_path: Path, use_cache: bool, + ep: str = "QNNExecutionProvider", + device: str = "NPU", ) -> tuple[dict[str, str], dict, list[dict]] | None: """Run SA with information on graph_optimized.onnx. @@ -172,7 +174,7 @@ def stage2_sa_pre( if use_cache and is_cached(sa_pre_path) and is_cached(optim_record_path): safe_print(" [Stage 2] SA pre-check (cached)") - classifications = parse_sa_json(sa_pre_path) + classifications = parse_sa_json(sa_pre_path, ep=ep) optim_record = json.loads(optim_record_path.read_text(encoding="utf-8")) optim_config = optim_record.get("optim_config", {}) info_items = optim_record.get("info_items", []) @@ -180,7 +182,7 @@ def stage2_sa_pre( safe_print(" [Stage 2] Running SA pre-check (with recommendations)...") try: classifications, optim_config, info_items = run_sa_with_info( - graph_opt_path, sa_pre_path + graph_opt_path, sa_pre_path, ep=ep, device=device ) except Exception as e: safe_print(f" [ERROR] SA pre-check failed: {e}") @@ -242,6 +244,8 @@ def stage4_sa_post( model_dir: Path, sa_opt_path: Path, use_cache: bool, + ep: str = "QNNExecutionProvider", + device: str = "NPU", ) -> tuple[dict[str, str], list[dict]] | None: """Run SA on sa_optimized.onnx. @@ -251,12 +255,14 @@ def stage4_sa_post( if use_cache and is_cached(sa_post_path): safe_print(" [Stage 4] SA post-check (cached)") - classifications = parse_sa_json(sa_post_path) + classifications = parse_sa_json(sa_post_path, ep=ep) info_items = [] else: safe_print(" [Stage 4] Running SA post-check...") try: - classifications, _, info_items = run_sa_with_info(sa_opt_path, sa_post_path) + classifications, _, info_items = run_sa_with_info( + sa_opt_path, sa_post_path, ep=ep, device=device + ) except Exception as e: safe_print(f" [ERROR] SA post-check failed: {e}") return None @@ -274,22 +280,30 @@ def stage4_sa_post( return classifications, info_items -def _run_compile(onnx_path: Path, output_dir: Path) -> tuple[int, str]: - """Run wmk compile --device npu --no-quantize. Returns (rc, stderr_tail).""" +def _run_compile( + onnx_path: Path, + output_dir: Path, + device: str = "npu", + ep: str | None = None, +) -> tuple[int, str]: + """Run wmk compile --device --no-quantize. Returns (rc, stderr_tail).""" + cmd = [ + sys.executable, + "-m", + "winml.modelkit.cli", + "compile", + "--model", + str(onnx_path), + "--device", + device, + "--no-quantize", + "--output-dir", + str(output_dir), + ] + if ep: + cmd += ["--ep", ep] result = subprocess.run( # noqa: S603 - [ - sys.executable, - "-m", - "winml.modelkit.cli", - "compile", - "--model", - str(onnx_path), - "--device", - "npu", - "--no-quantize", - "--output-dir", - str(output_dir), - ], + cmd, capture_output=True, text=True, encoding="utf-8", @@ -306,8 +320,10 @@ def _compile_and_diff( sa_predictions: dict[str, str], model_dir: Path, use_cache: bool, + device: str = "npu", + ep: str | None = None, ) -> dict | None: - """Compile an ONNX with QNN NPU and compare against SA predictions. + """Compile an ONNX and compare against SA predictions. Args: label: Log prefix, e.g. "5a (pre)" or "5b (post)". @@ -324,8 +340,8 @@ def _compile_and_diff( if use_cache and is_cached(compiled_path): safe_print(f" [Stage {label}] Compile (cached): {compiled_path.name}") else: - safe_print(f" [Stage {label}] Compiling {onnx_path.name} → QNN EPContext...") - rc, _ = _run_compile(onnx_path, model_dir) + safe_print(f" [Stage {label}] Compiling {onnx_path.name} → EPContext...") + rc, _ = _run_compile(onnx_path, model_dir, device=device, ep=ep) if rc != 0 or not is_cached(compiled_path): safe_print(f" [Stage {label}] Compile failed (rc={rc}) — skipping diff") return None @@ -352,6 +368,8 @@ def stage5_compile_and_diff( sa_pre: dict[str, str], sa_post: dict[str, str], use_cache: bool, + device: str = "npu", + ep: str | None = None, ) -> tuple[dict | None, dict | None]: """Stage 5: compile both graph_optimized and sa_optimized, diff each vs its SA. @@ -367,6 +385,8 @@ def stage5_compile_and_diff( sa_pre, model_dir, use_cache, + device=device, + ep=ep, ) diff_post = _compile_and_diff( "5b (post)", @@ -375,6 +395,8 @@ def stage5_compile_and_diff( sa_post, model_dir, use_cache, + device=device, + ep=ep, ) return diff_pre, diff_post @@ -388,6 +410,8 @@ def evaluate_model( model_entry: dict, output_dir: Path, use_cache: bool, + ep: str = "QNNExecutionProvider", + device: str = "NPU", ) -> dict | None: """Run the 4+1 stage SA eval pipeline for a single model.""" hf_id = model_entry["hf_id"] @@ -410,7 +434,7 @@ def evaluate_model( return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir) # Stage 2 - pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache) + pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device) if pre_result is None: return _skip_result(hf_id, task, model_type, "SKIP_SA_PRE", model_dir) sa_pre, optim_config, pre_info_items = pre_result @@ -421,14 +445,21 @@ def evaluate_model( return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir) # Stage 4 - post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache) + post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device) if post_result is None: return _skip_result(hf_id, task, model_type, "SKIP_SA_POST", model_dir) sa_post, post_info_items = post_result # Stage 5: compile both ONNXes → EPContext diff pre and post epcontext_diff_pre, epcontext_diff_post = stage5_compile_and_diff( - model_dir, graph_opt_path, sa_opt_path, sa_pre, sa_post, use_cache + model_dir, + graph_opt_path, + sa_opt_path, + sa_pre, + sa_post, + use_cache, + device=device.lower(), + ep=ep, ) elapsed = time.monotonic() - t0 @@ -654,6 +685,12 @@ def main() -> None: action="store_true", help="Skip stages whose output artifacts already exist", ) + parser.add_argument( + "--ep", + default="QNNExecutionProvider", + help="Execution provider (default: QNNExecutionProvider)", + ) + parser.add_argument("--device", default="NPU", help="Target device (default: NPU)") args = parser.parse_args() output_dir = args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}") @@ -680,7 +717,9 @@ def main() -> None: for i, entry in enumerate(models_to_run, 1): safe_print(f"\n[{i}/{len(models_to_run)}]") - result = evaluate_model(entry, output_dir, use_cache=args.use_cache) + result = evaluate_model( + entry, output_dir, use_cache=args.use_cache, ep=args.ep, device=args.device + ) if result: all_results.append(result) diff --git a/scripts/e2e_eval/sa_comparison.py b/scripts/e2e_eval/sa_comparison.py index bbd9a4097..a4803de31 100644 --- a/scripts/e2e_eval/sa_comparison.py +++ b/scripts/e2e_eval/sa_comparison.py @@ -30,6 +30,8 @@ def run_sa_with_info( onnx_path: Path, output_path: Path, + ep: str = "QNNExecutionProvider", + device: str = "NPU", ) -> tuple[dict[str, str], dict, list[dict]]: """Run SA using Python API with information enabled. @@ -50,7 +52,7 @@ def run_sa_with_info( config = AnalyzerConfig(enable_information=True) analyzer = ONNXStaticAnalyzer(config=config) - result = analyzer.analyze(str(onnx_path), ep="QNNExecutionProvider", device="NPU") + result = analyzer.analyze(str(onnx_path), ep=ep, device=device) # Save full SA JSON output_path.write_text(result.to_json(), encoding="utf-8") @@ -59,10 +61,10 @@ def run_sa_with_info( classifications: dict[str, str] = {} info_items: list[dict] = [] - for ep in result.output.results: - if ep.ep_type != "QNNExecutionProvider": + for ep_result in result.output.results: + if ep_result.ep_type != ep: continue - for level_enum, pid_list in ep.classification.items(): + for level_enum, pid_list in ep_result.classification.items(): level = level_enum.value.upper() for pid in pid_list: classifications[pid] = level @@ -72,12 +74,12 @@ def run_sa_with_info( "explanation": info.explanation or "", "has_actions": bool(info.actions), } - for info in ep.information + for info in ep_result.information ) break # Get optimization config from SA recommendations - optim_config = dict(result.get_optimization_config("QNNExecutionProvider")) + optim_config = dict(result.get_optimization_config(ep)) return classifications, optim_config, info_items @@ -87,14 +89,14 @@ def run_sa_with_info( # --------------------------------------------------------------------------- -def parse_sa_json(json_path: Path) -> dict[str, str]: +def parse_sa_json(json_path: Path, ep: str = "QNNExecutionProvider") -> dict[str, str]: """Parse wmk analyze JSON output into {pattern_id: level}. Works for both subprocess-written JSON (lowercase keys in classification dict) and Python API-written JSON (SupportLevel enum serialized as lowercase strings). - Returns empty dict if file is missing or QNN result not found. + Returns empty dict if file is missing or the requested EP result not found. """ if not json_path.exists(): return {} @@ -106,7 +108,7 @@ def parse_sa_json(json_path: Path) -> dict[str, str]: result: dict[str, str] = {} for ep_result in sa_data.get("results", []): - if ep_result.get("ep_type") != "QNNExecutionProvider": + if ep_result.get("ep_type") != ep: continue cls = ep_result.get("classification", {}) for level, pid_list in cls.items():