microsoft · DingmaomaoBJTU · Apr 27, 2026 · Apr 24, 2026
@@ -350,6 +350,7 @@ def _run_build(
     precision: str,
     timeout: int,
     model_dir: Path,
+    ep: str | None = None,
 ) -> dict:
     """Run winml config + winml build for one model. Returns build result dict.
 
@@ -387,6 +388,8 @@ def _run_build(
     ]
     if entry.task:
         config_args += ["--task", entry.task]
+    if ep:
+        config_args += ["--ep", ep]
 
     config_proc = _run_subprocess(config_args, timeout)
     if config_proc["exit_code"] != 0:
@@ -516,6 +519,7 @@ def run_model(
     device: str,
     timeout: int,
     onnx_paths: dict[str, str] | None = None,
+    ep: str | None = None,
 ) -> dict:
     """Execute winml perf for one or more ONNX models. Returns merged result dict.
 
@@ -537,6 +541,8 @@ def run_model(
         ]
         if entry.task:
             args += ["--task", entry.task]
+        if ep:
+            args += ["--ep", ep]
         args += ["--iterations", "10", "--warmup", "2"]
         args += entry.perf_args
 
@@ -565,6 +571,8 @@ def run_model(
             safe_print(f"    perf: {label}")
 
         args = [*WINML_CLI, "perf", "-m", path, "--device", device]
+        if ep:
+            args += ["--ep", ep]
         args += ["--iterations", "10", "--warmup", "2"]
         args += entry.perf_args
 
@@ -670,6 +678,7 @@ def _run_winml_eval(
     ds_config: dict,
     model_dir: Path,
     onnx_path: str | None = None,
+    ep: str | None = None,
 ) -> dict:
     """Invoke winml eval for one model. Returns process result + parsed metric."""
     output_path = model_dir / "winml_eval_output.json"
@@ -699,6 +708,8 @@ def _run_winml_eval(
         ]
     if entry.task:
         args += ["--task", entry.task]
+    if ep:
+        args += ["--ep", ep]
     # When ds_config is provided, pass explicit dataset args;
     # otherwise winml eval uses its built-in task defaults.
     if ds_config.get("dataset"):
@@ -857,14 +868,15 @@ def _run_accuracy_phase(
     timeout: int,
     model_dir: Path,
     onnx_path: str | None = None,
+    ep: str | None = None,
 ) -> dict:
     """Run winml eval + pytorch baseline for one model. Returns accuracy sub-section dict."""
     ds_config = get_dataset_config(entry.hf_id, entry.task) or {}
 
     # Build local dataset if a build_script is configured
     _build_dataset(ds_config, timeout)
 
-    winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path)
+    winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path, ep=ep)
 
     # Check baseline cache before running the expensive PyTorch baseline
     cached = _lookup_baseline_cache(entry.hf_id, entry.task, ds_config)
@@ -1012,6 +1024,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--model-type", help="Filter by model_type")
     parser.add_argument("--group", help="Filter by group")
     parser.add_argument("--device", default="auto", help="Target device (default: auto)")
+    parser.add_argument("--ep", default=None, help="Execution provider (e.g. qnn, dml, ov)")
     parser.add_argument(
         "--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)"
     )
@@ -1194,7 +1207,10 @@ def main() -> None:
         retry_types = {t.upper() for t in args.retry_failed} if args.retry_failed else set()
 
     safe_print(f"E2E Evaluation: {len(entries)} models -> {output_dir}")
-    safe_print(f"Device: {args.device} | Timeout: {args.timeout}s | Eval: {args.eval_type}")
+    ep_label = args.ep or "auto"
+    safe_print(
+        f"Device: {args.device} | EP: {ep_label} | Timeout: {args.timeout}s | Eval: {args.eval_type}"
+    )
     safe_print(f"Disk free: {_get_disk_free_gb():.1f} GB")
     if args.clean_cache:
         safe_print("Cache cleanup: ON (caches + temp files cleaned after each model)")
@@ -1287,6 +1303,7 @@ def main() -> None:
                 _DEFAULT_PRECISION,
                 args.timeout,
                 model_dir,
+                ep=args.ep,
             )
             onnx_paths = build_result["onnx_paths"] if build_result["success"] else {}
             # Composite models produce multiple ONNX paths; accuracy phase requires a
@@ -1316,20 +1333,21 @@ def main() -> None:
                 )
                 accuracy_result = {"skipped": True, "skip_reason": "composite_model_not_supported"}
                 if args.eval_type == "both":
-                    perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
+                    perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
             elif args.eval_type == "accuracy":
                 accuracy_result = _run_accuracy_phase(
                     entry,
                     args.device,
                     args.timeout,
                     model_dir,
                     first_path,
+                    ep=args.ep,
                 )
             elif args.eval_type == "perf":
-                perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
+                perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
             else:
                 # "both": perf → eval
-                perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
+                perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
                 if perf_proc["exit_code"] != 0:
                     accuracy_result = {"skipped": True, "skip_reason": "perf_failed"}
                 else:
@@ -1339,6 +1357,7 @@ def main() -> None:
                         args.timeout,
                         model_dir,
                         first_path,
+                        ep=args.ep,
                     )
 
         except KeyboardInterrupt:

@@ -162,6 +162,8 @@ def stage2_sa_pre(
     model_dir: Path,
     graph_opt_path: Path,
     use_cache: bool,
+    ep: str = "QNNExecutionProvider",
+    device: str = "NPU",
 ) -> tuple[dict[str, str], dict, list[dict]] | None:
     """Run SA with information on graph_optimized.onnx.
 
@@ -172,15 +174,15 @@ def stage2_sa_pre(
 
     if use_cache and is_cached(sa_pre_path) and is_cached(optim_record_path):
         safe_print("  [Stage 2] SA pre-check (cached)")
-        classifications = parse_sa_json(sa_pre_path)
+        classifications = parse_sa_json(sa_pre_path, ep=ep)
         optim_record = json.loads(optim_record_path.read_text(encoding="utf-8"))
         optim_config = optim_record.get("optim_config", {})
         info_items = optim_record.get("info_items", [])
     else:
         safe_print("  [Stage 2] Running SA pre-check (with recommendations)...")
         try:
             classifications, optim_config, info_items = run_sa_with_info(
-                graph_opt_path, sa_pre_path
+                graph_opt_path, sa_pre_path, ep=ep, device=device
             )
         except Exception as e:
             safe_print(f"  [ERROR] SA pre-check failed: {e}")
@@ -242,6 +244,8 @@ def stage4_sa_post(
     model_dir: Path,
     sa_opt_path: Path,
     use_cache: bool,
+    ep: str = "QNNExecutionProvider",
+    device: str = "NPU",
 ) -> tuple[dict[str, str], list[dict]] | None:
     """Run SA on sa_optimized.onnx.
 
@@ -251,12 +255,14 @@ def stage4_sa_post(
 
     if use_cache and is_cached(sa_post_path):
         safe_print("  [Stage 4] SA post-check (cached)")
-        classifications = parse_sa_json(sa_post_path)
+        classifications = parse_sa_json(sa_post_path, ep=ep)
         info_items = []
     else:
         safe_print("  [Stage 4] Running SA post-check...")
         try:
-            classifications, _, info_items = run_sa_with_info(sa_opt_path, sa_post_path)
+            classifications, _, info_items = run_sa_with_info(
+                sa_opt_path, sa_post_path, ep=ep, device=device
+            )
         except Exception as e:
             safe_print(f"  [ERROR] SA post-check failed: {e}")
             return None
@@ -274,22 +280,30 @@ def stage4_sa_post(
     return classifications, info_items
 
 
-def _run_compile(onnx_path: Path, output_dir: Path) -> tuple[int, str]:
-    """Run wmk compile --device npu --no-quantize. Returns (rc, stderr_tail)."""
+def _run_compile(
+    onnx_path: Path,
+    output_dir: Path,
+    device: str = "npu",
+    ep: str | None = None,
+) -> tuple[int, str]:
+    """Run wmk compile --device <device> --no-quantize. Returns (rc, stderr_tail)."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "winml.modelkit.cli",
+        "compile",
+        "--model",
+        str(onnx_path),
+        "--device",
+        device,
+        "--no-quantize",
+        "--output-dir",
+        str(output_dir),
+    ]
+    if ep:
+        cmd += ["--ep", ep]
     result = subprocess.run(  # noqa: S603
-        [
-            sys.executable,
-            "-m",
-            "winml.modelkit.cli",
-            "compile",
-            "--model",
-            str(onnx_path),
-            "--device",
-            "npu",
-            "--no-quantize",
-            "--output-dir",
-            str(output_dir),
-        ],
+        cmd,
         capture_output=True,
         text=True,
         encoding="utf-8",
@@ -306,8 +320,10 @@ def _compile_and_diff(
     sa_predictions: dict[str, str],
     model_dir: Path,
     use_cache: bool,
+    device: str = "npu",
+    ep: str | None = None,
 ) -> dict | None:
-    """Compile an ONNX with QNN NPU and compare against SA predictions.
+    """Compile an ONNX and compare against SA predictions.
 
     Args:
         label: Log prefix, e.g. "5a (pre)" or "5b (post)".
@@ -324,8 +340,8 @@ def _compile_and_diff(
     if use_cache and is_cached(compiled_path):
         safe_print(f"  [Stage {label}] Compile (cached): {compiled_path.name}")
     else:
-        safe_print(f"  [Stage {label}] Compiling {onnx_path.name} → QNN EPContext...")
-        rc, _ = _run_compile(onnx_path, model_dir)
+        safe_print(f"  [Stage {label}] Compiling {onnx_path.name} → EPContext...")
+        rc, _ = _run_compile(onnx_path, model_dir, device=device, ep=ep)
         if rc != 0 or not is_cached(compiled_path):
             safe_print(f"  [Stage {label}] Compile failed (rc={rc}) — skipping diff")
             return None
@@ -352,6 +368,8 @@ def stage5_compile_and_diff(
     sa_pre: dict[str, str],
     sa_post: dict[str, str],
     use_cache: bool,
+    device: str = "npu",
+    ep: str | None = None,
 ) -> tuple[dict | None, dict | None]:
     """Stage 5: compile both graph_optimized and sa_optimized, diff each vs its SA.
 
@@ -367,6 +385,8 @@ def stage5_compile_and_diff(
         sa_pre,
         model_dir,
         use_cache,
+        device=device,
+        ep=ep,
     )
     diff_post = _compile_and_diff(
         "5b (post)",
@@ -375,6 +395,8 @@ def stage5_compile_and_diff(
         sa_post,
         model_dir,
         use_cache,
+        device=device,
+        ep=ep,
     )
     return diff_pre, diff_post
 
@@ -388,6 +410,8 @@ def evaluate_model(
     model_entry: dict,
     output_dir: Path,
     use_cache: bool,
+    ep: str = "QNNExecutionProvider",
+    device: str = "NPU",
 ) -> dict | None:
     """Run the 4+1 stage SA eval pipeline for a single model."""
     hf_id = model_entry["hf_id"]
@@ -410,7 +434,7 @@ def evaluate_model(
         return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir)
 
     # Stage 2
-    pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache)
+    pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device)
     if pre_result is None:
         return _skip_result(hf_id, task, model_type, "SKIP_SA_PRE", model_dir)
     sa_pre, optim_config, pre_info_items = pre_result
@@ -421,14 +445,21 @@ def evaluate_model(
         return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir)
 
     # Stage 4
-    post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache)
+    post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device)
     if post_result is None:
         return _skip_result(hf_id, task, model_type, "SKIP_SA_POST", model_dir)
     sa_post, post_info_items = post_result
 
     # Stage 5: compile both ONNXes → EPContext diff pre and post
     epcontext_diff_pre, epcontext_diff_post = stage5_compile_and_diff(
-        model_dir, graph_opt_path, sa_opt_path, sa_pre, sa_post, use_cache
+        model_dir,
+        graph_opt_path,
+        sa_opt_path,
+        sa_pre,
+        sa_post,
+        use_cache,
+        device=device.lower(),
+        ep=ep,
     )
 
     elapsed = time.monotonic() - t0
@@ -654,6 +685,12 @@ def main() -> None:
         action="store_true",
         help="Skip stages whose output artifacts already exist",
     )
+    parser.add_argument(
+        "--ep",
+        default="QNNExecutionProvider",
+        help="Execution provider (default: QNNExecutionProvider)",
+    )
+    parser.add_argument("--device", default="NPU", help="Target device (default: NPU)")
     args = parser.parse_args()
 
     output_dir = args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}")
@@ -680,7 +717,9 @@ def main() -> None:
 
     for i, entry in enumerate(models_to_run, 1):
         safe_print(f"\n[{i}/{len(models_to_run)}]")
-        result = evaluate_model(entry, output_dir, use_cache=args.use_cache)
+        result = evaluate_model(
+            entry, output_dir, use_cache=args.use_cache, ep=args.ep, device=args.device
+        )
         if result:
             all_results.append(result)