Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions scripts/e2e_eval/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def _run_build(
precision: str,
timeout: int,
model_dir: Path,
ep: str | None = None,
) -> dict:
"""Run winml config + winml build for one model. Returns build result dict.

Expand Down Expand Up @@ -387,6 +388,8 @@ def _run_build(
]
if entry.task:
config_args += ["--task", entry.task]
if ep:
config_args += ["--ep", ep]

config_proc = _run_subprocess(config_args, timeout)
if config_proc["exit_code"] != 0:
Expand Down Expand Up @@ -516,6 +519,7 @@ def run_model(
device: str,
timeout: int,
onnx_paths: dict[str, str] | None = None,
ep: str | None = None,
) -> dict:
"""Execute winml perf for one or more ONNX models. Returns merged result dict.

Expand All @@ -537,6 +541,8 @@ def run_model(
]
if entry.task:
args += ["--task", entry.task]
if ep:
args += ["--ep", ep]
args += ["--iterations", "10", "--warmup", "2"]
args += entry.perf_args

Expand Down Expand Up @@ -565,6 +571,8 @@ def run_model(
safe_print(f" perf: {label}")

args = [*WINML_CLI, "perf", "-m", path, "--device", device]
if ep:
args += ["--ep", ep]
args += ["--iterations", "10", "--warmup", "2"]
args += entry.perf_args

Expand Down Expand Up @@ -670,6 +678,7 @@ def _run_winml_eval(
ds_config: dict,
model_dir: Path,
onnx_path: str | None = None,
ep: str | None = None,
) -> dict:
"""Invoke winml eval for one model. Returns process result + parsed metric."""
output_path = model_dir / "winml_eval_output.json"
Expand Down Expand Up @@ -699,6 +708,8 @@ def _run_winml_eval(
]
if entry.task:
args += ["--task", entry.task]
if ep:
args += ["--ep", ep]
# When ds_config is provided, pass explicit dataset args;
# otherwise winml eval uses its built-in task defaults.
if ds_config.get("dataset"):
Expand Down Expand Up @@ -857,14 +868,15 @@ def _run_accuracy_phase(
timeout: int,
model_dir: Path,
onnx_path: str | None = None,
ep: str | None = None,
) -> dict:
"""Run winml eval + pytorch baseline for one model. Returns accuracy sub-section dict."""
ds_config = get_dataset_config(entry.hf_id, entry.task) or {}

# Build local dataset if a build_script is configured
_build_dataset(ds_config, timeout)

winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path)
winml = _run_winml_eval(entry, device, timeout, ds_config, model_dir, onnx_path, ep=ep)

# Check baseline cache before running the expensive PyTorch baseline
cached = _lookup_baseline_cache(entry.hf_id, entry.task, ds_config)
Expand Down Expand Up @@ -1012,6 +1024,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--model-type", help="Filter by model_type")
parser.add_argument("--group", help="Filter by group")
parser.add_argument("--device", default="auto", help="Target device (default: auto)")
parser.add_argument("--ep", default=None, help="Execution provider (e.g. qnn, dml, ov)")
parser.add_argument(
"--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)"
)
Expand Down Expand Up @@ -1194,7 +1207,10 @@ def main() -> None:
retry_types = {t.upper() for t in args.retry_failed} if args.retry_failed else set()

safe_print(f"E2E Evaluation: {len(entries)} models -> {output_dir}")
safe_print(f"Device: {args.device} | Timeout: {args.timeout}s | Eval: {args.eval_type}")
ep_label = args.ep or "auto"
safe_print(
f"Device: {args.device} | EP: {ep_label} | Timeout: {args.timeout}s | Eval: {args.eval_type}"
)
safe_print(f"Disk free: {_get_disk_free_gb():.1f} GB")
if args.clean_cache:
safe_print("Cache cleanup: ON (caches + temp files cleaned after each model)")
Expand Down Expand Up @@ -1287,6 +1303,7 @@ def main() -> None:
_DEFAULT_PRECISION,
args.timeout,
model_dir,
ep=args.ep,
)
onnx_paths = build_result["onnx_paths"] if build_result["success"] else {}
# Composite models produce multiple ONNX paths; accuracy phase requires a
Expand Down Expand Up @@ -1316,20 +1333,21 @@ def main() -> None:
)
accuracy_result = {"skipped": True, "skip_reason": "composite_model_not_supported"}
if args.eval_type == "both":
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
elif args.eval_type == "accuracy":
accuracy_result = _run_accuracy_phase(
entry,
args.device,
args.timeout,
model_dir,
first_path,
ep=args.ep,
)
elif args.eval_type == "perf":
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
else:
# "both": perf → eval
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths)
perf_proc = run_model(entry, args.device, args.timeout, onnx_paths, ep=args.ep)
if perf_proc["exit_code"] != 0:
accuracy_result = {"skipped": True, "skip_reason": "perf_failed"}
else:
Expand All @@ -1339,6 +1357,7 @@ def main() -> None:
args.timeout,
model_dir,
first_path,
ep=args.ep,
)

except KeyboardInterrupt:
Expand Down
91 changes: 65 additions & 26 deletions scripts/e2e_eval/run_sa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ def stage2_sa_pre(
model_dir: Path,
graph_opt_path: Path,
use_cache: bool,
ep: str = "QNNExecutionProvider",
device: str = "NPU",
) -> tuple[dict[str, str], dict, list[dict]] | None:
"""Run SA with information on graph_optimized.onnx.

Expand All @@ -172,15 +174,15 @@ def stage2_sa_pre(

if use_cache and is_cached(sa_pre_path) and is_cached(optim_record_path):
safe_print(" [Stage 2] SA pre-check (cached)")
classifications = parse_sa_json(sa_pre_path)
classifications = parse_sa_json(sa_pre_path, ep=ep)
optim_record = json.loads(optim_record_path.read_text(encoding="utf-8"))
optim_config = optim_record.get("optim_config", {})
info_items = optim_record.get("info_items", [])
else:
safe_print(" [Stage 2] Running SA pre-check (with recommendations)...")
try:
classifications, optim_config, info_items = run_sa_with_info(
graph_opt_path, sa_pre_path
graph_opt_path, sa_pre_path, ep=ep, device=device
)
except Exception as e:
safe_print(f" [ERROR] SA pre-check failed: {e}")
Expand Down Expand Up @@ -242,6 +244,8 @@ def stage4_sa_post(
model_dir: Path,
sa_opt_path: Path,
use_cache: bool,
ep: str = "QNNExecutionProvider",
device: str = "NPU",
) -> tuple[dict[str, str], list[dict]] | None:
"""Run SA on sa_optimized.onnx.

Expand All @@ -251,12 +255,14 @@ def stage4_sa_post(

if use_cache and is_cached(sa_post_path):
safe_print(" [Stage 4] SA post-check (cached)")
classifications = parse_sa_json(sa_post_path)
classifications = parse_sa_json(sa_post_path, ep=ep)
info_items = []
else:
safe_print(" [Stage 4] Running SA post-check...")
try:
classifications, _, info_items = run_sa_with_info(sa_opt_path, sa_post_path)
classifications, _, info_items = run_sa_with_info(
sa_opt_path, sa_post_path, ep=ep, device=device
)
except Exception as e:
safe_print(f" [ERROR] SA post-check failed: {e}")
return None
Expand All @@ -274,22 +280,30 @@ def stage4_sa_post(
return classifications, info_items


def _run_compile(onnx_path: Path, output_dir: Path) -> tuple[int, str]:
"""Run wmk compile --device npu --no-quantize. Returns (rc, stderr_tail)."""
def _run_compile(
onnx_path: Path,
output_dir: Path,
device: str = "npu",
ep: str | None = None,
) -> tuple[int, str]:
"""Run wmk compile --device <device> --no-quantize. Returns (rc, stderr_tail)."""
cmd = [
sys.executable,
"-m",
"winml.modelkit.cli",
"compile",
"--model",
str(onnx_path),
"--device",
device,
"--no-quantize",
"--output-dir",
str(output_dir),
]
if ep:
cmd += ["--ep", ep]
result = subprocess.run( # noqa: S603
[
sys.executable,
"-m",
"winml.modelkit.cli",
"compile",
"--model",
str(onnx_path),
"--device",
"npu",
"--no-quantize",
"--output-dir",
str(output_dir),
],
cmd,
capture_output=True,
text=True,
encoding="utf-8",
Expand All @@ -306,8 +320,10 @@ def _compile_and_diff(
sa_predictions: dict[str, str],
model_dir: Path,
use_cache: bool,
device: str = "npu",
ep: str | None = None,
) -> dict | None:
"""Compile an ONNX with QNN NPU and compare against SA predictions.
"""Compile an ONNX and compare against SA predictions.

Args:
label: Log prefix, e.g. "5a (pre)" or "5b (post)".
Expand All @@ -324,8 +340,8 @@ def _compile_and_diff(
if use_cache and is_cached(compiled_path):
safe_print(f" [Stage {label}] Compile (cached): {compiled_path.name}")
else:
safe_print(f" [Stage {label}] Compiling {onnx_path.name} → QNN EPContext...")
rc, _ = _run_compile(onnx_path, model_dir)
safe_print(f" [Stage {label}] Compiling {onnx_path.name} → EPContext...")
rc, _ = _run_compile(onnx_path, model_dir, device=device, ep=ep)
if rc != 0 or not is_cached(compiled_path):
safe_print(f" [Stage {label}] Compile failed (rc={rc}) — skipping diff")
return None
Expand All @@ -352,6 +368,8 @@ def stage5_compile_and_diff(
sa_pre: dict[str, str],
sa_post: dict[str, str],
use_cache: bool,
device: str = "npu",
ep: str | None = None,
) -> tuple[dict | None, dict | None]:
"""Stage 5: compile both graph_optimized and sa_optimized, diff each vs its SA.

Expand All @@ -367,6 +385,8 @@ def stage5_compile_and_diff(
sa_pre,
model_dir,
use_cache,
device=device,
ep=ep,
)
diff_post = _compile_and_diff(
"5b (post)",
Expand All @@ -375,6 +395,8 @@ def stage5_compile_and_diff(
sa_post,
model_dir,
use_cache,
device=device,
ep=ep,
)
return diff_pre, diff_post

Expand All @@ -388,6 +410,8 @@ def evaluate_model(
model_entry: dict,
output_dir: Path,
use_cache: bool,
ep: str = "QNNExecutionProvider",
device: str = "NPU",
) -> dict | None:
"""Run the 4+1 stage SA eval pipeline for a single model."""
hf_id = model_entry["hf_id"]
Expand All @@ -410,7 +434,7 @@ def evaluate_model(
return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir)

# Stage 2
pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache)
pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device)
if pre_result is None:
return _skip_result(hf_id, task, model_type, "SKIP_SA_PRE", model_dir)
sa_pre, optim_config, pre_info_items = pre_result
Expand All @@ -421,14 +445,21 @@ def evaluate_model(
return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir)

# Stage 4
post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache)
post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device)
if post_result is None:
return _skip_result(hf_id, task, model_type, "SKIP_SA_POST", model_dir)
sa_post, post_info_items = post_result

# Stage 5: compile both ONNXes → EPContext diff pre and post
epcontext_diff_pre, epcontext_diff_post = stage5_compile_and_diff(
model_dir, graph_opt_path, sa_opt_path, sa_pre, sa_post, use_cache
model_dir,
graph_opt_path,
sa_opt_path,
sa_pre,
sa_post,
use_cache,
device=device.lower(),
ep=ep,
)

elapsed = time.monotonic() - t0
Expand Down Expand Up @@ -654,6 +685,12 @@ def main() -> None:
action="store_true",
help="Skip stages whose output artifacts already exist",
)
parser.add_argument(
"--ep",
default="QNNExecutionProvider",
help="Execution provider (default: QNNExecutionProvider)",
)
parser.add_argument("--device", default="NPU", help="Target device (default: NPU)")
args = parser.parse_args()

output_dir = args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}")
Expand All @@ -680,7 +717,9 @@ def main() -> None:

for i, entry in enumerate(models_to_run, 1):
safe_print(f"\n[{i}/{len(models_to_run)}]")
result = evaluate_model(entry, output_dir, use_cache=args.use_cache)
result = evaluate_model(
entry, output_dir, use_cache=args.use_cache, ep=args.ep, device=args.device
)
if result:
all_results.append(result)

Expand Down
Loading
Loading