From b6f8a941872d688158bd29c21cd2acc23ac06b2d Mon Sep 17 00:00:00 2001 From: yuesu Date: Tue, 31 Mar 2026 15:57:01 +0800 Subject: [PATCH 01/14] change error to warning --- .pipelines/Modelkit E2E Test.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index ea7ba9512..b3e42b13c 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -128,8 +128,9 @@ jobs: } & uv @uvArgs - if ($LASTEXITCODE -ne 0) { - throw "Model evaluation failed for $(hf_id) / $(hf_task)" + $evalExit = $LASTEXITCODE + if ($evalExit -ne 0) { + Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task)" } workingDirectory: $(Build.SourcesDirectory) displayName: 'Run eval for current model' From 82cf2eaabc4bc9c8f2f079520742689eb88799b2 Mon Sep 17 00:00:00 2001 From: yuesu Date: Tue, 31 Mar 2026 16:39:41 +0800 Subject: [PATCH 02/14] refine --- .pipelines/Modelkit E2E Test.yml | 3 +- scripts/e2e_eval/run_eval.py | 152 ++++++++++++++++++------------- 2 files changed, 92 insertions(+), 63 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index b3e42b13c..5e90f822d 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -130,8 +130,9 @@ jobs: & uv @uvArgs $evalExit = $LASTEXITCODE if ($evalExit -ne 0) { - Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task)" + Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task) (model failure — non-blocking)" } + exit 0 workingDirectory: $(Build.SourcesDirectory) displayName: 'Run eval for current model' diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 1aa6b682f..53c941025 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + """E2E evaluation runner — unified perf + accuracy. Batch-runs wmk perf (and optionally wmk eval + pytorch baseline) for models @@ -102,10 +107,10 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str: # Patterns that indicate the disk is full (cross-platform). _NO_SPACE_PATTERNS = ( - "no space left on device", # Linux/macOS OSError - "oserror: [errno 28]", # Python errno string + "no space left on device", # Linux/macOS OSError + "oserror: [errno 28]", # Python errno string "there is not enough space on the disk", # Windows - "winerror 112", # Windows disk-full error code + "winerror 112", # Windows disk-full error code "disk full", ) @@ -167,7 +172,8 @@ def _kill_process_tree(pid: int) -> None: # Fallback: taskkill on Windows, killpg on Unix if platform.system() == "Windows": subprocess.run( # noqa: S603 - ["taskkill", "/F", "/T", "/PID", str(pid)], capture_output=True + ["taskkill", "/F", "/T", "/PID", str(pid)], # noqa: S607 + capture_output=True, ) else: import signal @@ -309,7 +315,11 @@ def _watchdog() -> None: def _run_build( - entry: ModelEntry, device: str, precision: str, timeout: int, model_dir: Path, + entry: ModelEntry, + device: str, + precision: str, + timeout: int, + model_dir: Path, ) -> dict: """Run wmk config + wmk build for one model. Returns build result dict. @@ -320,11 +330,16 @@ def _run_build( # Step 1: wmk config config_args = [ - *WMK, "config", - "-m", entry.hf_id, - "--device", device, - "--precision", precision, - "-o", str(config_path), + *WMK, + "config", + "-m", + entry.hf_id, + "--device", + device, + "--precision", + precision, + "-o", + str(config_path), ] if entry.task: config_args += ["--task", entry.task] @@ -340,9 +355,12 @@ def _run_build( # Step 2: wmk build --use-cache build_args = [ - *WMK, "build", - "-c", str(config_path), - "-m", entry.hf_id, + *WMK, + "build", + "-c", + str(config_path), + "-m", + entry.hf_id, "--use-cache", ] @@ -404,7 +422,10 @@ def _find_cached_model(hf_id: str, build_proc: dict) -> str | None: def run_model( - entry: ModelEntry, device: str, timeout: int, onnx_path: str | None = None, + entry: ModelEntry, + device: str, + timeout: int, + onnx_path: str | None = None, ) -> dict: """Execute wmk perf for one model. Returns raw subprocess result dict. @@ -415,8 +436,14 @@ def run_model( args = [*WMK, "perf", "-m", onnx_path, "--device", device] else: args = [ - *WMK, "perf", "-m", entry.hf_id, - "--device", device, "--precision", _DEFAULT_PRECISION, + *WMK, + "perf", + "-m", + entry.hf_id, + "--device", + device, + "--precision", + _DEFAULT_PRECISION, ] if entry.task: args += ["--task", entry.task] @@ -516,14 +543,23 @@ def _run_wmk_eval( eval_device = "npu" if device == "auto" else device if onnx_path: args = [ - *WMK, "eval", "-m", onnx_path, - "--model-id", entry.hf_id, - "--device", eval_device, + *WMK, + "eval", + "-m", + onnx_path, + "--model-id", + entry.hf_id, + "--device", + eval_device, ] else: args = [ - *WMK, "eval", "-m", entry.hf_id, - "--device", eval_device, + *WMK, + "eval", + "-m", + entry.hf_id, + "--device", + eval_device, ] if entry.task: args += ["--task", entry.task] @@ -550,14 +586,9 @@ def _run_wmk_eval( metric = None if proc["exit_code"] == 0 and output_path.exists(): - wmk_key = ( - ds_config.get("wmk_metric_key") - or ds_config.get("metric", "accuracy") - ) + wmk_key = ds_config.get("wmk_metric_key") or ds_config.get("metric", "accuracy") num_samples = ds_config.get("num_samples", _DEFAULT_SAMPLES) - metric = _parse_metric_from_wmk_output( - output_path, wmk_key, num_samples - ) + metric = _parse_metric_from_wmk_output(output_path, wmk_key, num_samples) status = "PASS" if (proc["exit_code"] == 0 and metric is not None) else "FAIL" return { @@ -606,9 +637,7 @@ def _save_baseline_cache(cache: dict) -> None: ) -def _lookup_baseline_cache( - hf_id: str, task: str, ds_config: dict -) -> dict | None: +def _lookup_baseline_cache(hf_id: str, task: str, ds_config: dict) -> dict | None: """Return cached baseline result dict, or None if not cached.""" cache = _load_baseline_cache() key = _baseline_cache_key(hf_id, task, ds_config) @@ -631,9 +660,7 @@ def _shorten_command(cmd: str) -> str: return " ".join(shortened) -def _store_baseline_cache( - hf_id: str, task: str, ds_config: dict, result: dict -) -> None: +def _store_baseline_cache(hf_id: str, task: str, ds_config: dict, result: dict) -> None: """Store a successful baseline result in cache.""" if result.get("status") != "PASS": return @@ -758,8 +785,10 @@ def save_environment_info(path: Path) -> None: # Git HEAD commit info try: result = subprocess.run( - ["git", "log", "-1", "--format=%H%n%s%n%ai"], # noqa: S603, S607 - capture_output=True, text=True, timeout=5, + ["git", "log", "-1", "--format=%H%n%s%n%ai"], # noqa: S607 + capture_output=True, + text=True, + timeout=5, ) if result.returncode == 0: lines = result.stdout.strip().splitlines() @@ -782,17 +811,6 @@ def _get_disk_free_gb() -> float: return shutil.disk_usage(anchor).free / (1024**3) -def _clean_model_hf_cache(hf_id: str) -> None: - """Delete cached HuggingFace files for a specific model.""" - slug = f"models--{hf_id.replace('/', '--')}" - cache_path = HF_CACHE_DIR / slug - if not cache_path.exists(): - return - size_mb = sum(f.stat().st_size for f in cache_path.rglob("*") if f.is_file()) / (1024 * 1024) - shutil.rmtree(cache_path, ignore_errors=True) - safe_print(f" [cache] Cleaned {slug} ({size_mb:.0f} MB freed)") - - def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path: """Convert model ID + task to directory slug.""" slug = hf_id.replace("/", "__") @@ -807,6 +825,7 @@ def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path: def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" parser = argparse.ArgumentParser(description="E2E evaluation runner — unified perf + accuracy") parser.add_argument( "--registry", @@ -878,6 +897,7 @@ def parse_args() -> argparse.Namespace: def main() -> None: + """Run the E2E evaluation pipeline.""" args = parse_args() # 1. Load registry @@ -928,13 +948,7 @@ def main() -> None: safe_print(f"Registry: {len(entries)} models (eval-type: {args.eval_type})") for e in entries: ds = get_dataset_config(e.hf_id, e.task) - skip_acc = ( - "" - if args.eval_type == "perf" - else " [task_default]" - if ds is None - else "" - ) + skip_acc = "" if args.eval_type == "perf" else " [task_default]" if ds is None else "" safe_print( f" [{e.priority}] {e.hf_id} / {e.task} ({e.model_type}, {e.group}){skip_acc}" ) @@ -1010,9 +1024,7 @@ def main() -> None: # Timeout skip list: skip known-timeout models and write a TIMEOUT result if (entry.hf_id, entry.task or "") in timeout_skip_set: reason = _get_timeout_skip_reason(entry.hf_id, entry.task or "") - safe_print( - f"\n[{i}/{len(entries)}] {label} (SKIP - TIMEOUT: {reason})" - ) + safe_print(f"\n[{i}/{len(entries)}] {label} (SKIP - TIMEOUT: {reason})") model_dir.mkdir(parents=True, exist_ok=True) timeout_result = build_eval_result( entry=entry, @@ -1090,7 +1102,11 @@ def main() -> None: onnx_path: str | None = None if args.eval_type in ("perf", "both"): build_result = _run_build( - entry, args.device, _DEFAULT_PRECISION, args.timeout, model_dir, + entry, + args.device, + _DEFAULT_PRECISION, + args.timeout, + model_dir, ) if build_result["success"]: onnx_path = build_result["onnx_path"] @@ -1098,12 +1114,20 @@ def main() -> None: if args.eval_type == "accuracy": # Accuracy-only: build + eval (no perf) build_result = _run_build( - entry, args.device, _DEFAULT_PRECISION, args.timeout, model_dir, + entry, + args.device, + _DEFAULT_PRECISION, + args.timeout, + model_dir, ) if build_result["success"]: onnx_path = build_result["onnx_path"] accuracy_result = _run_accuracy_phase( - entry, args.device, args.timeout, model_dir, onnx_path, + entry, + args.device, + args.timeout, + model_dir, + onnx_path, ) else: accuracy_result = {"skipped": True, "skip_reason": "build_failed"} @@ -1124,7 +1148,11 @@ def main() -> None: accuracy_result = {"skipped": True, "skip_reason": "perf_failed"} else: accuracy_result = _run_accuracy_phase( - entry, args.device, args.timeout, model_dir, onnx_path, + entry, + args.device, + args.timeout, + model_dir, + onnx_path, ) else: # Build failed @@ -1169,7 +1197,7 @@ def main() -> None: safe_print(f" [acc only]{acc_tag}") if args.clean_hf_cache: - _clean_model_hf_cache(entry.hf_id) + _clear_disk_caches() run_duration = time.perf_counter() - run_start From 6f1963be3725233ce6120b38402ac7c4aeb33db6 Mon Sep 17 00:00:00 2001 From: yuesu Date: Wed, 1 Apr 2026 16:02:19 +0800 Subject: [PATCH 03/14] add date variable --- .pipelines/Modelkit E2E Test.yml | 61 ++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index 5e90f822d..99a3080d7 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -1,5 +1,11 @@ trigger: none +parameters: + - name: evalDate + displayName: 'Eval date (leave empty for today, e.g. 2026-04-01)' + type: string + default: '' + variables: evalOutputBase: 'c:/eval_results' @@ -43,7 +49,8 @@ jobs: displayName: 'Install dependencies' - powershell: | - $evalDate = Get-Date -Format 'yyyy-MM-dd' + $evalDate = '${{ parameters.evalDate }}' + if (-not $evalDate) { $evalDate = Get-Date -Format 'yyyy-MM-dd' } $dir = "$(evalOutputBase)/$evalDate" Write-Host "##vso[task.setvariable variable=EVAL_DIR;isOutput=true]$dir" Write-Host "Eval output directory: $dir" @@ -86,7 +93,8 @@ jobs: - job: EvalModel displayName: 'Run Model Eval' dependsOn: Prepare - timeoutInMinutes: 2400 + timeoutInMinutes: 90 + cancelTimeoutInMinutes: 2 pool: name: modelkit-selfhost-pool demands: @@ -136,11 +144,60 @@ jobs: workingDirectory: $(Build.SourcesDirectory) displayName: 'Run eval for current model' + - job: RetryMissed + displayName: 'Retry Interrupted Models' + dependsOn: + - Prepare + - EvalModel + condition: always() + timeoutInMinutes: 180 + cancelTimeoutInMinutes: 5 + pool: + name: modelkit-selfhost-pool + demands: + - Agent.Name -equals NPU-QNN + variables: + EVAL_DIR: $[ dependencies.Prepare.outputs['set_output_dir.EVAL_DIR'] ] + + steps: + - checkout: none + + - powershell: | + $uvBin = "$env:USERPROFILE\.local\bin" + $venvDir = "$(Build.SourcesDirectory)\.venv\Scripts" + Write-Host "##vso[task.prependpath]$uvBin" + Write-Host "##vso[task.prependpath]$venvDir" + displayName: 'Activate Python environment' + + - powershell: | + Write-Host "Retrying any models missing eval_result.json in $(EVAL_DIR)" + + $uvArgs = @( + "run", "--no-sync", "python", "scripts/e2e_eval/run_eval.py", + "--output-dir", "$(EVAL_DIR)", + "--device", "npu", + "--continue", + "--verbose", + "--timeout", "1800", + "--no-report", + "--clean-hf-cache" + ) + + & uv @uvArgs + $retryExit = $LASTEXITCODE + if ($retryExit -ne 0) { + Write-Warning "Retry pass exited with code $retryExit (non-blocking)" + } + exit 0 + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Re-run models with missing results' + - job: Report displayName: 'Generate Eval Report' dependsOn: - Prepare - EvalModel + - RetryMissed condition: always() pool: name: modelkit-selfhost-pool From 2f32a7d1df29a07cb59c5060f494a03b30003ffe Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 11:50:12 +0800 Subject: [PATCH 04/14] add --continue for list model --- .pipelines/Modelkit E2E Test.yml | 28 +++++-- scripts/e2e_eval/run_eval.py | 126 +++++++++++++++++++++++-------- 2 files changed, 116 insertions(+), 38 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index 99a3080d7..22c914422 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -5,6 +5,10 @@ parameters: displayName: 'Eval date (leave empty for today, e.g. 2026-04-01)' type: string default: '' + - name: continueRun + displayName: 'Skip already-evaluated models (--continue)' + type: boolean + default: true variables: evalOutputBase: 'c:/eval_results' @@ -57,10 +61,16 @@ jobs: name: set_output_dir displayName: 'Set eval output directory' - - script: > - uv run python scripts/e2e_eval/run_eval.py - --list-json temp/model_list.json - --device npu + - powershell: | + $args = @( + "run", "python", "scripts/e2e_eval/run_eval.py", + "--list-json", "temp/model_list.json", + "--device", "npu" + ) + if ('${{ parameters.continueRun }}' -eq 'True') { + $args += @("--continue", "--output-dir", "$(set_output_dir.EVAL_DIR)") + } + & uv @args workingDirectory: $(Build.SourcesDirectory) displayName: 'Generate model list' @@ -68,7 +78,10 @@ jobs: $models = Get-Content "$(Build.SourcesDirectory)/temp/model_list.json" | ConvertFrom-Json $total = $models.Count if ($total -eq 0) { - throw "No models found in temp/model_list.json" + Write-Host "All models already evaluated — nothing to run" + Write-Host "##vso[task.setvariable variable=modelMatrix;isOutput=true]{}" + Write-Host "##vso[task.setvariable variable=skipEval;isOutput=true]true" + return } $matrix = @{} @@ -93,6 +106,7 @@ jobs: - job: EvalModel displayName: 'Run Model Eval' dependsOn: Prepare + condition: and(succeeded(), ne(dependencies.Prepare.outputs['set_matrix.skipEval'], 'true')) timeoutInMinutes: 90 cancelTimeoutInMinutes: 2 pool: @@ -129,7 +143,7 @@ jobs: "--verbose", "--timeout", "1800", "--no-report", - "--clean-hf-cache" + "--clean-cache" ) if ("$(hf_task)") { $uvArgs += @("--task", "$(hf_task)") @@ -180,7 +194,7 @@ jobs: "--verbose", "--timeout", "1800", "--no-report", - "--clean-hf-cache" + "--clean-cache" ) & uv @uvArgs diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 53c941025..5538cca00 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -36,6 +36,7 @@ import shutil import subprocess import sys +import tempfile import threading import time from datetime import date, datetime, timezone @@ -116,6 +117,8 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str: _HF_CACHE = Path.home() / ".cache" / "huggingface" _WML_CACHE = Path.home() / ".cache" / "winml" +_TEMP_DIR = Path(os.environ.get("TEMP", os.environ.get("TMP", tempfile.gettempdir()))) +_TEMP_PREFIXES = ("wmk_", "tmp", "modelkit_compat_") def _is_no_space_error(proc: dict) -> bool: @@ -125,17 +128,39 @@ def _is_no_space_error(proc: dict) -> bool: def _clear_disk_caches() -> None: - """Delete HuggingFace and WML cache directories to free disk space.""" + """Delete HuggingFace, WML cache directories and leaked temp files.""" for cache_dir in (_HF_CACHE, _WML_CACHE): if cache_dir.exists(): - safe_print(f" [disk-full] Removing cache: {cache_dir}") + safe_print(f" [cleanup] Removing cache: {cache_dir}") try: shutil.rmtree(cache_dir) - safe_print(f" [disk-full] Removed: {cache_dir}") + safe_print(f" [cleanup] Removed: {cache_dir}") except OSError as exc: - safe_print(f" [disk-full] Warning: could not remove {cache_dir}: {exc}") - else: - safe_print(f" [disk-full] Cache not found (skipping): {cache_dir}") + safe_print(f" [cleanup] Warning: could not remove {cache_dir}: {exc}") + + # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx) + if _TEMP_DIR.is_dir(): + cleaned = 0 + for entry in _TEMP_DIR.iterdir(): + name = entry.name + if not any(name.startswith(p) for p in _TEMP_PREFIXES): + continue + is_leaked = ( + entry.is_dir() + or entry.suffix in (".onnx", ".out", ".err") + or name.endswith(".onnx.data") + ) + if is_leaked: + try: + if entry.is_dir(): + shutil.rmtree(entry) + else: + entry.unlink() + cleaned += 1 + except OSError: + pass + if cleaned: + safe_print(f" [cleanup] Removed {cleaned} leaked temp entries from {_TEMP_DIR}") def safe_print(text: str) -> None: @@ -811,6 +836,32 @@ def _get_disk_free_gb() -> float: return shutil.disk_usage(anchor).free / (1024**3) +def _should_skip_existing(existing: dict, retry_types: set[str] | None, eval_type: str) -> bool: + """Return True if an existing eval_result should be skipped (not re-run). + + Used by both --list-json and the main eval loop to share continue/retry logic. + """ + if retry_types is None: + return True # --continue without --retry-failed: skip all existing + + perf = existing.get("perf") or {} + acc = existing.get("accuracy") + + # Check perf failure (only when perf ran) + if eval_type != "accuracy" and not perf.get("passed"): + cls = classify_result(existing) or "UNKNOWN" + if not retry_types or cls in retry_types: + return False # Should retry + + # Check accuracy verdict + if acc is not None and not acc.get("skipped"): + verdict = derive_verdict(acc).value + if not retry_types or verdict in retry_types: + return False # Should retry + + return True # No retry criteria matched — skip + + def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path: """Convert model ID + task to directory slug.""" slug = hf_id.replace("/", "__") @@ -854,9 +905,11 @@ def parse_args() -> argparse.Namespace: "--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)" ) parser.add_argument( + "--clean-cache", "--clean-hf-cache", + dest="clean_cache", action="store_true", - help="Delete HuggingFace hub cache for each model after evaluation (saves disk space)", + help="Delete caches and leaked temp files after each model evaluation (saves disk space)", ) parser.add_argument("--list", action="store_true", help="List filtered models and exit") parser.add_argument( @@ -956,6 +1009,34 @@ def main() -> None: # --list-json mode: write machine-readable JSON and exit if args.list_json: + # --continue / --retry-failed: filter out already-evaluated models + if args.continue_run or args.retry_failed is not None: + output_dir = args.output_dir or Path(f"eval_results/{date.today().isoformat()}") + retry_types: set[str] | None = None + if args.retry_failed is not None: + args.continue_run = True + retry_types = {t.upper() for t in args.retry_failed} if args.retry_failed else set() + + filtered: list[ModelEntry] = [] + skipped_count = 0 + for e in entries: + result_path = model_result_dir(output_dir, e.hf_id, e.task) / "eval_result.json" + if args.continue_run and result_path.exists(): + try: + existing = load_result_json(result_path) + if _should_skip_existing(existing, retry_types, args.eval_type): + skipped_count += 1 + continue + except Exception: + pass # Corrupt result file — include model for re-evaluation + filtered.append(e) + if skipped_count: + safe_print( + f"--continue: skipped {skipped_count} already-evaluated models " + f"(output_dir: {output_dir})" + ) + entries = filtered + model_list = [ { "hf_id": e.hf_id, @@ -997,8 +1078,8 @@ def main() -> None: safe_print(f"E2E Evaluation: {len(entries)} models -> {output_dir}") safe_print(f"Device: {args.device} | Timeout: {args.timeout}s | Eval: {args.eval_type}") safe_print(f"Disk free: {_get_disk_free_gb():.1f} GB") - if args.clean_hf_cache: - safe_print("Cache cleanup: ON (HF cache cleaned after each model)") + if args.clean_cache: + safe_print("Cache cleanup: ON (caches + temp files cleaned after each model)") if retry_types is not None: if retry_types: safe_print(f"Retry mode: {', '.join(sorted(retry_types))}") @@ -1049,29 +1130,12 @@ def main() -> None: if args.continue_run and result_path.exists(): try: existing = load_result_json(result_path) - skip = True - - perf = existing.get("perf") or {} - acc = existing.get("accuracy") - - # Derive current classification / verdict to check against retry types - if retry_types is not None: - should_retry = False - # Check perf failure (only when perf ran) - if args.eval_type != "accuracy" and not perf.get("passed"): - cls = classify_result(existing) or "UNKNOWN" - if not retry_types or cls in retry_types: - should_retry = True - # Check accuracy verdict - if not should_retry and acc is not None and not acc.get("skipped"): - verdict = derive_verdict(acc).value - if not retry_types or verdict in retry_types: - should_retry = True - skip = not should_retry - - if skip: + + if _should_skip_existing(existing, retry_types, args.eval_type): results.append(existing) skipped += 1 + perf = existing.get("perf") or {} + acc = existing.get("accuracy") perf_cls = classify_result(existing) or "UNKNOWN" perf_tag = "PASS" if perf.get("passed") else f"FAIL/{perf_cls}" acc_tag = "" @@ -1196,7 +1260,7 @@ def main() -> None: else: safe_print(f" [acc only]{acc_tag}") - if args.clean_hf_cache: + if args.clean_cache: _clear_disk_caches() run_duration = time.perf_counter() - run_start From 8967738caa41d8c7cb9aaacb3c52e860f5306a40 Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 13:37:21 +0800 Subject: [PATCH 05/14] update clean --- scripts/e2e_eval/run_eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 5538cca00..fe7a65fd8 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -118,7 +118,7 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str: _HF_CACHE = Path.home() / ".cache" / "huggingface" _WML_CACHE = Path.home() / ".cache" / "winml" _TEMP_DIR = Path(os.environ.get("TEMP", os.environ.get("TMP", tempfile.gettempdir()))) -_TEMP_PREFIXES = ("wmk_", "tmp", "modelkit_compat_") +_TEMP_PREFIXES = ("wmk_", "modelkit_compat_") def _is_no_space_error(proc: dict) -> bool: @@ -151,6 +151,7 @@ def _clear_disk_caches() -> None: or name.endswith(".onnx.data") ) if is_leaked: + safe_print(f" [cleanup] Leaked temp: {entry} (dir={entry.is_dir()})") try: if entry.is_dir(): shutil.rmtree(entry) From 088d82f08663a8d6883934cfb29e6e66bdd4c982 Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 13:47:34 +0800 Subject: [PATCH 06/14] update --- scripts/e2e_eval/run_eval.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index fe7a65fd8..e8761930f 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -138,20 +138,23 @@ def _clear_disk_caches() -> None: except OSError as exc: safe_print(f" [cleanup] Warning: could not remove {cache_dir}: {exc}") - # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx) + # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx*) if _TEMP_DIR.is_dir(): cleaned = 0 for entry in _TEMP_DIR.iterdir(): name = entry.name - if not any(name.startswith(p) for p in _TEMP_PREFIXES): - continue - is_leaked = ( - entry.is_dir() - or entry.suffix in (".onnx", ".out", ".err") - or name.endswith(".onnx.data") - ) - if is_leaked: - safe_print(f" [cleanup] Leaked temp: {entry} (dir={entry.is_dir()})") + should_clean = False + if any(name.startswith(p) for p in _TEMP_PREFIXES): + should_clean = ( + entry.is_dir() + or entry.suffix in (".onnx", ".out", ".err") + or name.endswith(".onnx.data") + ) + elif name.startswith("tmp") and name.endswith((".onnx", ".onnx.data")): + # Python tempfile creates tmp* prefixed files; only clean ONNX artifacts + should_clean = True + if should_clean: + safe_print(f" [cleanup] Leaked temp: {entry}") try: if entry.is_dir(): shutil.rmtree(entry) From c578b2255633a2bee34ea4829d1026cd2b6a35d1 Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 14:15:04 +0800 Subject: [PATCH 07/14] remove retry --- .pipelines/Modelkit E2E Test.yml | 48 -------------------------------- 1 file changed, 48 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index 22c914422..e359b20b5 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -158,54 +158,6 @@ jobs: workingDirectory: $(Build.SourcesDirectory) displayName: 'Run eval for current model' - - job: RetryMissed - displayName: 'Retry Interrupted Models' - dependsOn: - - Prepare - - EvalModel - condition: always() - timeoutInMinutes: 180 - cancelTimeoutInMinutes: 5 - pool: - name: modelkit-selfhost-pool - demands: - - Agent.Name -equals NPU-QNN - variables: - EVAL_DIR: $[ dependencies.Prepare.outputs['set_output_dir.EVAL_DIR'] ] - - steps: - - checkout: none - - - powershell: | - $uvBin = "$env:USERPROFILE\.local\bin" - $venvDir = "$(Build.SourcesDirectory)\.venv\Scripts" - Write-Host "##vso[task.prependpath]$uvBin" - Write-Host "##vso[task.prependpath]$venvDir" - displayName: 'Activate Python environment' - - - powershell: | - Write-Host "Retrying any models missing eval_result.json in $(EVAL_DIR)" - - $uvArgs = @( - "run", "--no-sync", "python", "scripts/e2e_eval/run_eval.py", - "--output-dir", "$(EVAL_DIR)", - "--device", "npu", - "--continue", - "--verbose", - "--timeout", "1800", - "--no-report", - "--clean-cache" - ) - - & uv @uvArgs - $retryExit = $LASTEXITCODE - if ($retryExit -ne 0) { - Write-Warning "Retry pass exited with code $retryExit (non-blocking)" - } - exit 0 - workingDirectory: $(Build.SourcesDirectory) - displayName: 'Re-run models with missing results' - - job: Report displayName: 'Generate Eval Report' dependsOn: From 1a11e2882ab2fa5cb938641076f82a76f86f0af9 Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 14:23:09 +0800 Subject: [PATCH 08/14] fix --- .pipelines/Modelkit E2E Test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index e359b20b5..ab870e668 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -163,7 +163,6 @@ jobs: dependsOn: - Prepare - EvalModel - - RetryMissed condition: always() pool: name: modelkit-selfhost-pool From a85954f1342ebe9f9f561a8ba6b7d25d50d613be Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 15:39:53 +0800 Subject: [PATCH 09/14] clean .onnx.data file in temp --- src/winml/modelkit/optim/pipes/fusion.py | 6 ++++-- src/winml/modelkit/optim/pipes/graph.py | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/winml/modelkit/optim/pipes/fusion.py b/src/winml/modelkit/optim/pipes/fusion.py index 9b3e348b4..57d2c7ee6 100644 --- a/src/winml/modelkit/optim/pipes/fusion.py +++ b/src/winml/modelkit/optim/pipes/fusion.py @@ -301,6 +301,8 @@ def process(self, model: onnx.ModelProto, config: ORTFusionPipeConfig) -> onnx.M ) from e finally: - # Clean up temporary file + # Clean up temporary file and its external data sidecar if input_path: - Path(input_path).unlink(missing_ok=True) + p = Path(input_path) + p.unlink(missing_ok=True) + (p.parent / f"{p.name}.data").unlink(missing_ok=True) diff --git a/src/winml/modelkit/optim/pipes/graph.py b/src/winml/modelkit/optim/pipes/graph.py index fe6f9b4e8..ed3841c56 100644 --- a/src/winml/modelkit/optim/pipes/graph.py +++ b/src/winml/modelkit/optim/pipes/graph.py @@ -593,8 +593,9 @@ def process(self, model: onnx.ModelProto, config: ORTGraphPipeConfig) -> onnx.Mo ) from e finally: - # Clean up temporary files (always execute) - if input_file and input_file.exists(): - input_file.unlink(missing_ok=True) - if output_file and output_file.exists(): - output_file.unlink(missing_ok=True) + # Clean up temporary files and their external data sidecars + for tmp in (input_file, output_file): + if tmp is not None: + tmp.unlink(missing_ok=True) + data_sidecar = tmp.parent / f"{tmp.name}.data" + data_sidecar.unlink(missing_ok=True) From 74656ddcdb7c89de15fe197e9e304c6d17286254 Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 15:58:50 +0800 Subject: [PATCH 10/14] refine job name --- .pipelines/Modelkit E2E Test.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index ab870e668..e163129ce 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -87,7 +87,8 @@ jobs: $matrix = @{} for ($i = 0; $i -lt $total; $i++) { $m = $models[$i] - $key = ($m.hf_id -replace '[^A-Za-z0-9]', '_') + $slug = ($m.hf_id -replace '[^A-Za-z0-9]', '_') + $key = "$($i + 1)_${slug}" $matrix[$key] = @{ hf_id = [string]$m.hf_id hf_task = [string]$m.task @@ -104,7 +105,7 @@ jobs: displayName: 'Create matrix variables' - job: EvalModel - displayName: 'Run Model Eval' + displayName: 'Eval' dependsOn: Prepare condition: and(succeeded(), ne(dependencies.Prepare.outputs['set_matrix.skipEval'], 'true')) timeoutInMinutes: 90 From 73baa304f1f8b662aa9c66279ad396fc476f5b8a Mon Sep 17 00:00:00 2001 From: yuesu Date: Thu, 2 Apr 2026 16:15:03 +0800 Subject: [PATCH 11/14] change number to task --- .pipelines/Modelkit E2E Test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index e163129ce..362a95e51 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -87,8 +87,8 @@ jobs: $matrix = @{} for ($i = 0; $i -lt $total; $i++) { $m = $models[$i] - $slug = ($m.hf_id -replace '[^A-Za-z0-9]', '_') - $key = "$($i + 1)_${slug}" + $slug = (($m.hf_id + '_' + $m.task) -replace '[^A-Za-z0-9]', '_') + $key = $slug $matrix[$key] = @{ hf_id = [string]$m.hf_id hf_task = [string]$m.task From 6d9d6c9ce46f2c12f4911b646f63e6885d9a5aaa Mon Sep 17 00:00:00 2001 From: yuesu Date: Fri, 3 Apr 2026 11:52:27 +0800 Subject: [PATCH 12/14] add publish artifact --- .pipelines/Modelkit E2E Test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index 362a95e51..8bd498b5f 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -187,3 +187,9 @@ jobs: --input-dir $(EVAL_DIR) workingDirectory: $(Build.SourcesDirectory) displayName: 'Generate evaluation report' + + - task: PublishPipelineArtifact@1 + inputs: + targetPath: $(EVAL_DIR) + artifactName: EvalReport + displayName: 'Publish eval results as artifact' From 5f7d875dbe77fc7da8f9c07e9feecc6f2eafabc1 Mon Sep 17 00:00:00 2001 From: yuesu Date: Fri, 3 Apr 2026 12:09:41 +0800 Subject: [PATCH 13/14] remove --- scripts/e2e_eval/run_eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 487297f75..b85367ac1 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -915,7 +915,6 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--clean-cache", - "--clean-hf-cache", dest="clean_cache", action="store_true", help="Delete caches and leaked temp files after each model evaluation (saves disk space)", From 973caf70d69d2b72c2a5293d082e60e3599ab68b Mon Sep 17 00:00:00 2001 From: yuesu Date: Fri, 3 Apr 2026 16:23:03 +0800 Subject: [PATCH 14/14] update --- .pipelines/Modelkit E2E Test.yml | 5 +++++ scripts/e2e_eval/run_eval.py | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml index 8bd498b5f..a42a84e0f 100644 --- a/.pipelines/Modelkit E2E Test.yml +++ b/.pipelines/Modelkit E2E Test.yml @@ -89,6 +89,11 @@ jobs: $m = $models[$i] $slug = (($m.hf_id + '_' + $m.task) -replace '[^A-Za-z0-9]', '_') $key = $slug + $suffix = 2 + while ($matrix.ContainsKey($key)) { + $key = "${slug}_${suffix}" + $suffix++ + } $matrix[$key] = @{ hf_id = [string]$m.hf_id hf_task = [string]$m.task diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index b85367ac1..0f004035e 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -3,11 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- - """E2E evaluation runner — unified perf + accuracy. Batch-runs winml perf (and optionally winml eval + pytorch baseline) for models @@ -167,7 +162,7 @@ def _clear_disk_caches() -> None: entry.unlink() cleaned += 1 except OSError: - pass + pass # Best-effort cleanup; ignore if file is locked or already removed if cleaned: safe_print(f" [cleanup] Removed {cleaned} leaked temp entries from {_TEMP_DIR}")