From b6f8a941872d688158bd29c21cd2acc23ac06b2d Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Tue, 31 Mar 2026 15:57:01 +0800
Subject: [PATCH 01/14] change error to warning

---
 .pipelines/Modelkit E2E Test.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index ea7ba9512..b3e42b13c 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -128,8 +128,9 @@ jobs:
           }
 
           & uv @uvArgs
-          if ($LASTEXITCODE -ne 0) {
-              throw "Model evaluation failed for $(hf_id) / $(hf_task)"
+          $evalExit = $LASTEXITCODE
+          if ($evalExit -ne 0) {
+              Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task)"
           }
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Run eval for current model'

From 82cf2eaabc4bc9c8f2f079520742689eb88799b2 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Tue, 31 Mar 2026 16:39:41 +0800
Subject: [PATCH 02/14] refine

---
 .pipelines/Modelkit E2E Test.yml |   3 +-
 scripts/e2e_eval/run_eval.py     | 152 ++++++++++++++++++-------------
 2 files changed, 92 insertions(+), 63 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index b3e42b13c..5e90f822d 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -130,8 +130,9 @@ jobs:
           & uv @uvArgs
           $evalExit = $LASTEXITCODE
           if ($evalExit -ne 0) {
-              Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task)"
+              Write-Warning "Model eval exited with code $evalExit for $(hf_id) / $(hf_task) (model failure — non-blocking)"
           }
+          exit 0
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Run eval for current model'
 
diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 1aa6b682f..53c941025 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
 """E2E evaluation runner — unified perf + accuracy.
 
 Batch-runs wmk perf (and optionally wmk eval + pytorch baseline) for models
@@ -102,10 +107,10 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str:
 
 # Patterns that indicate the disk is full (cross-platform).
 _NO_SPACE_PATTERNS = (
-    "no space left on device",          # Linux/macOS OSError
-    "oserror: [errno 28]",              # Python errno string
+    "no space left on device",  # Linux/macOS OSError
+    "oserror: [errno 28]",  # Python errno string
     "there is not enough space on the disk",  # Windows
-    "winerror 112",                     # Windows disk-full error code
+    "winerror 112",  # Windows disk-full error code
     "disk full",
 )
 
@@ -167,7 +172,8 @@ def _kill_process_tree(pid: int) -> None:
         # Fallback: taskkill on Windows, killpg on Unix
         if platform.system() == "Windows":
             subprocess.run(  # noqa: S603
-                ["taskkill", "/F", "/T", "/PID", str(pid)], capture_output=True
+                ["taskkill", "/F", "/T", "/PID", str(pid)],  # noqa: S607
+                capture_output=True,
             )
         else:
             import signal
@@ -309,7 +315,11 @@ def _watchdog() -> None:
 
 
 def _run_build(
-    entry: ModelEntry, device: str, precision: str, timeout: int, model_dir: Path,
+    entry: ModelEntry,
+    device: str,
+    precision: str,
+    timeout: int,
+    model_dir: Path,
 ) -> dict:
     """Run wmk config + wmk build for one model. Returns build result dict.
 
@@ -320,11 +330,16 @@ def _run_build(
 
     # Step 1: wmk config
     config_args = [
-        *WMK, "config",
-        "-m", entry.hf_id,
-        "--device", device,
-        "--precision", precision,
-        "-o", str(config_path),
+        *WMK,
+        "config",
+        "-m",
+        entry.hf_id,
+        "--device",
+        device,
+        "--precision",
+        precision,
+        "-o",
+        str(config_path),
     ]
     if entry.task:
         config_args += ["--task", entry.task]
@@ -340,9 +355,12 @@ def _run_build(
 
     # Step 2: wmk build --use-cache
     build_args = [
-        *WMK, "build",
-        "-c", str(config_path),
-        "-m", entry.hf_id,
+        *WMK,
+        "build",
+        "-c",
+        str(config_path),
+        "-m",
+        entry.hf_id,
         "--use-cache",
     ]
 
@@ -404,7 +422,10 @@ def _find_cached_model(hf_id: str, build_proc: dict) -> str | None:
 
 
 def run_model(
-    entry: ModelEntry, device: str, timeout: int, onnx_path: str | None = None,
+    entry: ModelEntry,
+    device: str,
+    timeout: int,
+    onnx_path: str | None = None,
 ) -> dict:
     """Execute wmk perf for one model. Returns raw subprocess result dict.
 
@@ -415,8 +436,14 @@ def run_model(
         args = [*WMK, "perf", "-m", onnx_path, "--device", device]
     else:
         args = [
-            *WMK, "perf", "-m", entry.hf_id,
-            "--device", device, "--precision", _DEFAULT_PRECISION,
+            *WMK,
+            "perf",
+            "-m",
+            entry.hf_id,
+            "--device",
+            device,
+            "--precision",
+            _DEFAULT_PRECISION,
         ]
         if entry.task:
             args += ["--task", entry.task]
@@ -516,14 +543,23 @@ def _run_wmk_eval(
     eval_device = "npu" if device == "auto" else device
     if onnx_path:
         args = [
-            *WMK, "eval", "-m", onnx_path,
-            "--model-id", entry.hf_id,
-            "--device", eval_device,
+            *WMK,
+            "eval",
+            "-m",
+            onnx_path,
+            "--model-id",
+            entry.hf_id,
+            "--device",
+            eval_device,
         ]
     else:
         args = [
-            *WMK, "eval", "-m", entry.hf_id,
-            "--device", eval_device,
+            *WMK,
+            "eval",
+            "-m",
+            entry.hf_id,
+            "--device",
+            eval_device,
         ]
     if entry.task:
         args += ["--task", entry.task]
@@ -550,14 +586,9 @@ def _run_wmk_eval(
 
     metric = None
     if proc["exit_code"] == 0 and output_path.exists():
-        wmk_key = (
-            ds_config.get("wmk_metric_key")
-            or ds_config.get("metric", "accuracy")
-        )
+        wmk_key = ds_config.get("wmk_metric_key") or ds_config.get("metric", "accuracy")
         num_samples = ds_config.get("num_samples", _DEFAULT_SAMPLES)
-        metric = _parse_metric_from_wmk_output(
-            output_path, wmk_key, num_samples
-        )
+        metric = _parse_metric_from_wmk_output(output_path, wmk_key, num_samples)
     status = "PASS" if (proc["exit_code"] == 0 and metric is not None) else "FAIL"
 
     return {
@@ -606,9 +637,7 @@ def _save_baseline_cache(cache: dict) -> None:
     )
 
 
-def _lookup_baseline_cache(
-    hf_id: str, task: str, ds_config: dict
-) -> dict | None:
+def _lookup_baseline_cache(hf_id: str, task: str, ds_config: dict) -> dict | None:
     """Return cached baseline result dict, or None if not cached."""
     cache = _load_baseline_cache()
     key = _baseline_cache_key(hf_id, task, ds_config)
@@ -631,9 +660,7 @@ def _shorten_command(cmd: str) -> str:
     return " ".join(shortened)
 
 
-def _store_baseline_cache(
-    hf_id: str, task: str, ds_config: dict, result: dict
-) -> None:
+def _store_baseline_cache(hf_id: str, task: str, ds_config: dict, result: dict) -> None:
     """Store a successful baseline result in cache."""
     if result.get("status") != "PASS":
         return
@@ -758,8 +785,10 @@ def save_environment_info(path: Path) -> None:
     # Git HEAD commit info
     try:
         result = subprocess.run(
-            ["git", "log", "-1", "--format=%H%n%s%n%ai"],  # noqa: S603, S607
-            capture_output=True, text=True, timeout=5,
+            ["git", "log", "-1", "--format=%H%n%s%n%ai"],  # noqa: S607
+            capture_output=True,
+            text=True,
+            timeout=5,
         )
         if result.returncode == 0:
             lines = result.stdout.strip().splitlines()
@@ -782,17 +811,6 @@ def _get_disk_free_gb() -> float:
     return shutil.disk_usage(anchor).free / (1024**3)
 
 
-def _clean_model_hf_cache(hf_id: str) -> None:
-    """Delete cached HuggingFace files for a specific model."""
-    slug = f"models--{hf_id.replace('/', '--')}"
-    cache_path = HF_CACHE_DIR / slug
-    if not cache_path.exists():
-        return
-    size_mb = sum(f.stat().st_size for f in cache_path.rglob("*") if f.is_file()) / (1024 * 1024)
-    shutil.rmtree(cache_path, ignore_errors=True)
-    safe_print(f"  [cache] Cleaned {slug} ({size_mb:.0f} MB freed)")
-
-
 def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path:
     """Convert model ID + task to directory slug."""
     slug = hf_id.replace("/", "__")
@@ -807,6 +825,7 @@ def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path:
 
 
 def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="E2E evaluation runner — unified perf + accuracy")
     parser.add_argument(
         "--registry",
@@ -878,6 +897,7 @@ def parse_args() -> argparse.Namespace:
 
 
 def main() -> None:
+    """Run the E2E evaluation pipeline."""
     args = parse_args()
 
     # 1. Load registry
@@ -928,13 +948,7 @@ def main() -> None:
         safe_print(f"Registry: {len(entries)} models  (eval-type: {args.eval_type})")
         for e in entries:
             ds = get_dataset_config(e.hf_id, e.task)
-            skip_acc = (
-                ""
-                if args.eval_type == "perf"
-                else "  [task_default]"
-                if ds is None
-                else ""
-            )
+            skip_acc = "" if args.eval_type == "perf" else "  [task_default]" if ds is None else ""
             safe_print(
                 f"  [{e.priority}] {e.hf_id} / {e.task}  ({e.model_type}, {e.group}){skip_acc}"
             )
@@ -1010,9 +1024,7 @@ def main() -> None:
         # Timeout skip list: skip known-timeout models and write a TIMEOUT result
         if (entry.hf_id, entry.task or "") in timeout_skip_set:
             reason = _get_timeout_skip_reason(entry.hf_id, entry.task or "")
-            safe_print(
-                f"\n[{i}/{len(entries)}] {label}  (SKIP - TIMEOUT: {reason})"
-            )
+            safe_print(f"\n[{i}/{len(entries)}] {label}  (SKIP - TIMEOUT: {reason})")
             model_dir.mkdir(parents=True, exist_ok=True)
             timeout_result = build_eval_result(
                 entry=entry,
@@ -1090,7 +1102,11 @@ def main() -> None:
             onnx_path: str | None = None
             if args.eval_type in ("perf", "both"):
                 build_result = _run_build(
-                    entry, args.device, _DEFAULT_PRECISION, args.timeout, model_dir,
+                    entry,
+                    args.device,
+                    _DEFAULT_PRECISION,
+                    args.timeout,
+                    model_dir,
                 )
                 if build_result["success"]:
                     onnx_path = build_result["onnx_path"]
@@ -1098,12 +1114,20 @@ def main() -> None:
             if args.eval_type == "accuracy":
                 # Accuracy-only: build + eval (no perf)
                 build_result = _run_build(
-                    entry, args.device, _DEFAULT_PRECISION, args.timeout, model_dir,
+                    entry,
+                    args.device,
+                    _DEFAULT_PRECISION,
+                    args.timeout,
+                    model_dir,
                 )
                 if build_result["success"]:
                     onnx_path = build_result["onnx_path"]
                     accuracy_result = _run_accuracy_phase(
-                        entry, args.device, args.timeout, model_dir, onnx_path,
+                        entry,
+                        args.device,
+                        args.timeout,
+                        model_dir,
+                        onnx_path,
                     )
                 else:
                     accuracy_result = {"skipped": True, "skip_reason": "build_failed"}
@@ -1124,7 +1148,11 @@ def main() -> None:
                         accuracy_result = {"skipped": True, "skip_reason": "perf_failed"}
                     else:
                         accuracy_result = _run_accuracy_phase(
-                            entry, args.device, args.timeout, model_dir, onnx_path,
+                            entry,
+                            args.device,
+                            args.timeout,
+                            model_dir,
+                            onnx_path,
                         )
                 else:
                     # Build failed
@@ -1169,7 +1197,7 @@ def main() -> None:
             safe_print(f"  [acc only]{acc_tag}")
 
         if args.clean_hf_cache:
-            _clean_model_hf_cache(entry.hf_id)
+            _clear_disk_caches()
 
     run_duration = time.perf_counter() - run_start
 

From 6f1963be3725233ce6120b38402ac7c4aeb33db6 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Wed, 1 Apr 2026 16:02:19 +0800
Subject: [PATCH 03/14] add date variable

---
 .pipelines/Modelkit E2E Test.yml | 61 ++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index 5e90f822d..99a3080d7 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -1,5 +1,11 @@
 trigger: none
 
+parameters:
+  - name: evalDate
+    displayName: 'Eval date (leave empty for today, e.g. 2026-04-01)'
+    type: string
+    default: ''
+
 variables:
   evalOutputBase: 'c:/eval_results'
 
@@ -43,7 +49,8 @@ jobs:
         displayName: 'Install dependencies'
 
       - powershell: |
-          $evalDate = Get-Date -Format 'yyyy-MM-dd'
+          $evalDate = '${{ parameters.evalDate }}'
+          if (-not $evalDate) { $evalDate = Get-Date -Format 'yyyy-MM-dd' }
           $dir = "$(evalOutputBase)/$evalDate"
           Write-Host "##vso[task.setvariable variable=EVAL_DIR;isOutput=true]$dir"
           Write-Host "Eval output directory: $dir"
@@ -86,7 +93,8 @@ jobs:
   - job: EvalModel
     displayName: 'Run Model Eval'
     dependsOn: Prepare
-    timeoutInMinutes: 2400
+    timeoutInMinutes: 90
+    cancelTimeoutInMinutes: 2
     pool:
       name: modelkit-selfhost-pool
       demands:
@@ -136,11 +144,60 @@ jobs:
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Run eval for current model'
 
+  - job: RetryMissed
+    displayName: 'Retry Interrupted Models'
+    dependsOn:
+      - Prepare
+      - EvalModel
+    condition: always()
+    timeoutInMinutes: 180
+    cancelTimeoutInMinutes: 5
+    pool:
+      name: modelkit-selfhost-pool
+      demands:
+        - Agent.Name -equals NPU-QNN
+    variables:
+      EVAL_DIR: $[ dependencies.Prepare.outputs['set_output_dir.EVAL_DIR'] ]
+
+    steps:
+      - checkout: none
+
+      - powershell: |
+          $uvBin = "$env:USERPROFILE\.local\bin"
+          $venvDir = "$(Build.SourcesDirectory)\.venv\Scripts"
+          Write-Host "##vso[task.prependpath]$uvBin"
+          Write-Host "##vso[task.prependpath]$venvDir"
+        displayName: 'Activate Python environment'
+
+      - powershell: |
+          Write-Host "Retrying any models missing eval_result.json in $(EVAL_DIR)"
+
+          $uvArgs = @(
+              "run", "--no-sync", "python", "scripts/e2e_eval/run_eval.py",
+              "--output-dir", "$(EVAL_DIR)",
+              "--device", "npu",
+              "--continue",
+              "--verbose",
+              "--timeout", "1800",
+              "--no-report",
+              "--clean-hf-cache"
+          )
+
+          & uv @uvArgs
+          $retryExit = $LASTEXITCODE
+          if ($retryExit -ne 0) {
+              Write-Warning "Retry pass exited with code $retryExit (non-blocking)"
+          }
+          exit 0
+        workingDirectory: $(Build.SourcesDirectory)
+        displayName: 'Re-run models with missing results'
+
   - job: Report
     displayName: 'Generate Eval Report'
     dependsOn:
       - Prepare
       - EvalModel
+      - RetryMissed
     condition: always()
     pool:
       name: modelkit-selfhost-pool

From 2f32a7d1df29a07cb59c5060f494a03b30003ffe Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 11:50:12 +0800
Subject: [PATCH 04/14] add --continue for list model

---
 .pipelines/Modelkit E2E Test.yml |  28 +++++--
 scripts/e2e_eval/run_eval.py     | 126 +++++++++++++++++++++++--------
 2 files changed, 116 insertions(+), 38 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index 99a3080d7..22c914422 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -5,6 +5,10 @@ parameters:
     displayName: 'Eval date (leave empty for today, e.g. 2026-04-01)'
     type: string
     default: ''
+  - name: continueRun
+    displayName: 'Skip already-evaluated models (--continue)'
+    type: boolean
+    default: true
 
 variables:
   evalOutputBase: 'c:/eval_results'
@@ -57,10 +61,16 @@ jobs:
         name: set_output_dir
         displayName: 'Set eval output directory'
 
-      - script: >
-          uv run python scripts/e2e_eval/run_eval.py
-          --list-json temp/model_list.json
-          --device npu
+      - powershell: |
+          $args = @(
+              "run", "python", "scripts/e2e_eval/run_eval.py",
+              "--list-json", "temp/model_list.json",
+              "--device", "npu"
+          )
+          if ('${{ parameters.continueRun }}' -eq 'True') {
+              $args += @("--continue", "--output-dir", "$(set_output_dir.EVAL_DIR)")
+          }
+          & uv @args
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Generate model list'
 
@@ -68,7 +78,10 @@ jobs:
           $models = Get-Content "$(Build.SourcesDirectory)/temp/model_list.json" | ConvertFrom-Json
           $total = $models.Count
           if ($total -eq 0) {
-              throw "No models found in temp/model_list.json"
+              Write-Host "All models already evaluated — nothing to run"
+              Write-Host "##vso[task.setvariable variable=modelMatrix;isOutput=true]{}"
+              Write-Host "##vso[task.setvariable variable=skipEval;isOutput=true]true"
+              return
           }
 
           $matrix = @{}
@@ -93,6 +106,7 @@ jobs:
   - job: EvalModel
     displayName: 'Run Model Eval'
     dependsOn: Prepare
+    condition: and(succeeded(), ne(dependencies.Prepare.outputs['set_matrix.skipEval'], 'true'))
     timeoutInMinutes: 90
     cancelTimeoutInMinutes: 2
     pool:
@@ -129,7 +143,7 @@ jobs:
               "--verbose",
               "--timeout", "1800",
               "--no-report",
-              "--clean-hf-cache"
+              "--clean-cache"
           )
           if ("$(hf_task)") {
               $uvArgs += @("--task", "$(hf_task)")
@@ -180,7 +194,7 @@ jobs:
               "--verbose",
               "--timeout", "1800",
               "--no-report",
-              "--clean-hf-cache"
+              "--clean-cache"
           )
 
           & uv @uvArgs
diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 53c941025..5538cca00 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -36,6 +36,7 @@
 import shutil
 import subprocess
 import sys
+import tempfile
 import threading
 import time
 from datetime import date, datetime, timezone
@@ -116,6 +117,8 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str:
 
 _HF_CACHE = Path.home() / ".cache" / "huggingface"
 _WML_CACHE = Path.home() / ".cache" / "winml"
+_TEMP_DIR = Path(os.environ.get("TEMP", os.environ.get("TMP", tempfile.gettempdir())))
+_TEMP_PREFIXES = ("wmk_", "tmp", "modelkit_compat_")
 
 
 def _is_no_space_error(proc: dict) -> bool:
@@ -125,17 +128,39 @@ def _is_no_space_error(proc: dict) -> bool:
 
 
 def _clear_disk_caches() -> None:
-    """Delete HuggingFace and WML cache directories to free disk space."""
+    """Delete HuggingFace, WML cache directories and leaked temp files."""
     for cache_dir in (_HF_CACHE, _WML_CACHE):
         if cache_dir.exists():
-            safe_print(f"  [disk-full] Removing cache: {cache_dir}")
+            safe_print(f"  [cleanup] Removing cache: {cache_dir}")
             try:
                 shutil.rmtree(cache_dir)
-                safe_print(f"  [disk-full] Removed: {cache_dir}")
+                safe_print(f"  [cleanup] Removed: {cache_dir}")
             except OSError as exc:
-                safe_print(f"  [disk-full] Warning: could not remove {cache_dir}: {exc}")
-        else:
-            safe_print(f"  [disk-full] Cache not found (skipping): {cache_dir}")
+                safe_print(f"  [cleanup] Warning: could not remove {cache_dir}: {exc}")
+
+    # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx)
+    if _TEMP_DIR.is_dir():
+        cleaned = 0
+        for entry in _TEMP_DIR.iterdir():
+            name = entry.name
+            if not any(name.startswith(p) for p in _TEMP_PREFIXES):
+                continue
+            is_leaked = (
+                entry.is_dir()
+                or entry.suffix in (".onnx", ".out", ".err")
+                or name.endswith(".onnx.data")
+            )
+            if is_leaked:
+                try:
+                    if entry.is_dir():
+                        shutil.rmtree(entry)
+                    else:
+                        entry.unlink()
+                    cleaned += 1
+                except OSError:
+                    pass
+        if cleaned:
+            safe_print(f"  [cleanup] Removed {cleaned} leaked temp entries from {_TEMP_DIR}")
 
 
 def safe_print(text: str) -> None:
@@ -811,6 +836,32 @@ def _get_disk_free_gb() -> float:
     return shutil.disk_usage(anchor).free / (1024**3)
 
 
+def _should_skip_existing(existing: dict, retry_types: set[str] | None, eval_type: str) -> bool:
+    """Return True if an existing eval_result should be skipped (not re-run).
+
+    Used by both --list-json and the main eval loop to share continue/retry logic.
+    """
+    if retry_types is None:
+        return True  # --continue without --retry-failed: skip all existing
+
+    perf = existing.get("perf") or {}
+    acc = existing.get("accuracy")
+
+    # Check perf failure (only when perf ran)
+    if eval_type != "accuracy" and not perf.get("passed"):
+        cls = classify_result(existing) or "UNKNOWN"
+        if not retry_types or cls in retry_types:
+            return False  # Should retry
+
+    # Check accuracy verdict
+    if acc is not None and not acc.get("skipped"):
+        verdict = derive_verdict(acc).value
+        if not retry_types or verdict in retry_types:
+            return False  # Should retry
+
+    return True  # No retry criteria matched — skip
+
+
 def model_result_dir(output_dir: Path, hf_id: str, task: str = "") -> Path:
     """Convert model ID + task to directory slug."""
     slug = hf_id.replace("/", "__")
@@ -854,9 +905,11 @@ def parse_args() -> argparse.Namespace:
         "--timeout", type=int, default=600, help="Per-subprocess timeout in seconds (default: 600)"
     )
     parser.add_argument(
+        "--clean-cache",
         "--clean-hf-cache",
+        dest="clean_cache",
         action="store_true",
-        help="Delete HuggingFace hub cache for each model after evaluation (saves disk space)",
+        help="Delete caches and leaked temp files after each model evaluation (saves disk space)",
     )
     parser.add_argument("--list", action="store_true", help="List filtered models and exit")
     parser.add_argument(
@@ -956,6 +1009,34 @@ def main() -> None:
 
     # --list-json mode: write machine-readable JSON and exit
     if args.list_json:
+        # --continue / --retry-failed: filter out already-evaluated models
+        if args.continue_run or args.retry_failed is not None:
+            output_dir = args.output_dir or Path(f"eval_results/{date.today().isoformat()}")
+            retry_types: set[str] | None = None
+            if args.retry_failed is not None:
+                args.continue_run = True
+                retry_types = {t.upper() for t in args.retry_failed} if args.retry_failed else set()
+
+            filtered: list[ModelEntry] = []
+            skipped_count = 0
+            for e in entries:
+                result_path = model_result_dir(output_dir, e.hf_id, e.task) / "eval_result.json"
+                if args.continue_run and result_path.exists():
+                    try:
+                        existing = load_result_json(result_path)
+                        if _should_skip_existing(existing, retry_types, args.eval_type):
+                            skipped_count += 1
+                            continue
+                    except Exception:
+                        pass  # Corrupt result file — include model for re-evaluation
+                filtered.append(e)
+            if skipped_count:
+                safe_print(
+                    f"--continue: skipped {skipped_count} already-evaluated models "
+                    f"(output_dir: {output_dir})"
+                )
+            entries = filtered
+
         model_list = [
             {
                 "hf_id": e.hf_id,
@@ -997,8 +1078,8 @@ def main() -> None:
     safe_print(f"E2E Evaluation: {len(entries)} models -> {output_dir}")
     safe_print(f"Device: {args.device} | Timeout: {args.timeout}s | Eval: {args.eval_type}")
     safe_print(f"Disk free: {_get_disk_free_gb():.1f} GB")
-    if args.clean_hf_cache:
-        safe_print("Cache cleanup: ON (HF cache cleaned after each model)")
+    if args.clean_cache:
+        safe_print("Cache cleanup: ON (caches + temp files cleaned after each model)")
     if retry_types is not None:
         if retry_types:
             safe_print(f"Retry mode: {', '.join(sorted(retry_types))}")
@@ -1049,29 +1130,12 @@ def main() -> None:
         if args.continue_run and result_path.exists():
             try:
                 existing = load_result_json(result_path)
-                skip = True
-
-                perf = existing.get("perf") or {}
-                acc = existing.get("accuracy")
-
-                # Derive current classification / verdict to check against retry types
-                if retry_types is not None:
-                    should_retry = False
-                    # Check perf failure (only when perf ran)
-                    if args.eval_type != "accuracy" and not perf.get("passed"):
-                        cls = classify_result(existing) or "UNKNOWN"
-                        if not retry_types or cls in retry_types:
-                            should_retry = True
-                    # Check accuracy verdict
-                    if not should_retry and acc is not None and not acc.get("skipped"):
-                        verdict = derive_verdict(acc).value
-                        if not retry_types or verdict in retry_types:
-                            should_retry = True
-                    skip = not should_retry
-
-                if skip:
+
+                if _should_skip_existing(existing, retry_types, args.eval_type):
                     results.append(existing)
                     skipped += 1
+                    perf = existing.get("perf") or {}
+                    acc = existing.get("accuracy")
                     perf_cls = classify_result(existing) or "UNKNOWN"
                     perf_tag = "PASS" if perf.get("passed") else f"FAIL/{perf_cls}"
                     acc_tag = ""
@@ -1196,7 +1260,7 @@ def main() -> None:
         else:
             safe_print(f"  [acc only]{acc_tag}")
 
-        if args.clean_hf_cache:
+        if args.clean_cache:
             _clear_disk_caches()
 
     run_duration = time.perf_counter() - run_start

From 8967738caa41d8c7cb9aaacb3c52e860f5306a40 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 13:37:21 +0800
Subject: [PATCH 05/14] update clean

---
 scripts/e2e_eval/run_eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 5538cca00..fe7a65fd8 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -118,7 +118,7 @@ def _get_timeout_skip_reason(hf_id: str, task: str) -> str:
 _HF_CACHE = Path.home() / ".cache" / "huggingface"
 _WML_CACHE = Path.home() / ".cache" / "winml"
 _TEMP_DIR = Path(os.environ.get("TEMP", os.environ.get("TMP", tempfile.gettempdir())))
-_TEMP_PREFIXES = ("wmk_", "tmp", "modelkit_compat_")
+_TEMP_PREFIXES = ("wmk_", "modelkit_compat_")
 
 
 def _is_no_space_error(proc: dict) -> bool:
@@ -151,6 +151,7 @@ def _clear_disk_caches() -> None:
                 or name.endswith(".onnx.data")
             )
             if is_leaked:
+                safe_print(f"  [cleanup] Leaked temp: {entry} (dir={entry.is_dir()})")
                 try:
                     if entry.is_dir():
                         shutil.rmtree(entry)

From 088d82f08663a8d6883934cfb29e6e66bdd4c982 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 13:47:34 +0800
Subject: [PATCH 06/14] update

---
 scripts/e2e_eval/run_eval.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index fe7a65fd8..e8761930f 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -138,20 +138,23 @@ def _clear_disk_caches() -> None:
             except OSError as exc:
                 safe_print(f"  [cleanup] Warning: could not remove {cache_dir}: {exc}")
 
-    # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx)
+    # Clean leaked temp directories/files (wmk_*, modelkit_compat_*, tmp*.onnx*)
     if _TEMP_DIR.is_dir():
         cleaned = 0
         for entry in _TEMP_DIR.iterdir():
             name = entry.name
-            if not any(name.startswith(p) for p in _TEMP_PREFIXES):
-                continue
-            is_leaked = (
-                entry.is_dir()
-                or entry.suffix in (".onnx", ".out", ".err")
-                or name.endswith(".onnx.data")
-            )
-            if is_leaked:
-                safe_print(f"  [cleanup] Leaked temp: {entry} (dir={entry.is_dir()})")
+            should_clean = False
+            if any(name.startswith(p) for p in _TEMP_PREFIXES):
+                should_clean = (
+                    entry.is_dir()
+                    or entry.suffix in (".onnx", ".out", ".err")
+                    or name.endswith(".onnx.data")
+                )
+            elif name.startswith("tmp") and name.endswith((".onnx", ".onnx.data")):
+                # Python tempfile creates tmp* prefixed files; only clean ONNX artifacts
+                should_clean = True
+            if should_clean:
+                safe_print(f"  [cleanup] Leaked temp: {entry}")
                 try:
                     if entry.is_dir():
                         shutil.rmtree(entry)

From c578b2255633a2bee34ea4829d1026cd2b6a35d1 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 14:15:04 +0800
Subject: [PATCH 07/14] remove retry

---
 .pipelines/Modelkit E2E Test.yml | 48 --------------------------------
 1 file changed, 48 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index 22c914422..e359b20b5 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -158,54 +158,6 @@ jobs:
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Run eval for current model'
 
-  - job: RetryMissed
-    displayName: 'Retry Interrupted Models'
-    dependsOn:
-      - Prepare
-      - EvalModel
-    condition: always()
-    timeoutInMinutes: 180
-    cancelTimeoutInMinutes: 5
-    pool:
-      name: modelkit-selfhost-pool
-      demands:
-        - Agent.Name -equals NPU-QNN
-    variables:
-      EVAL_DIR: $[ dependencies.Prepare.outputs['set_output_dir.EVAL_DIR'] ]
-
-    steps:
-      - checkout: none
-
-      - powershell: |
-          $uvBin = "$env:USERPROFILE\.local\bin"
-          $venvDir = "$(Build.SourcesDirectory)\.venv\Scripts"
-          Write-Host "##vso[task.prependpath]$uvBin"
-          Write-Host "##vso[task.prependpath]$venvDir"
-        displayName: 'Activate Python environment'
-
-      - powershell: |
-          Write-Host "Retrying any models missing eval_result.json in $(EVAL_DIR)"
-
-          $uvArgs = @(
-              "run", "--no-sync", "python", "scripts/e2e_eval/run_eval.py",
-              "--output-dir", "$(EVAL_DIR)",
-              "--device", "npu",
-              "--continue",
-              "--verbose",
-              "--timeout", "1800",
-              "--no-report",
-              "--clean-cache"
-          )
-
-          & uv @uvArgs
-          $retryExit = $LASTEXITCODE
-          if ($retryExit -ne 0) {
-              Write-Warning "Retry pass exited with code $retryExit (non-blocking)"
-          }
-          exit 0
-        workingDirectory: $(Build.SourcesDirectory)
-        displayName: 'Re-run models with missing results'
-
   - job: Report
     displayName: 'Generate Eval Report'
     dependsOn:

From 1a11e2882ab2fa5cb938641076f82a76f86f0af9 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 14:23:09 +0800
Subject: [PATCH 08/14] fix

---
 .pipelines/Modelkit E2E Test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index e359b20b5..ab870e668 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -163,7 +163,6 @@ jobs:
     dependsOn:
       - Prepare
       - EvalModel
-      - RetryMissed
     condition: always()
     pool:
       name: modelkit-selfhost-pool

From a85954f1342ebe9f9f561a8ba6b7d25d50d613be Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 15:39:53 +0800
Subject: [PATCH 09/14] clean .onnx.data file in temp

---
 src/winml/modelkit/optim/pipes/fusion.py |  6 ++++--
 src/winml/modelkit/optim/pipes/graph.py  | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/winml/modelkit/optim/pipes/fusion.py b/src/winml/modelkit/optim/pipes/fusion.py
index 9b3e348b4..57d2c7ee6 100644
--- a/src/winml/modelkit/optim/pipes/fusion.py
+++ b/src/winml/modelkit/optim/pipes/fusion.py
@@ -301,6 +301,8 @@ def process(self, model: onnx.ModelProto, config: ORTFusionPipeConfig) -> onnx.M
             ) from e
 
         finally:
-            # Clean up temporary file
+            # Clean up temporary file and its external data sidecar
             if input_path:
-                Path(input_path).unlink(missing_ok=True)
+                p = Path(input_path)
+                p.unlink(missing_ok=True)
+                (p.parent / f"{p.name}.data").unlink(missing_ok=True)
diff --git a/src/winml/modelkit/optim/pipes/graph.py b/src/winml/modelkit/optim/pipes/graph.py
index fe6f9b4e8..ed3841c56 100644
--- a/src/winml/modelkit/optim/pipes/graph.py
+++ b/src/winml/modelkit/optim/pipes/graph.py
@@ -593,8 +593,9 @@ def process(self, model: onnx.ModelProto, config: ORTGraphPipeConfig) -> onnx.Mo
                 ) from e
 
         finally:
-            # Clean up temporary files (always execute)
-            if input_file and input_file.exists():
-                input_file.unlink(missing_ok=True)
-            if output_file and output_file.exists():
-                output_file.unlink(missing_ok=True)
+            # Clean up temporary files and their external data sidecars
+            for tmp in (input_file, output_file):
+                if tmp is not None:
+                    tmp.unlink(missing_ok=True)
+                    data_sidecar = tmp.parent / f"{tmp.name}.data"
+                    data_sidecar.unlink(missing_ok=True)

From 74656ddcdb7c89de15fe197e9e304c6d17286254 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 15:58:50 +0800
Subject: [PATCH 10/14] refine job name

---
 .pipelines/Modelkit E2E Test.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index ab870e668..e163129ce 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -87,7 +87,8 @@ jobs:
           $matrix = @{}
           for ($i = 0; $i -lt $total; $i++) {
               $m = $models[$i]
-              $key = ($m.hf_id -replace '[^A-Za-z0-9]', '_')
+              $slug = ($m.hf_id -replace '[^A-Za-z0-9]', '_')
+              $key = "$($i + 1)_${slug}"
               $matrix[$key] = @{
                   hf_id = [string]$m.hf_id
                   hf_task = [string]$m.task
@@ -104,7 +105,7 @@ jobs:
         displayName: 'Create matrix variables'
 
   - job: EvalModel
-    displayName: 'Run Model Eval'
+    displayName: 'Eval'
     dependsOn: Prepare
     condition: and(succeeded(), ne(dependencies.Prepare.outputs['set_matrix.skipEval'], 'true'))
     timeoutInMinutes: 90

From 73baa304f1f8b662aa9c66279ad396fc476f5b8a Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Thu, 2 Apr 2026 16:15:03 +0800
Subject: [PATCH 11/14] change number to task

---
 .pipelines/Modelkit E2E Test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index e163129ce..362a95e51 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -87,8 +87,8 @@ jobs:
           $matrix = @{}
           for ($i = 0; $i -lt $total; $i++) {
               $m = $models[$i]
-              $slug = ($m.hf_id -replace '[^A-Za-z0-9]', '_')
-              $key = "$($i + 1)_${slug}"
+              $slug = (($m.hf_id + '_' + $m.task) -replace '[^A-Za-z0-9]', '_')
+              $key = $slug
               $matrix[$key] = @{
                   hf_id = [string]$m.hf_id
                   hf_task = [string]$m.task

From 6d9d6c9ce46f2c12f4911b646f63e6885d9a5aaa Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Fri, 3 Apr 2026 11:52:27 +0800
Subject: [PATCH 12/14] add publish artifact

---
 .pipelines/Modelkit E2E Test.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index 362a95e51..8bd498b5f 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -187,3 +187,9 @@ jobs:
           --input-dir $(EVAL_DIR)
         workingDirectory: $(Build.SourcesDirectory)
         displayName: 'Generate evaluation report'
+
+      - task: PublishPipelineArtifact@1
+        inputs:
+          targetPath: $(EVAL_DIR)
+          artifactName: EvalReport
+        displayName: 'Publish eval results as artifact'

From 5f7d875dbe77fc7da8f9c07e9feecc6f2eafabc1 Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Fri, 3 Apr 2026 12:09:41 +0800
Subject: [PATCH 13/14] remove

---
 scripts/e2e_eval/run_eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 487297f75..b85367ac1 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -915,7 +915,6 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--clean-cache",
-        "--clean-hf-cache",
         dest="clean_cache",
         action="store_true",
         help="Delete caches and leaked temp files after each model evaluation (saves disk space)",

From 973caf70d69d2b72c2a5293d082e60e3599ab68b Mon Sep 17 00:00:00 2001
From: yuesu <yuesu@microsoft.com>
Date: Fri, 3 Apr 2026 16:23:03 +0800
Subject: [PATCH 14/14] update

---
 .pipelines/Modelkit E2E Test.yml | 5 +++++
 scripts/e2e_eval/run_eval.py     | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.pipelines/Modelkit E2E Test.yml b/.pipelines/Modelkit E2E Test.yml
index 8bd498b5f..a42a84e0f 100644
--- a/.pipelines/Modelkit E2E Test.yml	
+++ b/.pipelines/Modelkit E2E Test.yml	
@@ -89,6 +89,11 @@ jobs:
               $m = $models[$i]
               $slug = (($m.hf_id + '_' + $m.task) -replace '[^A-Za-z0-9]', '_')
               $key = $slug
+              $suffix = 2
+              while ($matrix.ContainsKey($key)) {
+                  $key = "${slug}_${suffix}"
+                  $suffix++
+              }
               $matrix[$key] = @{
                   hf_id = [string]$m.hf_id
                   hf_task = [string]$m.task
diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index b85367ac1..0f004035e 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -3,11 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-# -------------------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------------------
-
 """E2E evaluation runner — unified perf + accuracy.
 
 Batch-runs winml perf (and optionally winml eval + pytorch baseline) for models
@@ -167,7 +162,7 @@ def _clear_disk_caches() -> None:
                         entry.unlink()
                     cleaned += 1
                 except OSError:
-                    pass
+                    pass  # Best-effort cleanup; ignore if file is locked or already removed
         if cleaned:
             safe_print(f"  [cleanup] Removed {cleaned} leaked temp entries from {_TEMP_DIR}")