diff --git a/src/winml/modelkit/cli.py b/src/winml/modelkit/cli.py
index 03e14b221..4504856b7 100644
--- a/src/winml/modelkit/cli.py
+++ b/src/winml/modelkit/cli.py
@@ -264,7 +264,7 @@ def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) ->
 def main(ctx: click.Context, verbose: int, quiet: bool, debug: bool) -> None:
     """WinML CLI - Accelerate Model Deployment on WinML.
 
-    Universal ONNX export with QNN and OpenVINO backend support.
+    Universal ONNX export with various WinML execution providers support.
     """
     # --debug is a backward-compat alias for -vv
     if debug:
diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py
index 4ad292d77..7fb4337ea 100644
--- a/src/winml/modelkit/commands/perf.py
+++ b/src/winml/modelkit/commands/perf.py
@@ -300,7 +300,8 @@ def run(self) -> BenchmarkResult:
         _print_model_info(
             self._model.io_config,
             task=self._model.task or self.config.task,
-            device=self._model.device,
+            req_device=self.config.device,
+            act_device=self._model.device,
             ep_name=self._model.ep_name,
         )
 
@@ -755,6 +756,12 @@ def _perf_modules(
 # Report Generation
 # =============================================================================
 
+def _device_string(req_device: str, act_device: str, ep_name: EPName | None) -> str:
+    device_str = f"{req_device} ({act_device})" if req_device != act_device else act_device
+    if ep_name:
+        device_str = f"{device_str} / {ep_name}"
+    return device_str
+
 
 def display_console_report(result: BenchmarkResult, console: Console) -> None:
     """Display benchmark results in formatted console output."""
@@ -763,9 +770,7 @@ def display_console_report(result: BenchmarkResult, console: Console) -> None:
 
     req_device = result.config.device
     act_device = result.actual_device
-    device_str = f"{req_device} ({act_device})" if req_device != act_device else act_device
-    if result.actual_ep:
-        device_str = f"{device_str} / {result.actual_ep}"
+    device_str = _device_string(req_device, act_device, result.actual_ep)
     console.print(f"[dim]Device:[/dim]      {device_str}")
 
     # TODO: show resolved precision once WinMLPreTrainedModel.precision
@@ -885,13 +890,14 @@ def _print_model_info(
     io_config: dict,
     *,
     task: str | None = None,
-    device: str = "auto",
+    req_device: str = "auto",
+    act_device: str = "auto",
     ep_name: EPName | None = None,
 ) -> None:
     """Print model I/O metadata before the benchmark starts."""
     console = Console(stderr=True)
     console.print()
-    device_line = f"{device} / {ep_name}" if ep_name else device
+    device_line = _device_string(req_device, act_device, ep_name)
     console.print(f"[dim]Device:[/dim]      {device_line}")
     if task:
         console.print(f"[dim]Task:[/dim]        {task}")
@@ -1011,7 +1017,7 @@ def _run_onnx_benchmark(
     session.compile()
 
     # Print model info before benchmark starts
-    _print_model_info(io_cfg, device=session.device, ep_name=session.ep_name)
+    _print_model_info(io_cfg, req_device=device, act_device=session.device, ep_name=session.ep_name)
 
     # Run benchmark
     total_iterations = warmup + iterations
@@ -1044,7 +1050,7 @@ def _run_onnx_benchmark(
                 total_iterations=total_iterations,
                 warmup=warmup,
                 model_id=str(onnx_path.name),
-                device=device,
+                device=session.device or device,
             )
             hw_metrics = hw.to_dict()
     else:
diff --git a/src/winml/modelkit/session/monitor/_pdh.py b/src/winml/modelkit/session/monitor/_pdh.py
index a9ceccd8a..0e0364dc4 100644
--- a/src/winml/modelkit/session/monitor/_pdh.py
+++ b/src/winml/modelkit/session/monitor/_pdh.py
@@ -329,8 +329,8 @@ def build_npu_query(npu_luid: str, pid: int | None = None) -> PdhQuery:
     Returns:
         An opened PdhQuery configured for NPU monitoring.
     """
-    # Neural: OpenVINO NPU
-    return build_adapter_query(npu_luid, engine_types=("Compute", "Neural"), pid=pid)
+    # Neural / 3D: OpenVINO NPU
+    return build_adapter_query(npu_luid, engine_types=("Compute", "Neural", "3D"), pid=pid)
 
 
 def build_gpu_query(gpu_luid: str, pid: int | None = None) -> PdhQuery:
diff --git a/tests/e2e/test_perf_e2e.py b/tests/e2e/test_perf_e2e.py
index 013433167..8952b5c7d 100644
--- a/tests/e2e/test_perf_e2e.py
+++ b/tests/e2e/test_perf_e2e.py
@@ -484,6 +484,7 @@ def test_benchmark_ep_cpu(self, ep: str, tmp_path: Path, model_arg: str):
     def test_benchmark_ep_gpu(self, ep: str, tmp_path: Path, model_arg: str):
         """Benchmark with --ep <ep>."""
         require_ep(ep)
+        _require_gpu()
 
         output_file = tmp_path / f"perf_hf_{ep}_gpu.json"
 
@@ -507,6 +508,7 @@ def test_benchmark_ep_gpu(self, ep: str, tmp_path: Path, model_arg: str):
     def test_benchmark_ep_npu(self, ep: str, tmp_path: Path, model_arg: str):
         """Benchmark with --ep <ep>."""
         require_ep(ep)
+        _require_npu()
 
         output_file = tmp_path / f"perf_hf_{ep}_npu.json"