Enable launcher benchmarks in CI for XPU (#5568)

HBN-MichalSzy · anmyachev · web-flow · commit 5c4ddd579107 · 2025-11-29T16:50:09.000+01:00
Addresses #5028 --------- Co-authored-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml
@@ -223,6 +223,18 @@ jobs:
             --max-new-tokens $MAX_NEW_TOKENS \
             --batch-size $BATCH_SIZE
 
+      - name: Run launch microbenchmark tests
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'launch_micro_benchmarks')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'launch_micro_benchmarks') }}
+        run: |
+          source scripts/capture-hw-details.sh
+          python python/test/microbenchmark/launch_overhead.py --reports $REPORTS
+
+          python benchmarks/third_party/vllm/transform_results.py $REPORTS/launch_overhead_results.csv $REPORTS/launch_overhead-report.csv \
+            --tag $TAG \
+            --bgroup overhead \
+            --benchmark launch-overhead \
+            --param_cols="input_type"
+
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v5
diff --git a/benchmarks/third_party/vllm/transform_results.py b/benchmarks/third_party/vllm/transform_results.py
@@ -46,7 +46,7 @@ def serialize_params(row):
 
     dfs = []
     for compiler_name in compilers:
-        for value_name in ['TFlops', 'GB/s']:
+        for value_name in ['TFlops', 'GB/s', 'time_us']:
             col = f'{compiler_name}-{value_name}'
             if col not in df.columns:
                 continue
diff --git a/python/test/microbenchmark/launch_overhead.py b/python/test/microbenchmark/launch_overhead.py
@@ -2,7 +2,10 @@
 Original code by @bertmaher; profiling added by @apgoucher
 """
 
+import argparse
 import cProfile
+import csv
+import os
 import pstats
 import time
 
@@ -42,11 +45,11 @@ def nop_args(
 def do_bench_walltime(fn):
     print("Compiling...")
     fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
 
     for _ in range(1000):
         fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
 
     n_repeat = 10000
 
@@ -55,11 +58,11 @@ def do_bench_walltime(fn):
     for _ in range(25):
         print("Running %d benchmarking iterations..." % n_repeat)
         # Benchmark
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         start_time = time.time()
         for _ in range(n_repeat):
             fn()
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         end_time = time.time()
         wall_time_ms = (end_time - start_time) * 1e3 / n_repeat
         mses.append(wall_time_ms)
@@ -71,19 +74,19 @@ def do_bench_walltime(fn):
     profile.enable()
     for _ in range(n_repeat):
         fn()
-    torch.cuda.synchronize()
+    torch.xpu.synchronize()
     profile.disable()
     stats = pstats.Stats(profile)
     stats.sort_stats("time")
     stats.print_stats()
     return mses
 
 
-def main(use_tensor_desc: bool):
+def main(use_tensor_desc: bool, reports_dir: str = None):
     if use_tensor_desc:
-        targs = [TensorDescriptor.from_tensor(torch.zeros(1, 16, device="cuda"), block_shape=[1, 16]) for _ in range(5)]
+        targs = [TensorDescriptor.from_tensor(torch.zeros(1, 16, device="xpu"), block_shape=[1, 16]) for _ in range(5)]
     else:
-        targs = [torch.zeros(1, device="cuda") for _ in range(5)]
+        targs = [torch.zeros(1, device="xpu") for _ in range(5)]
     ncargs = [0, 1, 1024, 2**31 - 1, 2**64 - 1, False, True, None, (16, 16)]
     cargs = [32, False, True, 0, 64]
 
@@ -94,9 +97,26 @@ def main(use_tensor_desc: bool):
     print(usecs)
     print(sorted(usecs)[len(usecs) >> 1])
 
+    if reports_dir:
+        os.makedirs(reports_dir, exist_ok=True)
+        csv_path = os.path.join(reports_dir, "launch_overhead_results.csv")
+        file_exists = os.path.exists(csv_path)
+
+        with open(csv_path, "a", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                writer.writerow(["input_type", "triton-time_us"])
+
+            input_type = "TensorDescriptor" if use_tensor_desc else "Tensor"
+            writer.writerow([input_type, round(sorted(usecs)[len(usecs) >> 1], 2)])
+
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark launch overhead for Triton kernels")
+    parser.add_argument("--reports", type=str, default=None, help="Path to directory for CSV reports")
+    args = parser.parse_args()
+
     print("launch overhead of kernel with Tensor inputs")
-    main(use_tensor_desc=False)
+    main(use_tensor_desc=False, reports_dir=args.reports)
     print("launch overhead of kernel with TensorDescriptor inputs")
-    main(use_tensor_desc=True)
+    main(use_tensor_desc=True, reports_dir=args.reports)