From 4d38b5ef1da9c2e3fb6861b0737fa847a72b70b0 Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Tue, 16 Sep 2025 09:38:12 +0800
Subject: [PATCH 01/27] [CI] Fix CICD test issues (#2027)

1. torchbench issues caused by deps installation
2. pt2e test dataset path and deps installation
3. op benchmark github access permission
4. enhance bisect search

disable_distributed
---
 .github/actions/linux-testenv/action.yml  |  16 +++-
 .github/actions/pt2e/action.yml           |   2 +
 .github/workflows/_linux_e2e.yml          |   1 -
 .github/workflows/_linux_e2e_summary.yml  |  15 +--
 .github/workflows/_linux_op_benchmark.yml |   4 +-
 .github/workflows/bisect_search.yml       | 112 ++++++++--------------
 .github/workflows/pull.yml                |   2 +
 7 files changed, 67 insertions(+), 85 deletions(-)

diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
index cc5b9e0a70..246b94c919 100644
--- a/.github/actions/linux-testenv/action.yml
+++ b/.github/actions/linux-testenv/action.yml
@@ -99,7 +99,7 @@ runs:
             TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
           fi
         fi
-        if [ "${{ github.event_name }}" == "pull_request" ];then
+        if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then
           cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops
           cd third_party/torch-xpu-ops
         else
@@ -139,7 +139,7 @@ runs:
           fi
           # for dlrm
           pip install pyre-extensions
-          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
           # for soft_actor_critic, temp fix
           pip install git+https://github.com/nocoding03/gym@fix-np
           cd ../pytorch
@@ -152,10 +152,13 @@ runs:
           TORCHBENCH_COMMIT_ID="$(git rev-parse --short  HEAD)"
           sed -i 's/^ *pynvml.*//' requirements.txt
           pip install -r requirements.txt
-          python install.py --continue_on_fail
+          # python install.py --continue_on_fail
+          echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
+          pip install dominate
+          python install.py Super_SloMo
           # for dlrm
           pip install pyre-extensions
-          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
           cd ../pytorch
         else
           pip install -r ./.ci/docker/requirements-ci.txt
@@ -170,6 +173,11 @@ runs:
         else
           pip install torchao --pre --index-url https://download.pytorch.org/whl/nightly/xpu
         fi
+        if [ "${{ inputs.suite }}" != "None" ];then
+          # To install numpy 1.x for benchmarks as CUDA
+          # yolov requires numpy>=1.23
+          pip install -U numpy==1.26.4
+        fi
     - name: Torch Config
       shell: bash -xe {0}
       run: |
diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml
index 8704ed0fc6..60954ffd29 100644
--- a/.github/actions/pt2e/action.yml
+++ b/.github/actions/pt2e/action.yml
@@ -29,6 +29,7 @@ runs:
       shell: bash -xe {0}
       run: |
         # dataset
+        dataset_dir="${RUNNER_TEMP}/_datasets/imagenet"
         if [ ! -d ${dataset_dir} ];then
           rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir}
           wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
@@ -36,6 +37,7 @@ runs:
           tar -xf ILSVRC2012_img_val.tar
           bash valprep.sh
         fi
+        echo "dataset_dir=${dataset_dir}" >> ${GITHUB_ENV}
     - name: PT2E Test (${{ inputs.dt }}  ${{ inputs.scenario }})
       shell: bash -xe {0}
       run: |
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
index ab49d28490..cd2056ff36 100644
--- a/.github/workflows/_linux_e2e.yml
+++ b/.github/workflows/_linux_e2e.yml
@@ -87,7 +87,6 @@ jobs:
         cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
         MODEL_ONLY_NAME: ${{ inputs.model }}
         AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-        dataset_dir: ${{ runner.temp }}/../_datasets/imagenet
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml
index 427a35ca82..f2026a9f5a 100644
--- a/.github/workflows/_linux_e2e_summary.yml
+++ b/.github/workflows/_linux_e2e_summary.yml
@@ -30,14 +30,15 @@ jobs:
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Install gh-cli
-        run: |
-          sudo apt-get update
-          sudo apt-get install gh rsync ca-certificates -y
       - name: Setup python-${{ inputs.python }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ inputs.python }}
+      - name: Install gh-cli
+        run: |
+          sudo apt-get update
+          sudo apt-get install gh rsync ca-certificates -y
+          pip install pandas requests
       - name: Download Target Artifact
         run: |
           mkdir target/
@@ -64,7 +65,7 @@ jobs:
       - name: Get summary
         if: ${{ ! cancelled() }}
         run: |
-          pip install pandas requests
+          exit_label=0
           e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)"
           if [ -f "${e2e_summary_csv}" ];then
             bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY}
@@ -72,17 +73,17 @@ jobs:
             if [ ${exit_label} -ne 0 ];then
               grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
               echo "There are ${exit_label} cases that need look into!!! Please check them"
-              exit ${exit_label}
             fi
           fi
           pt2e_summary_csv="$(find ./target/ -name "summary.csv")"
           if [ -f "${pt2e_summary_csv}" ];then
             cat ${pt2e_summary_csv}
-            failed_num=$(grep -c ',failed' ${pt2e_summary_csv})
+            failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true)
             if [ ${failed_num} -ne 0 ];then
               echo "[Warning] PT2E has failures!"
             fi
           fi
+          exit ${exit_label}
       - name: Upload Reference Run ID
         if: ${{ endsWith(inputs.test_type, 'ly') }}
         run: |
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index b65d985008..d71458c9bf 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -50,8 +50,6 @@ jobs:
   op_benchmark:
     needs: runner
     runs-on: ${{ needs.runner.outputs.runner_id }}
-    permissions: 
-      issues: write
     timeout-minutes: 900
     container:
       image: mengfeili/intel-pvc-driver:1146-1136
@@ -93,6 +91,8 @@ jobs:
   op_benchmark_test_results_check:
     needs: op_benchmark
     runs-on: ubuntu-24.04
+    permissions:
+      issues: write
     steps:
       - name: Install gh-cli
         run: |
diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml
index 4adba0df6c..c2512acb33 100644
--- a/.github/workflows/bisect_search.yml
+++ b/.github/workflows/bisect_search.yml
@@ -45,42 +45,26 @@ jobs:
   get_runner:
     runs-on: ${{ inputs.runner }}
     outputs:
-      test_host: ${{ steps.runner-info.outputs.test_host }}
-      test_user: ${{ steps.runner-info.outputs.test_user }}
-      test_group: ${{ steps.runner-info.outputs.test_group }}
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
     steps:
-      - name: Get runner info
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
         id: runner-info
-        run: |
-          # get test runner
-          echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
-          echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
-          echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
-          # show host info
-          cat /etc/os-release
-          uname -a
-          source /opt/intel/oneapi/setvars.sh
-          sycl-ls
-          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
-      - name: Cleanup workspace
-        if: ${{ always() }}
-        run: |
-          # clean docker cache
-          docker stop $(docker ps -aq) || true
-          docker system prune -af || true
-          # clean files
-          ls -al
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+        uses: ./.github/actions/get-runner
 
   biisect-search:
     needs: get_runner
-    runs-on: ${{ needs.get_runner.outputs.test_host }}
+    runs-on: ${{ needs.get_runner.outputs.runner_id }}
     container:
       image: mengfeili/intel-pvc-driver:1146-1136
       volumes:
         - ${{ github.workspace }}:${{ github.workspace }}
       options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
-              -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
+              -u ${{ needs.get_runner.outputs.user_id }} --group-add ${{ needs.get_runner.outputs.render_id }}
       env:
         AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
         SEARCH_COMMITS: ${{ inputs.search_commits }}
@@ -95,33 +79,19 @@ jobs:
        HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
-      - name: Check runner
-        run: |
-          ls -al
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
-          sudo rm -rf /tmp/xpu-tool
-      - name: Setup python-${{ inputs.python }}
-        uses: actions/setup-python@v5
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
         with:
-          python-version: ${{ inputs.python }}
-      - name: Check runner
-        run: |
-          hostname && whoami && id
-          clinfo --list
-          gcc -v && g++ -v
-          which python && which pip
-          python -V
-          pip install -U pip wheel setuptools
-          pip list
-          uname -a
-          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
-          pip install cmake ninja pandas psutil scipy requests pybind11
-          mkdir gs-logs gs-search
-          echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
+          path: gs-scripts
+      - name: Prepare test env
+        uses: ./gs-scripts/.github/actions/linux-testenv
+        with:
+          pytorch: nightly_wheel
+          python: ${{ inputs.python }}
       - name: Install oneAPI DLE
         if: ${{ inputs.oneapi != 'installed' }}
         run: |
-          rm -rf ~/intel ~/.intel /tmp/intel
+          rm -rf ~/intel ~/.intel ${HOME}/intel
           if [ "${{ inputs.oneapi }}" == "2025.1" ];then
             ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
           elif [ "${{ inputs.oneapi }}" == "2025.2" ];then
@@ -130,42 +100,42 @@ jobs:
             ONEAPI_URL="${{ inputs.oneapi }}"
           fi
           wget -q -O oneapi.sh "${ONEAPI_URL}"
-          bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
-          echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-        with:
-          path: gs-scripts
-      - name: Prepare source code
+          bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi
+          echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV}
+
+      - name: Summary file
         run: |
-          git clone https://github.com/pytorch/pytorch gs-pytorch
-          cd gs-pytorch
+          mkdir -p gs-logs
+          echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
+      - name: Get latest versions
+        run: |
+          cd pytorch
           LATEST_PT_COMMIT="$(git rev-parse HEAD)"
-          cd ..
-          git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
-          cd gs-torch-xpu-ops
+          cd third_party/torch-xpu-ops
           LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
-          cd ..
           echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
           echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
       - name: Prepare test env
         run: |
-          pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
           if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
-            pip install transformers==4.44.2
+            pip install transformers
           elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
-            pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
-            pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
+            pip install timm
           elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
             model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
             git clone https://github.com/pytorch/benchmark gs-benchmark
             cd gs-benchmark
+            pip install -r requirements.txt
             echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
             python install.py ${model_name}
-          else
-            pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
+            cd ..
           fi
-          pip uninstall -y torch && pip uninstall -y torch
+          pip uninstall -y torchvision torchaudio
+          git clone https://github.com/pytorch/vision gs-vision
+          cd gs-vision
+          python setup.py install
+          cd ..
+          pip uninstall -y torch
       - name: Bisect search pytorch
         if: ${{ contains(inputs.search_commits, 'pytorch') }}
         run: |
@@ -186,7 +156,7 @@ jobs:
                   > ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
           new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
           if [ "${old_status}" != "${new_status}" ];then
-            cd gs-pytorch
+            cd pytorch
             git reset --hard
             rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
             git bisect start ${new_commit} ${old_commit}
@@ -219,7 +189,7 @@ jobs:
                   > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
           new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
           if [ "${old_status}" != "${new_status}" ];then
-            cd gs-pytorch
+            cd pytorch
             git reset --hard
             rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
             git bisect start ${new_commit} ${old_commit}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be6b260734..b5e8606ac8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -115,6 +115,7 @@ jobs:
     with:
       runner: pvc_rolling
       pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
       ut: ${{ matrix.ut_name }}
 
   linux-distributed:
@@ -129,6 +130,7 @@ jobs:
     with:
       runner: pvc_rolling
       pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
       ut: ${{ matrix.ut_name }}
 
   linux-e2e:

From df1d7ad08c124f86db2fc9bdf05111322c7655b6 Mon Sep 17 00:00:00 2001
From: xiangdong <40376367+zxd1997066@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:57:02 +0800
Subject: [PATCH 02/27] [CI] Add ported distributed cases (#1945)

This PR intends to add some ported distributed cases in torch-xpu-ops
CI.
- Add ZE_AFFINITY_MASK to ensure using Xelink.
- Add CCL_ROOT for Xelink, this WA can be removed after oneCCL upgrade
to 2021.16.2
- Increase distributed test time limit. Currently, the test part needs
about 1 hour after add ported cases.

disable_e2e
disable_ut
---
 .github/actions/linux-uttest/action.yml |   6 +-
 .github/scripts/ut_result_check.sh      |   3 +-
 .github/workflows/_linux_ut.yml         |   3 +-
 test/xpu/run_distributed.py             |  48 +++++++++--
 test/xpu/skip_list_dist.py              | 101 ++++--------------------
 5 files changed, 65 insertions(+), 96 deletions(-)

diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
index fb79182006..6e2cf2ceeb 100644
--- a/.github/actions/linux-uttest/action.yml
+++ b/.github/actions/linux-uttest/action.yml
@@ -155,7 +155,7 @@ runs:
           tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
 
     - name: xpu_distributed
-      shell: timeout 3600 bash -xeu -o pipefail {0}
+      shell: timeout 36000 bash -xeu -o pipefail {0}
       if: ${{ inputs.ut_name == 'xpu_distributed' }}
       run: |
         xpu-smi topology -m
@@ -166,9 +166,13 @@ runs:
           echo -e "[ERROR] XCCL is not enabled"
           exit 1
         fi
+        export CCL_ROOT=$(dirname $(which python))/../
+        export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
+        export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
         python run_distributed.py \
           2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
           tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
+        find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;
 
     # Summary
     - name: UT Test Results Summary
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 0e6b95ec45..1aac532497 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
 fi
 
 if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
     grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 1751d9d972..6da0abadd1 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -99,11 +99,12 @@ jobs:
 
   test-in-baremetal:
     needs: runner
+    timeout-minutes: 600
     if: ${{ contains(inputs.ut, 'distributed') }}
     runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
+      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
index ddde5f8c8a..cd04712091 100644
--- a/test/xpu/run_distributed.py
+++ b/test/xpu/run_distributed.py
@@ -9,9 +9,41 @@
 res2 = 0
 fail_test = []
 
-# libfabric WA to avoid hang issue
-os.environ["FI_PROVIDER"] = "tcp"
-# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if len(value) > len(max_affinity):
+            max_affinity = value
+
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
 
 
 # run python test
@@ -26,6 +58,10 @@ def run(test_command):
 
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
+res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
+res += run(test_command)
 
 # run pytest with skiplist
 for key in skip_dict:
@@ -38,8 +74,4 @@ def run(test_command):
 if fail_test:
     print(",".join(fail_test) + " have failures")
 
-exit_code = os.WEXITSTATUS(res2)
-if exit_code == 0:
-    sys.exit(res)
-else:
-    sys.exit(exit_code)
+sys.exit(res)
diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
index 1210896ec6..cf5ed5cd78 100644
--- a/test/xpu/skip_list_dist.py
+++ b/test/xpu/skip_list_dist.py
@@ -1,105 +1,36 @@
 skip_dict = {
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
-        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
-        "test_checkpoint_submodule_use_reentrant_False_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
         "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
-        "test_delayed_optim_step_offload_false_no_shard_xpu",
-        "test_delayed_optim_step_offload_false_none_xpu",
-        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
-        "test_delayed_optim_step_offload_true_none_xpu",
-        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
-        "test_delayed_reduce_scatter_offload_false_none_xpu",
-        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_true_none_xpu",
-        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_offload_false_none_xpu",
-        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_offload_true_none_xpu",
-        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
-        "test_nested_always_wrap_model_offload_false_none_xpu",
-        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_true_none_xpu",
-        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_false_no_shard_xpu",
-        "test_nested_wrapped_model_offload_false_none_xpu",
-        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_true_none_xpu",
-        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
-        "test_transformer_offload_false_no_shard_xpu",
-        "test_transformer_offload_false_none_xpu",
-        "test_transformer_offload_false_shard_grad_op_xpu",
-        "test_transformer_offload_true_none_xpu",
-        "test_transformer_offload_true_shard_grad_op_xpu",
-        # https://github.com/intel/torch-xpu-ops/issues/1475
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-        "test_transformer_no_grad_mixed_precision_False_xpu",
-    ),
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
-    #    "test_raises_warning_or_errors_xpu",
-    # ),
-    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
-        "test_invalid_first_iter_order_sharding_strategy1_xpu",
-        "test_train_eval_sharding_strategy1_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
         "test_parity_with_non_frozen_fsdp_xpu",
         "test_parity_with_ddp_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
-    # will bring back after oneccl upgrade to 2021.16.1
-    # "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
         "test_transformer_no_grad_mixed_precision_True_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
-    #    "test_hsdp_init_with_device_mesh_xpu",
-    #    "test_root_module_is_not_FSDP_xpu",
-    # ),
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "distributed/test_c10d_xccl.py": (
-        # will bring back after oneccl upgrade to 2021.16.1
-        "test_xccl_barrier",
+        # https://github.com/intel/torch-xpu-ops/issues/2046
+        "test_unwaited",
+    ),
+    "distributed/test_c10d_ops_xccl.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # depends on https://github.com/pytorch/pytorch/pull/159473
+        "test_tracing_with_fakepg_xpu",
     ),
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
 }

From 74b11bff8e68ff8f170277ffdae8082a3845f200 Mon Sep 17 00:00:00 2001
From: Chao Han <chao1.han@intel.com>
Date: Wed, 17 Sep 2025 08:57:21 +0800
Subject: [PATCH 03/27] support high priority stream (#1715)

Support high priority stream for xccl, test case add in
https://github.com/intel/torch-xpu-ops/pull/2049
We need merge this pr first and upstream op register
https://github.com/pytorch/pytorch/pull/163049 and then test case could
be pass

---------

Co-authored-by: mengfei25 <mengfei.li@Intel.com>
---
 src/xccl/ProcessGroupXCCL.cpp | 17 +++++++++++------
 src/xccl/ProcessGroupXCCL.hpp | 23 ++++++++++++++++-------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index c820a1c486..ba7c8561be 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -322,7 +322,9 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
-ProcessGroupXCCL::Options::Options() : Backend::Options(XCCL_BACKEND_NAME) {}
+ProcessGroupXCCL::Options::Options(bool is_high_priority_stream)
+    : Backend::Options(XCCL_BACKEND_NAME),
+      is_high_priority_stream(is_high_priority_stream) {}
 
 static std::atomic<size_t> process_group_id = 0;
 
@@ -351,7 +353,7 @@ const std::string& ProcessGroupXCCL::logPrefix() const {
 }
 
 ProcessGroupXCCL::ProcessGroupXCCL(
-    const c10::intrusive_ptr<Store>& store,
+    c10::intrusive_ptr<Store> store,
     int rank,
     int size,
     c10::intrusive_ptr<Options> options)
@@ -377,7 +379,10 @@ ProcessGroupXCCL::ProcessGroupXCCL(
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
   LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: "
-            << "size: " << size << ", global rank: " << rank_;
+            << "size: " << size << ", global rank: " << rank_
+            << ", USE_HIGH_PRIORITY_STREAM: "
+            << options_->is_high_priority_stream
+            << ", PG Name: " << options_->group_name;
 
   LOG(INFO) << logPrefix() << "ProcessGroupXCCL environments: "
             << "XCCL version: " << XcclVersion
@@ -534,9 +539,9 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     rank = p2pRank;
   }
 
-  c10::impl::VirtualGuardImpl impl(device.type());
-  c10::Stream stream =
-      impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false);
+  bool force_high = getCvarBool(TORCH_XCCL_HIGH_PRIORITY, false);
+  c10::Stream stream = at::xpu::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
   sycl::queue& q = c10::xpu::XPUStream(stream).queue();
 
   auto ctx = ccl::create_context(q.get_context());
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index e7aa39c82d..2516c0b912 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -24,6 +24,9 @@
 #include <xccl/ProcessGroupXCCLMonitor.hpp>
 namespace c10d {
 
+static std::vector<std::string> TORCH_XCCL_HIGH_PRIORITY = {
+    "TORCH_XCCL_HIGH_PRIORITY"};
+
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "TORCH_XCCL_BLOCKING_WAIT",
     "XCCL_BLOCKING_WAIT"};
@@ -118,18 +121,19 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   };
 
   struct Options : public Backend::Options {
-    explicit Options();
+    explicit Options(bool is_high_priority_stream = false);
 
-    static c10::intrusive_ptr<Options> create() {
-      return c10::make_intrusive<Options>();
+    static c10::intrusive_ptr<Options> create(
+        bool is_high_priority_stream = false) {
+      return c10::make_intrusive<Options>(is_high_priority_stream);
     }
-
+    bool is_high_priority_stream;
     std::vector<uint64_t> global_ranks_in_group;
     std::string group_name;
   };
 
   ProcessGroupXCCL(
-      const c10::intrusive_ptr<Store>& store,
+      c10::intrusive_ptr<Store> store,
       int rank,
       int size,
       c10::intrusive_ptr<Options> options = Options::create());
@@ -138,11 +142,16 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const c10::intrusive_ptr<Store>& store,
       int rank,
       int size,
-      const std::string& groupName)
-      : ProcessGroupXCCL(store, rank, size) {}
+      const std::string& groupName,
+      c10::intrusive_ptr<Options> options = Options::create())
+      : ProcessGroupXCCL(store, rank, size, std::move(options)) {}
 
   ~ProcessGroupXCCL() override;
 
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
   }

From 6d1a47672546d0ab115e14da1cf4d9dd5903fe4d Mon Sep 17 00:00:00 2001
From: Chao Han <chao1.han@intel.com>
Date: Wed, 17 Sep 2025 08:59:04 +0800
Subject: [PATCH 04/27] Reduce tensor shape avoid timeout (#2051)

---
 test/xpu/distributed/test_c10d_xccl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 44a3ac148a..226501d7ac 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -641,7 +641,7 @@ def test_unwaited(self) -> None:
         with _functional_collectives.allow_inflight_collective_as_graph_input_ctx():
             self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
             input = torch.full(
-                (10240, 10240), float(self.rank), device=f"xpu:{self.rank}"
+                (1024, 1024), float(self.rank), device=f"xpu:{self.rank}"
             )
             dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
             # Non-functional collectives run under the context manager is registered in the work registry.
@@ -653,7 +653,7 @@ def test_unwaited(self) -> None:
         # Case 2: Run collectives not under context manager, and don't call wait on them.
         # NOTE: Here we intentionally test memory-stressed case.
         self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
-        for _ in range(50000):
+        for _ in range(500):
             input = torch.full(
                 (1024, 1024), float(self.rank), device=f"xpu:{self.rank}"
             )

From a5f73c1c761ce2150856f120007b374bac3ec724 Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Wed, 17 Sep 2025 04:35:31 +0200
Subject: [PATCH 05/27] Move checks from nonzero kernel to operator (#1991)

- Early return if input to nonzero has no elements,
- Change pytorch `MAX_DIMS` to `XPU_MAX_TENSORINFO_DIMS` xpu equivalent,
- Rename `tensor` to `out` to match op schema.

---------

Co-authored-by: Tadeusz Krawiec <72737249+tadkrawiec@users.noreply.github.com>
---
 src/ATen/native/xpu/Nonzero.cpp            |  14 ++-
 src/ATen/native/xpu/sycl/NonzeroKernel.cpp | 129 ++++++++++-----------
 2 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
index 9988631d3b..a1f9b1d09f 100644
--- a/src/ATen/native/xpu/Nonzero.cpp
+++ b/src/ATen/native/xpu/Nonzero.cpp
@@ -2,7 +2,7 @@
 #include <ATen/xpu/EmptyTensor.h>
 
 #include <ATen/native/xpu/sycl/NonzeroKernel.h>
-#include <ATen/native/xpu/sycl/OffsetCalculator.h>
+#include <comm/TensorInfo.h>
 
 namespace at {
 namespace native {
@@ -10,7 +10,7 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
       "nonzero is not supported for tensors with more than INT_MAX elements, \
-  See https://github.com/pytorch/pytorch/issues/51871");
+      See https://github.com/pytorch/pytorch/issues/51871");
   TORCH_CHECK(
       out.dtype() == at::kLong,
       "Expected object of scalar type ",
@@ -24,11 +24,15 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
       " and self on ",
       self.device());
   TORCH_CHECK(
-      self.dim() <= MAX_DIMS,
+      self.dim() <= XPU_MAX_TENSORINFO_DIMS,
       "nonzero is not supported for tensor with more than ",
-      MAX_DIMS,
+      XPU_MAX_TENSORINFO_DIMS,
       " dimensions");
 
+  if (self.numel() == 0) {
+    out = at::detail::empty_xpu({0, self.dim()}, out.options());
+    return out;
+  }
   xpu::nonzero_kernel(self, out);
   return out;
 }
@@ -39,4 +43,4 @@ Tensor nonzero_xpu(const Tensor& self) {
   return out;
 }
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index 72df4b8ee8..d9c166dbcb 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -17,7 +17,7 @@ struct FlattenIdxtoRealIdxKernelFunctor {
     if (global_id < N_) {
       auto dim = global_id / num_nonzeros_;
       auto index = global_id % num_nonzeros_;
-      tensor_begin_[global_id] =
+      out_begin_[global_id] =
           idx_flat_begin_[index] / divisor_[dim] % sizes_[dim];
     }
   }
@@ -25,14 +25,14 @@ struct FlattenIdxtoRealIdxKernelFunctor {
       int64_t N,
       const int64_t num_dim,
       const int64_t num_nonzeros,
-      int64_t* tensor_begin,
+      int64_t* out_begin,
       int64_t* idx_flat_begin,
       int64_t* divisor,
       int64_t* sizes)
       : N_(N),
         num_dim_(num_dim),
         num_nonzeros_(num_nonzeros),
-        tensor_begin_(tensor_begin),
+        out_begin_(out_begin),
         idx_flat_begin_(idx_flat_begin) {
     for (auto dim = num_dim - 1; dim >= 0; dim--) {
       sizes_[dim] = sizes[dim];
@@ -44,7 +44,7 @@ struct FlattenIdxtoRealIdxKernelFunctor {
   int64_t N_;
   const int64_t num_dim_;
   const int64_t num_nonzeros_;
-  int64_t* tensor_begin_;
+  int64_t* out_begin_;
   int64_t* idx_flat_begin_;
   int64_t divisor_[XPU_MAX_TENSORINFO_DIMS];
   int64_t sizes_[XPU_MAX_TENSORINFO_DIMS];
@@ -79,77 +79,68 @@ struct CopyIfFunc<bool> {
 };
 
 template <typename scalar_t>
-void nonzero_template(const Tensor& self_, Tensor& tensor) {
+void nonzero_template(const Tensor& self_, Tensor& out) {
   Tensor self = self_.contiguous();
 
   const int64_t num_dim = self.dim();
-  TORCH_CHECK(num_dim <= XPU_MAX_TENSORINFO_DIMS, "dim exceed max allowed dim");
-
-  int64_t N = self.numel();
-
-  if (N > 0) {
-    Tensor idx_flat = at::empty(
-        {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-    Tensor range = at::empty(
-        {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-
-    const scalar_t* self_begin = self.const_data_ptr<scalar_t>();
-    int64_t* idx_flat_begin = idx_flat.data_ptr<int64_t>();
-    int64_t* range_begin = nullptr;
-
-    CopyIfFunc<scalar_t> f(self_begin);
-    auto idx_flat_end =
-        pstl::copy_if<int64_t>(range_begin, range_begin + N, idx_flat_begin, f);
-
-    auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end);
-
-    bool need_to_copy = tensor.dim() == 2 &&
-        tensor.sizes()[0] == num_nonzeros && tensor.sizes()[1] == self_.dim() &&
-        !tensor.t().is_contiguous();
-    at::Tensor tensor_ = need_to_copy
-        ? Tensor(at::detail::empty_xpu(
-              {self_.dim(), num_nonzeros}, tensor.options()))
-        : tensor.resize_({self_.dim(), num_nonzeros});
-
-    if (num_nonzeros > 0 && num_dim > 0) {
-      int64_t* tensor_begin = tensor_.data_ptr<int64_t>();
-
-      // preload sizes tensor for index calculation
-      int64_t sizes[XPU_MAX_TENSORINFO_DIMS];
-      int64_t divisor[XPU_MAX_TENSORINFO_DIMS];
-      sizes[num_dim - 1] = self.size(num_dim - 1);
-      divisor[num_dim - 1] = 1;
-      for (auto dim = num_dim - 2; dim >= 0; dim--) {
-        sizes[dim] = self.size(dim);
-        divisor[dim] = sizes[dim + 1] * divisor[dim + 1];
-      }
-
-      const int64_t N = num_nonzeros * num_dim;
-      // restore flatten idx to indices
-      FlattenIdxtoRealIdxKernelFunctor kfn(
-          N,
-          num_dim,
-          num_nonzeros,
-          tensor_begin,
-          idx_flat_begin,
-          divisor,
-          sizes);
-
-      const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
-      const auto num_wg = (N + wg_sz - 1) / wg_sz;
-
-      sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
-    }
-    if (need_to_copy) {
-      tensor.copy_(tensor_.t());
-    } else {
-      // transpose out so it is correct size
-      Tensor tensor_temp = tensor_.t();
-      tensor.set_(tensor_temp);
+  const int64_t N = self.numel();
+
+  Tensor idx_flat = at::empty(
+      {N}, out.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
+
+  const scalar_t* self_begin = self.const_data_ptr<scalar_t>();
+  int64_t* idx_flat_begin = idx_flat.data_ptr<int64_t>();
+  int64_t* range_begin = nullptr;
+
+  CopyIfFunc<scalar_t> f(self_begin);
+  auto idx_flat_end =
+      pstl::copy_if<int64_t>(range_begin, range_begin + N, idx_flat_begin, f);
+
+  auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end);
+
+  bool need_to_copy = out.dim() == 2 &&
+      out.sizes()[0] == num_nonzeros && out.sizes()[1] == num_dim &&
+      !out.t().is_contiguous();
+  Tensor out_ = need_to_copy
+      ? Tensor(at::detail::empty_xpu(
+            {num_dim, num_nonzeros}, out.options()))
+      : out.resize_({num_dim, num_nonzeros});
+
+  if (num_nonzeros > 0 && num_dim > 0) {
+    int64_t* out_begin = out_.data_ptr<int64_t>();
+
+    // preload sizes tensor for index calculation
+    int64_t sizes[XPU_MAX_TENSORINFO_DIMS];
+    int64_t divisor[XPU_MAX_TENSORINFO_DIMS];
+    sizes[num_dim - 1] = self.size(num_dim - 1);
+    divisor[num_dim - 1] = 1;
+    for (auto dim = num_dim - 2; dim >= 0; dim--) {
+      sizes[dim] = self.size(dim);
+      divisor[dim] = sizes[dim + 1] * divisor[dim + 1];
     }
 
+    const int64_t N = num_nonzeros * num_dim;
+    // restore flatten idx to indices
+    FlattenIdxtoRealIdxKernelFunctor kfn(
+        N,
+        num_dim,
+        num_nonzeros,
+        out_begin,
+        idx_flat_begin,
+        divisor,
+        sizes);
+
+    const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
+    const auto num_wg = (N + wg_sz - 1) / wg_sz;
+
+    sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
+  }
+  if (need_to_copy) {
+    out.copy_(out_.t());
   } else {
-    tensor = tensor.resize_({num_dim, N}).contiguous().t();
+    // transpose out so it is correct size
+    Tensor out_temp = out_.t();
+    out.set_(out_temp);
   }
 }
 

From 9f08551751b3214c10075ea77966a0912665881e Mon Sep 17 00:00:00 2001
From: Cherry Zhang <cherry.zhang@intel.com>
Date: Wed, 17 Sep 2025 15:12:43 +0800
Subject: [PATCH 06/27] Check input contiguous before collectives call to
 low-level communication library (#2048)

Co-authored-by: Chao Han <chao1.han@intel.com>
---
 src/xccl/ProcessGroupXCCL.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index ba7c8561be..923771a13f 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -97,17 +97,17 @@ void checkSingleTensor(
 ) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+  }
 
-    // Skip the following requirements for P2P operations
-    if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-      if (p2p) {
-        TORCH_WARN_ONCE(
-            "Detected non-contiguous tensor in P2P operations. It is user "
-            "responsibility to guarantee that source and destination tensors have "
-            "the same contiguity format.");
-      } else {
-        C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
-      }
+  // Skip the following requirements for P2P operations
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    if (p2p) {
+      TORCH_WARN_ONCE(
+          "Detected non-contiguous tensor in P2P operations. It is user "
+          "responsibility to guarantee that source and destination tensors have "
+          "the same contiguity format.");
+    } else {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
     }
   }
 }

From 6508096a46812e712374aae68c798ef267773172 Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Wed, 17 Sep 2025 15:17:59 +0200
Subject: [PATCH 07/27] Fix hardswish gradients corner case (#2050)

Fixes https://github.com/intel/torch-xpu-ops/issues/2014
---
 src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp | 4 ++--
 test/xpu/skip_list_common.py                            | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
index 8c6e47f774..2c2c60ada9 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -35,9 +35,9 @@ struct HardswishBackwardFunctor {
     const opmath_t one_half(0.5f);
     opmath_t grad_val = static_cast<opmath_t>(grad_val_);
     opmath_t self_val = static_cast<opmath_t>(self_val_);
-    if (self_val < neg_three) {
+    if (self_val <= neg_three) {
       return zero;
-    } else if (self_val <= three) {
+    } else if (self_val < three) {
       return grad_val * ((self_val / three) + one_half);
     } else {
       return grad_val;
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index ae3a8b8a93..5e70eebf44 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -1022,10 +1022,6 @@
         "test_ctc_loss_cudnn_tensor",  # want "xpu" in function name
         # RuntimeError: reflection_pad2d_backward_xpu does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'.
         "test_ReflectionPad2d_large_deterministic_xpu",
-        # Case updated in pytorch commit 97272e4
-        "test_hardswish_grad_corner_xpu_bfloat16",
-        "test_hardswish_grad_corner_xpu_float16",
-        "test_hardswish_grad_corner_xpu_float32",
         # x_cuda = x.clone().detach().to("cuda").requires_grad_(): Torch not compiled with CUDA enabled
         "test_layer_norm_backwards_eps",
     ),

From 24fab67b6ecf7620d0cf776047a3056c5b518bab Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:43:49 -0700
Subject: [PATCH 08/27] Clean up getDeviceIndexOfCurrentQueue (#2060)

# Motivation
Clean up `getDeviceIndexOfCurrentQueue` since it duplicates
`current_device` and provides no additional functionality.
---
 src/ATen/native/xpu/sycl/BatchNormKernels.cpp |   3 +-
 src/ATen/native/xpu/sycl/Norm.h               |   3 +-
 src/ATen/native/xpu/sycl/SoftMaxKernels.cpp   |  12 +-
 .../native/xpu/sycl/TensorShapeKernels.cpp    |   3 +-
 src/comm/DeviceProperties.h                   | 130 +++++++-----------
 src/comm/Runtime.h                            |   4 -
 6 files changed, 60 insertions(+), 95 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index f8e1b6906e..4124890274 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -185,8 +185,7 @@ int get_num_threads_by_dev_max_group_size(
 int get_prefer_simd(int numPlane, int nHw) {
   // decide SIMD: SIMD32 or SIMD16
 
-  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
-
+  auto dev_id = at::xpu::current_device();
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   auto sub_group_size = dev_prop->sub_group_sizes;
   int simd = sub_group_size[1];
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index 6117b6d261..8b887a443a 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -269,8 +269,7 @@ class NormConfig {
   }
 
   void get_max_vec_size() {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int64_t total_resource = syclMaxWorkItemsPerTile();
 
     constexpr int float4_size = sizeof(float) * 4;
     max_vec_size = float4_size / element_size_bytes;
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index fdedf5fb09..020fe4a1bc 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -1559,8 +1559,7 @@ void spatial_softmax_forward(
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
-  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1749,8 +1748,7 @@ void spatial_softmax_backward(
       canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1901,8 +1899,7 @@ Tensor& masked_softmax_forward(
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -2026,8 +2023,7 @@ void masked_softmax_backward(
       canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
diff --git a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
index 8c46658e7a..fed5fee904 100644
--- a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
@@ -669,8 +669,7 @@ void split_with_sizes_copy_out_xpu_contiguous_no_cast(
     num_groups += div_up(split_chunk_size, GROUP_SIZE * BYTES_PER_THREAD);
   }
 
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t tile_size = syclMaxWorkItemsPerTile(dev_id);
+  int64_t tile_size = syclMaxWorkItemsPerTile();
   const int64_t max_groups = tile_size / GROUP_SIZE * 2.0;
 
   // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index ee0d285eaf..5d1f0c8f77 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -3,17 +3,15 @@
 #include <ATen/xpu/XPUContext.h>
 
 #include <comm/Runtime.h>
-#include <iostream>
 
 namespace xpu {
 namespace sycl {
 
 template <class KernelClass>
 static int64_t syclMaxWorkGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
-  auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
-  auto ctx = q.get_context();
-  auto dev = q.get_device();
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
+  auto& ctx = c10::xpu::get_device_context();
+  auto& dev = c10::xpu::get_raw_device(dev_id);
 
   auto kid = ::sycl::get_kernel_id<KernelClass>();
   // The kernel won't be built for devices except for the first device.
@@ -30,73 +28,69 @@ static int64_t syclMaxWorkGroupSize(
 
 template <class KernelClass>
 static int64_t syclMaxWorkGroupSize(
-    KernelClass /*kfn*/,
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    const KernelClass& /*kfn*/,
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   return syclMaxWorkGroupSize<KernelClass>(dev_id);
 }
 
 static inline int64_t syclDeviceMaxWorkGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_work_group_size;
 }
 
 static inline int64_t syclMaxSubGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  auto subgroup_sizes = dev_prop->sub_group_sizes;
-  uint64_t max_val = 0;
-  for (auto i : subgroup_sizes) {
-    if (i > max_val)
-      max_val = i;
-  }
-  return max_val;
+  const auto& subgroup_sizes = dev_prop->sub_group_sizes;
+  TORCH_CHECK(
+      !subgroup_sizes.empty(),
+      "The device subgroup sizes is empty, please check the device status.");
+  return *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end());
 }
 
 static inline int64_t syclMinSubGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  auto subgroup_sizes = dev_prop->sub_group_sizes;
-  uint64_t min_val = dev_prop->max_work_group_size;
-  for (auto i : subgroup_sizes) {
-    if (i < min_val)
-      min_val = i;
-  }
-  return min_val;
+  const auto& subgroup_sizes = dev_prop->sub_group_sizes;
+  TORCH_CHECK(
+      !subgroup_sizes.empty(),
+      "The device subgroup sizes is empty, please check the device status.");
+  return *std::min_element(subgroup_sizes.begin(), subgroup_sizes.end());
 }
 
 static inline int64_t syclMaxComputeUnitSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_compute_units;
 }
 
 static inline int64_t syclGpuEuCount(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_count;
 }
 
 static inline int64_t syclGpuEuSimdWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_simd_width;
 }
 
 static inline int64_t syclGpuHWThreadsPerEU(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_hw_threads_per_eu;
 }
 
 static inline int64_t syclGpuEUCountPerSubslice(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_count_per_subslice;
 }
 
 static inline int64_t syclMaxWorkItemsPerTile(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t eu_cnt = dev_prop->gpu_eu_count;
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
@@ -105,7 +99,7 @@ static inline int64_t syclMaxWorkItemsPerTile(
 }
 
 static inline int64_t syclMaxWorkItemsPerSubSlice(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
   int64_t eu_count = dev_prop->gpu_eu_count_per_subslice;
@@ -113,7 +107,7 @@ static inline int64_t syclMaxWorkItemsPerSubSlice(
 }
 
 static inline int64_t syclMaxWorkItemsPerEU(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
   int64_t hw_threads = dev_prop->gpu_hw_threads_per_eu;
@@ -121,94 +115,76 @@ static inline int64_t syclMaxWorkItemsPerEU(
 }
 
 static inline int64_t syclMaxNumSubGroups(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_num_sub_groups;
 }
 
 static inline int64_t syclMaxDSSNum(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   int64_t dss_num =
       syclMaxComputeUnitSize(dev_id) / syclGpuEUCountPerSubslice(dev_id);
   return dss_num;
 }
 
 static inline size_t syclGlobalMemSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->global_mem_size;
 }
 
 static inline int64_t syclLocalMemSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->local_mem_size;
 }
 
 template <typename T>
 uint32_t syclPrefVectorWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   (void)dev_id; // Suppress unused variable warning
 
   // Hot fix. This is the preferred vector width for GPUs up to LNL/BMG.
-  uint32_t vec_width = 16;
+  constexpr uint32_t vec_width = 16;
 
-  if (std::is_same<T, char>::value) {
-    return vec_width / sizeof(char);
-  }
-  if (std::is_same<T, short>::value) {
-    return vec_width / sizeof(short);
-  }
-  if (std::is_same<T, int>::value) {
-    return vec_width / sizeof(int);
-  }
-  if (std::is_same<T, int64_t>::value) {
-    return vec_width / sizeof(int64_t);
-  }
-  if (std::is_same<T, float>::value) {
-    return vec_width / sizeof(float);
-  }
-  if (std::is_same<T, double>::value) {
-    return vec_width / sizeof(double);
+  if constexpr (
+      std::is_same_v<T, char> || std::is_same_v<T, short> ||
+      std::is_same_v<T, int> || std::is_same_v<T, int64_t> ||
+      std::is_same_v<T, float> || std::is_same_v<T, double> ||
+      std::is_same_v<T, ::sycl::half>) {
+    return vec_width / sizeof(T);
+  } else {
+    throw std::invalid_argument(
+        "Invalid data type to fetch preferred vector width!");
   }
-  if (std::is_same<T, ::sycl::half>::value) {
-    return vec_width / sizeof(::sycl::half);
-  }
-  throw std::invalid_argument(
-      "Invalid data type to fetch preferred vector width!");
 }
 
 template <typename T>
 uint32_t syclNativeVectorWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  if (std::is_same<T, char>::value) {
+  if constexpr (std::is_same_v<T, char>) {
     return dev_prop->native_vector_width_char;
-  }
-  if (std::is_same<T, short>::value) {
+  } else if constexpr (std::is_same_v<T, short>) {
     return dev_prop->native_vector_width_short;
-  }
-  if (std::is_same<T, int>::value) {
+  } else if constexpr (std::is_same_v<T, int>) {
     return dev_prop->native_vector_width_int;
-  }
-  if (std::is_same<T, int64_t>::value) {
+  } else if constexpr (std::is_same_v<T, int64_t>) {
     return dev_prop->native_vector_width_long;
-  }
-  if (std::is_same<T, float>::value) {
+  } else if constexpr (std::is_same_v<T, float>) {
     return dev_prop->native_vector_width_float;
-  }
-  if (std::is_same<T, double>::value) {
+  } else if constexpr (std::is_same_v<T, double>) {
     return dev_prop->native_vector_width_double;
-  }
-  if (std::is_same<T, ::sycl::half>::value) {
+  } else if constexpr (std::is_same_v<T, ::sycl::half>) {
     return dev_prop->native_vector_width_half;
+  } else {
+    throw std::invalid_argument(
+        "Invalid data type to fetch native vector width!");
   }
-  throw std::invalid_argument(
-      "Invalid data type to fetch native vector width!");
 }
 
 static inline bool syclHasFloat64(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->has_fp64;
 }
diff --git a/src/comm/Runtime.h b/src/comm/Runtime.h
index fa7daaf125..4fd44d08e5 100644
--- a/src/comm/Runtime.h
+++ b/src/comm/Runtime.h
@@ -4,10 +4,6 @@
 
 namespace at::xpu {
 
-static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() {
-  return c10::xpu::getCurrentXPUStream().device_index();
-}
-
 static inline sycl::queue& getCurrentSYCLQueue() {
   return c10::xpu::getCurrentXPUStream().queue();
 }

From bc52e6303989c5e8baa7ad97f39040251434b828 Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Mon, 22 Sep 2025 10:07:34 +0800
Subject: [PATCH 09/27] [CI] Cleanup after build to avoid permission issue
 (#2088)

disable_ut
disable_e2e
disable_distributed
---
 .github/workflows/_linux_build.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 286d124b7b..6fcaf5c954 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -200,7 +200,6 @@ jobs:
           python -c "import torchaudio; print(torchaudio.__version__)"
           python pytorch/torch/utils/collect_env.py
           pip list |grep -E 'torch|intel'
-          chmod 777 /__w /github ./ -R
       - name: Upload Torch XPU Wheel
         if: ${{ success() }}
         uses: actions/upload-artifact@v4
@@ -213,3 +212,8 @@ jobs:
         with:
           name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/build_*.log
+      - name: Cleanup workspace
+        if: ${{ always() }}
+        run: |
+          chmod 777 /__w /github ./ -R
+          find ./ |grep -v "^\./$" |xargs rm -rf

From 6e5af1ee2f2560426e74f6c842fb46d5ae254f75 Mon Sep 17 00:00:00 2001
From: Frost Mitchell <frost.mitchell@intel.com>
Date: Mon, 22 Sep 2025 01:46:17 -0400
Subject: [PATCH 10/27] Revert tracking of Work status for FlightRecorder in
 ProcessGroupXCCL  (#2076)

The callback used to track the work status in ProcessGroupXCCL was
causing an unintended memory leak by maintaining the work objects and
therefor the stashed tensors. For now, I'm removing the callback and I
have added a unit test to ensure this memory leak isn't returning.

Fix https://github.com/intel/torch-xpu-ops/issues/2084
---
 src/xccl/ProcessGroupXCCL.cpp          | 26 +++++++++++---------------
 src/xccl/ProcessGroupXCCL.hpp          |  2 --
 test/xpu/distributed/test_c10d_xccl.py | 25 +++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index 923771a13f..514cdaf3a7 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -437,17 +437,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus(
   pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
 }
 
-void ProcessGroupXCCL::setCompletedPgStatus(
-    c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work) {
-  pgStatus_->lastCompletedSeq = static_cast<int64_t>(work->getSequencenumber());
-  pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_);
-  pgStatus_->lastCompletedNumelIn = work->numelIn_;
-  pgStatus_->lastCompletedNumelOut = work->numelOut_;
-  // To avoid complexity, we're not computing duration.
-  FlightRecorderXCCL::get()->retire_id(
-      work->trace_id_, /*compute_duration*/ false);
-}
-
 void ProcessGroupXCCL::setSequenceNumberForGroup() {}
 
 uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() {
@@ -777,8 +766,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
   work->future_->markCompleted(at::IValue(*work->outputs_));
+  auto id = work->trace_id_;
   work->future_->addCallback(
-      [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); });
+      [id](at::ivalue::Future&) {
+        FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+      },
+      /*use_future*/ false);
   work->blockingWait_ = blockingWait_;
 
   work->numelIn_ = 0;
@@ -889,9 +882,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);
     work->future_->markCompleted(at::IValue(*work->outputs_));
-    work->future_->addCallback([this, work](at::ivalue::Future&) {
-      this->setCompletedPgStatus(work);
-    });
+    auto id = work->trace_id_;
+    work->future_->addCallback(
+        [id](at::ivalue::Future&) {
+          FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+        },
+        /*use_future*/ false);
 
     work->numelIn_ = work->numelOut_ = tensor.numel();
     setEnqueuedPgStatus(work);
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index 2516c0b912..8e4604fbfb 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -424,8 +424,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
 
   const std::vector<uint64_t>& groupRanks() const;
   void setEnqueuedPgStatus(c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
-  void setCompletedPgStatus(
-      c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
   bool dumpDebuggingInfo(bool includeStackTrace = true);
 
  protected:
diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 226501d7ac..57b33ae29d 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -365,6 +365,31 @@ def test_nan_assert(self, type):
         # reset env
         os.environ["TORCH_XCCL_NAN_CHECK"] = "0"
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_oom(self):
+        pg = self._create_process_group_xccl()
+        dp_ranks = range(0, self.world_size)
+        dp_group = c10d.new_group(dp_ranks)
+        device = torch.device(f"xpu:{self.rank}")
+        torch.xpu.set_device(device)
+
+        shape = (16384 * 2, 16384 * 2)
+        weight = torch.ones(shape, device=device).half()
+        gradient = torch.zeros(shape, device=device).half()
+        ret = torch.randn(shape, device=device).half()
+
+        for iter in range(50):
+            output = torch.empty_like(ret)
+            output = ret + weight + gradient
+            ret = torch.nn.functional.linear(output, weight=ret)
+            dist.all_reduce(ret, op=dist.ReduceOp.SUM)
+        torch.xpu.synchronize()
+        self.assertLess(
+            torch.xpu.max_memory_allocated(),
+            torch.xpu.max_memory_reserved() * 2,
+        )
+
 
 class CommTest(MultiProcessTestCase):
     @property

From 65234bdfba4a70fc5eb3c73fae421fcd8249351e Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Mon, 22 Sep 2025 22:44:13 -0700
Subject: [PATCH 11/27] build: enable SYCL warnings on Linux (#2096)

Some versions of DPC++ compiler pass paths to SYCL headers as user
include paths (`-I`) rather than system paths (`-isystem`). This makes
host compiler to report warnings encountered in the SYCL headers, such
as deprecated warnings, even if warned API is not actually used in the
program. We expect that this issue will be addressed in the later
version of DPC++ compiler. To workaround the issue we wrap paths to SYCL
headers in `-isystem`.

disable_ut
disable_e2e
disable_distributed

CC: @EikanWang @chuanqi129

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 cmake/BuildFlags.cmake | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index 23956d710a..c0afe2d8f1 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -36,10 +36,16 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     list(APPEND SYCL_HOST_FLAGS -fPIC)
     list(APPEND SYCL_HOST_FLAGS -std=c++17)
     list(APPEND SYCL_HOST_FLAGS -Wunused-variable)
-    # SYCL headers warnings
-    list(APPEND SYCL_HOST_FLAGS -Wno-deprecated-declarations)
-    list(APPEND SYCL_HOST_FLAGS -Wno-deprecated)
-    list(APPEND SYCL_HOST_FLAGS -Wno-attributes)
+    # Some versions of DPC++ compiler pass paths to SYCL headers as user include paths (`-I`) rather
+    # than system paths (`-isystem`). This makes host compiler to report warnings encountered in the
+    # SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program.
+    # We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround
+    # the issue we wrap paths to SYCL headers in `-isystem`.
+    foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
+      list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
+    endforeach()
+    # Excluding warnings which flood the compilation output
+    # TODO: fix warnings in the source code and then reenable them in compilation
     list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare)
   endif()
 

From b755d3cb8de018e35c6ea104440bc1b1de0c0aaa Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Tue, 23 Sep 2025 15:17:25 +0800
Subject: [PATCH 12/27] [CI] Add deps for new LLM models (#2054)

Add new LLM models deps `accelerate`

disable_ut
disable_distributed
---
 .github/actions/linux-testenv/action.yml                     | 4 +++-
 .../rolling/inductor_huggingface_inference.csv               | 4 ++++
 .../rolling/inductor_huggingface_training.csv                | 5 +++++
 .github/workflows/_linux_e2e_summary.yml                     | 4 ++--
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
index 246b94c919..cbf2a60a5c 100644
--- a/.github/actions/linux-testenv/action.yml
+++ b/.github/actions/linux-testenv/action.yml
@@ -57,7 +57,7 @@ runs:
       run: |
         # install pytorch
         if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then
-          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
         elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then
           pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu
         elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then
@@ -115,6 +115,8 @@ runs:
         pip install pandas psutil scipy pyyaml
         cd pytorch
         if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then
+          # for new LLM models
+          pip install accelerate
           pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0
           TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')"
         elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
index a75d3d2253..fe1e531308 100644
--- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
+++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
@@ -45,3 +45,7 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass
 XGLMForCausalLM,pass,pass,pass,pass,pass
 XLNetLMHeadModel,pass,pass,pass,pass,pass
 YituTechConvBert,pass,pass,pass,pass,pass
+Qwen/Qwen3-0.6B,pass,pass,pass,pass,pass
+google/gemma-2-2b,pass,pass,pass,pass,pass
+meta-llama/Llama-3.2-1B,pass,pass,pass,pass,pass
+openai/whisper-tiny,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
index a75d3d2253..6d3726bdfb 100644
--- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
+++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
@@ -45,3 +45,8 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass
 XGLMForCausalLM,pass,pass,pass,pass,pass
 XLNetLMHeadModel,pass,pass,pass,pass,pass
 YituTechConvBert,pass,pass,pass,pass,pass
+# https://github.com/intel/torch-xpu-ops/issues/2055
+Qwen/Qwen3-0.6B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+google/gemma-2-2b,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+meta-llama/Llama-3.2-1B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+openai/whisper-tiny,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml
index f2026a9f5a..9824c8c458 100644
--- a/.github/workflows/_linux_e2e_summary.yml
+++ b/.github/workflows/_linux_e2e_summary.yml
@@ -77,10 +77,10 @@ jobs:
           fi
           pt2e_summary_csv="$(find ./target/ -name "summary.csv")"
           if [ -f "${pt2e_summary_csv}" ];then
-            cat ${pt2e_summary_csv}
             failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true)
             if [ ${failed_num} -ne 0 ];then
               echo "[Warning] PT2E has failures!"
+              grep 'failed' ${pt2e_summary_csv}
             fi
           fi
           exit ${exit_label}
@@ -88,7 +88,7 @@ jobs:
         if: ${{ endsWith(inputs.test_type, 'ly') }}
         run: |
           gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1
-          has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)"
+          has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt || true)"
           if [ ${has_or_not} -ne 0 ];then
             sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt
           else

From 9eed218770fc9f9ba6dcbbb3ee7480c6fb247d7a Mon Sep 17 00:00:00 2001
From: Konrad Drozd <konrad.drozd@intel.com>
Date: Tue, 23 Sep 2025 09:58:06 +0200
Subject: [PATCH 13/27] Fix accuracy issues with CTC loss (#2074)

Fixes #1737 and the flaky backward accuracy issue uncovered by fixing
the forward pass, using sycl_global_and_local_fence to sync data in
global memory - log_alpha_data_.
---
 src/ATen/native/xpu/sycl/LossCTCKernels.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
index 6acaf2e6cb..90c090e38e 100644
--- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -115,7 +115,7 @@ struct CTCLossLogAlphaKernelFunctor {
         have_three = false;
       }
       for (int64_t t = 1; t < max_input_length_; t++) {
-        item.barrier(sycl_local_fence);
+        item.barrier(sycl_global_and_local_fence);
         if (valid && (t < input_length) && (s < 2 * target_length + 1)) {
           // only for valid t, s. This is equation (6) and (7), la1, la2, la3
           // are the three summands, lamax is the maximum for the logsumexp
@@ -165,7 +165,7 @@ struct CTCLossLogAlphaKernelFunctor {
         }
       }
     }
-    item.barrier(sycl_local_fence);
+    item.barrier(sycl_global_and_local_fence);
 
     if (!valid)
       return;
@@ -502,7 +502,7 @@ struct CTCLossBackwardLogBetaKernelFunctor {
       // now go backward in t. Note that we need to skip the last timestep that
       // we did above.
       for (int64_t t = max_input_length_ - 2; t >= 0; t--) {
-        item.barrier(sycl_local_fence);
+        item.barrier(sycl_global_and_local_fence);
         if (valid && (t < input_length - 1) && (s < 2 * target_length + 1)) {
           scalar_t lb1 = log_beta_data_
               [lb_batch_offset + lb_input_stride_ * (t + 1) +

From 0df6a62ddba9928f4524e485ec9386aa420aa8ce Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Tue, 23 Sep 2025 10:32:43 +0200
Subject: [PATCH 14/27] Implement aten::nonzero_static on XPU backend (#2061)

Fixes https://github.com/intel/torch-xpu-ops/issues/1943
---
 src/ATen/native/xpu/Nonzero.cpp   | 69 ++++++++++++++++++++++++++-----
 yaml/native/native_functions.yaml |  9 ++++
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
index a1f9b1d09f..82cfaa8275 100644
--- a/src/ATen/native/xpu/Nonzero.cpp
+++ b/src/ATen/native/xpu/Nonzero.cpp
@@ -1,22 +1,18 @@
 #include <ATen/core/Tensor.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/full.h>
 #include <ATen/xpu/EmptyTensor.h>
-
 #include <ATen/native/xpu/sycl/NonzeroKernel.h>
 #include <comm/TensorInfo.h>
 
 namespace at {
 namespace native {
-Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
+
+void nonzero_common_checks(const Tensor& self, Tensor& out, const std::string& op_name) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
-      "nonzero is not supported for tensors with more than INT_MAX elements, \
+      op_name, " is not supported for tensors with more than INT_MAX elements, \
       See https://github.com/pytorch/pytorch/issues/51871");
-  TORCH_CHECK(
-      out.dtype() == at::kLong,
-      "Expected object of scalar type ",
-      at::kLong,
-      " as out, but got ",
-      out.dtype());
   TORCH_CHECK(
       self.device() == out.device(),
       "expected self and out to be on the same device, but got out on ",
@@ -25,10 +21,19 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
       self.device());
   TORCH_CHECK(
       self.dim() <= XPU_MAX_TENSORINFO_DIMS,
-      "nonzero is not supported for tensor with more than ",
+      op_name, " is not supported for tensor with more than ",
       XPU_MAX_TENSORINFO_DIMS,
       " dimensions");
+}
 
+Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(
+      out.dtype() == at::kLong,
+      "Expected object of scalar type ",
+      at::kLong,
+      " as out, but got ",
+      out.dtype());
+  nonzero_common_checks(self, out, "nonzero");
   if (self.numel() == 0) {
     out = at::detail::empty_xpu({0, self.dim()}, out.options());
     return out;
@@ -42,5 +47,49 @@ Tensor nonzero_xpu(const Tensor& self) {
   nonzero_out_xpu(self, out);
   return out;
 }
+
+Tensor& nonzero_static_out_xpu(
+    const Tensor& self,
+    int64_t size,
+    int64_t fill_value,
+    Tensor& out) {
+  TORCH_CHECK(
+    size >= 0, "nonzero_static: 'size' must be an non-negative integer");
+  TORCH_CHECK(
+    out.dtype() == at::kLong,
+    "nonzero_static: Expected out tensor to have scalar type Long");
+  nonzero_common_checks(self, out, "nonzero_static");
+  if (self.numel() == 0) {
+    out = at::full({size, self.dim()}, fill_value, out.options());
+    return out;
+  }
+
+  Tensor nonzero_out = at::detail::empty_xpu({0}, self.options().dtype(kLong));
+  xpu::nonzero_kernel(self, nonzero_out);
+  auto nonzero_size = nonzero_out.size(0);
+  out.resize_({size, self.dim()});
+
+  if (nonzero_size > size) {
+    out.copy_(nonzero_out.narrow(0, 0, size));
+  } else if (nonzero_size < size) {
+    auto padding = at::full({size - nonzero_size, self.dim()}, fill_value, out.options());
+    out.copy_(at::cat({nonzero_out, padding}, 0));
+  } else {
+    out.copy_(nonzero_out);
+  }
+  return out;
+}
+
+Tensor nonzero_static_xpu(
+    const Tensor& self,
+    int64_t size,
+    int64_t fill_value) {
+  TORCH_CHECK(
+    size >= 0, "nonzero_static: 'size' must be an non-negative integer");
+  Tensor out = at::detail::empty_xpu({size, self.dim()}, self.options().dtype(kLong));
+  nonzero_static_out_xpu(self, size, fill_value, out);
+  return out;
+}
+
 } // namespace native
 } // namespace at
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index 2df8464262..a3281791de 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -2071,6 +2071,15 @@
     XPU: nonzero_xpu
   tags: [dynamic_output_shape, core]
 
+- func: nonzero_static.out(Tensor self, *, SymInt size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: nonzero_static_out_xpu
+
+- func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: nonzero_static_xpu
+
 - func: maximum(Tensor self, Tensor other) -> Tensor
   structured_delegate: maximum.out
   device_check: NoCheck   # TensorIterator

From 229e8ba104e3b0a6e88c36effefa3ea914b25673 Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Tue, 23 Sep 2025 15:22:38 +0200
Subject: [PATCH 15/27] Stop recursive calculations in polynomial kernels if
 tensor has NaNs (#2062)

Fixes https://github.com/intel/torch-xpu-ops/issues/1823
---
 .../native/xpu/sycl/HermitePolynomialHKernel.cpp     | 12 +++++-------
 .../native/xpu/sycl/HermitePolynomialHeKernel.cpp    | 12 +++++-------
 .../native/xpu/sycl/LaguerrePolynomialLKernel.cpp    |  2 +-
 .../native/xpu/sycl/LegendrePolynomialPKernel.cpp    |  2 +-
 test/xpu/skip_list_common.py                         |  6 ------
 5 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
index 4862af9a8f..bffd6117b5 100644
--- a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
+++ b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
@@ -9,17 +9,15 @@ namespace at::native::xpu {
 template <typename scalar_t>
 struct HermitePolynomialHFunctor {
   scalar_t operator()(scalar_t x, scalar_t n_) const {
-    int64_t n = static_cast<int64_t>(n_);
+    auto n = static_cast<int64_t>(n_);
     if (n < 0) {
       return scalar_t(0.0);
-    }
-
-    if (n == 0) {
+    } else if (n == 0) {
       return scalar_t(1.0);
-    }
-
-    if (n == 1) {
+    } else if (n == 1) {
       return x + x;
+    } else if (n > getHermitianLimit<scalar_t>()) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
     }
 
     scalar_t p = scalar_t(1.0);
diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
index 6c37e87202..8ba1d8bfff 100644
--- a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
@@ -9,17 +9,15 @@ namespace at::native::xpu {
 template <typename scalar_t>
 struct HermitePolynomialHeFunctor {
   scalar_t operator()(scalar_t x, scalar_t n_) const {
-    int64_t n = static_cast<int64_t>(n_);
+    auto n = static_cast<int64_t>(n_);
     if (n < 0) {
       return scalar_t(0.0);
-    }
-
-    if (n == 0) {
+    } else if (n == 0) {
       return scalar_t(1.0);
-    }
-
-    if (n == 1) {
+    } else if (n == 1) {
       return x;
+    } else if (n > getHermitianLimit<scalar_t>()) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
     }
 
     scalar_t p = scalar_t(1.0);
diff --git a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
index 8688f80456..869d64492a 100644
--- a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
@@ -30,7 +30,7 @@ struct LaguerrePolynomialLFunctor {
     scalar_t q = scalar_t(1.0) - x;
     scalar_t r;
 
-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
       r = (((k + k) + (scalar_t(1.0) - x)) * q - k * p) / (k + 1);
       p = q;
       q = r;
diff --git a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
index 2176c4e79c..caf1d8ac16 100644
--- a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
@@ -34,7 +34,7 @@ struct LegendrePolynomialPFunctor {
     scalar_t q = x;
     scalar_t r;
 
-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
       r = ((k + k + 1) * x * q - k * p) / (k + 1);
       p = q;
       q = r;
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 5e70eebf44..9eeb6b44c4 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -74,12 +74,6 @@
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
-        # XPU hang. CUDA hang as well.
-        # https://github.com/pytorch/pytorch/issues/79528
-        "test_compare_cpu_special_hermite_polynomial_he_xpu_float32",
-        "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
-        "test_compare_cpu_special_laguerre_polynomial_l_xpu_float32",
-        "test_compare_cpu_special_legendre_polynomial_p_xpu_float32",
         # core dump
         "test_dtypes__refs_nn_functional_pdist_xpu",
         # XFAIL of CUDA and XPU, unexpected success in fallback

From 23cd5841a06e88431a35e4827b70ff35e78d6da6 Mon Sep 17 00:00:00 2001
From: Slawomir Siwek <slawomir.siwek@intel.com>
Date: Thu, 25 Sep 2025 08:10:11 +0200
Subject: [PATCH 16/27] Remove unused variables (#2044)

- remove unused variables,
- add compiler flag to prevent this in the future
---
 .github/workflows/_linux_build.yml            | 1 +
 CMakeLists.txt                                | 3 +++
 src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 3 ---
 src/ATen/native/xpu/sycl/RoiAlignKernels.cpp  | 1 -
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 6fcaf5c954..6f209ae0b1 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -94,6 +94,7 @@ jobs:
       - name: Build Pytorch on ${{ needs.runner.outputs.hostname }}
         run: |
           export USE_XCCL=1
+          export IS_XPU_CI=1
           # only build pvc for CI
           if [ "${{ github.event_name }}" == "pull_request" ];then
             export TORCH_XPU_ARCH_LIST='pvc'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 277f1f067d..063c728411 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,9 @@ set(PROJECT_VERSION "2.3.0")
 # Avoid SYCL compiler error
 if(NOT WIN32)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-error")
+  if("$ENV{IS_XPU_CI}" STREQUAL "1")
+    string(APPEND CMAKE_CXX_FLAGS " -Werror=unused-variable")
+  endif()
 endif()
 
 cmake_policy(SET CMP0048 NEW)
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index 92a4195d01..e4a8444c94 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1103,9 +1103,6 @@ void _layer_norm_backward_kernel(
     // affecting performance and behavior.
     const scalar_t* dY_data = dY.const_data_ptr<scalar_t>();
     const scalar_t* X_data = X.const_data_ptr<scalar_t>();
-    weight_t* dg_data =
-        dgamma.defined() ? dgamma.data_ptr<weight_t>() : nullptr;
-    weight_t* db_data = dbeta.defined() ? dbeta.data_ptr<weight_t>() : nullptr;
     Tensor dgamma_blocks;
     Tensor dbeta_blocks;
     weight_t* dgamma_blocks_ptr = nullptr;
diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
index f0188ce0ee..869a2b6627 100644
--- a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
@@ -440,7 +440,6 @@ Tensor roi_align_kernel(
 
   at::Tensor output = at::zeros(
       {num_rois, channels, pooled_height, pooled_width}, input.options());
-  auto output_size = num_rois * pooled_height * pooled_width * channels;
 
   if (output.numel() == 0) {
     return output;

From 304983a659a10abe9d6946c54b50221b9299a389 Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Fri, 26 Sep 2025 09:44:40 +0800
Subject: [PATCH 17/27] [CI] Login docker hub to enlarge pull limitation
 (#2107)

---
 .github/workflows/_linux_accelerate.yml   | 5 +++--
 .github/workflows/_linux_build.yml        | 2 ++
 .github/workflows/_linux_e2e.yml          | 1 +
 .github/workflows/_linux_op_benchmark.yml | 1 +
 .github/workflows/_linux_transformers.yml | 1 +
 .github/workflows/_linux_ut.yml           | 1 +
 .github/workflows/bisect_search.yml       | 9 +++++----
 7 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml
index decf4e612f..fbaa0c3f07 100644
--- a/.github/workflows/_linux_accelerate.yml
+++ b/.github/workflows/_linux_accelerate.yml
@@ -48,6 +48,9 @@ concurrency:
 defaults:
   run:
     shell: bash {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   conditions-filter:
@@ -55,8 +58,6 @@ jobs:
     if: ${{ github.event.pull_request.draft == false }}
     runs-on: ubuntu-24.04
     timeout-minutes: 10
-    env:
-      GH_TOKEN: ${{ github.token }}
     outputs:
       disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }}
     steps:
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 6f209ae0b1..77caf298b4 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -31,6 +31,8 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   runner:
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
index cd2056ff36..08f88462e6 100644
--- a/.github/workflows/_linux_e2e.yml
+++ b/.github/workflows/_linux_e2e.yml
@@ -50,6 +50,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   runner:
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index d71458c9bf..e8474d7ab1 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -26,6 +26,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   reference_issue: 1689
 
 jobs:
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
index fc3deaf7a3..67e2082834 100644
--- a/.github/workflows/_linux_transformers.yml
+++ b/.github/workflows/_linux_transformers.yml
@@ -56,6 +56,7 @@ concurrency:
   cancel-in-progress: true
 env:
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   HF_HUB_ETAG_TIMEOUT: 120
   HF_HUB_DOWNLOAD_TIMEOUT: 120
   python: ${{ inputs.python != '' && inputs.python || '3.10' }}
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 6da0abadd1..e895a959b7 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -33,6 +33,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   UT_SKIP_ISSUE: 1624
 
 jobs:
diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml
index c2512acb33..d45c0a83f8 100644
--- a/.github/workflows/bisect_search.yml
+++ b/.github/workflows/bisect_search.yml
@@ -40,6 +40,11 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   get_runner:
@@ -74,10 +79,6 @@ jobs:
         TORCH_XPU_ARCH_LIST: pvc
         USE_XCCL: 0
         USE_KINETO: 0
-    env:
-       GH_TOKEN: ${{ github.token }}
-       HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4

From 26ae9e71332c5f29892339e6aa626e39def363c3 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Thu, 25 Sep 2025 19:50:07 -0700
Subject: [PATCH 18/27] Install xpu internal headers to PyTorch (#2106)

# Motivation
To resolve https://github.com/intel/torch-xpu-ops/issues/2034, expose
all xpu internal headers to PyTorch.
---
 src/ATen/CMakeLists.txt | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 37ec7066b7..493675e804 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,6 +1,5 @@
 # ATen XPU sources
 
-file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
 file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
 file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
@@ -18,10 +17,28 @@ set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE)
 set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 
-foreach(HEADER  ${xpu_h})
-  install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
-endforeach()
+# ATen XPU headers
 
-foreach(HEADER  ${xpu_ops_generated_headers})
-  install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
-endforeach()
+macro(install_xpu_headers glob_pattern dest_subdir)
+  file(GLOB headers ${glob_pattern})
+  if(headers)
+    install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/${dest_subdir}")
+  endif()
+endmacro()
+
+install_xpu_headers("xpu/*.h" "ATen/xpu")
+install_xpu_headers("native/xpu/*.h" "ATen/native/xpu")
+install_xpu_headers("native/xpu/sycl/*.h" "ATen/native/xpu/sycl")
+install_xpu_headers("native/xpu/mkl/*.h" "ATen/native/xpu/mkl")
+install_xpu_headers("native/nested/xpu/*.h" "ATen/native/nested/xpu")
+install_xpu_headers("native/nested/xpu/sycl/*.h" "ATen/native/nested/xpu/sycl")
+install_xpu_headers("native/quantized/*.h" "ATen/native/quantized/xpu")
+install_xpu_headers("native/quantized/sycl/*.h" "ATen/native/quantized/xpu/sycl")
+install_xpu_headers("native/sparse/xpu/*.h" "ATen/native/sparse/xpu")
+install_xpu_headers("native/sparse/xpu/sycl/*.h" "ATen/native/sparse/xpu/sycl")
+install_xpu_headers("native/transformers/*.h" "ATen/native/transformers/xpu")
+install_xpu_headers("native/transformers/sycl/*.h" "ATen/native/transformers/xpu/sycl")
+
+if(xpu_ops_generated_headers)
+  install(FILES ${xpu_ops_generated_headers} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
+endif()

From eed1b8f22f2d470c9812e792520c5156947c0f27 Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Thu, 25 Sep 2025 19:55:49 -0700
Subject: [PATCH 19/27] Fix error handling for BatchLinearAlgebra Ops (#2073)

This PR is to fix the access to exception vector in error handling.
Original implementation may cause a segmentation fault due to
out-of-bounds access.
---
 src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
index 29facbaa3d..26e80fa4d0 100644
--- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
@@ -54,7 +54,7 @@ void error_handle(
     return;
   }
 
-  for (auto& i : ids) {
+  for (size_t i = 0; i < errs.size(); ++i) {
     try {
       std::rethrow_exception(errs[i]);
     } catch (const oneapi::mkl::lapack::exception& e) {

From 9e95ad0be226f13375f90a4cd5574311611b5a37 Mon Sep 17 00:00:00 2001
From: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com>
Date: Fri, 26 Sep 2025 13:29:49 +0800
Subject: [PATCH 20/27] Add dynamic skip template (#2115)

- Add template for dynamic skip, makes it easier to create issues that
require skipping

disable_all

---------

Co-authored-by: Wang, Chuanqi <chuanqi.wang@intel.com>
---
 .github/ISSUE_TEMPLATE/dynamic-skip.yml | 63 +++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/dynamic-skip.yml

diff --git a/.github/ISSUE_TEMPLATE/dynamic-skip.yml b/.github/ISSUE_TEMPLATE/dynamic-skip.yml
new file mode 100644
index 0000000000..bdde3c5809
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/dynamic-skip.yml
@@ -0,0 +1,63 @@
+name: 🐛 Dynamic skip
+description: Create an issue to skip PR unrelated failures dynamically
+title: "[Bug Skip]: "
+labels: ["skipped"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug with skip template
+    description: |
+      Please provide a clear and concise description of what the bug is.
+      The template for dynamic skip as below:
+      
+      ```python
+      # Template(Check in the github action summary)
+      Cases:
+      [Category],[Class name],[Test name]
+      ```
+
+      ```python
+      # example
+      Cases:
+      op_ut,third_party.torch-xpu-ops.test.xpu.test_transformers_xpu.TestTransformersXPU,test_scaled
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is and also align the dynamic template.
+      ```
+      # Skippped cases with dynamic template
+      ```
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Versions
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
+      # For security purposes, please check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!

From 09edbeeb138fd8504ef42bcd3bb2402360b1b95f Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Fri, 26 Sep 2025 15:16:00 +0800
Subject: [PATCH 21/27] [CI] Modify ci test workflow (#2116)

1. Use `timm_nfnet` for CI test instead of `timm_regnet`, which has
known accuracy issue https://github.com/intel/torch-xpu-ops/issues/1334
2. fix ondemand test issue

disable_build
disable_ut
disable_distributed
---
 .github/actions/linux-testenv/action.yml | 2 +-
 .github/workflows/nightly_ondemand.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
index cbf2a60a5c..a4c77e5f8a 100644
--- a/.github/actions/linux-testenv/action.yml
+++ b/.github/actions/linux-testenv/action.yml
@@ -77,7 +77,7 @@ runs:
         # apply extra PRs for stock pytorch
         pip install requests
         if [ "${{ github.event_name }}" == "pull_request" ];then
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/27
         else
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
         fi
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index d408622efe..5c6207b963 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -111,7 +111,7 @@ jobs:
             else
               test_type="build-ondemand"
             fi
-            ut="$(echo ${{ inputs.ut }} |sed 's/,/","/g;s/^/["/;s/$/"]/')"
+            ut="$(echo ${{ inputs.ut }} |sed -E 's/(microbench,|,microbench)//g;s/(,windows|windows,)//g;s/,/","/g;s/^/["/;s/$/"]/')"
             suite="$(echo ${{ inputs.suite }} |sed 's/,/","/g;s/^/["/;s/$/"]/')"
             triton="${{ inputs.triton }}"
             python="${{ inputs.python }}"

From fa1e391a447223d4d659d7ade09827d5ed31c8ce Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Sun, 28 Sep 2025 22:33:43 -0700
Subject: [PATCH 22/27] Fix unnecessary double data type conversion (#2114)

# Motivation
The original issue occurs on some old iGPU running the following code on
Windows:
```python
import torch
import torch.nn.functional as F

print(torch.xpu.get_device_properties())
arr = torch.rand(1, 2, 5, 5, device='xpu')
pts = torch.rand(1, 3, 3, 2, device='xpu')
out = F.grid_sample(arr, pts, align_corners=False)
```
The failure output is:
```bash
Traceback (most recent call last):
  File "C:\Vesuvius\urerr\urerr.py", line 22, in <module>
    out = F.grid_sample(arr, pts, align_corners=False)
  File "C:\Anaconda3\envs\pytn\Lib\site-packages\torch\nn\functional.py", line 5118, in grid_sample
    return torch.grid_sampler(input, grid, mode_enum, padding_mode_enum, align_corners)
           ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: UR error
```

The Driver team analysis located the crash in the generated spriv IR
code.
```asm
; Function Attrs: nounwind
define internal spir_func float @_ZN2at6native3xpuL19compute_coordinatesIfEET_S3_iNS0_6detail18GridSamplerPaddingEb(float %0, i32 %1, i32 %2, i1 zeroext %3) #0 !spirv.ParameterDecorations !393 {
  %5 = alloca double, align 8, !spirv.Decorations !394
  switch i32 %2, label %58 [
    i32 1, label %6
    i32 2, label %14
  ]

6:                                                ; preds = %4
  %7 = sext i32 %1 to i64
  %8 = add nsw i64 %7, -1, !spirv.Decorations !387
  %9 = sitofp i64 %8 to float
  %10 = fcmp olt float %0, 0.000000e+00
  %11 = select i1 %10, float 0.000000e+00, float %0
  %12 = fcmp olt float %11, %9
  %13 = select i1 %12, float %11, float %9
  br label %58

14:                                               ; preds = %4
  br i1 %3, label %15, label %32

15:                                               ; preds = %14
  %16 = shl i32 %1, 1
  %17 = add i32 %16, -2
  %18 = icmp eq i32 %17, 0
  br i1 %18, label %49, label %19

19:                                               ; preds = %15
  %20 = sitofp i32 %17 to float
  %21 = fmul float %20, 5.000000e-01
  %22 = call spir_func float @_Z16__spirv_ocl_fabsf(float %0) #0
  %23 = call spir_func float @fmodf(float %22, float %21) #3
  %24 = fdiv float %22, %21
  %25 = call spir_func float @_Z17__spirv_ocl_floorf(float %24) #0
  %26 = fptosi float %25 to i32
  %27 = and i32 %26, 1
  %28 = icmp eq i32 %27, 0
  %29 = fsub float %21, %23
  %30 = select i1 %28, float %23, float %29
  %31 = fadd float %30, 0.000000e+00
  br label %49

32:                                               ; preds = %14
  %33 = icmp eq i32 %1, 0
  br i1 %33, label %49, label %34

34:                                               ; preds = %32
  %35 = shl nsw i32 %1, 1, !spirv.Decorations !387
  %36 = sitofp i32 %35 to float
  %37 = fmul float %36, 5.000000e-01
  %38 = fadd float %0, 5.000000e-01
  %39 = call spir_func float @_Z16__spirv_ocl_fabsf(float %38) #0
  %40 = call spir_func float @fmodf(float %39, float %37) #3
  %41 = fdiv float %39, %37
  %42 = call spir_func float @_Z17__spirv_ocl_floorf(float %41) #0
  %43 = fptosi float %42 to i32
  %44 = and i32 %43, 1
  %45 = icmp eq i32 %44, 0
  %46 = fsub float %37, %40
  %47 = select i1 %45, float %40, float %46
  %48 = fadd float %47, -5.000000e-01
  br label %49

49:                                               ; preds = %34, %32, %19, %15
  %50 = phi float [ %31, %19 ], [ 0.000000e+00, %15 ], [ %48, %34 ], [ 0.000000e+00, %32 ]
  %51 = sext i32 %1 to i64
  %52 = add nsw i64 %51, -1, !spirv.Decorations !387
  %53 = sitofp i64 %52 to float
  %54 = fcmp olt float %50, 0.000000e+00
  %55 = select i1 %54, float 0.000000e+00, float %50
  %56 = fcmp olt float %55, %53
  %57 = select i1 %56, float %55, float %53
  br label %58

58:                                               ; preds = %49, %6, %4
  %59 = phi float [ %13, %6 ], [ %57, %49 ], [ %0, %4 ]
  %60 = fptosi float %59 to i64
  %61 = icmp sgt i64 %60, 2147483646
  %62 = fcmp olt float %59, 0xC1E0000000000000
  %63 = or i1 %61, %62
  br i1 %63, label %72, label %64

64:                                               ; preds = %58
  %65 = fpext float %59 to double
  %66 = bitcast double* %5 to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %66)
  %67 = addrspacecast double* %5 to double addrspace(4)*
  store double %65, double* %5, align 8
  %68 = call spir_func signext i16 @_dtest(double addrspace(4)* %67) #3
  %69 = bitcast double* %5 to i8*
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %69)
  %70 = icmp slt i16 %68, 1
  %71 = select i1 %70, float %59, float -1.000000e+02
  br label %72

72:                                               ; preds = %64, %58
  %73 = phi float [ %71, %64 ], [ -1.000000e+02, %58 ]
  ret float %73
}
```
We can see that spirv IR code uses a double type and calls the @_dtest
function in block 64. Accroding to [MSVC
document](https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=msvc-170#_dtest-_ldtest-_fdtest),
_dtest is used to detect whether a number is `Nan` or `INFINITE`.
This allows us to locate the root cause of the crash, which corresponds
to the following C++ logic:
```cpp
if (static_cast<int64_t>(x) > INT_MAX - 1 || x < INT_MIN ||
      !std::isfinite(static_cast<double>(x)))
    return static_cast<scalar_t>(-100.0);
  return x;
```
In other words, the crash occurs when the GPU executes code that tries
to convert a floating-point value (Half or BFloat16) to a double and
check whether it is finite.

# Solution
- For Half and BFloat16, `std::isfinite(x)` promot `x` to `float`,
providing enough precision for finiteness checks. Casting to double is
redundant and can be safely removed.
- Explicitly return `-100.0f` instead of double type.

# Additional Context
I can't find the iGPU that could verify the fix, but it is unlikely to
introduce any additional error.
---
 src/ATen/native/xpu/sycl/GridSampler.h |  5 +++--
 test/regressions/test_grid_sample.py   | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 test/regressions/test_grid_sample.py

diff --git a/src/ATen/native/xpu/sycl/GridSampler.h b/src/ATen/native/xpu/sycl/GridSampler.h
index 33192f4cd7..cfc6283d2f 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.h
+++ b/src/ATen/native/xpu/sycl/GridSampler.h
@@ -49,9 +49,10 @@ static inline scalar_t safe_downgrade_to_int_range(scalar_t x) {
   // -100.0 does not have special meaning. This is just to make sure
   // it's not within_bounds_2d or within_bounds_3d, and does not cause
   // undefined behavior.
+  // We avoid using double here because some platforms may not support it.
   if (static_cast<int64_t>(x) > INT_MAX - 1 || x < INT_MIN ||
-      !std::isfinite(static_cast<double>(x)))
-    return static_cast<scalar_t>(-100.0);
+      !std::isfinite(x))
+    return static_cast<scalar_t>(-100.0f);
   return x;
 }
 
diff --git a/test/regressions/test_grid_sample.py b/test/regressions/test_grid_sample.py
new file mode 100644
index 0000000000..a7c28c10a9
--- /dev/null
+++ b/test/regressions/test_grid_sample.py
@@ -0,0 +1,19 @@
+# Owner(s): ["module: intel"]
+import torch
+import torch.nn.functional as F
+from torch.testing._internal.common_utils import TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSimpleCopy(TestCase):
+    # Refer to https://github.com/pytorch/pytorch/issues/153996
+    def test_grid_sample(self, dtype=torch.float):
+        input_cpu = torch.rand(1, 2, 5, 5, device=cpu_device)
+        grid_cpu = torch.rand(1, 3, 3, 2, device=cpu_device)
+        out_cpu = F.grid_sample(input_cpu, grid_cpu, align_corners=False)
+        input_xpu = input_cpu.to(xpu_device)
+        grid_xpu = grid_cpu.to(xpu_device)
+        out_xpu = F.grid_sample(input_xpu, grid_xpu, align_corners=False)
+        self.assertEqual(out_cpu, out_xpu.to(cpu_device))

From d5a81e058f1fd5abf2f77101c11d375a5dadb8a1 Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Sun, 28 Sep 2025 22:58:44 -0700
Subject: [PATCH 23/27] Fix overflow when calculating workgroups count (#2104)

To fix #2070.
This PR updates several SYCL kernel launch functions in
`src/ATen/native/xpu/sycl/Loops.h` to use `int64_t` for workgroup size
and number of workgroups calculations. This change prevents overflow
issues when handling large tensor sizes.
---
 src/ATen/native/xpu/sycl/Loops.h         | 26 ++++++++++++------------
 test/xpu/test_tensor_creation_ops_xpu.py |  8 ++++++++
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h
index 8c4fbbdef5..05a2f6b468 100644
--- a/src/ATen/native/xpu/sycl/Loops.h
+++ b/src/ATen/native/xpu/sycl/Loops.h
@@ -314,8 +314,8 @@ static void launch_legacy_group_range_kernel(int64_t N, const func_t& f) {
 
   auto ker = ElementwiseGroupRangeKernel<vec_size, func_t>(N, f);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * vec_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
@@ -328,9 +328,9 @@ static void launch_legacy_global_range_kernel(int64_t N, const func_t& f) {
 
   auto ker = ElementwiseGlobalRangeKernel<func_t>(N, f);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz);
-  int hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz);
+  int64_t hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
   num_wg = num_wg > hw_max_num_wg ? hw_max_num_wg : num_wg;
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
@@ -355,8 +355,8 @@ static inline void launch_unrolled_kernel(
   auto ker = UnrolledElementwiseKernel(N, f, data, ic, oc, l, s);
   using ker_t = decltype(ker);
 
-  auto wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * ker_t::item_work_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * ker_t::item_work_size);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
@@ -393,13 +393,13 @@ static inline void launch_vectorized_kernel(
 
 #define VEC_KER(vec_size)                                                    \
   {                                                                          \
-    TORCH_CHECK(max_scalar_bytes* vec_size <= 16);                           \
+    TORCH_CHECK(max_scalar_bytes * vec_size <= 16);                          \
     if constexpr (max_scalar_bytes * vec_size <= 16) {                       \
       auto ker =                                                             \
           VectorizedElementwiseKernel<vec_size, func_t, array_t, in_calc_t>( \
               N, f, data, input_calc);                                       \
-      int num_wg = ceil_div<int>(N, wg_sz * vec_size);                       \
-      sycl_kernel_submit(wg_sz* num_wg, wg_sz, getCurrentSYCLQueue(), ker);  \
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size);               \
+      sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); \
     }                                                                        \
   }
 
@@ -426,7 +426,7 @@ static inline void launch_vectorized_kernel(
           N, f, data, input_calc, output_calc, loader, storer);
       using ker_t = decltype(ker);
 
-      int num_wg = ceil_div<int>(N, wg_sz * ker_t::item_work_size);
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * ker_t::item_work_size);
       sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
       break;
     }
@@ -457,8 +457,8 @@ static inline void launch_unrolled_kernel_for_multi_outputs(
       out_calc_t>(N, f, data, ic, oc);
   using ker_t = decltype(ker);
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, ker_t::item_work_size * wg_sz);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, ker_t::item_work_size * wg_sz);
   sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker);
 }
 
diff --git a/test/xpu/test_tensor_creation_ops_xpu.py b/test/xpu/test_tensor_creation_ops_xpu.py
index 95a30716ea..077fbb6ad5 100644
--- a/test/xpu/test_tensor_creation_ops_xpu.py
+++ b/test/xpu/test_tensor_creation_ops_xpu.py
@@ -4371,6 +4371,14 @@ def test_full_like_inference(self, device):
             torch.full_like(like, 1.0, dtype=torch.complex64).dtype, torch.complex64
         )
 
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_zeros_large(self, device, dtype):
+        output = torch.zeros(2**31 - 1, device=device, dtype=dtype)
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_ones_large(self, device, dtype):
+        output = torch.ones(2**31 - 1, device=device, dtype=dtype)
+
 
 # Tests for the `frombuffer` function (only work on CPU):
 #   Constructs tensors from Python objects that implement the buffer protocol,

From f301733b03758ccd67642d2c202f2d589bd231a4 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 30 Sep 2025 09:30:05 +0800
Subject: [PATCH 24/27] Fix segmentation fault and calculation error in
 AveragePool2dKernel (#2091)

Fixed the following issues found by
test/test_nn.py::TestNNDeviceTypeXPU::test_avg_pool_large_tensor2_xpu
1. A segmentation fault caused by a data type conversion error that
invalidated the memory address.
2. A calculation error caused by data overflow.

---------

Co-authored-by: Cui, Yifeng <yifeng.cui@intel.com>
---
 .../native/xpu/sycl/AveragePool2dKernels.cpp  | 67 +++++++++----------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
index 4caa4a7ead..cf856d6177 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
@@ -5,6 +5,7 @@
 #include <ATen/native/Pool.h>
 
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
+#include <ATen/native/xpu/sycl/KernelUtils.h>
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
@@ -25,9 +26,7 @@ inline int max(int a, int b) {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP(item, index, total_elements_) {
       const int pw = index % pooled_width_;
       const int ph = (index / pooled_width_) % pooled_height_;
       const int c = (index / pooled_width_ / pooled_height_) % channels_;
@@ -73,19 +72,19 @@ struct AvgPool2dKernelFunctor {
   AvgPool2dKernelFunctor(
       scalar_t* top_data,
       const scalar_t* bottom_data,
-      index_t total_elements,
-      index_t channels,
-      index_t height,
-      index_t width,
-      int pooled_height,
-      int pooled_width,
-      int kernel_h,
-      int kernel_w,
-      int stride_h,
-      int stride_w,
-      int pad_h,
-      int pad_w,
-      int divisor_override,
+      const int total_elements,
+      const int64_t channels,
+      const int64_t height,
+      const int64_t width,
+      const int64_t pooled_height,
+      const int pooled_width,
+      const int kernel_h,
+      const int kernel_w,
+      const int stride_h,
+      const int stride_w,
+      const int pad_h,
+      const int pad_w,
+      const int divisor_override,
       bool count_include_pad,
       bool use_divisor)
       : top_data_(top_data),
@@ -109,19 +108,19 @@ struct AvgPool2dKernelFunctor {
  private:
   scalar_t* top_data_;
   const scalar_t* bottom_data_;
-  index_t total_elements_;
-  index_t channels_;
-  index_t height_;
-  index_t width_;
-  int pooled_height_;
-  int pooled_width_;
-  int kernel_h_;
-  int kernel_w_;
-  int stride_h_;
-  int stride_w_;
-  int pad_h_;
-  int pad_w_;
-  int divisor_override_;
+  const int total_elements_;
+  const int64_t channels_;
+  const int64_t height_;
+  const int64_t width_;
+  const int64_t pooled_height_;
+  const int pooled_width_;
+  const int kernel_h_;
+  const int kernel_w_;
+  const int stride_h_;
+  const int stride_w_;
+  const int pad_h_;
+  const int pad_w_;
+  const int divisor_override_;
   bool count_include_pad_;
   bool use_divisor_;
 };
@@ -129,9 +128,7 @@ struct AvgPool2dKernelFunctor {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dChannelsLastKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP(item, index, total_elements_) {
       const int c = index % channels_;
       const int pw = (index / channels_) % pooled_width_;
       const int ph = (index / channels_ / pooled_width_) % pooled_height_;
@@ -327,8 +324,7 @@ void launch_avg_pool2d_kernel(
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dChannelsLastBackwardKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) {
       const int c = index % channels_;
       const int w = (index / channels_) % width_ + pad_w_;
       const int h = (index / channels_ / width_) % height_ + pad_h_;
@@ -431,8 +427,7 @@ struct AvgPool2dChannelsLastBackwardKernelFunctor {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dBackwarKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) {
       // find out the local index
       // find out the local offset
       const int w = index % width_ + pad_w_;

From 086f20a0ce7e090ab403f23b35c14267752bab91 Mon Sep 17 00:00:00 2001
From: Frost Mitchell <frost.mitchell@intel.com>
Date: Thu, 2 Oct 2025 20:08:03 -0400
Subject: [PATCH 25/27] Fix test_barrier hang by using static global rank in
 ProcessGroupXCCL (#2036)

Fixes #1978

In ProcessGroupNCCL, `globalRank()` returns a static int globalRank,
which is first initialized by the ProcessGroup setup, so the globalRank
assigned to each thread matches the id of the CUDA device. However, we
were not using this same pattern for XCCL. Instead, we were just using
the assigned rank of the thread, which were not necessarily the same as
the globalRank.

The failing test `test_barrier` created two separate groups of 2 ranks
each, and then 4 threads called barrier, but all on the same 2-thread
group. Since initially the device id is not specified in this barrier
call, the thread attempts to "guess" the device index. In the previous
code, this guess would be 0 or 1, since the rank of each thread was not
actually the globalRank. In `barrier`, this guessed id was used to
initialize XCCLComm objects, and then call allreduce on a single element
tensor. However, this resulted in an allreduce call two times on each
device, which could result in a hang based on the execution order of the
4 threads.

With the update in this PR, PGXCCL now uses the static globalRank in the
same places as ProcessGroupNCCL, so the initialized XCCLComm objects are
for unique devices and allreduce doesn't call on the same device
multiple times.
---
 src/xccl/ProcessGroupXCCL.cpp        | 11 ++++++++---
 src/xccl/ProcessGroupXCCL.hpp        |  1 +
 src/xccl/ProcessGroupXCCLMonitor.cpp |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index 514cdaf3a7..aca5a928aa 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -352,6 +352,11 @@ const std::string& ProcessGroupXCCL::logPrefix() const {
   return logPrefix_;
 }
 
+const int& ProcessGroupXCCL::globalRank() const {
+  static int globalRank = rank_;
+  return globalRank;
+}
+
 ProcessGroupXCCL::ProcessGroupXCCL(
     c10::intrusive_ptr<Store> store,
     int rank,
@@ -379,7 +384,7 @@ ProcessGroupXCCL::ProcessGroupXCCL(
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
   LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: "
-            << "size: " << size << ", global rank: " << rank_
+            << "size: " << size << ", global rank: " << globalRank()
             << ", USE_HIGH_PRIORITY_STREAM: "
             << options_->is_high_priority_stream
             << ", PG Name: " << options_->group_name;
@@ -410,7 +415,7 @@ bool ProcessGroupXCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
   if (traceBufferSize_ > 0) {
     // TODO: dump_xccl_trace
     auto xcclTrace = dump_xccl_trace(true, includeStackTrace, false);
-    DebugInfoWriter& writer = DebugInfoWriter::getWriter(rank_);
+    DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank());
     LOG(INFO) << logPrefix() << "ProcessGroupXCCL dumping xccl trace to "
               << writer.getWriterTarget();
     writer.write(xcclTrace);
@@ -2021,7 +2026,7 @@ c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const {
     return *usedDeviceIdxs_.begin();
   }
   int devIdx =
-      static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
+      static_cast<int16_t>(globalRank() % at::detail::getXPUHooks().getNumGPUs());
   LOG(WARNING)
       << logPrefix()
       << c10::str(
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index 8e4604fbfb..7074354c64 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -423,6 +423,7 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::DeviceIndex guessDeviceId() const;
 
   const std::vector<uint64_t>& groupRanks() const;
+  const int& globalRank() const;
   void setEnqueuedPgStatus(c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
   bool dumpDebuggingInfo(bool includeStackTrace = true);
 
diff --git a/src/xccl/ProcessGroupXCCLMonitor.cpp b/src/xccl/ProcessGroupXCCLMonitor.cpp
index 68fdd402c1..246bdf3a59 100644
--- a/src/xccl/ProcessGroupXCCLMonitor.cpp
+++ b/src/xccl/ProcessGroupXCCLMonitor.cpp
@@ -39,7 +39,7 @@ void HeartbeatMonitorXCCL::runLoop() {
   // We only need to dump once per PG, so we use local_id_ == 0 for the first PG
   if (pg_->local_id_ == 0) {
     // DumpPipe is one per-trainer process
-    dumpPipe.emplace(pg_->getRank());
+    dumpPipe.emplace(pg_->globalRank());
     while (true) {
       std::unique_lock<std::mutex> lock(monitorMutex_);
       if (monitorWakeUpCV_.wait_for(

From cae6ba3511ed03df00e245eb8f72ebef6c06dbfa Mon Sep 17 00:00:00 2001
From: Frost Mitchell <frost.mitchell@intel.com>
Date: Fri, 3 Oct 2025 11:42:52 -0400
Subject: [PATCH 26/27] Add FlightRecorder tests (#1971)

As a follow-up to #1867 , this PR includes tests for the FlightRecorder
on XCCL, as well as moving some definitions from
ProcessGroupXCCL::Options to Backend::Options.

These tests are largely based on
`pytorch/test/distributed/test_c10d_nccl.py`, but doesn't include some
tests:
- `test_short_json` since json dumps are not supported in
ProcessGroupXCCL
- `test_trace_while_all_works_retired`: `_wait_for_pending_works` isn't
supported by XCCL
- `test_trace_while_active`: XCCL hangs when op is called on only one
rank
- `test_trace_while_stuck`: XCCL hangs when op is called on only one
rank

---------

Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>
---
 src/xccl/ProcessGroupXCCL.cpp          |  51 ++-
 src/xccl/ProcessGroupXCCL.hpp          |   3 +-
 test/xpu/distributed/test_c10d_xccl.py | 565 ++++++++++++++++++++++++-
 3 files changed, 598 insertions(+), 21 deletions(-)

diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index aca5a928aa..ba25869a2a 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -459,7 +459,8 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     bool isP2P,
     const char* profilingTitle,
     const std::vector<at::Tensor>& inputs,
-    const std::vector<at::Tensor>& outputs) {
+    const std::vector<at::Tensor>& outputs,
+    bool record) {
   auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
       device,
       rank,
@@ -470,20 +471,22 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
       profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
                                 : std::nullopt);
 
-  r->trace_id_ = FlightRecorderXCCL::get()->record(
-      local_id_,
-      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
-      seqCollective_,
-      seqP2P_,
-      op_id_,
-      profilingTitle ? profilingTitle : "",
-      inputs,
-      outputs,
-      nullptr,
-      r->xcclEndEvent_.get(),
-      options_->timeout,
-      pgStatus_,
-      isP2P);
+  if (record) {
+    r->trace_id_ = FlightRecorderXCCL::get()->record(
+        local_id_,
+        std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+        seqCollective_,
+        seqP2P_,
+        op_id_,
+        profilingTitle ? profilingTitle : "",
+        inputs,
+        outputs,
+        nullptr,
+        r->xcclEndEvent_.get(),
+        options_->timeout,
+        pgStatus_,
+        isP2P);
+  }
   return r;
 }
 
@@ -664,16 +667,19 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     const char* profilingTitle,
     bool nanCheck) {
   nanCheck &= enableNanCheck_;
-  seqCollective_++;
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
 
+  if (!coalescing_state_) {
+    seqCollective_++;
+  }
+  op_id_++;
+
   if (coalescing_state_ & CoalActive) {
     if ((coalescing_state_ & CoalColl) == 0) {
       seqCollective_++;
     }
-    op_id_++;
     coalescing_state_ |= CoalColl;
     if (coalescedDevice_.index() < 0) {
       coalescedDevice_ = device;
@@ -714,8 +720,15 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   }
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  work =
-      initWork(device, rank_, opType, false, profilingTitle, inputs, outputs);
+  work = initWork(
+      device,
+      rank_,
+      opType,
+      false,
+      profilingTitle,
+      inputs,
+      outputs,
+      !coalescing_state_);
   if (coalescing_state_) {
     FlightRecorderXCCL::get()->record(
         local_id_,
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index 7074354c64..bbc1f7cca8 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -180,7 +180,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       bool isP2P,
       const char* profilingTitle = nullptr,
       const std::vector<at::Tensor>& inputs = {},
-      const std::vector<at::Tensor>& outputs = {});
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false);
 
   template <typename Fn>
   c10::intrusive_ptr<Work> collective(
diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 57b33ae29d..062b502781 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -1,16 +1,21 @@
 # Owner(s): ["oncall: distributed"]
 
+import json
 import math
 import os
+import pickle
 import random
 import signal
 import sys
+import tempfile
+import threading
 import time
-from datetime import timedelta
+from datetime import datetime, timedelta
 from enum import auto, Enum
 from unittest import mock
 
 import torch
+import torch._C._distributed_c10d
 import torch.distributed as c10d
 import torch.distributed._functional_collectives as _functional_collectives
 
@@ -733,6 +738,564 @@ def test_wait_tensor(self) -> None:
         self.assertEqual(input1, input2)
 
 
+class XCCLTraceTestBase(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "1000"
+        self.tempdir = tempfile.TemporaryDirectory()
+        os.environ["TORCH_FR_DUMP_TEMP_FILE"] = self._trace_basename()
+        os.environ["TORCH_FR_DEBUG_INFO_PIPE_FILE"] = self._trace_basename()
+        self._spawn_processes()
+
+    @classmethod
+    def _run(
+        cls,
+        parent_conn,
+        rank: int,
+        test_name: str,
+        file_name: str,
+        parent_pipe,
+        **kwargs,
+    ) -> None:
+        cls.parent = parent_conn
+        super()._run(rank, test_name, file_name, parent_pipe)
+
+    @property
+    def local_device(self):
+        return torch.device("xpu", self.rank_to_GPU[self.rank][0])
+
+    def _join_processes(self, fn):
+        # We need to patch sys.exit() as skip_if will use sys.exit() and
+        # the exit code from the this process will not be caught.
+        with mock.patch("sys.exit"):
+            fn()
+        super()._join_processes(fn)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context("spawn").Process
+        self.children_pipes = []
+        parent_pipes = []
+        for _ in range(self.world_size):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            self.children_pipes.append(child_conn)
+            parent_pipes.append(parent_conn)
+        piter = iter(parent_pipes)
+
+        def wrap(*positional, args, **kwargs):
+            args = (next(piter), *args)
+            return proc(*positional, args=args, **kwargs)
+
+        self._start_processes(wrap)
+
+    def _create_process_group_xccl(
+        self, timeout=timedelta(seconds=600), device_id=None
+    ):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timeout,
+            device_id=device_id,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "xccl")
+
+    def _trace_basename(self):
+        # we pass the base to the env, and the dump util will append rank
+        return os.path.join(self.tempdir.name, "trace_")
+
+    def _trace_name(self, rank):
+        return self._trace_basename() + str(rank)
+
+    def started_or_scheduled(self, timing_enabled=False):
+        return "started" if timing_enabled else "scheduled"
+
+
+class XCCLTraceTest(XCCLTraceTestBase):
+    def _verify_trace(self, t, include_collectives, is_json, timing_enabled=False):
+        ver = t["version"]
+        self.assertEqual(ver, "2.10")
+        comm_lib_version = t["comm_lib_version"]
+        torch_comm_lib_version = torch._C._distributed_c10d.get_xccl_version()
+        self.assertEqual(comm_lib_version, torch_comm_lib_version)
+        pg_config = t["pg_config"]
+        self.assertEqual(len(pg_config), 1)
+        default_pg_info = pg_config["0"]
+        self.assertIn("name", default_pg_info)
+        self.assertIn("desc", default_pg_info)
+        self.assertIn("ranks", default_pg_info)
+        pg_status = t["pg_status"]
+        self.assertEqual(len(pg_status), 1)
+        self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2")
+        self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2")
+        self.assertEqual(
+            str(pg_status["0"]["last_started_collective"]),
+            "2" if timing_enabled else "-1",
+        )
+        global_ranks = pg_config["0"]["ranks"]
+        self.assertEqual(len(json.loads(global_ranks)), self.world_size)
+        if include_collectives:
+            self.assertEqual(len(t["entries"]), 2)
+            t = t["entries"]
+            last = t[-1]
+            self.assertEqual(last["thread_id"], str(threading.current_thread().ident))
+            self.assertEqual(last["thread_name"], "fr_test_thread")
+            self.assertEqual(last["process_group"], ("0", "default_pg"))
+            # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed
+            s = last["time_discovered_started_ns"]
+            f = last["time_discovered_completed_ns"]
+            self.assertEqual(last["record_id"], 1)
+            # self.assertIsNotNone(f)
+            if timing_enabled:
+                self.assertIsNotNone(s)
+                self.assertTrue(s <= f)
+            # we don't collect stack traces in JSON at the moment
+            if not is_json:
+                self.assertIn("test_c10d_xccl.py", str(last["frames"]))
+            self.assertEqual(last["input_sizes"], ((3, 4),))
+            self.assertEqual(last["input_dtypes"], ["Float"])
+            self.assertEqual(last["output_sizes"], ((3, 4),))
+            self.assertEqual(last["output_dtypes"], ["Float"])
+            self.assertEqual(last["collective_seq_id"], 2)
+            self.assertEqual(last["timeout_ms"], 600000)
+            now = datetime.now()
+            event_created_time = datetime.fromtimestamp(
+                last["time_created_ns"] / 1000000000
+            )
+            before_test = now - timedelta(minutes=1)
+            self.assertTrue(before_test < event_created_time < now)
+            if timing_enabled:
+                # very loose bounds, measured 0.036 ms on devgpu
+                self.assertTrue(0 < last["duration_ms"] < 100)
+            else:
+                self.assertTrue("duration_ms" not in last)
+        else:
+            self.assertTrue("entries" not in t)
+
+    def load_libpthread_or_libc(self):
+        import ctypes.util
+
+        for base in ("pthread", "c"):
+            path = ctypes.util.find_library(base)
+            if path:
+                try:
+                    return ctypes.CDLL(path)
+                except OSError:
+                    continue
+        raise RuntimeError("Could not load pthread or libc")
+
+    # Directly set thread name using threading.current_thread().name does not work
+    # because we use pthread_getname_np to get the thread’s OS-level name in C++
+    def set_thread_name(self, name):
+        import ctypes
+
+        lib = self.load_libpthread_or_libc()
+        pthread_self = lib.pthread_self
+        pthread_self.restype = ctypes.c_void_p
+        pthread_setname_np = lib.pthread_setname_np
+        pthread_setname_np.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+
+        # Get current pthread handle
+        tid = pthread_self()
+
+        # Set name
+        pthread_setname_np(tid, name.encode())
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("include_collectives", [True, False])
+    def test_short_pickle(self, include_collectives, timing_enabled=False):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+        t = pickle.loads(
+            torch._C._distributed_c10d._dump_xccl_trace(
+                includeCollectives=include_collectives
+            )
+        )
+        self._verify_trace(
+            t,
+            include_collectives=include_collectives,
+            is_json=True,
+            timing_enabled=timing_enabled,
+        )
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_dump_pipe(self):
+        def open_file_with_timeout(file_path, mode, timeout=1.0):
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if os.path.exists(file_path):
+                    return open(file_path, mode)
+                time.sleep(0.1)
+            raise FileNotFoundError
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            for c in self.children_pipes:
+                self.assertEqual(c.recv(), "next")
+
+            dump_file = self._trace_name(rank=0)
+            pipe_file = dump_file + ".pipe"
+            with open_file_with_timeout(pipe_file, "w") as f:
+                f.write("1\n")
+            with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f:
+                self.assertTrue("all_reduce" in str(pickle.load(f)))
+
+            for c in self.children_pipes:
+                c.send("next")
+            return
+
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        self.parent.send("next")
+        self.parent.recv()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_long(self):
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            # test some other primitives to make sure
+            # their strings are valid
+            xs = [torch.ones(3, 4, device=device)]
+            pg.broadcast(xs).wait()
+            pg.allreduce(xs).wait()
+            # pg.reduce(xs).wait() //Currently failing on XPU
+            ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]]
+            pg.allgather(ys, xs).wait()
+            pg.reduce_scatter(xs, ys).wait()
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 10)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(last["profiling_name"], "xccl:all_reduce")
+        # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed
+        self.assertIn("test_c10d_xccl.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["input_dtypes"], ["Float"])
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["output_dtypes"], ["Float"])
+        self.assertEqual(last["timeout_ms"], 600000)
+        self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_barrier_profiling(self):
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        f = pg.barrier()
+        f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 2)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(first["profiling_name"], "xccl:all_reduce_barrier")
+        self.assertEqual(last["profiling_name"], "xccl:all_reduce")
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "op_sizes_per_coalesce",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [False])
+    def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's xpu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        num_coalesced_ops = 20
+        ops_per_coalesce = len(op_sizes_per_coalesce)
+        for _ in range(num_coalesced_ops):
+            ops = []
+            for input_sizes in op_sizes_per_coalesce:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    ops.append(dist.P2POp(dist.irecv, tensor, 1))
+                elif self.rank == 1:
+                    tensor *= 2
+                    ops.append(dist.P2POp(dist.isend, tensor, 0))
+
+            dist.batch_isend_irecv(ops).pop().wait()
+
+        torch.xpu.synchronize(device=self.local_device)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), num_coalesced_ops * ops_per_coalesce)
+
+        expected_record_id = 0
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_coalesced_ops):
+            first_op = seq * (ops_per_coalesce)
+            coalesced_op = first_op + ops_per_coalesce
+            for p2p_op_idx, input_sizes in zip(
+                range(first_op, coalesced_op, 1), op_sizes_per_coalesce
+            ):
+                # the indivudal ops inside the coalescing group the individual op metadata,
+                # but not the timing info coming from the actual coalesced kernel
+                profiling_name = (
+                    "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0"
+                )
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["record_id"], expected_record_id
+                )
+                expected_record_id += 1
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["profiling_name"], profiling_name
+                )
+                # we don't increment collective_seq_id for p2p ops.
+                self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0)
+                self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq)
+                self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id)
+                expected_op_id += 1
+                self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes])
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["output_sizes"], [input_sizes]
+                )
+                # duration doesn't get tagged onto individual ops yet, nor is their state updated
+                self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled")
+                self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx])
+
+            # coalesced ops not yet supported in FR
+            expected_seq += 1
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "op_sizes",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [False])
+    def test_individual_send_recv(self, op_sizes, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's xpu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        num_repeats = 10
+        ops_per_repeat = len(op_sizes)
+        for _ in range(num_repeats):
+            for input_sizes in op_sizes:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    dist.recv(tensor, 1)
+                elif self.rank == 1:
+                    tensor *= 2
+                    dist.send(tensor, 0)
+
+        torch.xpu.synchronize(device=self.local_device)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat))
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_repeats * ops_per_repeat):
+            input_sizes = op_sizes[seq % ops_per_repeat]
+            profiling_name = "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0"
+            self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name)
+            # we don't increment collective_seq_id for p2p ops.
+            self.assertEqual(t["entries"][seq]["collective_seq_id"], 0)
+            self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][seq]["op_id"], expected_op_id)
+            expected_op_id += 1
+            self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["state"], "completed")
+
+            if timing_enabled:
+                duration = t["entries"][seq]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][seq])
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [False])
+    def test_allgather_uneven(self, timing_enabled):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_split_sizes = [i + 1 for i in range(self.world_size)]
+        sum_len = sum(output_split_sizes)
+        output_tensor = torch.zeros(sum_len, 2).to(self.rank)
+        expected_tensor = torch.ones(sum_len, 2).to(self.rank)
+        input_tensor = torch.ones(output_split_sizes[self.rank], 2).to(self.rank)
+
+        dist.all_gather(
+            list(torch.split(output_tensor, output_split_sizes)), input_tensor
+        )
+        torch.xpu.synchronize(device=self.rank)
+        self.assertEqual(output_tensor, expected_tensor)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), self.world_size)
+        for i in range(self.world_size):
+            self.assertEqual(t["entries"][i]["profiling_name"], "xccl:_broadcast_oop")
+            # collective_seq_id should be incremented once.
+            self.assertEqual(t["entries"][i]["collective_seq_id"], 1)
+            self.assertEqual(t["entries"][i]["input_sizes"], [[i + 1, 2]])
+            self.assertEqual(
+                t["entries"][i]["output_sizes"],
+                [[i + 1, 2]],
+            )
+            self.assertEqual(t["entries"][i]["state"], "scheduled")
+            # No event is recorded for individual ops
+            self.assertTrue("time_discovered_completed_ns" in t["entries"][i])
+        # TODO: (frost-intel) Add coalesced op recording for FR
+
+    # TODO(whc) test out other ops (And combinations of ops, if that's valid?)
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [False])
+    def test_coalescing_manager_collective(self, timing_enabled):
+        """
+        The coalescing manager api works by accumulating operations in python via a contextmanager, and then making
+        one call into c++ to an <op>_coalesced API.  It has limited support for ops and has been added recently to
+        avoid overheads of making individual py-cpp calls.  This complicates flight recording..
+
+        For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_tensors = torch.zeros(2, 2).to(self.rank)
+        input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)]
+
+        # TODO(whc) make this work with bigger world or something
+        self.assertEqual(self.world_size, 2, self.world_size)
+
+        with dist._coalescing_manager():
+            for i in range(self.world_size):
+                dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+        self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
+
+        torch.xpu.synchronize(device=self.rank)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+
+        self.assertEqual(
+            len(t["entries"]), 1
+        )  # one for the reduce_scatter_tensor_coalesced
+        self.assertEqual(
+            t["entries"][0]["profiling_name"], "xccl:reduce_scatter_tensor_coalesced"
+        )
+        # collective_seq_id should be incremented once.
+        self.assertEqual(t["entries"][0]["collective_seq_id"], 1)
+        self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]])
+        self.assertEqual(
+            t["entries"][0]["output_sizes"],
+            [
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+            ],
+        )
+        self.assertEqual(t["entries"][0]["state"], "completed")
+        if timing_enabled:
+            duration = t["entries"][0]["duration_ms"]
+            self.assertTrue(0.001 < duration < 10000, duration)
+        else:
+            self.assertTrue("duration_ms" not in t["entries"][0])
+
+
+instantiate_parametrized_tests(XCCLTraceTest)
 instantiate_parametrized_tests(ProcessGroupXCCLTest)
 
 
From fe324d3fe9e95f202f7d3a5d07385a59aa786129 Mon Sep 17 00:00:00 2001
From: Pawel Swider <137768238+PawelSwider2000@users.noreply.github.com>
Date: Thu, 9 Oct 2025 03:12:20 +0200
Subject: [PATCH 27/27] Modify files in install_xpu_headers only if they
 changes (#2138)

Observed that recompilations are triggered by updating files by
install_xpu_headers.py script. Turns out that script does not change the
files in any way but rewriting the same content into files updating
their timestamp causing multiple dependent files to recompile.

This PR makes sure that `install_xpu_headers.py` changes or creates
files only when content should change. This allow to speedup
recompilations several times, by my experience from few minutes to few
seconds.


This fixes: https://github.com/intel/torch-xpu-ops/issues/2093

---------

Co-authored-by: Pawel Swider <pawel.swider@yintel.com>
---
 tools/codegen/install_xpu_headers.py | 103 ++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 27 deletions(-)

diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py
index 807a509bc6..0b5ad70660 100644
--- a/tools/codegen/install_xpu_headers.py
+++ b/tools/codegen/install_xpu_headers.py
@@ -4,6 +4,7 @@
 import shutil
 from pathlib import Path
 
+VERBOSE = False
 
 parser = argparse.ArgumentParser(description="Utils for append ops headers")
 parser.add_argument(
@@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst):
     r"""
     Cleans trailing empty lines from the destination file, then appends #include
     lines from the source file that match `#include <ATen/ops/...` to the destination.
+    Only modifies file if changes are actually needed.
     """
     if args.dry_run:
         return
 
-    # Read source file and match header lines
-    with open(src, encoding="utf-8") as fr:
-        src_text = fr.read()
+    try:
+        with open(src, encoding="utf-8") as fr:
+            src_text = fr.read()
+    except OSError as e:
+        if VERBOSE:
+            print(f"Warning: Could not read source file {src}: {e}")
+        return
+
     pattern = r"^#include <ATen/ops/.*>\s*\r?\n"
     matches = re.findall(pattern, src_text, re.MULTILINE)
     if not matches:
         return
 
-    with open(dst, "r+", encoding="utf-8") as f:
-        dst_lines = f.readlines()
-        dst_text = "".join(dst_lines)
-        missing_headers = [match for match in matches if match not in dst_text]
-        if not missing_headers:
-            return
+    try:
+        with open(dst, encoding="utf-8") as f:
+            dst_lines = f.readlines()
+            dst_text = "".join(dst_lines)
+    except OSError as e:
+        if VERBOSE:
+            print(f"Warning: Could not read destination file {dst}: {e}")
+        return
+
+    missing_headers = [match for match in matches if match not in dst_text]
+    if not missing_headers:
+        return
+
+    new_dst_lines = dst_lines.copy()
+
+    while new_dst_lines and not new_dst_lines[-1].strip():
+        new_dst_lines.pop()
+    new_dst_lines.extend(missing_headers)
 
-        # Remove trailing empty lines from dst_lines
-        while dst_lines and not dst_lines[-1].strip():
-            dst_lines.pop()
+    new_content = "".join(new_dst_lines)
+    old_content = "".join(dst_lines)
 
-        f.seek(0)
-        f.truncate()
-        f.writelines(dst_lines)
-        # Append missing headers to the end of the file
-        f.writelines(missing_headers)
+    if new_content != old_content:
+        try:
+            with open(dst, "w", encoding="utf-8") as f:
+                f.writelines(new_dst_lines)
+        except OSError as e:
+            if VERBOSE:
+                print(f"Error: Could not write to {dst}: {e}")
 
 
 def parse_ops_headers(src):
@@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir):
 
 def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers):
     r"""
-    Generate XPU ops headers xpu_ops_generated_headers.cmake
+    Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes
     """
-    with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw:
-        fw.write("set(xpu_ops_generated_headers\n")
-        for header in xpu_ops_headers:
-            fw.write(f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n')
-        fw.write(")\n")
+    output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake")
+
+    # Generate new content
+    new_content = "set(xpu_ops_generated_headers\n"
+    for header in xpu_ops_headers:
+        new_content += f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n'
+    new_content += ")\n"
+
+    # Check if file exists and has same content
+    should_write = True
+    if os.path.exists(output_file):
+        try:
+            with open(output_file, encoding="utf-8") as f:
+                existing_content = f.read()
+            should_write = existing_content != new_content
+        except OSError:
+            # If we can't read the file, write it anyway
+            should_write = True
+
+    if should_write:
+        with open(output_file, "w", encoding="utf-8") as fw:
+            fw.write(new_content)
 
 
 def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
     r"""
     For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers.
+    Copies and appends are done only if leading to file changes to prevent unnecessary recompilations.
     """
     if args.dry_run:
         return
@@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present."
         src = os.path.join(src_dir, f)
         dst = os.path.join(dst_dir, f)
-        shutil.copy(src, dst)
+        # Only copy if src and dst differ or dst does not exist
+        should_copy = True
+        if os.path.exists(dst):
+            try:
+                with open(src, "rb") as fsrc, open(dst, "rb") as fdst:
+                    should_copy = fsrc.read() != fdst.read()
+            except OSError:
+                should_copy = True
+        if should_copy:
+            shutil.copy(src, dst)
 
     for f in common_headers:
         src = os.path.join(src_dir, f)
@@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         with open(dst, "r+", encoding="utf-8") as f:
             dst_lines = f.readlines()
             dst_text = "".join(dst_lines)
+            old_content = "".join(dst_lines)
             missing_declarations = []
             insertion_index = None
             for index, line in enumerate(dst_lines):
@@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
                     break
             assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}."
 
-            f.seek(0)
-            f.writelines(dst_lines)
-            f.truncate()
+            if old_content != "".join(dst_lines):
+                f.seek(0)
+                f.writelines(dst_lines)
+                f.truncate()
 
 
 def main():