diff --git a/.github/ISSUE_TEMPLATE/dynamic-skip.yml b/.github/ISSUE_TEMPLATE/dynamic-skip.yml
new file mode 100644
index 0000000000..bdde3c5809
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/dynamic-skip.yml
@@ -0,0 +1,63 @@
+name: 🐛 Dynamic skip
+description: Create an issue to skip PR unrelated failures dynamically
+title: "[Bug Skip]: "
+labels: ["skipped"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug with skip template
+    description: |
+      Please provide a clear and concise description of what the bug is.
+      The template for dynamic skip as below:
+      
+      ```python
+      # Template(Check in the github action summary)
+      Cases:
+      [Category],[Class name],[Test name]
+      ```
+
+      ```python
+      # example
+      Cases:
+      op_ut,third_party.torch-xpu-ops.test.xpu.test_transformers_xpu.TestTransformersXPU,test_scaled
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is and also align the dynamic template.
+      ```
+      # Skippped cases with dynamic template
+      ```
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Versions
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
+      # For security purposes, please check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
index cc5b9e0a70..a4c77e5f8a 100644
--- a/.github/actions/linux-testenv/action.yml
+++ b/.github/actions/linux-testenv/action.yml
@@ -57,7 +57,7 @@ runs:
       run: |
         # install pytorch
         if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then
-          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
         elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then
           pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu
         elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then
@@ -77,7 +77,7 @@ runs:
         # apply extra PRs for stock pytorch
         pip install requests
         if [ "${{ github.event_name }}" == "pull_request" ];then
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/27
         else
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
         fi
@@ -99,7 +99,7 @@ runs:
             TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
           fi
         fi
-        if [ "${{ github.event_name }}" == "pull_request" ];then
+        if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then
           cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops
           cd third_party/torch-xpu-ops
         else
@@ -115,6 +115,8 @@ runs:
         pip install pandas psutil scipy pyyaml
         cd pytorch
         if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then
+          # for new LLM models
+          pip install accelerate
           pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0
           TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')"
         elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
@@ -139,7 +141,7 @@ runs:
           fi
           # for dlrm
           pip install pyre-extensions
-          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
           # for soft_actor_critic, temp fix
           pip install git+https://github.com/nocoding03/gym@fix-np
           cd ../pytorch
@@ -152,10 +154,13 @@ runs:
           TORCHBENCH_COMMIT_ID="$(git rev-parse --short  HEAD)"
           sed -i 's/^ *pynvml.*//' requirements.txt
           pip install -r requirements.txt
-          python install.py --continue_on_fail
+          # python install.py --continue_on_fail
+          echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
+          pip install dominate
+          python install.py Super_SloMo
           # for dlrm
           pip install pyre-extensions
-          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps
+          curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
           cd ../pytorch
         else
           pip install -r ./.ci/docker/requirements-ci.txt
@@ -170,6 +175,11 @@ runs:
         else
           pip install torchao --pre --index-url https://download.pytorch.org/whl/nightly/xpu
         fi
+        if [ "${{ inputs.suite }}" != "None" ];then
+          # To install numpy 1.x for benchmarks as CUDA
+          # yolov requires numpy>=1.23
+          pip install -U numpy==1.26.4
+        fi
     - name: Torch Config
       shell: bash -xe {0}
       run: |
diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
index fb79182006..6e2cf2ceeb 100644
--- a/.github/actions/linux-uttest/action.yml
+++ b/.github/actions/linux-uttest/action.yml
@@ -155,7 +155,7 @@ runs:
           tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log
 
     - name: xpu_distributed
-      shell: timeout 3600 bash -xeu -o pipefail {0}
+      shell: timeout 36000 bash -xeu -o pipefail {0}
       if: ${{ inputs.ut_name == 'xpu_distributed' }}
       run: |
         xpu-smi topology -m
@@ -166,9 +166,13 @@ runs:
           echo -e "[ERROR] XCCL is not enabled"
           exit 1
         fi
+        export CCL_ROOT=$(dirname $(which python))/../
+        export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
+        export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
         python run_distributed.py \
           2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
           tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
+        find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;
 
     # Summary
     - name: UT Test Results Summary
diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml
index 8704ed0fc6..60954ffd29 100644
--- a/.github/actions/pt2e/action.yml
+++ b/.github/actions/pt2e/action.yml
@@ -29,6 +29,7 @@ runs:
       shell: bash -xe {0}
       run: |
         # dataset
+        dataset_dir="${RUNNER_TEMP}/_datasets/imagenet"
         if [ ! -d ${dataset_dir} ];then
           rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir}
           wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
@@ -36,6 +37,7 @@ runs:
           tar -xf ILSVRC2012_img_val.tar
           bash valprep.sh
         fi
+        echo "dataset_dir=${dataset_dir}" >> ${GITHUB_ENV}
     - name: PT2E Test (${{ inputs.dt }}  ${{ inputs.scenario }})
       shell: bash -xe {0}
       run: |
diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
index a75d3d2253..fe1e531308 100644
--- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
+++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv
@@ -45,3 +45,7 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass
 XGLMForCausalLM,pass,pass,pass,pass,pass
 XLNetLMHeadModel,pass,pass,pass,pass,pass
 YituTechConvBert,pass,pass,pass,pass,pass
+Qwen/Qwen3-0.6B,pass,pass,pass,pass,pass
+google/gemma-2-2b,pass,pass,pass,pass,pass
+meta-llama/Llama-3.2-1B,pass,pass,pass,pass,pass
+openai/whisper-tiny,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
index a75d3d2253..6d3726bdfb 100644
--- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
+++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv
@@ -45,3 +45,8 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass
 XGLMForCausalLM,pass,pass,pass,pass,pass
 XLNetLMHeadModel,pass,pass,pass,pass,pass
 YituTechConvBert,pass,pass,pass,pass,pass
+# https://github.com/intel/torch-xpu-ops/issues/2055
+Qwen/Qwen3-0.6B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+google/gemma-2-2b,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+meta-llama/Llama-3.2-1B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
+openai/whisper-tiny,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh
index 0e6b95ec45..1aac532497 100644
--- a/.github/scripts/ut_result_check.sh
+++ b/.github/scripts/ut_result_check.sh
@@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1'
 fi
 
 if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
-    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
+    sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
     grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
     echo -e "========================================================================="
     echo -e "Show Failed cases in ${ut_suite} xpu distributed"
diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml
index decf4e612f..fbaa0c3f07 100644
--- a/.github/workflows/_linux_accelerate.yml
+++ b/.github/workflows/_linux_accelerate.yml
@@ -48,6 +48,9 @@ concurrency:
 defaults:
   run:
     shell: bash {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   conditions-filter:
@@ -55,8 +58,6 @@ jobs:
     if: ${{ github.event.pull_request.draft == false }}
     runs-on: ubuntu-24.04
     timeout-minutes: 10
-    env:
-      GH_TOKEN: ${{ github.token }}
     outputs:
       disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }}
     steps:
diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml
index 286d124b7b..77caf298b4 100644
--- a/.github/workflows/_linux_build.yml
+++ b/.github/workflows/_linux_build.yml
@@ -31,6 +31,8 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   runner:
@@ -94,6 +96,7 @@ jobs:
       - name: Build Pytorch on ${{ needs.runner.outputs.hostname }}
         run: |
           export USE_XCCL=1
+          export IS_XPU_CI=1
           # only build pvc for CI
           if [ "${{ github.event_name }}" == "pull_request" ];then
             export TORCH_XPU_ARCH_LIST='pvc'
@@ -200,7 +203,6 @@ jobs:
           python -c "import torchaudio; print(torchaudio.__version__)"
           python pytorch/torch/utils/collect_env.py
           pip list |grep -E 'torch|intel'
-          chmod 777 /__w /github ./ -R
       - name: Upload Torch XPU Wheel
         if: ${{ success() }}
         uses: actions/upload-artifact@v4
@@ -213,3 +215,8 @@ jobs:
         with:
           name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/build_*.log
+      - name: Cleanup workspace
+        if: ${{ always() }}
+        run: |
+          chmod 777 /__w /github ./ -R
+          find ./ |grep -v "^\./$" |xargs rm -rf
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
index ab49d28490..08f88462e6 100644
--- a/.github/workflows/_linux_e2e.yml
+++ b/.github/workflows/_linux_e2e.yml
@@ -50,6 +50,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   runner:
@@ -87,7 +88,6 @@ jobs:
         cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
         MODEL_ONLY_NAME: ${{ inputs.model }}
         AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-        dataset_dir: ${{ runner.temp }}/../_datasets/imagenet
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml
index 427a35ca82..9824c8c458 100644
--- a/.github/workflows/_linux_e2e_summary.yml
+++ b/.github/workflows/_linux_e2e_summary.yml
@@ -30,14 +30,15 @@ jobs:
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
-      - name: Install gh-cli
-        run: |
-          sudo apt-get update
-          sudo apt-get install gh rsync ca-certificates -y
       - name: Setup python-${{ inputs.python }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ inputs.python }}
+      - name: Install gh-cli
+        run: |
+          sudo apt-get update
+          sudo apt-get install gh rsync ca-certificates -y
+          pip install pandas requests
       - name: Download Target Artifact
         run: |
           mkdir target/
@@ -64,7 +65,7 @@ jobs:
       - name: Get summary
         if: ${{ ! cancelled() }}
         run: |
-          pip install pandas requests
+          exit_label=0
           e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)"
           if [ -f "${e2e_summary_csv}" ];then
             bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY}
@@ -72,22 +73,22 @@ jobs:
             if [ ${exit_label} -ne 0 ];then
               grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
               echo "There are ${exit_label} cases that need look into!!! Please check them"
-              exit ${exit_label}
             fi
           fi
           pt2e_summary_csv="$(find ./target/ -name "summary.csv")"
           if [ -f "${pt2e_summary_csv}" ];then
-            cat ${pt2e_summary_csv}
-            failed_num=$(grep -c ',failed' ${pt2e_summary_csv})
+            failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true)
             if [ ${failed_num} -ne 0 ];then
               echo "[Warning] PT2E has failures!"
+              grep 'failed' ${pt2e_summary_csv}
             fi
           fi
+          exit ${exit_label}
       - name: Upload Reference Run ID
         if: ${{ endsWith(inputs.test_type, 'ly') }}
         run: |
           gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1
-          has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)"
+          has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt || true)"
           if [ ${has_or_not} -ne 0 ];then
             sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt
           else
diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml
index b65d985008..e8474d7ab1 100644
--- a/.github/workflows/_linux_op_benchmark.yml
+++ b/.github/workflows/_linux_op_benchmark.yml
@@ -26,6 +26,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   reference_issue: 1689
 
 jobs:
@@ -50,8 +51,6 @@ jobs:
   op_benchmark:
     needs: runner
     runs-on: ${{ needs.runner.outputs.runner_id }}
-    permissions: 
-      issues: write
     timeout-minutes: 900
     container:
       image: mengfeili/intel-pvc-driver:1146-1136
@@ -93,6 +92,8 @@ jobs:
   op_benchmark_test_results_check:
     needs: op_benchmark
     runs-on: ubuntu-24.04
+    permissions:
+      issues: write
     steps:
       - name: Install gh-cli
         run: |
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
index fc3deaf7a3..67e2082834 100644
--- a/.github/workflows/_linux_transformers.yml
+++ b/.github/workflows/_linux_transformers.yml
@@ -56,6 +56,7 @@ concurrency:
   cancel-in-progress: true
 env:
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   HF_HUB_ETAG_TIMEOUT: 120
   HF_HUB_DOWNLOAD_TIMEOUT: 120
   python: ${{ inputs.python != '' && inputs.python || '3.10' }}
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 1751d9d972..e895a959b7 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -33,6 +33,7 @@ env:
   GH_TOKEN: ${{ github.token }}
   HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
   HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
   UT_SKIP_ISSUE: 1624
 
 jobs:
@@ -99,11 +100,12 @@ jobs:
 
   test-in-baremetal:
     needs: runner
+    timeout-minutes: 600
     if: ${{ contains(inputs.ut, 'distributed') }}
     runs-on: ${{ needs.runner.outputs.runner_id }}
     env:
       AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
-      PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1
+      PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
     steps:
       - name: Checkout torch-xpu-ops
         uses: actions/checkout@v4
diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml
index 4adba0df6c..d45c0a83f8 100644
--- a/.github/workflows/bisect_search.yml
+++ b/.github/workflows/bisect_search.yml
@@ -40,47 +40,36 @@ permissions: read-all
 defaults:
   run:
     shell: bash -xe {0}
+env:
+  GH_TOKEN: ${{ github.token }}
+  HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+  DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
 
 jobs:
   get_runner:
     runs-on: ${{ inputs.runner }}
     outputs:
-      test_host: ${{ steps.runner-info.outputs.test_host }}
-      test_user: ${{ steps.runner-info.outputs.test_user }}
-      test_group: ${{ steps.runner-info.outputs.test_group }}
+      runner_id: ${{ steps.runner-info.outputs.runner_id }}
+      user_id: ${{ steps.runner-info.outputs.user_id }}
+      render_id: ${{ steps.runner-info.outputs.render_id }}
+      hostname: ${{ steps.runner-info.outputs.hostname }}
     steps:
-      - name: Get runner info
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Get runner
         id: runner-info
-        run: |
-          # get test runner
-          echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT}
-          echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT}
-          echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
-          # show host info
-          cat /etc/os-release
-          uname -a
-          source /opt/intel/oneapi/setvars.sh
-          sycl-ls
-          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
-      - name: Cleanup workspace
-        if: ${{ always() }}
-        run: |
-          # clean docker cache
-          docker stop $(docker ps -aq) || true
-          docker system prune -af || true
-          # clean files
-          ls -al
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
+        uses: ./.github/actions/get-runner
 
   biisect-search:
     needs: get_runner
-    runs-on: ${{ needs.get_runner.outputs.test_host }}
+    runs-on: ${{ needs.get_runner.outputs.runner_id }}
     container:
       image: mengfeili/intel-pvc-driver:1146-1136
       volumes:
         - ${{ github.workspace }}:${{ github.workspace }}
       options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g
-              -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }}
+              -u ${{ needs.get_runner.outputs.user_id }} --group-add ${{ needs.get_runner.outputs.render_id }}
       env:
         AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
         SEARCH_COMMITS: ${{ inputs.search_commits }}
@@ -90,38 +79,20 @@ jobs:
         TORCH_XPU_ARCH_LIST: pvc
         USE_XCCL: 0
         USE_KINETO: 0
-    env:
-       GH_TOKEN: ${{ github.token }}
-       HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
-      - name: Check runner
-        run: |
-          ls -al
-          sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
-          sudo rm -rf /tmp/xpu-tool
-      - name: Setup python-${{ inputs.python }}
-        uses: actions/setup-python@v5
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
         with:
-          python-version: ${{ inputs.python }}
-      - name: Check runner
-        run: |
-          hostname && whoami && id
-          clinfo --list
-          gcc -v && g++ -v
-          which python && which pip
-          python -V
-          pip install -U pip wheel setuptools
-          pip list
-          uname -a
-          dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev'
-          pip install cmake ninja pandas psutil scipy requests pybind11
-          mkdir gs-logs gs-search
-          echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
+          path: gs-scripts
+      - name: Prepare test env
+        uses: ./gs-scripts/.github/actions/linux-testenv
+        with:
+          pytorch: nightly_wheel
+          python: ${{ inputs.python }}
       - name: Install oneAPI DLE
         if: ${{ inputs.oneapi != 'installed' }}
         run: |
-          rm -rf ~/intel ~/.intel /tmp/intel
+          rm -rf ~/intel ~/.intel ${HOME}/intel
           if [ "${{ inputs.oneapi }}" == "2025.1" ];then
             ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh"
           elif [ "${{ inputs.oneapi }}" == "2025.2" ];then
@@ -130,42 +101,42 @@ jobs:
             ONEAPI_URL="${{ inputs.oneapi }}"
           fi
           wget -q -O oneapi.sh "${ONEAPI_URL}"
-          bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi
-          echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV}
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-        with:
-          path: gs-scripts
-      - name: Prepare source code
+          bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi
+          echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV}
+
+      - name: Summary file
         run: |
-          git clone https://github.com/pytorch/pytorch gs-pytorch
-          cd gs-pytorch
+          mkdir -p gs-logs
+          echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv
+      - name: Get latest versions
+        run: |
+          cd pytorch
           LATEST_PT_COMMIT="$(git rev-parse HEAD)"
-          cd ..
-          git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops
-          cd gs-torch-xpu-ops
+          cd third_party/torch-xpu-ops
           LATEST_XPU_COMMIT="$(git rev-parse HEAD)"
-          cd ..
           echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV}
           echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV}
       - name: Prepare test env
         run: |
-          pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu
           if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then
-            pip install transformers==4.44.2
+            pip install transformers
           elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then
-            pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
-            pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
+            pip install timm
           elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then
             model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')"
             git clone https://github.com/pytorch/benchmark gs-benchmark
             cd gs-benchmark
+            pip install -r requirements.txt
             echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV}
             python install.py ${model_name}
-          else
-            pip install -r gs-pytorch/.ci/docker/requirements-ci.txt
+            cd ..
           fi
-          pip uninstall -y torch && pip uninstall -y torch
+          pip uninstall -y torchvision torchaudio
+          git clone https://github.com/pytorch/vision gs-vision
+          cd gs-vision
+          python setup.py install
+          cd ..
+          pip uninstall -y torch
       - name: Bisect search pytorch
         if: ${{ contains(inputs.search_commits, 'pytorch') }}
         run: |
@@ -186,7 +157,7 @@ jobs:
                   > ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)"
           new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
           if [ "${old_status}" != "${new_status}" ];then
-            cd gs-pytorch
+            cd pytorch
             git reset --hard
             rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
             git bisect start ${new_commit} ${old_commit}
@@ -219,7 +190,7 @@ jobs:
                   > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)"
           new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)"
           if [ "${old_status}" != "${new_status}" ];then
-            cd gs-pytorch
+            cd pytorch
             git reset --hard
             rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/
             git bisect start ${new_commit} ${old_commit}
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index d408622efe..5c6207b963 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -111,7 +111,7 @@ jobs:
             else
               test_type="build-ondemand"
             fi
-            ut="$(echo ${{ inputs.ut }} |sed 's/,/","/g;s/^/["/;s/$/"]/')"
+            ut="$(echo ${{ inputs.ut }} |sed -E 's/(microbench,|,microbench)//g;s/(,windows|windows,)//g;s/,/","/g;s/^/["/;s/$/"]/')"
             suite="$(echo ${{ inputs.suite }} |sed 's/,/","/g;s/^/["/;s/$/"]/')"
             triton="${{ inputs.triton }}"
             python="${{ inputs.python }}"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index be6b260734..b5e8606ac8 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -115,6 +115,7 @@ jobs:
     with:
       runner: pvc_rolling
       pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
       ut: ${{ matrix.ut_name }}
 
   linux-distributed:
@@ -129,6 +130,7 @@ jobs:
     with:
       runner: pvc_rolling
       pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
+      torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
       ut: ${{ matrix.ut_name }}
 
   linux-e2e:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 277f1f067d..063c728411 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,9 @@ set(PROJECT_VERSION "2.3.0")
 # Avoid SYCL compiler error
 if(NOT WIN32)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-error")
+  if("$ENV{IS_XPU_CI}" STREQUAL "1")
+    string(APPEND CMAKE_CXX_FLAGS " -Werror=unused-variable")
+  endif()
 endif()
 
 cmake_policy(SET CMP0048 NEW)
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index 23956d710a..c0afe2d8f1 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -36,10 +36,16 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     list(APPEND SYCL_HOST_FLAGS -fPIC)
     list(APPEND SYCL_HOST_FLAGS -std=c++17)
     list(APPEND SYCL_HOST_FLAGS -Wunused-variable)
-    # SYCL headers warnings
-    list(APPEND SYCL_HOST_FLAGS -Wno-deprecated-declarations)
-    list(APPEND SYCL_HOST_FLAGS -Wno-deprecated)
-    list(APPEND SYCL_HOST_FLAGS -Wno-attributes)
+    # Some versions of DPC++ compiler pass paths to SYCL headers as user include paths (`-I`) rather
+    # than system paths (`-isystem`). This makes host compiler to report warnings encountered in the
+    # SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program.
+    # We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround
+    # the issue we wrap paths to SYCL headers in `-isystem`.
+    foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR)
+      list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}")
+    endforeach()
+    # Excluding warnings which flood the compilation output
+    # TODO: fix warnings in the source code and then reenable them in compilation
     list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare)
   endif()
 
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 37ec7066b7..493675e804 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,6 +1,5 @@
 # ATen XPU sources
 
-file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
 file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
 file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
@@ -18,10 +17,28 @@ set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE)
 set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 
-foreach(HEADER  ${xpu_h})
-  install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
-endforeach()
+# ATen XPU headers
 
-foreach(HEADER  ${xpu_ops_generated_headers})
-  install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
-endforeach()
+macro(install_xpu_headers glob_pattern dest_subdir)
+  file(GLOB headers ${glob_pattern})
+  if(headers)
+    install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/${dest_subdir}")
+  endif()
+endmacro()
+
+install_xpu_headers("xpu/*.h" "ATen/xpu")
+install_xpu_headers("native/xpu/*.h" "ATen/native/xpu")
+install_xpu_headers("native/xpu/sycl/*.h" "ATen/native/xpu/sycl")
+install_xpu_headers("native/xpu/mkl/*.h" "ATen/native/xpu/mkl")
+install_xpu_headers("native/nested/xpu/*.h" "ATen/native/nested/xpu")
+install_xpu_headers("native/nested/xpu/sycl/*.h" "ATen/native/nested/xpu/sycl")
+install_xpu_headers("native/quantized/*.h" "ATen/native/quantized/xpu")
+install_xpu_headers("native/quantized/sycl/*.h" "ATen/native/quantized/xpu/sycl")
+install_xpu_headers("native/sparse/xpu/*.h" "ATen/native/sparse/xpu")
+install_xpu_headers("native/sparse/xpu/sycl/*.h" "ATen/native/sparse/xpu/sycl")
+install_xpu_headers("native/transformers/*.h" "ATen/native/transformers/xpu")
+install_xpu_headers("native/transformers/sycl/*.h" "ATen/native/transformers/xpu/sycl")
+
+if(xpu_ops_generated_headers)
+  install(FILES ${xpu_ops_generated_headers} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
+endif()
diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
index 9988631d3b..82cfaa8275 100644
--- a/src/ATen/native/xpu/Nonzero.cpp
+++ b/src/ATen/native/xpu/Nonzero.cpp
@@ -1,22 +1,18 @@
 #include <ATen/core/Tensor.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/full.h>
 #include <ATen/xpu/EmptyTensor.h>
-
 #include <ATen/native/xpu/sycl/NonzeroKernel.h>
-#include <ATen/native/xpu/sycl/OffsetCalculator.h>
+#include <comm/TensorInfo.h>
 
 namespace at {
 namespace native {
-Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
+
+void nonzero_common_checks(const Tensor& self, Tensor& out, const std::string& op_name) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
-      "nonzero is not supported for tensors with more than INT_MAX elements, \
-  See https://github.com/pytorch/pytorch/issues/51871");
-  TORCH_CHECK(
-      out.dtype() == at::kLong,
-      "Expected object of scalar type ",
-      at::kLong,
-      " as out, but got ",
-      out.dtype());
+      op_name, " is not supported for tensors with more than INT_MAX elements, \
+      See https://github.com/pytorch/pytorch/issues/51871");
   TORCH_CHECK(
       self.device() == out.device(),
       "expected self and out to be on the same device, but got out on ",
@@ -24,11 +20,24 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
       " and self on ",
       self.device());
   TORCH_CHECK(
-      self.dim() <= MAX_DIMS,
-      "nonzero is not supported for tensor with more than ",
-      MAX_DIMS,
+      self.dim() <= XPU_MAX_TENSORINFO_DIMS,
+      op_name, " is not supported for tensor with more than ",
+      XPU_MAX_TENSORINFO_DIMS,
       " dimensions");
+}
 
+Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(
+      out.dtype() == at::kLong,
+      "Expected object of scalar type ",
+      at::kLong,
+      " as out, but got ",
+      out.dtype());
+  nonzero_common_checks(self, out, "nonzero");
+  if (self.numel() == 0) {
+    out = at::detail::empty_xpu({0, self.dim()}, out.options());
+    return out;
+  }
   xpu::nonzero_kernel(self, out);
   return out;
 }
@@ -38,5 +47,49 @@ Tensor nonzero_xpu(const Tensor& self) {
   nonzero_out_xpu(self, out);
   return out;
 }
+
+Tensor& nonzero_static_out_xpu(
+    const Tensor& self,
+    int64_t size,
+    int64_t fill_value,
+    Tensor& out) {
+  TORCH_CHECK(
+    size >= 0, "nonzero_static: 'size' must be an non-negative integer");
+  TORCH_CHECK(
+    out.dtype() == at::kLong,
+    "nonzero_static: Expected out tensor to have scalar type Long");
+  nonzero_common_checks(self, out, "nonzero_static");
+  if (self.numel() == 0) {
+    out = at::full({size, self.dim()}, fill_value, out.options());
+    return out;
+  }
+
+  Tensor nonzero_out = at::detail::empty_xpu({0}, self.options().dtype(kLong));
+  xpu::nonzero_kernel(self, nonzero_out);
+  auto nonzero_size = nonzero_out.size(0);
+  out.resize_({size, self.dim()});
+
+  if (nonzero_size > size) {
+    out.copy_(nonzero_out.narrow(0, 0, size));
+  } else if (nonzero_size < size) {
+    auto padding = at::full({size - nonzero_size, self.dim()}, fill_value, out.options());
+    out.copy_(at::cat({nonzero_out, padding}, 0));
+  } else {
+    out.copy_(nonzero_out);
+  }
+  return out;
+}
+
+Tensor nonzero_static_xpu(
+    const Tensor& self,
+    int64_t size,
+    int64_t fill_value) {
+  TORCH_CHECK(
+    size >= 0, "nonzero_static: 'size' must be an non-negative integer");
+  Tensor out = at::detail::empty_xpu({size, self.dim()}, self.options().dtype(kLong));
+  nonzero_static_out_xpu(self, size, fill_value, out);
+  return out;
+}
+
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
index 29facbaa3d..26e80fa4d0 100644
--- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
+++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp
@@ -54,7 +54,7 @@ void error_handle(
     return;
   }
 
-  for (auto& i : ids) {
+  for (size_t i = 0; i < errs.size(); ++i) {
     try {
       std::rethrow_exception(errs[i]);
     } catch (const oneapi::mkl::lapack::exception& e) {
diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
index 8c6e47f774..2c2c60ada9 100644
--- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp
@@ -35,9 +35,9 @@ struct HardswishBackwardFunctor {
     const opmath_t one_half(0.5f);
     opmath_t grad_val = static_cast<opmath_t>(grad_val_);
     opmath_t self_val = static_cast<opmath_t>(self_val_);
-    if (self_val < neg_three) {
+    if (self_val <= neg_three) {
       return zero;
-    } else if (self_val <= three) {
+    } else if (self_val < three) {
       return grad_val * ((self_val / three) + one_half);
     } else {
       return grad_val;
diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
index 4caa4a7ead..cf856d6177 100644
--- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
@@ -5,6 +5,7 @@
 #include <ATen/native/Pool.h>
 
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
+#include <ATen/native/xpu/sycl/KernelUtils.h>
 #include <comm/Runtime.h>
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
@@ -25,9 +26,7 @@ inline int max(int a, int b) {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP(item, index, total_elements_) {
       const int pw = index % pooled_width_;
       const int ph = (index / pooled_width_) % pooled_height_;
       const int c = (index / pooled_width_ / pooled_height_) % channels_;
@@ -73,19 +72,19 @@ struct AvgPool2dKernelFunctor {
   AvgPool2dKernelFunctor(
       scalar_t* top_data,
       const scalar_t* bottom_data,
-      index_t total_elements,
-      index_t channels,
-      index_t height,
-      index_t width,
-      int pooled_height,
-      int pooled_width,
-      int kernel_h,
-      int kernel_w,
-      int stride_h,
-      int stride_w,
-      int pad_h,
-      int pad_w,
-      int divisor_override,
+      const int total_elements,
+      const int64_t channels,
+      const int64_t height,
+      const int64_t width,
+      const int64_t pooled_height,
+      const int pooled_width,
+      const int kernel_h,
+      const int kernel_w,
+      const int stride_h,
+      const int stride_w,
+      const int pad_h,
+      const int pad_w,
+      const int divisor_override,
       bool count_include_pad,
       bool use_divisor)
       : top_data_(top_data),
@@ -109,19 +108,19 @@ struct AvgPool2dKernelFunctor {
  private:
   scalar_t* top_data_;
   const scalar_t* bottom_data_;
-  index_t total_elements_;
-  index_t channels_;
-  index_t height_;
-  index_t width_;
-  int pooled_height_;
-  int pooled_width_;
-  int kernel_h_;
-  int kernel_w_;
-  int stride_h_;
-  int stride_w_;
-  int pad_h_;
-  int pad_w_;
-  int divisor_override_;
+  const int total_elements_;
+  const int64_t channels_;
+  const int64_t height_;
+  const int64_t width_;
+  const int64_t pooled_height_;
+  const int pooled_width_;
+  const int kernel_h_;
+  const int kernel_w_;
+  const int stride_h_;
+  const int stride_w_;
+  const int pad_h_;
+  const int pad_w_;
+  const int divisor_override_;
   bool count_include_pad_;
   bool use_divisor_;
 };
@@ -129,9 +128,7 @@ struct AvgPool2dKernelFunctor {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dChannelsLastKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP(item, index, total_elements_) {
       const int c = index % channels_;
       const int pw = (index / channels_) % pooled_width_;
       const int ph = (index / channels_ / pooled_width_) % pooled_height_;
@@ -327,8 +324,7 @@ void launch_avg_pool2d_kernel(
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dChannelsLastBackwardKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) {
       const int c = index % channels_;
       const int w = (index / channels_) % width_ + pad_w_;
       const int h = (index / channels_ / width_) % height_ + pad_h_;
@@ -431,8 +427,7 @@ struct AvgPool2dChannelsLastBackwardKernelFunctor {
 template <typename scalar_t, typename accscalar_t, typename index_t>
 struct AvgPool2dBackwarKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    index_t index = item.get_global_linear_id();
-    if (index < total_elements_) {
+    XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) {
       // find out the local index
       // find out the local offset
       const int w = index % width_ + pad_w_;
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index f8e1b6906e..4124890274 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -185,8 +185,7 @@ int get_num_threads_by_dev_max_group_size(
 int get_prefer_simd(int numPlane, int nHw) {
   // decide SIMD: SIMD32 or SIMD16
 
-  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
-
+  auto dev_id = at::xpu::current_device();
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   auto sub_group_size = dev_prop->sub_group_sizes;
   int simd = sub_group_size[1];
diff --git a/src/ATen/native/xpu/sycl/GridSampler.h b/src/ATen/native/xpu/sycl/GridSampler.h
index 33192f4cd7..cfc6283d2f 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.h
+++ b/src/ATen/native/xpu/sycl/GridSampler.h
@@ -49,9 +49,10 @@ static inline scalar_t safe_downgrade_to_int_range(scalar_t x) {
   // -100.0 does not have special meaning. This is just to make sure
   // it's not within_bounds_2d or within_bounds_3d, and does not cause
   // undefined behavior.
+  // We avoid using double here because some platforms may not support it.
   if (static_cast<int64_t>(x) > INT_MAX - 1 || x < INT_MIN ||
-      !std::isfinite(static_cast<double>(x)))
-    return static_cast<scalar_t>(-100.0);
+      !std::isfinite(x))
+    return static_cast<scalar_t>(-100.0f);
   return x;
 }
 
diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
index 4862af9a8f..bffd6117b5 100644
--- a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
+++ b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp
@@ -9,17 +9,15 @@ namespace at::native::xpu {
 template <typename scalar_t>
 struct HermitePolynomialHFunctor {
   scalar_t operator()(scalar_t x, scalar_t n_) const {
-    int64_t n = static_cast<int64_t>(n_);
+    auto n = static_cast<int64_t>(n_);
     if (n < 0) {
       return scalar_t(0.0);
-    }
-
-    if (n == 0) {
+    } else if (n == 0) {
       return scalar_t(1.0);
-    }
-
-    if (n == 1) {
+    } else if (n == 1) {
       return x + x;
+    } else if (n > getHermitianLimit<scalar_t>()) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
     }
 
     scalar_t p = scalar_t(1.0);
diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
index 6c37e87202..8ba1d8bfff 100644
--- a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp
@@ -9,17 +9,15 @@ namespace at::native::xpu {
 template <typename scalar_t>
 struct HermitePolynomialHeFunctor {
   scalar_t operator()(scalar_t x, scalar_t n_) const {
-    int64_t n = static_cast<int64_t>(n_);
+    auto n = static_cast<int64_t>(n_);
     if (n < 0) {
       return scalar_t(0.0);
-    }
-
-    if (n == 0) {
+    } else if (n == 0) {
       return scalar_t(1.0);
-    }
-
-    if (n == 1) {
+    } else if (n == 1) {
       return x;
+    } else if (n > getHermitianLimit<scalar_t>()) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
     }
 
     scalar_t p = scalar_t(1.0);
diff --git a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
index 8688f80456..869d64492a 100644
--- a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp
@@ -30,7 +30,7 @@ struct LaguerrePolynomialLFunctor {
     scalar_t q = scalar_t(1.0) - x;
     scalar_t r;
 
-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
       r = (((k + k) + (scalar_t(1.0) - x)) * q - k * p) / (k + 1);
       p = q;
       q = r;
diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
index 92a4195d01..e4a8444c94 100644
--- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp
@@ -1103,9 +1103,6 @@ void _layer_norm_backward_kernel(
     // affecting performance and behavior.
     const scalar_t* dY_data = dY.const_data_ptr<scalar_t>();
     const scalar_t* X_data = X.const_data_ptr<scalar_t>();
-    weight_t* dg_data =
-        dgamma.defined() ? dgamma.data_ptr<weight_t>() : nullptr;
-    weight_t* db_data = dbeta.defined() ? dbeta.data_ptr<weight_t>() : nullptr;
     Tensor dgamma_blocks;
     Tensor dbeta_blocks;
     weight_t* dgamma_blocks_ptr = nullptr;
diff --git a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
index 2176c4e79c..caf1d8ac16 100644
--- a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp
@@ -34,7 +34,7 @@ struct LegendrePolynomialPFunctor {
     scalar_t q = x;
     scalar_t r;
 
-    for (int64_t k = 1; k < n; k++) {
+    for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
       r = ((k + k + 1) * x * q - k * p) / (k + 1);
       p = q;
       q = r;
diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h
index d9c4ede284..b6d677b78d 100644
--- a/src/ATen/native/xpu/sycl/Loops.h
+++ b/src/ATen/native/xpu/sycl/Loops.h
@@ -228,8 +228,8 @@ static void launch_legacy_group_range_kernel(int64_t N, const func_t& f) {
     return;
   }
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * vec_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size);
   sycl_kernel_submit<elementwise_group_range_kernel<vec_size, func_t>>(
       wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), 0, N, f);
 }
@@ -241,9 +241,9 @@ static void launch_legacy_global_range_kernel(int64_t N, const func_t& f) {
     return;
   }
 
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz);
-  int hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz);
+  int64_t hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz;
   num_wg = num_wg > hw_max_num_wg ? hw_max_num_wg : num_wg;
   sycl_kernel_submit<elementwise_global_range_kernel<func_t>>(
       wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), 0, N, f);
@@ -267,8 +267,8 @@ static inline void launch_unrolled_kernel(
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
 
   static constexpr int item_work_size = 4;
-  auto wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, wg_sz * item_work_size);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, wg_sz * item_work_size);
   sycl_kernel_submit<unrolled_elementwise_kernel<
       func_t,
       array_t,
@@ -321,25 +321,25 @@ static inline void launch_vectorized_kernel(
   using traits = function_traits<func_t>;
   auto wg_sz = syclMaxWorkItemsPerSubSlice();
 
-#define VEC_KER(vec_size)                               \
-  {                                                     \
-    TORCH_CHECK(max_scalar_bytes* vec_size <= 16);      \
-    if constexpr (max_scalar_bytes * vec_size <= 16) {  \
-      int num_wg = ceil_div<int>(N, wg_sz * vec_size);  \
-      sycl_kernel_submit<vectorized_elementwise_kernel< \
-          vec_size,                                     \
-          func_t,                                       \
-          array_t,                                      \
-          in_calc_t>>(                                  \
-          wg_sz * num_wg,                               \
-          wg_sz,                                        \
-          getCurrentSYCLQueue(),                        \
-          0,                                            \
-          N,                                            \
-          f,                                            \
-          data,                                         \
-          input_calc);                                  \
-    }                                                   \
+#define VEC_KER(vec_size)                                      \
+  {                                                            \
+    TORCH_CHECK(max_scalar_bytes* vec_size <= 16);             \
+    if constexpr (max_scalar_bytes * vec_size <= 16) {         \
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * vec_size); \
+      sycl_kernel_submit<vectorized_elementwise_kernel<        \
+          vec_size,                                            \
+          func_t,                                              \
+          array_t,                                             \
+          in_calc_t>>(                                         \
+          wg_sz * num_wg,                                      \
+          wg_sz,                                               \
+          getCurrentSYCLQueue(),                               \
+          0,                                                   \
+          N,                                                   \
+          f,                                                   \
+          data,                                                \
+          input_calc);                                         \
+    }                                                          \
   }
 
   switch (vec_size) {
@@ -367,7 +367,7 @@ static inline void launch_vectorized_kernel(
       auto storer = storer_t();
 
       static constexpr int item_work_size = 4;
-      int num_wg = ceil_div<int>(N, wg_sz * item_work_size);
+      int64_t num_wg = ceil_div<int64_t>(N, wg_sz * item_work_size);
       sycl_kernel_submit<unrolled_elementwise_kernel<
           func_t,
           array_t,
@@ -409,8 +409,8 @@ static inline void launch_unrolled_kernel_for_multi_outputs(
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
 
   static constexpr int item_work_size = 4;
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, item_work_size * wg_sz);
+  int64_t wg_sz = syclMaxWorkItemsPerSubSlice();
+  int64_t num_wg = ceil_div<int64_t>(N, item_work_size * wg_sz);
   sycl_kernel_submit<unrolled_elementwise_for_multi_outputs_kernel<
       num_outputs,
       func_t,
diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
index 6acaf2e6cb..90c090e38e 100644
--- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -115,7 +115,7 @@ struct CTCLossLogAlphaKernelFunctor {
         have_three = false;
       }
       for (int64_t t = 1; t < max_input_length_; t++) {
-        item.barrier(sycl_local_fence);
+        item.barrier(sycl_global_and_local_fence);
         if (valid && (t < input_length) && (s < 2 * target_length + 1)) {
           // only for valid t, s. This is equation (6) and (7), la1, la2, la3
           // are the three summands, lamax is the maximum for the logsumexp
@@ -165,7 +165,7 @@ struct CTCLossLogAlphaKernelFunctor {
         }
       }
     }
-    item.barrier(sycl_local_fence);
+    item.barrier(sycl_global_and_local_fence);
 
     if (!valid)
       return;
@@ -502,7 +502,7 @@ struct CTCLossBackwardLogBetaKernelFunctor {
       // now go backward in t. Note that we need to skip the last timestep that
       // we did above.
       for (int64_t t = max_input_length_ - 2; t >= 0; t--) {
-        item.barrier(sycl_local_fence);
+        item.barrier(sycl_global_and_local_fence);
         if (valid && (t < input_length - 1) && (s < 2 * target_length + 1)) {
           scalar_t lb1 = log_beta_data_
               [lb_batch_offset + lb_input_stride_ * (t + 1) +
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index 72df4b8ee8..d9c166dbcb 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -17,7 +17,7 @@ struct FlattenIdxtoRealIdxKernelFunctor {
     if (global_id < N_) {
       auto dim = global_id / num_nonzeros_;
       auto index = global_id % num_nonzeros_;
-      tensor_begin_[global_id] =
+      out_begin_[global_id] =
           idx_flat_begin_[index] / divisor_[dim] % sizes_[dim];
     }
   }
@@ -25,14 +25,14 @@ struct FlattenIdxtoRealIdxKernelFunctor {
       int64_t N,
       const int64_t num_dim,
       const int64_t num_nonzeros,
-      int64_t* tensor_begin,
+      int64_t* out_begin,
       int64_t* idx_flat_begin,
       int64_t* divisor,
       int64_t* sizes)
       : N_(N),
         num_dim_(num_dim),
         num_nonzeros_(num_nonzeros),
-        tensor_begin_(tensor_begin),
+        out_begin_(out_begin),
         idx_flat_begin_(idx_flat_begin) {
     for (auto dim = num_dim - 1; dim >= 0; dim--) {
       sizes_[dim] = sizes[dim];
@@ -44,7 +44,7 @@ struct FlattenIdxtoRealIdxKernelFunctor {
   int64_t N_;
   const int64_t num_dim_;
   const int64_t num_nonzeros_;
-  int64_t* tensor_begin_;
+  int64_t* out_begin_;
   int64_t* idx_flat_begin_;
   int64_t divisor_[XPU_MAX_TENSORINFO_DIMS];
   int64_t sizes_[XPU_MAX_TENSORINFO_DIMS];
@@ -79,77 +79,68 @@ struct CopyIfFunc<bool> {
 };
 
 template <typename scalar_t>
-void nonzero_template(const Tensor& self_, Tensor& tensor) {
+void nonzero_template(const Tensor& self_, Tensor& out) {
   Tensor self = self_.contiguous();
 
   const int64_t num_dim = self.dim();
-  TORCH_CHECK(num_dim <= XPU_MAX_TENSORINFO_DIMS, "dim exceed max allowed dim");
-
-  int64_t N = self.numel();
-
-  if (N > 0) {
-    Tensor idx_flat = at::empty(
-        {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-    Tensor range = at::empty(
-        {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-
-    const scalar_t* self_begin = self.const_data_ptr<scalar_t>();
-    int64_t* idx_flat_begin = idx_flat.data_ptr<int64_t>();
-    int64_t* range_begin = nullptr;
-
-    CopyIfFunc<scalar_t> f(self_begin);
-    auto idx_flat_end =
-        pstl::copy_if<int64_t>(range_begin, range_begin + N, idx_flat_begin, f);
-
-    auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end);
-
-    bool need_to_copy = tensor.dim() == 2 &&
-        tensor.sizes()[0] == num_nonzeros && tensor.sizes()[1] == self_.dim() &&
-        !tensor.t().is_contiguous();
-    at::Tensor tensor_ = need_to_copy
-        ? Tensor(at::detail::empty_xpu(
-              {self_.dim(), num_nonzeros}, tensor.options()))
-        : tensor.resize_({self_.dim(), num_nonzeros});
-
-    if (num_nonzeros > 0 && num_dim > 0) {
-      int64_t* tensor_begin = tensor_.data_ptr<int64_t>();
-
-      // preload sizes tensor for index calculation
-      int64_t sizes[XPU_MAX_TENSORINFO_DIMS];
-      int64_t divisor[XPU_MAX_TENSORINFO_DIMS];
-      sizes[num_dim - 1] = self.size(num_dim - 1);
-      divisor[num_dim - 1] = 1;
-      for (auto dim = num_dim - 2; dim >= 0; dim--) {
-        sizes[dim] = self.size(dim);
-        divisor[dim] = sizes[dim + 1] * divisor[dim + 1];
-      }
-
-      const int64_t N = num_nonzeros * num_dim;
-      // restore flatten idx to indices
-      FlattenIdxtoRealIdxKernelFunctor kfn(
-          N,
-          num_dim,
-          num_nonzeros,
-          tensor_begin,
-          idx_flat_begin,
-          divisor,
-          sizes);
-
-      const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
-      const auto num_wg = (N + wg_sz - 1) / wg_sz;
-
-      sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
-    }
-    if (need_to_copy) {
-      tensor.copy_(tensor_.t());
-    } else {
-      // transpose out so it is correct size
-      Tensor tensor_temp = tensor_.t();
-      tensor.set_(tensor_temp);
+  const int64_t N = self.numel();
+
+  Tensor idx_flat = at::empty(
+      {N}, out.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
+
+  const scalar_t* self_begin = self.const_data_ptr<scalar_t>();
+  int64_t* idx_flat_begin = idx_flat.data_ptr<int64_t>();
+  int64_t* range_begin = nullptr;
+
+  CopyIfFunc<scalar_t> f(self_begin);
+  auto idx_flat_end =
+      pstl::copy_if<int64_t>(range_begin, range_begin + N, idx_flat_begin, f);
+
+  auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end);
+
+  bool need_to_copy = out.dim() == 2 &&
+      out.sizes()[0] == num_nonzeros && out.sizes()[1] == num_dim &&
+      !out.t().is_contiguous();
+  Tensor out_ = need_to_copy
+      ? Tensor(at::detail::empty_xpu(
+            {num_dim, num_nonzeros}, out.options()))
+      : out.resize_({num_dim, num_nonzeros});
+
+  if (num_nonzeros > 0 && num_dim > 0) {
+    int64_t* out_begin = out_.data_ptr<int64_t>();
+
+    // preload sizes tensor for index calculation
+    int64_t sizes[XPU_MAX_TENSORINFO_DIMS];
+    int64_t divisor[XPU_MAX_TENSORINFO_DIMS];
+    sizes[num_dim - 1] = self.size(num_dim - 1);
+    divisor[num_dim - 1] = 1;
+    for (auto dim = num_dim - 2; dim >= 0; dim--) {
+      sizes[dim] = self.size(dim);
+      divisor[dim] = sizes[dim + 1] * divisor[dim + 1];
     }
 
+    const int64_t N = num_nonzeros * num_dim;
+    // restore flatten idx to indices
+    FlattenIdxtoRealIdxKernelFunctor kfn(
+        N,
+        num_dim,
+        num_nonzeros,
+        out_begin,
+        idx_flat_begin,
+        divisor,
+        sizes);
+
+    const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
+    const auto num_wg = (N + wg_sz - 1) / wg_sz;
+
+    sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
+  }
+  if (need_to_copy) {
+    out.copy_(out_.t());
   } else {
-    tensor = tensor.resize_({num_dim, N}).contiguous().t();
+    // transpose out so it is correct size
+    Tensor out_temp = out_.t();
+    out.set_(out_temp);
   }
 }
 
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index 6117b6d261..8b887a443a 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -269,8 +269,7 @@ class NormConfig {
   }
 
   void get_max_vec_size() {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int64_t total_resource = syclMaxWorkItemsPerTile();
 
     constexpr int float4_size = sizeof(float) * 4;
     max_vec_size = float4_size / element_size_bytes;
diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
index f0188ce0ee..869a2b6627 100644
--- a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp
@@ -440,7 +440,6 @@ Tensor roi_align_kernel(
 
   at::Tensor output = at::zeros(
       {num_rois, channels, pooled_height, pooled_width}, input.options());
-  auto output_size = num_rois * pooled_height * pooled_width * channels;
 
   if (output.numel() == 0) {
     return output;
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index fdedf5fb09..020fe4a1bc 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -1559,8 +1559,7 @@ void spatial_softmax_forward(
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
-  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1749,8 +1748,7 @@ void spatial_softmax_backward(
       canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1901,8 +1899,7 @@ Tensor& masked_softmax_forward(
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -2026,8 +2023,7 @@ void masked_softmax_backward(
       canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto* dev_prop = at::xpu::getCurrentDeviceProperties();
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
diff --git a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
index 8c46658e7a..fed5fee904 100644
--- a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp
@@ -669,8 +669,7 @@ void split_with_sizes_copy_out_xpu_contiguous_no_cast(
     num_groups += div_up(split_chunk_size, GROUP_SIZE * BYTES_PER_THREAD);
   }
 
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t tile_size = syclMaxWorkItemsPerTile(dev_id);
+  int64_t tile_size = syclMaxWorkItemsPerTile();
   const int64_t max_groups = tile_size / GROUP_SIZE * 2.0;
 
   // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index 1620ea1b7f..b4532fe59a 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -4,7 +4,6 @@
 
 #include <comm/Runtime.h>
 #include <sycl/sycl.hpp>
-#include <iostream>
 
 namespace syclext = sycl::ext::oneapi;
 namespace syclexp = sycl::ext::oneapi::experimental;
@@ -14,10 +13,9 @@ namespace sycl {
 
 template <class KernelClass>
 static int64_t syclMaxWorkGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
-  auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
-  auto ctx = q.get_context();
-  auto dev = q.get_device();
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
+  auto& ctx = c10::xpu::get_device_context();
+  auto& dev = c10::xpu::get_raw_device(dev_id);
 
   auto kid = ::sycl::get_kernel_id<KernelClass>();
   // The kernel won't be built for devices except for the first device.
@@ -34,15 +32,15 @@ static int64_t syclMaxWorkGroupSize(
 
 template <class KernelClass>
 static int64_t syclMaxWorkGroupSize(
-    KernelClass /*kfn*/,
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    const KernelClass& /*kfn*/,
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   return syclMaxWorkGroupSize<KernelClass>(dev_id);
 }
 
 // For SYCL free function
 template <auto* kptr>
 static int64_t syclMaxWorkGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
   auto ctxt = q.get_context();
   auto dev = q.get_device();
@@ -54,67 +52,63 @@ static int64_t syclMaxWorkGroupSize(
 }
 
 static inline int64_t syclDeviceMaxWorkGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_work_group_size;
 }
 
 static inline int64_t syclMaxSubGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  auto subgroup_sizes = dev_prop->sub_group_sizes;
-  uint64_t max_val = 0;
-  for (auto i : subgroup_sizes) {
-    if (i > max_val)
-      max_val = i;
-  }
-  return max_val;
+  const auto& subgroup_sizes = dev_prop->sub_group_sizes;
+  TORCH_CHECK(
+      !subgroup_sizes.empty(),
+      "The device subgroup sizes is empty, please check the device status.");
+  return *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end());
 }
 
 static inline int64_t syclMinSubGroupSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  auto subgroup_sizes = dev_prop->sub_group_sizes;
-  uint64_t min_val = dev_prop->max_work_group_size;
-  for (auto i : subgroup_sizes) {
-    if (i < min_val)
-      min_val = i;
-  }
-  return min_val;
+  const auto& subgroup_sizes = dev_prop->sub_group_sizes;
+  TORCH_CHECK(
+      !subgroup_sizes.empty(),
+      "The device subgroup sizes is empty, please check the device status.");
+  return *std::min_element(subgroup_sizes.begin(), subgroup_sizes.end());
 }
 
 static inline int64_t syclMaxComputeUnitSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_compute_units;
 }
 
 static inline int64_t syclGpuEuCount(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_count;
 }
 
 static inline int64_t syclGpuEuSimdWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_simd_width;
 }
 
 static inline int64_t syclGpuHWThreadsPerEU(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_hw_threads_per_eu;
 }
 
 static inline int64_t syclGpuEUCountPerSubslice(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->gpu_eu_count_per_subslice;
 }
 
 static inline int64_t syclMaxWorkItemsPerTile(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t eu_cnt = dev_prop->gpu_eu_count;
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
@@ -123,7 +117,7 @@ static inline int64_t syclMaxWorkItemsPerTile(
 }
 
 static inline int64_t syclMaxWorkItemsPerSubSlice(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
   int64_t eu_count = dev_prop->gpu_eu_count_per_subslice;
@@ -131,7 +125,7 @@ static inline int64_t syclMaxWorkItemsPerSubSlice(
 }
 
 static inline int64_t syclMaxWorkItemsPerEU(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   int64_t simd_width = syclMaxSubGroupSize(dev_id);
   int64_t hw_threads = dev_prop->gpu_hw_threads_per_eu;
@@ -139,94 +133,76 @@ static inline int64_t syclMaxWorkItemsPerEU(
 }
 
 static inline int64_t syclMaxNumSubGroups(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_num_sub_groups;
 }
 
 static inline int64_t syclMaxDSSNum(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   int64_t dss_num =
       syclMaxComputeUnitSize(dev_id) / syclGpuEUCountPerSubslice(dev_id);
   return dss_num;
 }
 
 static inline size_t syclGlobalMemSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->global_mem_size;
 }
 
 static inline int64_t syclLocalMemSize(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->local_mem_size;
 }
 
 template <typename T>
 uint32_t syclPrefVectorWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   (void)dev_id; // Suppress unused variable warning
 
   // Hot fix. This is the preferred vector width for GPUs up to LNL/BMG.
-  uint32_t vec_width = 16;
+  constexpr uint32_t vec_width = 16;
 
-  if (std::is_same<T, char>::value) {
-    return vec_width / sizeof(char);
-  }
-  if (std::is_same<T, short>::value) {
-    return vec_width / sizeof(short);
-  }
-  if (std::is_same<T, int>::value) {
-    return vec_width / sizeof(int);
-  }
-  if (std::is_same<T, int64_t>::value) {
-    return vec_width / sizeof(int64_t);
-  }
-  if (std::is_same<T, float>::value) {
-    return vec_width / sizeof(float);
+  if constexpr (
+      std::is_same_v<T, char> || std::is_same_v<T, short> ||
+      std::is_same_v<T, int> || std::is_same_v<T, int64_t> ||
+      std::is_same_v<T, float> || std::is_same_v<T, double> ||
+      std::is_same_v<T, ::sycl::half>) {
+    return vec_width / sizeof(T);
+  } else {
+    throw std::invalid_argument(
+        "Invalid data type to fetch preferred vector width!");
   }
-  if (std::is_same<T, double>::value) {
-    return vec_width / sizeof(double);
-  }
-  if (std::is_same<T, ::sycl::half>::value) {
-    return vec_width / sizeof(::sycl::half);
-  }
-  throw std::invalid_argument(
-      "Invalid data type to fetch preferred vector width!");
 }
 
 template <typename T>
 uint32_t syclNativeVectorWidth(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  if (std::is_same<T, char>::value) {
+  if constexpr (std::is_same_v<T, char>) {
     return dev_prop->native_vector_width_char;
-  }
-  if (std::is_same<T, short>::value) {
+  } else if constexpr (std::is_same_v<T, short>) {
     return dev_prop->native_vector_width_short;
-  }
-  if (std::is_same<T, int>::value) {
+  } else if constexpr (std::is_same_v<T, int>) {
     return dev_prop->native_vector_width_int;
-  }
-  if (std::is_same<T, int64_t>::value) {
+  } else if constexpr (std::is_same_v<T, int64_t>) {
     return dev_prop->native_vector_width_long;
-  }
-  if (std::is_same<T, float>::value) {
+  } else if constexpr (std::is_same_v<T, float>) {
     return dev_prop->native_vector_width_float;
-  }
-  if (std::is_same<T, double>::value) {
+  } else if constexpr (std::is_same_v<T, double>) {
     return dev_prop->native_vector_width_double;
-  }
-  if (std::is_same<T, ::sycl::half>::value) {
+  } else if constexpr (std::is_same_v<T, ::sycl::half>) {
     return dev_prop->native_vector_width_half;
+  } else {
+    throw std::invalid_argument(
+        "Invalid data type to fetch native vector width!");
   }
-  throw std::invalid_argument(
-      "Invalid data type to fetch native vector width!");
 }
 
 static inline bool syclHasFloat64(
-    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+    at::DeviceIndex dev_id = at::xpu::current_device()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->has_fp64;
 }
diff --git a/src/comm/Runtime.h b/src/comm/Runtime.h
index fa7daaf125..4fd44d08e5 100644
--- a/src/comm/Runtime.h
+++ b/src/comm/Runtime.h
@@ -4,10 +4,6 @@
 
 namespace at::xpu {
 
-static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() {
-  return c10::xpu::getCurrentXPUStream().device_index();
-}
-
 static inline sycl::queue& getCurrentSYCLQueue() {
   return c10::xpu::getCurrentXPUStream().queue();
 }
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
index c820a1c486..ba25869a2a 100644
--- a/src/xccl/ProcessGroupXCCL.cpp
+++ b/src/xccl/ProcessGroupXCCL.cpp
@@ -97,17 +97,17 @@ void checkSingleTensor(
 ) {
   if (!tensor.is_xpu() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
+  }
 
-    // Skip the following requirements for P2P operations
-    if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
-      if (p2p) {
-        TORCH_WARN_ONCE(
-            "Detected non-contiguous tensor in P2P operations. It is user "
-            "responsibility to guarantee that source and destination tensors have "
-            "the same contiguity format.");
-      } else {
-        C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
-      }
+  // Skip the following requirements for P2P operations
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    if (p2p) {
+      TORCH_WARN_ONCE(
+          "Detected non-contiguous tensor in P2P operations. It is user "
+          "responsibility to guarantee that source and destination tensors have "
+          "the same contiguity format.");
+    } else {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
     }
   }
 }
@@ -322,7 +322,9 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
-ProcessGroupXCCL::Options::Options() : Backend::Options(XCCL_BACKEND_NAME) {}
+ProcessGroupXCCL::Options::Options(bool is_high_priority_stream)
+    : Backend::Options(XCCL_BACKEND_NAME),
+      is_high_priority_stream(is_high_priority_stream) {}
 
 static std::atomic<size_t> process_group_id = 0;
 
@@ -350,8 +352,13 @@ const std::string& ProcessGroupXCCL::logPrefix() const {
   return logPrefix_;
 }
 
+const int& ProcessGroupXCCL::globalRank() const {
+  static int globalRank = rank_;
+  return globalRank;
+}
+
 ProcessGroupXCCL::ProcessGroupXCCL(
-    const c10::intrusive_ptr<Store>& store,
+    c10::intrusive_ptr<Store> store,
     int rank,
     int size,
     c10::intrusive_ptr<Options> options)
@@ -377,7 +384,10 @@ ProcessGroupXCCL::ProcessGroupXCCL(
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
   LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: "
-            << "size: " << size << ", global rank: " << rank_;
+            << "size: " << size << ", global rank: " << globalRank()
+            << ", USE_HIGH_PRIORITY_STREAM: "
+            << options_->is_high_priority_stream
+            << ", PG Name: " << options_->group_name;
 
   LOG(INFO) << logPrefix() << "ProcessGroupXCCL environments: "
             << "XCCL version: " << XcclVersion
@@ -405,7 +415,7 @@ bool ProcessGroupXCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
   if (traceBufferSize_ > 0) {
     // TODO: dump_xccl_trace
     auto xcclTrace = dump_xccl_trace(true, includeStackTrace, false);
-    DebugInfoWriter& writer = DebugInfoWriter::getWriter(rank_);
+    DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank());
     LOG(INFO) << logPrefix() << "ProcessGroupXCCL dumping xccl trace to "
               << writer.getWriterTarget();
     writer.write(xcclTrace);
@@ -432,17 +442,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus(
   pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
 }
 
-void ProcessGroupXCCL::setCompletedPgStatus(
-    c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work) {
-  pgStatus_->lastCompletedSeq = static_cast<int64_t>(work->getSequencenumber());
-  pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_);
-  pgStatus_->lastCompletedNumelIn = work->numelIn_;
-  pgStatus_->lastCompletedNumelOut = work->numelOut_;
-  // To avoid complexity, we're not computing duration.
-  FlightRecorderXCCL::get()->retire_id(
-      work->trace_id_, /*compute_duration*/ false);
-}
-
 void ProcessGroupXCCL::setSequenceNumberForGroup() {}
 
 uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() {
@@ -460,7 +459,8 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
     bool isP2P,
     const char* profilingTitle,
     const std::vector<at::Tensor>& inputs,
-    const std::vector<at::Tensor>& outputs) {
+    const std::vector<at::Tensor>& outputs,
+    bool record) {
   auto r = c10::make_intrusive<ProcessGroupXCCL::WorkXCCL>(
       device,
       rank,
@@ -471,20 +471,22 @@ c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> ProcessGroupXCCL::initWork(
       profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
                                 : std::nullopt);
 
-  r->trace_id_ = FlightRecorderXCCL::get()->record(
-      local_id_,
-      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
-      seqCollective_,
-      seqP2P_,
-      op_id_,
-      profilingTitle ? profilingTitle : "",
-      inputs,
-      outputs,
-      nullptr,
-      r->xcclEndEvent_.get(),
-      options_->timeout,
-      pgStatus_,
-      isP2P);
+  if (record) {
+    r->trace_id_ = FlightRecorderXCCL::get()->record(
+        local_id_,
+        std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+        seqCollective_,
+        seqP2P_,
+        op_id_,
+        profilingTitle ? profilingTitle : "",
+        inputs,
+        outputs,
+        nullptr,
+        r->xcclEndEvent_.get(),
+        options_->timeout,
+        pgStatus_,
+        isP2P);
+  }
   return r;
 }
 
@@ -534,9 +536,9 @@ std::shared_ptr<xcclComm_t> ProcessGroupXCCL::getXCCLComm(
     rank = p2pRank;
   }
 
-  c10::impl::VirtualGuardImpl impl(device.type());
-  c10::Stream stream =
-      impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false);
+  bool force_high = getCvarBool(TORCH_XCCL_HIGH_PRIORITY, false);
+  c10::Stream stream = at::xpu::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
   sycl::queue& q = c10::xpu::XPUStream(stream).queue();
 
   auto ctx = ccl::create_context(q.get_context());
@@ -665,16 +667,19 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
     const char* profilingTitle,
     bool nanCheck) {
   nanCheck &= enableNanCheck_;
-  seqCollective_++;
   auto device = inputs[0].device();
   const auto key = std::to_string(device.index());
   auto comm = getXCCLComm(key, device, opType);
 
+  if (!coalescing_state_) {
+    seqCollective_++;
+  }
+  op_id_++;
+
   if (coalescing_state_ & CoalActive) {
     if ((coalescing_state_ & CoalColl) == 0) {
       seqCollective_++;
     }
-    op_id_++;
     coalescing_state_ |= CoalColl;
     if (coalescedDevice_.index() < 0) {
       coalescedDevice_ = device;
@@ -715,8 +720,15 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   }
 
   c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work;
-  work =
-      initWork(device, rank_, opType, false, profilingTitle, inputs, outputs);
+  work = initWork(
+      device,
+      rank_,
+      opType,
+      false,
+      profilingTitle,
+      inputs,
+      outputs,
+      !coalescing_state_);
   if (coalescing_state_) {
     FlightRecorderXCCL::get()->record(
         local_id_,
@@ -772,8 +784,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::collective(
   work->future_ = c10::make_intrusive<at::ivalue::Future>(
       c10::ListType::create(c10::TensorType::get()), devices);
   work->future_->markCompleted(at::IValue(*work->outputs_));
+  auto id = work->trace_id_;
   work->future_->addCallback(
-      [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); });
+      [id](at::ivalue::Future&) {
+        FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+      },
+      /*use_future*/ false);
   work->blockingWait_ = blockingWait_;
 
   work->numelIn_ = 0;
@@ -884,9 +900,12 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::pointToPoint(
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);
     work->future_->markCompleted(at::IValue(*work->outputs_));
-    work->future_->addCallback([this, work](at::ivalue::Future&) {
-      this->setCompletedPgStatus(work);
-    });
+    auto id = work->trace_id_;
+    work->future_->addCallback(
+        [id](at::ivalue::Future&) {
+          FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false);
+        },
+        /*use_future*/ false);
 
     work->numelIn_ = work->numelOut_ = tensor.numel();
     setEnqueuedPgStatus(work);
@@ -2020,7 +2039,7 @@ c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const {
     return *usedDeviceIdxs_.begin();
   }
   int devIdx =
-      static_cast<int16_t>(rank_ % at::detail::getXPUHooks().getNumGPUs());
+      static_cast<int16_t>(globalRank() % at::detail::getXPUHooks().getNumGPUs());
   LOG(WARNING)
       << logPrefix()
       << c10::str(
diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp
index e7aa39c82d..bbc1f7cca8 100644
--- a/src/xccl/ProcessGroupXCCL.hpp
+++ b/src/xccl/ProcessGroupXCCL.hpp
@@ -24,6 +24,9 @@
 #include <xccl/ProcessGroupXCCLMonitor.hpp>
 namespace c10d {
 
+static std::vector<std::string> TORCH_XCCL_HIGH_PRIORITY = {
+    "TORCH_XCCL_HIGH_PRIORITY"};
+
 static std::vector<std::string> TORCH_XCCL_BLOCKING_WAIT = {
     "TORCH_XCCL_BLOCKING_WAIT",
     "XCCL_BLOCKING_WAIT"};
@@ -118,18 +121,19 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   };
 
   struct Options : public Backend::Options {
-    explicit Options();
+    explicit Options(bool is_high_priority_stream = false);
 
-    static c10::intrusive_ptr<Options> create() {
-      return c10::make_intrusive<Options>();
+    static c10::intrusive_ptr<Options> create(
+        bool is_high_priority_stream = false) {
+      return c10::make_intrusive<Options>(is_high_priority_stream);
     }
-
+    bool is_high_priority_stream;
     std::vector<uint64_t> global_ranks_in_group;
     std::string group_name;
   };
 
   ProcessGroupXCCL(
-      const c10::intrusive_ptr<Store>& store,
+      c10::intrusive_ptr<Store> store,
       int rank,
       int size,
       c10::intrusive_ptr<Options> options = Options::create());
@@ -138,11 +142,16 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       const c10::intrusive_ptr<Store>& store,
       int rank,
       int size,
-      const std::string& groupName)
-      : ProcessGroupXCCL(store, rank, size) {}
+      const std::string& groupName,
+      c10::intrusive_ptr<Options> options = Options::create())
+      : ProcessGroupXCCL(store, rank, size, std::move(options)) {}
 
   ~ProcessGroupXCCL() override;
 
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
   const std::string getBackendName() const override {
     return std::string(XCCL_BACKEND_NAME);
   }
@@ -171,7 +180,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       bool isP2P,
       const char* profilingTitle = nullptr,
       const std::vector<at::Tensor>& inputs = {},
-      const std::vector<at::Tensor>& outputs = {});
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false);
 
   template <typename Fn>
   c10::intrusive_ptr<Work> collective(
@@ -414,9 +424,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   c10::DeviceIndex guessDeviceId() const;
 
   const std::vector<uint64_t>& groupRanks() const;
+  const int& globalRank() const;
   void setEnqueuedPgStatus(c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
-  void setCompletedPgStatus(
-      c10::intrusive_ptr<ProcessGroupXCCL::WorkXCCL> work);
   bool dumpDebuggingInfo(bool includeStackTrace = true);
 
  protected:
diff --git a/src/xccl/ProcessGroupXCCLMonitor.cpp b/src/xccl/ProcessGroupXCCLMonitor.cpp
index 68fdd402c1..246bdf3a59 100644
--- a/src/xccl/ProcessGroupXCCLMonitor.cpp
+++ b/src/xccl/ProcessGroupXCCLMonitor.cpp
@@ -39,7 +39,7 @@ void HeartbeatMonitorXCCL::runLoop() {
   // We only need to dump once per PG, so we use local_id_ == 0 for the first PG
   if (pg_->local_id_ == 0) {
     // DumpPipe is one per-trainer process
-    dumpPipe.emplace(pg_->getRank());
+    dumpPipe.emplace(pg_->globalRank());
     while (true) {
       std::unique_lock<std::mutex> lock(monitorMutex_);
       if (monitorWakeUpCV_.wait_for(
diff --git a/test/regressions/test_grid_sample.py b/test/regressions/test_grid_sample.py
new file mode 100644
index 0000000000..a7c28c10a9
--- /dev/null
+++ b/test/regressions/test_grid_sample.py
@@ -0,0 +1,19 @@
+# Owner(s): ["module: intel"]
+import torch
+import torch.nn.functional as F
+from torch.testing._internal.common_utils import TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSimpleCopy(TestCase):
+    # Refer to https://github.com/pytorch/pytorch/issues/153996
+    def test_grid_sample(self, dtype=torch.float):
+        input_cpu = torch.rand(1, 2, 5, 5, device=cpu_device)
+        grid_cpu = torch.rand(1, 3, 3, 2, device=cpu_device)
+        out_cpu = F.grid_sample(input_cpu, grid_cpu, align_corners=False)
+        input_xpu = input_cpu.to(xpu_device)
+        grid_xpu = grid_cpu.to(xpu_device)
+        out_xpu = F.grid_sample(input_xpu, grid_xpu, align_corners=False)
+        self.assertEqual(out_cpu, out_xpu.to(cpu_device))
diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 44a3ac148a..062b502781 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -1,16 +1,21 @@
 # Owner(s): ["oncall: distributed"]
 
+import json
 import math
 import os
+import pickle
 import random
 import signal
 import sys
+import tempfile
+import threading
 import time
-from datetime import timedelta
+from datetime import datetime, timedelta
 from enum import auto, Enum
 from unittest import mock
 
 import torch
+import torch._C._distributed_c10d
 import torch.distributed as c10d
 import torch.distributed._functional_collectives as _functional_collectives
 
@@ -365,6 +370,31 @@ def test_nan_assert(self, type):
         # reset env
         os.environ["TORCH_XCCL_NAN_CHECK"] = "0"
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_oom(self):
+        pg = self._create_process_group_xccl()
+        dp_ranks = range(0, self.world_size)
+        dp_group = c10d.new_group(dp_ranks)
+        device = torch.device(f"xpu:{self.rank}")
+        torch.xpu.set_device(device)
+
+        shape = (16384 * 2, 16384 * 2)
+        weight = torch.ones(shape, device=device).half()
+        gradient = torch.zeros(shape, device=device).half()
+        ret = torch.randn(shape, device=device).half()
+
+        for iter in range(50):
+            output = torch.empty_like(ret)
+            output = ret + weight + gradient
+            ret = torch.nn.functional.linear(output, weight=ret)
+            dist.all_reduce(ret, op=dist.ReduceOp.SUM)
+        torch.xpu.synchronize()
+        self.assertLess(
+            torch.xpu.max_memory_allocated(),
+            torch.xpu.max_memory_reserved() * 2,
+        )
+
 
 class CommTest(MultiProcessTestCase):
     @property
@@ -641,7 +671,7 @@ def test_unwaited(self) -> None:
         with _functional_collectives.allow_inflight_collective_as_graph_input_ctx():
             self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
             input = torch.full(
-                (10240, 10240), float(self.rank), device=f"xpu:{self.rank}"
+                (1024, 1024), float(self.rank), device=f"xpu:{self.rank}"
             )
             dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
             # Non-functional collectives run under the context manager is registered in the work registry.
@@ -653,7 +683,7 @@ def test_unwaited(self) -> None:
         # Case 2: Run collectives not under context manager, and don't call wait on them.
         # NOTE: Here we intentionally test memory-stressed case.
         self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
-        for _ in range(50000):
+        for _ in range(500):
             input = torch.full(
                 (1024, 1024), float(self.rank), device=f"xpu:{self.rank}"
             )
@@ -708,6 +738,564 @@ def test_wait_tensor(self) -> None:
         self.assertEqual(input1, input2)
 
 
+class XCCLTraceTestBase(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "1000"
+        self.tempdir = tempfile.TemporaryDirectory()
+        os.environ["TORCH_FR_DUMP_TEMP_FILE"] = self._trace_basename()
+        os.environ["TORCH_FR_DEBUG_INFO_PIPE_FILE"] = self._trace_basename()
+        self._spawn_processes()
+
+    @classmethod
+    def _run(
+        cls,
+        parent_conn,
+        rank: int,
+        test_name: str,
+        file_name: str,
+        parent_pipe,
+        **kwargs,
+    ) -> None:
+        cls.parent = parent_conn
+        super()._run(rank, test_name, file_name, parent_pipe)
+
+    @property
+    def local_device(self):
+        return torch.device("xpu", self.rank_to_GPU[self.rank][0])
+
+    def _join_processes(self, fn):
+        # We need to patch sys.exit() as skip_if will use sys.exit() and
+        # the exit code from the this process will not be caught.
+        with mock.patch("sys.exit"):
+            fn()
+        super()._join_processes(fn)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context("spawn").Process
+        self.children_pipes = []
+        parent_pipes = []
+        for _ in range(self.world_size):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            self.children_pipes.append(child_conn)
+            parent_pipes.append(parent_conn)
+        piter = iter(parent_pipes)
+
+        def wrap(*positional, args, **kwargs):
+            args = (next(piter), *args)
+            return proc(*positional, args=args, **kwargs)
+
+        self._start_processes(wrap)
+
+    def _create_process_group_xccl(
+        self, timeout=timedelta(seconds=600), device_id=None
+    ):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "xccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timeout,
+            device_id=device_id,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "xccl")
+
+    def _trace_basename(self):
+        # we pass the base to the env, and the dump util will append rank
+        return os.path.join(self.tempdir.name, "trace_")
+
+    def _trace_name(self, rank):
+        return self._trace_basename() + str(rank)
+
+    def started_or_scheduled(self, timing_enabled=False):
+        return "started" if timing_enabled else "scheduled"
+
+
+class XCCLTraceTest(XCCLTraceTestBase):
+    def _verify_trace(self, t, include_collectives, is_json, timing_enabled=False):
+        ver = t["version"]
+        self.assertEqual(ver, "2.10")
+        comm_lib_version = t["comm_lib_version"]
+        torch_comm_lib_version = torch._C._distributed_c10d.get_xccl_version()
+        self.assertEqual(comm_lib_version, torch_comm_lib_version)
+        pg_config = t["pg_config"]
+        self.assertEqual(len(pg_config), 1)
+        default_pg_info = pg_config["0"]
+        self.assertIn("name", default_pg_info)
+        self.assertIn("desc", default_pg_info)
+        self.assertIn("ranks", default_pg_info)
+        pg_status = t["pg_status"]
+        self.assertEqual(len(pg_status), 1)
+        self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2")
+        self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2")
+        self.assertEqual(
+            str(pg_status["0"]["last_started_collective"]),
+            "2" if timing_enabled else "-1",
+        )
+        global_ranks = pg_config["0"]["ranks"]
+        self.assertEqual(len(json.loads(global_ranks)), self.world_size)
+        if include_collectives:
+            self.assertEqual(len(t["entries"]), 2)
+            t = t["entries"]
+            last = t[-1]
+            self.assertEqual(last["thread_id"], str(threading.current_thread().ident))
+            self.assertEqual(last["thread_name"], "fr_test_thread")
+            self.assertEqual(last["process_group"], ("0", "default_pg"))
+            # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed
+            s = last["time_discovered_started_ns"]
+            f = last["time_discovered_completed_ns"]
+            self.assertEqual(last["record_id"], 1)
+            # self.assertIsNotNone(f)
+            if timing_enabled:
+                self.assertIsNotNone(s)
+                self.assertTrue(s <= f)
+            # we don't collect stack traces in JSON at the moment
+            if not is_json:
+                self.assertIn("test_c10d_xccl.py", str(last["frames"]))
+            self.assertEqual(last["input_sizes"], ((3, 4),))
+            self.assertEqual(last["input_dtypes"], ["Float"])
+            self.assertEqual(last["output_sizes"], ((3, 4),))
+            self.assertEqual(last["output_dtypes"], ["Float"])
+            self.assertEqual(last["collective_seq_id"], 2)
+            self.assertEqual(last["timeout_ms"], 600000)
+            now = datetime.now()
+            event_created_time = datetime.fromtimestamp(
+                last["time_created_ns"] / 1000000000
+            )
+            before_test = now - timedelta(minutes=1)
+            self.assertTrue(before_test < event_created_time < now)
+            if timing_enabled:
+                # very loose bounds, measured 0.036 ms on devgpu
+                self.assertTrue(0 < last["duration_ms"] < 100)
+            else:
+                self.assertTrue("duration_ms" not in last)
+        else:
+            self.assertTrue("entries" not in t)
+
+    def load_libpthread_or_libc(self):
+        import ctypes.util
+
+        for base in ("pthread", "c"):
+            path = ctypes.util.find_library(base)
+            if path:
+                try:
+                    return ctypes.CDLL(path)
+                except OSError:
+                    continue
+        raise RuntimeError("Could not load pthread or libc")
+
+    # Directly set thread name using threading.current_thread().name does not work
+    # because we use pthread_getname_np to get the thread’s OS-level name in C++
+    def set_thread_name(self, name):
+        import ctypes
+
+        lib = self.load_libpthread_or_libc()
+        pthread_self = lib.pthread_self
+        pthread_self.restype = ctypes.c_void_p
+        pthread_setname_np = lib.pthread_setname_np
+        pthread_setname_np.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+
+        # Get current pthread handle
+        tid = pthread_self()
+
+        # Set name
+        pthread_setname_np(tid, name.encode())
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("include_collectives", [True, False])
+    def test_short_pickle(self, include_collectives, timing_enabled=False):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+        t = pickle.loads(
+            torch._C._distributed_c10d._dump_xccl_trace(
+                includeCollectives=include_collectives
+            )
+        )
+        self._verify_trace(
+            t,
+            include_collectives=include_collectives,
+            is_json=True,
+            timing_enabled=timing_enabled,
+        )
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_dump_pipe(self):
+        def open_file_with_timeout(file_path, mode, timeout=1.0):
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if os.path.exists(file_path):
+                    return open(file_path, mode)
+                time.sleep(0.1)
+            raise FileNotFoundError
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            for c in self.children_pipes:
+                self.assertEqual(c.recv(), "next")
+
+            dump_file = self._trace_name(rank=0)
+            pipe_file = dump_file + ".pipe"
+            with open_file_with_timeout(pipe_file, "w") as f:
+                f.write("1\n")
+            with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f:
+                self.assertTrue("all_reduce" in str(pickle.load(f)))
+
+            for c in self.children_pipes:
+                c.send("next")
+            return
+
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        self.parent.send("next")
+        self.parent.recv()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_long(self):
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            # test some other primitives to make sure
+            # their strings are valid
+            xs = [torch.ones(3, 4, device=device)]
+            pg.broadcast(xs).wait()
+            pg.allreduce(xs).wait()
+            # pg.reduce(xs).wait() //Currently failing on XPU
+            ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]]
+            pg.allgather(ys, xs).wait()
+            pg.reduce_scatter(xs, ys).wait()
+            f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 10)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(last["profiling_name"], "xccl:all_reduce")
+        # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed
+        self.assertIn("test_c10d_xccl.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["input_dtypes"], ["Float"])
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["output_dtypes"], ["Float"])
+        self.assertEqual(last["timeout_ms"], 600000)
+        self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_barrier_profiling(self):
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        f = pg.barrier()
+        f = pg.allreduce(a)
+        f.wait()
+        torch.xpu.synchronize(device=device)
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 2)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(first["profiling_name"], "xccl:all_reduce_barrier")
+        self.assertEqual(last["profiling_name"], "xccl:all_reduce")
+        dist.destroy_process_group()
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "op_sizes_per_coalesce",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [False])
+    def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's xpu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        num_coalesced_ops = 20
+        ops_per_coalesce = len(op_sizes_per_coalesce)
+        for _ in range(num_coalesced_ops):
+            ops = []
+            for input_sizes in op_sizes_per_coalesce:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    ops.append(dist.P2POp(dist.irecv, tensor, 1))
+                elif self.rank == 1:
+                    tensor *= 2
+                    ops.append(dist.P2POp(dist.isend, tensor, 0))
+
+            dist.batch_isend_irecv(ops).pop().wait()
+
+        torch.xpu.synchronize(device=self.local_device)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), num_coalesced_ops * ops_per_coalesce)
+
+        expected_record_id = 0
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_coalesced_ops):
+            first_op = seq * (ops_per_coalesce)
+            coalesced_op = first_op + ops_per_coalesce
+            for p2p_op_idx, input_sizes in zip(
+                range(first_op, coalesced_op, 1), op_sizes_per_coalesce
+            ):
+                # the indivudal ops inside the coalescing group the individual op metadata,
+                # but not the timing info coming from the actual coalesced kernel
+                profiling_name = (
+                    "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0"
+                )
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["record_id"], expected_record_id
+                )
+                expected_record_id += 1
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["profiling_name"], profiling_name
+                )
+                # we don't increment collective_seq_id for p2p ops.
+                self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0)
+                self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq)
+                self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id)
+                expected_op_id += 1
+                self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes])
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["output_sizes"], [input_sizes]
+                )
+                # duration doesn't get tagged onto individual ops yet, nor is their state updated
+                self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled")
+                self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx])
+
+            # coalesced ops not yet supported in FR
+            expected_seq += 1
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "op_sizes",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [False])
+    def test_individual_send_recv(self, op_sizes, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's xpu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        num_repeats = 10
+        ops_per_repeat = len(op_sizes)
+        for _ in range(num_repeats):
+            for input_sizes in op_sizes:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    dist.recv(tensor, 1)
+                elif self.rank == 1:
+                    tensor *= 2
+                    dist.send(tensor, 0)
+
+        torch.xpu.synchronize(device=self.local_device)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat))
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_repeats * ops_per_repeat):
+            input_sizes = op_sizes[seq % ops_per_repeat]
+            profiling_name = "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0"
+            self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name)
+            # we don't increment collective_seq_id for p2p ops.
+            self.assertEqual(t["entries"][seq]["collective_seq_id"], 0)
+            self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][seq]["op_id"], expected_op_id)
+            expected_op_id += 1
+            self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["state"], "completed")
+
+            if timing_enabled:
+                duration = t["entries"][seq]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][seq])
+
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [False])
+    def test_allgather_uneven(self, timing_enabled):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_split_sizes = [i + 1 for i in range(self.world_size)]
+        sum_len = sum(output_split_sizes)
+        output_tensor = torch.zeros(sum_len, 2).to(self.rank)
+        expected_tensor = torch.ones(sum_len, 2).to(self.rank)
+        input_tensor = torch.ones(output_split_sizes[self.rank], 2).to(self.rank)
+
+        dist.all_gather(
+            list(torch.split(output_tensor, output_split_sizes)), input_tensor
+        )
+        torch.xpu.synchronize(device=self.rank)
+        self.assertEqual(output_tensor, expected_tensor)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+        self.assertEqual(len(t["entries"]), self.world_size)
+        for i in range(self.world_size):
+            self.assertEqual(t["entries"][i]["profiling_name"], "xccl:_broadcast_oop")
+            # collective_seq_id should be incremented once.
+            self.assertEqual(t["entries"][i]["collective_seq_id"], 1)
+            self.assertEqual(t["entries"][i]["input_sizes"], [[i + 1, 2]])
+            self.assertEqual(
+                t["entries"][i]["output_sizes"],
+                [[i + 1, 2]],
+            )
+            self.assertEqual(t["entries"][i]["state"], "scheduled")
+            # No event is recorded for individual ops
+            self.assertTrue("time_discovered_completed_ns" in t["entries"][i])
+        # TODO: (frost-intel) Add coalesced op recording for FR
+
+    # TODO(whc) test out other ops (And combinations of ops, if that's valid?)
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [False])
+    def test_coalescing_manager_collective(self, timing_enabled):
+        """
+        The coalescing manager api works by accumulating operations in python via a contextmanager, and then making
+        one call into c++ to an <op>_coalesced API.  It has limited support for ops and has been added recently to
+        avoid overheads of making individual py-cpp calls.  This complicates flight recording..
+
+        For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_xccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_tensors = torch.zeros(2, 2).to(self.rank)
+        input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)]
+
+        # TODO(whc) make this work with bigger world or something
+        self.assertEqual(self.world_size, 2, self.world_size)
+
+        with dist._coalescing_manager():
+            for i in range(self.world_size):
+                dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+        self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
+
+        torch.xpu.synchronize(device=self.rank)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace())
+
+        self.assertEqual(
+            len(t["entries"]), 1
+        )  # one for the reduce_scatter_tensor_coalesced
+        self.assertEqual(
+            t["entries"][0]["profiling_name"], "xccl:reduce_scatter_tensor_coalesced"
+        )
+        # collective_seq_id should be incremented once.
+        self.assertEqual(t["entries"][0]["collective_seq_id"], 1)
+        self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]])
+        self.assertEqual(
+            t["entries"][0]["output_sizes"],
+            [
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+            ],
+        )
+        self.assertEqual(t["entries"][0]["state"], "completed")
+        if timing_enabled:
+            duration = t["entries"][0]["duration_ms"]
+            self.assertTrue(0.001 < duration < 10000, duration)
+        else:
+            self.assertTrue("duration_ms" not in t["entries"][0])
+
+
+instantiate_parametrized_tests(XCCLTraceTest)
 instantiate_parametrized_tests(ProcessGroupXCCLTest)
 
 
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
index ddde5f8c8a..cd04712091 100644
--- a/test/xpu/run_distributed.py
+++ b/test/xpu/run_distributed.py
@@ -9,9 +9,41 @@
 res2 = 0
 fail_test = []
 
-# libfabric WA to avoid hang issue
-os.environ["FI_PROVIDER"] = "tcp"
-# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3"
+# Get the xelink group card affinity
+ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
+if ret == 0:
+    gpu_dict = {}
+    with open("topology.log") as file:
+        lines = file.readlines()
+        for line in lines:
+            if "CPU Affinity" in line:
+                continue
+            line = line.strip()
+            if line.startswith("GPU "):
+                items = line.split(" ")
+                items = [x for x in items if x]
+                gpu_id = items[1]
+                i = gpu_id.split("/")[0]
+                affinity = ""
+                for j, item in enumerate(items):
+                    if "SYS" not in item and ("XL" in item or "S" in item):
+                        if len(affinity) == 0:
+                            affinity = str(j - 2)
+                        else:
+                            affinity = affinity + "," + str(j - 2)
+                gpu_dict[i] = affinity
+
+    max_affinity = ""
+    for key, value in gpu_dict.items():
+        if len(value) > len(max_affinity):
+            max_affinity = value
+
+    os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
+    print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
+
+else:
+    print("xpu-smi topology failed")
+    sys.exit(255)
 
 
 # run python test
@@ -26,6 +58,10 @@ def run(test_command):
 
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"]
+res += run(test_command)
+test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"]
+res += run(test_command)
 
 # run pytest with skiplist
 for key in skip_dict:
@@ -38,8 +74,4 @@ def run(test_command):
 if fail_test:
     print(",".join(fail_test) + " have failures")
 
-exit_code = os.WEXITSTATUS(res2)
-if exit_code == 0:
-    sys.exit(res)
-else:
-    sys.exit(exit_code)
+sys.exit(res)
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index ae3a8b8a93..9eeb6b44c4 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -74,12 +74,6 @@
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
-        # XPU hang. CUDA hang as well.
-        # https://github.com/pytorch/pytorch/issues/79528
-        "test_compare_cpu_special_hermite_polynomial_he_xpu_float32",
-        "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
-        "test_compare_cpu_special_laguerre_polynomial_l_xpu_float32",
-        "test_compare_cpu_special_legendre_polynomial_p_xpu_float32",
         # core dump
         "test_dtypes__refs_nn_functional_pdist_xpu",
         # XFAIL of CUDA and XPU, unexpected success in fallback
@@ -1022,10 +1016,6 @@
         "test_ctc_loss_cudnn_tensor",  # want "xpu" in function name
         # RuntimeError: reflection_pad2d_backward_xpu does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'.
         "test_ReflectionPad2d_large_deterministic_xpu",
-        # Case updated in pytorch commit 97272e4
-        "test_hardswish_grad_corner_xpu_bfloat16",
-        "test_hardswish_grad_corner_xpu_float16",
-        "test_hardswish_grad_corner_xpu_float32",
         # x_cuda = x.clone().detach().to("cuda").requires_grad_(): Torch not compiled with CUDA enabled
         "test_layer_norm_backwards_eps",
     ),
diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py
index 1210896ec6..cf5ed5cd78 100644
--- a/test/xpu/skip_list_dist.py
+++ b/test/xpu/skip_list_dist.py
@@ -1,105 +1,36 @@
 skip_dict = {
-    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": (
-        "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False",
-        "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False",
-        "test_checkpoint_submodule_use_reentrant_False_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
         "test_ddp_parity_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
-    "../../../../test/distributed/fsdp/test_fsdp_core.py": (
-        "test_delayed_optim_step_offload_false_no_shard_xpu",
-        "test_delayed_optim_step_offload_false_none_xpu",
-        "test_delayed_optim_step_offload_false_shard_grad_op_xpu",
-        "test_delayed_optim_step_offload_true_none_xpu",
-        "test_delayed_optim_step_offload_true_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_false_no_shard_xpu",
-        "test_delayed_reduce_scatter_offload_false_none_xpu",
-        "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu",
-        "test_delayed_reduce_scatter_offload_true_none_xpu",
-        "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_offload_false_none_xpu",
-        "test_mixture_of_experts_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_offload_true_none_xpu",
-        "test_mixture_of_experts_offload_true_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu",
-        "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_false_no_shard_xpu",
-        "test_nested_always_wrap_model_offload_false_none_xpu",
-        "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu",
-        "test_nested_always_wrap_model_offload_true_none_xpu",
-        "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_false_no_shard_xpu",
-        "test_nested_wrapped_model_offload_false_none_xpu",
-        "test_nested_wrapped_model_offload_false_shard_grad_op_xpu",
-        "test_nested_wrapped_model_offload_true_none_xpu",
-        "test_nested_wrapped_model_offload_true_shard_grad_op_xpu",
-        "test_transformer_offload_false_no_shard_xpu",
-        "test_transformer_offload_false_none_xpu",
-        "test_transformer_offload_false_shard_grad_op_xpu",
-        "test_transformer_offload_true_none_xpu",
-        "test_transformer_offload_true_shard_grad_op_xpu",
-        # https://github.com/intel/torch-xpu-ops/issues/1475
-        "test_transformer_no_grad_mixed_precision_True_xpu",
-        "test_transformer_no_grad_mixed_precision_False_xpu",
-    ),
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu",
-    #    "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu",
-    #    "test_raises_warning_or_errors_xpu",
-    # ),
-    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": (
-        "test_invalid_first_iter_order_sharding_strategy1_xpu",
-        "test_train_eval_sharding_strategy1_xpu",
-    ),
+    "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_core.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": (
         "test_parity_with_non_frozen_fsdp_xpu",
         "test_parity_with_ddp_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
-    # will bring back after oneccl upgrade to 2021.16.1
-    # "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_input.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
     "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": (
         "test_transformer_no_grad_mixed_precision_True_xpu",
     ),
     "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
-    # Will add them back after debugging
-    # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": (
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu",
-    #    "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu",
-    #    "test_hsdp_init_with_device_mesh_xpu",
-    #    "test_root_module_is_not_FSDP_xpu",
-    # ),
     "../../../../test/distributed/fsdp/test_utils.py": None,
     "distributed/test_c10d_xccl.py": (
-        # will bring back after oneccl upgrade to 2021.16.1
-        "test_xccl_barrier",
+        # https://github.com/intel/torch-xpu-ops/issues/2046
+        "test_unwaited",
+    ),
+    "distributed/test_c10d_ops_xccl.py": None,
+    "../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
+    "../../../../test/distributed/test_functional_api.py": (
+        # depends on https://github.com/pytorch/pytorch/pull/159473
+        "test_tracing_with_fakepg_xpu",
     ),
+    "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_mem_tracker.py": None,
+    "../../../../test/distributed/_tools/test_memory_tracker.py": None,
 }
diff --git a/test/xpu/test_tensor_creation_ops_xpu.py b/test/xpu/test_tensor_creation_ops_xpu.py
index 95a30716ea..077fbb6ad5 100644
--- a/test/xpu/test_tensor_creation_ops_xpu.py
+++ b/test/xpu/test_tensor_creation_ops_xpu.py
@@ -4371,6 +4371,14 @@ def test_full_like_inference(self, device):
             torch.full_like(like, 1.0, dtype=torch.complex64).dtype, torch.complex64
         )
 
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_zeros_large(self, device, dtype):
+        output = torch.zeros(2**31 - 1, device=device, dtype=dtype)
+
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_ones_large(self, device, dtype):
+        output = torch.ones(2**31 - 1, device=device, dtype=dtype)
+
 
 # Tests for the `frombuffer` function (only work on CPU):
 #   Constructs tensors from Python objects that implement the buffer protocol,
diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py
index 807a509bc6..0b5ad70660 100644
--- a/tools/codegen/install_xpu_headers.py
+++ b/tools/codegen/install_xpu_headers.py
@@ -4,6 +4,7 @@
 import shutil
 from pathlib import Path
 
+VERBOSE = False
 
 parser = argparse.ArgumentParser(description="Utils for append ops headers")
 parser.add_argument(
@@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst):
     r"""
     Cleans trailing empty lines from the destination file, then appends #include
     lines from the source file that match `#include <ATen/ops/...` to the destination.
+    Only modifies file if changes are actually needed.
     """
     if args.dry_run:
         return
 
-    # Read source file and match header lines
-    with open(src, encoding="utf-8") as fr:
-        src_text = fr.read()
+    try:
+        with open(src, encoding="utf-8") as fr:
+            src_text = fr.read()
+    except OSError as e:
+        if VERBOSE:
+            print(f"Warning: Could not read source file {src}: {e}")
+        return
+
     pattern = r"^#include <ATen/ops/.*>\s*\r?\n"
     matches = re.findall(pattern, src_text, re.MULTILINE)
     if not matches:
         return
 
-    with open(dst, "r+", encoding="utf-8") as f:
-        dst_lines = f.readlines()
-        dst_text = "".join(dst_lines)
-        missing_headers = [match for match in matches if match not in dst_text]
-        if not missing_headers:
-            return
+    try:
+        with open(dst, encoding="utf-8") as f:
+            dst_lines = f.readlines()
+            dst_text = "".join(dst_lines)
+    except OSError as e:
+        if VERBOSE:
+            print(f"Warning: Could not read destination file {dst}: {e}")
+        return
+
+    missing_headers = [match for match in matches if match not in dst_text]
+    if not missing_headers:
+        return
+
+    new_dst_lines = dst_lines.copy()
+
+    while new_dst_lines and not new_dst_lines[-1].strip():
+        new_dst_lines.pop()
+    new_dst_lines.extend(missing_headers)
 
-        # Remove trailing empty lines from dst_lines
-        while dst_lines and not dst_lines[-1].strip():
-            dst_lines.pop()
+    new_content = "".join(new_dst_lines)
+    old_content = "".join(dst_lines)
 
-        f.seek(0)
-        f.truncate()
-        f.writelines(dst_lines)
-        # Append missing headers to the end of the file
-        f.writelines(missing_headers)
+    if new_content != old_content:
+        try:
+            with open(dst, "w", encoding="utf-8") as f:
+                f.writelines(new_dst_lines)
+        except OSError as e:
+            if VERBOSE:
+                print(f"Error: Could not write to {dst}: {e}")
 
 
 def parse_ops_headers(src):
@@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir):
 
 def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers):
     r"""
-    Generate XPU ops headers xpu_ops_generated_headers.cmake
+    Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes
     """
-    with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw:
-        fw.write("set(xpu_ops_generated_headers\n")
-        for header in xpu_ops_headers:
-            fw.write(f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n')
-        fw.write(")\n")
+    output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake")
+
+    # Generate new content
+    new_content = "set(xpu_ops_generated_headers\n"
+    for header in xpu_ops_headers:
+        new_content += f'    "{Path(os.path.join(dst_dir, header)).as_posix()}"\n'
+    new_content += ")\n"
+
+    # Check if file exists and has same content
+    should_write = True
+    if os.path.exists(output_file):
+        try:
+            with open(output_file, encoding="utf-8") as f:
+                existing_content = f.read()
+            should_write = existing_content != new_content
+        except OSError:
+            # If we can't read the file, write it anyway
+            should_write = True
+
+    if should_write:
+        with open(output_file, "w", encoding="utf-8") as fw:
+            fw.write(new_content)
 
 
 def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
     r"""
     For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers.
+    Copies and appends are done only if leading to file changes to prevent unnecessary recompilations.
     """
     if args.dry_run:
         return
@@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present."
         src = os.path.join(src_dir, f)
         dst = os.path.join(dst_dir, f)
-        shutil.copy(src, dst)
+        # Only copy if src and dst differ or dst does not exist
+        should_copy = True
+        if os.path.exists(dst):
+            try:
+                with open(src, "rb") as fsrc, open(dst, "rb") as fdst:
+                    should_copy = fsrc.read() != fdst.read()
+            except OSError:
+                should_copy = True
+        if should_copy:
+            shutil.copy(src, dst)
 
     for f in common_headers:
         src = os.path.join(src_dir, f)
@@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
         with open(dst, "r+", encoding="utf-8") as f:
             dst_lines = f.readlines()
             dst_text = "".join(dst_lines)
+            old_content = "".join(dst_lines)
             missing_declarations = []
             insertion_index = None
             for index, line in enumerate(dst_lines):
@@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
                     break
             assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}."
 
-            f.seek(0)
-            f.writelines(dst_lines)
-            f.truncate()
+            if old_content != "".join(dst_lines):
+                f.seek(0)
+                f.writelines(dst_lines)
+                f.truncate()
 
 
 def main():
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index 2df8464262..a3281791de 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -2071,6 +2071,15 @@
     XPU: nonzero_xpu
   tags: [dynamic_output_shape, core]
 
+- func: nonzero_static.out(Tensor self, *, SymInt size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    XPU: nonzero_static_out_xpu
+
+- func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor
+  variants: method, function
+  dispatch:
+    XPU: nonzero_static_xpu
+
 - func: maximum(Tensor self, Tensor other) -> Tensor
   structured_delegate: maximum.out
   device_check: NoCheck   # TensorIterator