From 4d38b5ef1da9c2e3fb6861b0737fa847a72b70b0 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 16 Sep 2025 09:38:12 +0800 Subject: [PATCH 01/27] [CI] Fix CICD test issues (#2027) 1. torchbench issues caused by deps installation 2. pt2e test dataset path and deps installation 3. op benchmark github access permission 4. enhance bisect search disable_distributed --- .github/actions/linux-testenv/action.yml | 16 +++- .github/actions/pt2e/action.yml | 2 + .github/workflows/_linux_e2e.yml | 1 - .github/workflows/_linux_e2e_summary.yml | 15 +-- .github/workflows/_linux_op_benchmark.yml | 4 +- .github/workflows/bisect_search.yml | 112 ++++++++-------------- .github/workflows/pull.yml | 2 + 7 files changed, 67 insertions(+), 85 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index cc5b9e0a70..246b94c919 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -99,7 +99,7 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ];then + if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops cd third_party/torch-xpu-ops else @@ -139,7 +139,7 @@ runs: fi # for dlrm pip install pyre-extensions - curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install # for soft_actor_critic, temp fix pip install git+https://github.com/nocoding03/gym@fix-np cd ../pytorch @@ -152,10 +152,13 @@ runs: TORCHBENCH_COMMIT_ID="$(git rev-parse --short HEAD)" sed -i 's/^ *pynvml.*//' requirements.txt pip install -r requirements.txt - python install.py --continue_on_fail + # python install.py --continue_on_fail + echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV} + pip install dominate + python install.py Super_SloMo # for dlrm pip install pyre-extensions - curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install cd ../pytorch else pip install -r ./.ci/docker/requirements-ci.txt @@ -170,6 +173,11 @@ runs: else pip install torchao --pre --index-url https://download.pytorch.org/whl/nightly/xpu fi + if [ "${{ inputs.suite }}" != "None" ];then + # To install numpy 1.x for benchmarks as CUDA + # yolov requires numpy>=1.23 + pip install -U numpy==1.26.4 + fi - name: Torch Config shell: bash -xe {0} run: | diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 8704ed0fc6..60954ffd29 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -29,6 +29,7 @@ runs: shell: bash -xe {0} run: | # dataset + dataset_dir="${RUNNER_TEMP}/_datasets/imagenet" if [ ! -d ${dataset_dir} ];then rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir} wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh @@ -36,6 +37,7 @@ runs: tar -xf ILSVRC2012_img_val.tar bash valprep.sh fi + echo "dataset_dir=${dataset_dir}" >> ${GITHUB_ENV} - name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }}) shell: bash -xe {0} run: | diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index ab49d28490..cd2056ff36 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -87,7 +87,6 @@ jobs: cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }} MODEL_ONLY_NAME: ${{ inputs.model }} AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - dataset_dir: ${{ runner.temp }}/../_datasets/imagenet steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index 427a35ca82..f2026a9f5a 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -30,14 +30,15 @@ jobs: steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Install gh-cli - run: | - sudo apt-get update - sudo apt-get install gh rsync ca-certificates -y - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + pip install pandas requests - name: Download Target Artifact run: | mkdir target/ @@ -64,7 +65,7 @@ jobs: - name: Get summary if: ${{ ! cancelled() }} run: | - pip install pandas requests + exit_label=0 e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)" if [ -f "${e2e_summary_csv}" ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} @@ -72,17 +73,17 @@ jobs: if [ ${exit_label} -ne 0 ];then grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} fi fi pt2e_summary_csv="$(find ./target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then cat ${pt2e_summary_csv} - failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) + failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true) if [ ${failed_num} -ne 0 ];then echo "[Warning] PT2E has failures!" fi fi + exit ${exit_label} - name: Upload Reference Run ID if: ${{ endsWith(inputs.test_type, 'ly') }} run: | diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index b65d985008..d71458c9bf 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -50,8 +50,6 @@ jobs: op_benchmark: needs: runner runs-on: ${{ needs.runner.outputs.runner_id }} - permissions: - issues: write timeout-minutes: 900 container: image: mengfeili/intel-pvc-driver:1146-1136 @@ -93,6 +91,8 @@ jobs: op_benchmark_test_results_check: needs: op_benchmark runs-on: ubuntu-24.04 + permissions: + issues: write steps: - name: Install gh-cli run: | diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml index 4adba0df6c..c2512acb33 100644 --- a/.github/workflows/bisect_search.yml +++ b/.github/workflows/bisect_search.yml @@ -45,42 +45,26 @@ jobs: get_runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - - name: Get runner info + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + uses: ./.github/actions/get-runner biisect-search: needs: get_runner - runs-on: ${{ needs.get_runner.outputs.test_host }} + runs-on: ${{ needs.get_runner.outputs.runner_id }} container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.get_runner.outputs.user_id }} --group-add ${{ needs.get_runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool SEARCH_COMMITS: ${{ inputs.search_commits }} @@ -95,33 +79,19 @@ jobs: HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - - name: Check runner - run: | - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf /tmp/xpu-tool - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - hostname && whoami && id - clinfo --list - gcc -v && g++ -v - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - pip install cmake ninja pandas psutil scipy requests pybind11 - mkdir gs-logs gs-search - echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv + path: gs-scripts + - name: Prepare test env + uses: ./gs-scripts/.github/actions/linux-testenv + with: + pytorch: nightly_wheel + python: ${{ inputs.python }} - name: Install oneAPI DLE if: ${{ inputs.oneapi != 'installed' }} run: | - rm -rf ~/intel ~/.intel /tmp/intel + rm -rf ~/intel ~/.intel ${HOME}/intel if [ "${{ inputs.oneapi }}" == "2025.1" ];then ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh" elif [ "${{ inputs.oneapi }}" == "2025.2" ];then @@ -130,42 +100,42 @@ jobs: ONEAPI_URL="${{ inputs.oneapi }}" fi wget -q -O oneapi.sh "${ONEAPI_URL}" - bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi - echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV} - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - with: - path: gs-scripts - - name: Prepare source code + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + + - name: Summary file run: | - git clone https://github.com/pytorch/pytorch gs-pytorch - cd gs-pytorch + mkdir -p gs-logs + echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv + - name: Get latest versions + run: | + cd pytorch LATEST_PT_COMMIT="$(git rev-parse HEAD)" - cd .. - git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops - cd gs-torch-xpu-ops + cd third_party/torch-xpu-ops LATEST_XPU_COMMIT="$(git rev-parse HEAD)" - cd .. echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV} echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV} - name: Prepare test env run: | - pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then - pip install transformers==4.44.2 + pip install transformers elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + pip install timm elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')" git clone https://github.com/pytorch/benchmark gs-benchmark cd gs-benchmark + pip install -r requirements.txt echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV} python install.py ${model_name} - else - pip install -r gs-pytorch/.ci/docker/requirements-ci.txt + cd .. fi - pip uninstall -y torch && pip uninstall -y torch + pip uninstall -y torchvision torchaudio + git clone https://github.com/pytorch/vision gs-vision + cd gs-vision + python setup.py install + cd .. + pip uninstall -y torch - name: Bisect search pytorch if: ${{ contains(inputs.search_commits, 'pytorch') }} run: | @@ -186,7 +156,7 @@ jobs: > ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)" new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)" if [ "${old_status}" != "${new_status}" ];then - cd gs-pytorch + cd pytorch git reset --hard rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/ git bisect start ${new_commit} ${old_commit} @@ -219,7 +189,7 @@ jobs: > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)" new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)" if [ "${old_status}" != "${new_status}" ];then - cd gs-pytorch + cd pytorch git reset --hard rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/ git bisect start ${new_commit} ${old_commit} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be6b260734..b5e8606ac8 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -115,6 +115,7 @@ jobs: with: runner: pvc_rolling pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} ut: ${{ matrix.ut_name }} linux-distributed: @@ -129,6 +130,7 @@ jobs: with: runner: pvc_rolling pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} ut: ${{ matrix.ut_name }} linux-e2e: From df1d7ad08c124f86db2fc9bdf05111322c7655b6 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:57:02 +0800 Subject: [PATCH 02/27] [CI] Add ported distributed cases (#1945) This PR intends to add some ported distributed cases in torch-xpu-ops CI. - Add ZE_AFFINITY_MASK to ensure using Xelink. - Add CCL_ROOT for Xelink, this WA can be removed after oneCCL upgrade to 2021.16.2 - Increase distributed test time limit. Currently, the test part needs about 1 hour after add ported cases. disable_e2e disable_ut --- .github/actions/linux-uttest/action.yml | 6 +- .github/scripts/ut_result_check.sh | 3 +- .github/workflows/_linux_ut.yml | 3 +- test/xpu/run_distributed.py | 48 +++++++++-- test/xpu/skip_list_dist.py | 101 ++++-------------------- 5 files changed, 65 insertions(+), 96 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index fb79182006..6e2cf2ceeb 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -155,7 +155,7 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 3600 bash -xeu -o pipefail {0} + shell: timeout 36000 bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m @@ -166,9 +166,13 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi + export CCL_ROOT=$(dirname $(which python))/../ + export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" + export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" python run_distributed.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; # Summary - name: UT Test Results Summary diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 0e6b95ec45..1aac532497 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' fi if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1751d9d972..6da0abadd1 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -99,11 +99,12 @@ jobs: test-in-baremetal: needs: runner + timeout-minutes: 600 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8a..cd04712091 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -9,9 +9,41 @@ res2 = 0 fail_test = [] -# libfabric WA to avoid hang issue -os.environ["FI_PROVIDER"] = "tcp" -# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) # run python test @@ -26,6 +58,10 @@ def run(test_command): test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"] +res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: @@ -38,8 +74,4 @@ def run(test_command): if fail_test: print(",".join(fail_test) + " have failures") -exit_code = os.WEXITSTATUS(res2) -if exit_code == 0: - sys.exit(res) -else: - sys.exit(exit_code) +sys.exit(res) diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py index 1210896ec6..cf5ed5cd78 100644 --- a/test/xpu/skip_list_dist.py +++ b/test/xpu/skip_list_dist.py @@ -1,105 +1,36 @@ skip_dict = { - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( - "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", - "test_checkpoint_submodule_use_reentrant_False_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_core.py": ( - "test_delayed_optim_step_offload_false_no_shard_xpu", - "test_delayed_optim_step_offload_false_none_xpu", - "test_delayed_optim_step_offload_false_shard_grad_op_xpu", - "test_delayed_optim_step_offload_true_none_xpu", - "test_delayed_optim_step_offload_true_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_false_no_shard_xpu", - "test_delayed_reduce_scatter_offload_false_none_xpu", - "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_true_none_xpu", - "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_offload_false_no_shard_xpu", - "test_mixture_of_experts_offload_false_none_xpu", - "test_mixture_of_experts_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_offload_true_no_shard_xpu", - "test_mixture_of_experts_offload_true_none_xpu", - "test_mixture_of_experts_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_false_no_shard_xpu", - "test_nested_always_wrap_model_offload_false_none_xpu", - "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_true_none_xpu", - "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_false_no_shard_xpu", - "test_nested_wrapped_model_offload_false_none_xpu", - "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_true_none_xpu", - "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", - "test_transformer_offload_false_no_shard_xpu", - "test_transformer_offload_false_none_xpu", - "test_transformer_offload_false_shard_grad_op_xpu", - "test_transformer_offload_true_none_xpu", - "test_transformer_offload_true_shard_grad_op_xpu", - # https://github.com/intel/torch-xpu-ops/issues/1475 - "test_transformer_no_grad_mixed_precision_True_xpu", - "test_transformer_no_grad_mixed_precision_False_xpu", - ), - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu", - # "test_raises_warning_or_errors_xpu", - # ), - "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": ( - "test_invalid_first_iter_order_sharding_strategy1_xpu", - "test_train_eval_sharding_strategy1_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( "test_parity_with_non_frozen_fsdp_xpu", "test_parity_with_ddp_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, - # will bring back after oneccl upgrade to 2021.16.1 - # "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": ( "test_transformer_no_grad_mixed_precision_True_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu", - # "test_hsdp_init_with_device_mesh_xpu", - # "test_root_module_is_not_FSDP_xpu", - # ), "../../../../test/distributed/fsdp/test_utils.py": None, "distributed/test_c10d_xccl.py": ( - # will bring back after oneccl upgrade to 2021.16.1 - "test_xccl_barrier", + # https://github.com/intel/torch-xpu-ops/issues/2046 + "test_unwaited", + ), + "distributed/test_c10d_ops_xccl.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # depends on https://github.com/pytorch/pytorch/pull/159473 + "test_tracing_with_fakepg_xpu", ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, } From 74b11bff8e68ff8f170277ffdae8082a3845f200 Mon Sep 17 00:00:00 2001 From: Chao Han Date: Wed, 17 Sep 2025 08:57:21 +0800 Subject: [PATCH 03/27] support high priority stream (#1715) Support high priority stream for xccl, test case add in https://github.com/intel/torch-xpu-ops/pull/2049 We need merge this pr first and upstream op register https://github.com/pytorch/pytorch/pull/163049 and then test case could be pass --------- Co-authored-by: mengfei25 --- src/xccl/ProcessGroupXCCL.cpp | 17 +++++++++++------ src/xccl/ProcessGroupXCCL.hpp | 23 ++++++++++++++++------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index c820a1c486..ba7c8561be 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -322,7 +322,9 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { return true; } -ProcessGroupXCCL::Options::Options() : Backend::Options(XCCL_BACKEND_NAME) {} +ProcessGroupXCCL::Options::Options(bool is_high_priority_stream) + : Backend::Options(XCCL_BACKEND_NAME), + is_high_priority_stream(is_high_priority_stream) {} static std::atomic process_group_id = 0; @@ -351,7 +353,7 @@ const std::string& ProcessGroupXCCL::logPrefix() const { } ProcessGroupXCCL::ProcessGroupXCCL( - const c10::intrusive_ptr& store, + c10::intrusive_ptr store, int rank, int size, c10::intrusive_ptr options) @@ -377,7 +379,10 @@ ProcessGroupXCCL::ProcessGroupXCCL( std::string torch_distributed_debug = getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str()); LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: " - << "size: " << size << ", global rank: " << rank_; + << "size: " << size << ", global rank: " << rank_ + << ", USE_HIGH_PRIORITY_STREAM: " + << options_->is_high_priority_stream + << ", PG Name: " << options_->group_name; LOG(INFO) << logPrefix() << "ProcessGroupXCCL environments: " << "XCCL version: " << XcclVersion @@ -534,9 +539,9 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( rank = p2pRank; } - c10::impl::VirtualGuardImpl impl(device.type()); - c10::Stream stream = - impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false); + bool force_high = getCvarBool(TORCH_XCCL_HIGH_PRIORITY, false); + c10::Stream stream = at::xpu::getStreamFromPool( + options_->is_high_priority_stream || force_high); sycl::queue& q = c10::xpu::XPUStream(stream).queue(); auto ctx = ccl::create_context(q.get_context()); diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index e7aa39c82d..2516c0b912 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -24,6 +24,9 @@ #include namespace c10d { +static std::vector TORCH_XCCL_HIGH_PRIORITY = { + "TORCH_XCCL_HIGH_PRIORITY"}; + static std::vector TORCH_XCCL_BLOCKING_WAIT = { "TORCH_XCCL_BLOCKING_WAIT", "XCCL_BLOCKING_WAIT"}; @@ -118,18 +121,19 @@ class TORCH_API ProcessGroupXCCL : public Backend { }; struct Options : public Backend::Options { - explicit Options(); + explicit Options(bool is_high_priority_stream = false); - static c10::intrusive_ptr create() { - return c10::make_intrusive(); + static c10::intrusive_ptr create( + bool is_high_priority_stream = false) { + return c10::make_intrusive(is_high_priority_stream); } - + bool is_high_priority_stream; std::vector global_ranks_in_group; std::string group_name; }; ProcessGroupXCCL( - const c10::intrusive_ptr& store, + c10::intrusive_ptr store, int rank, int size, c10::intrusive_ptr options = Options::create()); @@ -138,11 +142,16 @@ class TORCH_API ProcessGroupXCCL : public Backend { const c10::intrusive_ptr& store, int rank, int size, - const std::string& groupName) - : ProcessGroupXCCL(store, rank, size) {} + const std::string& groupName, + c10::intrusive_ptr options = Options::create()) + : ProcessGroupXCCL(store, rank, size, std::move(options)) {} ~ProcessGroupXCCL() override; + c10::intrusive_ptr getOptions() { + return options_; + } + const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); } From 6d1a47672546d0ab115e14da1cf4d9dd5903fe4d Mon Sep 17 00:00:00 2001 From: Chao Han Date: Wed, 17 Sep 2025 08:59:04 +0800 Subject: [PATCH 04/27] Reduce tensor shape avoid timeout (#2051) --- test/xpu/distributed/test_c10d_xccl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 44a3ac148a..226501d7ac 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -641,7 +641,7 @@ def test_unwaited(self) -> None: with _functional_collectives.allow_inflight_collective_as_graph_input_ctx(): self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) input = torch.full( - (10240, 10240), float(self.rank), device=f"xpu:{self.rank}" + (1024, 1024), float(self.rank), device=f"xpu:{self.rank}" ) dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True) # Non-functional collectives run under the context manager is registered in the work registry. @@ -653,7 +653,7 @@ def test_unwaited(self) -> None: # Case 2: Run collectives not under context manager, and don't call wait on them. # NOTE: Here we intentionally test memory-stressed case. self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2) - for _ in range(50000): + for _ in range(500): input = torch.full( (1024, 1024), float(self.rank), device=f"xpu:{self.rank}" ) From a5f73c1c761ce2150856f120007b374bac3ec724 Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Wed, 17 Sep 2025 04:35:31 +0200 Subject: [PATCH 05/27] Move checks from nonzero kernel to operator (#1991) - Early return if input to nonzero has no elements, - Change pytorch `MAX_DIMS` to `XPU_MAX_TENSORINFO_DIMS` xpu equivalent, - Rename `tensor` to `out` to match op schema. --------- Co-authored-by: Tadeusz Krawiec <72737249+tadkrawiec@users.noreply.github.com> --- src/ATen/native/xpu/Nonzero.cpp | 14 ++- src/ATen/native/xpu/sycl/NonzeroKernel.cpp | 129 ++++++++++----------- 2 files changed, 69 insertions(+), 74 deletions(-) diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp index 9988631d3b..a1f9b1d09f 100644 --- a/src/ATen/native/xpu/Nonzero.cpp +++ b/src/ATen/native/xpu/Nonzero.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include namespace at { namespace native { @@ -10,7 +10,7 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { TORCH_CHECK( self.numel() < std::numeric_limits::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \ - See https://github.com/pytorch/pytorch/issues/51871"); + See https://github.com/pytorch/pytorch/issues/51871"); TORCH_CHECK( out.dtype() == at::kLong, "Expected object of scalar type ", @@ -24,11 +24,15 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { " and self on ", self.device()); TORCH_CHECK( - self.dim() <= MAX_DIMS, + self.dim() <= XPU_MAX_TENSORINFO_DIMS, "nonzero is not supported for tensor with more than ", - MAX_DIMS, + XPU_MAX_TENSORINFO_DIMS, " dimensions"); + if (self.numel() == 0) { + out = at::detail::empty_xpu({0, self.dim()}, out.options()); + return out; + } xpu::nonzero_kernel(self, out); return out; } @@ -39,4 +43,4 @@ Tensor nonzero_xpu(const Tensor& self) { return out; } } // namespace native -} // namespace at \ No newline at end of file +} // namespace at diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp index 72df4b8ee8..d9c166dbcb 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp @@ -17,7 +17,7 @@ struct FlattenIdxtoRealIdxKernelFunctor { if (global_id < N_) { auto dim = global_id / num_nonzeros_; auto index = global_id % num_nonzeros_; - tensor_begin_[global_id] = + out_begin_[global_id] = idx_flat_begin_[index] / divisor_[dim] % sizes_[dim]; } } @@ -25,14 +25,14 @@ struct FlattenIdxtoRealIdxKernelFunctor { int64_t N, const int64_t num_dim, const int64_t num_nonzeros, - int64_t* tensor_begin, + int64_t* out_begin, int64_t* idx_flat_begin, int64_t* divisor, int64_t* sizes) : N_(N), num_dim_(num_dim), num_nonzeros_(num_nonzeros), - tensor_begin_(tensor_begin), + out_begin_(out_begin), idx_flat_begin_(idx_flat_begin) { for (auto dim = num_dim - 1; dim >= 0; dim--) { sizes_[dim] = sizes[dim]; @@ -44,7 +44,7 @@ struct FlattenIdxtoRealIdxKernelFunctor { int64_t N_; const int64_t num_dim_; const int64_t num_nonzeros_; - int64_t* tensor_begin_; + int64_t* out_begin_; int64_t* idx_flat_begin_; int64_t divisor_[XPU_MAX_TENSORINFO_DIMS]; int64_t sizes_[XPU_MAX_TENSORINFO_DIMS]; @@ -79,77 +79,68 @@ struct CopyIfFunc { }; template -void nonzero_template(const Tensor& self_, Tensor& tensor) { +void nonzero_template(const Tensor& self_, Tensor& out) { Tensor self = self_.contiguous(); const int64_t num_dim = self.dim(); - TORCH_CHECK(num_dim <= XPU_MAX_TENSORINFO_DIMS, "dim exceed max allowed dim"); - - int64_t N = self.numel(); - - if (N > 0) { - Tensor idx_flat = at::empty( - {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - Tensor range = at::empty( - {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - - const scalar_t* self_begin = self.const_data_ptr(); - int64_t* idx_flat_begin = idx_flat.data_ptr(); - int64_t* range_begin = nullptr; - - CopyIfFunc f(self_begin); - auto idx_flat_end = - pstl::copy_if(range_begin, range_begin + N, idx_flat_begin, f); - - auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end); - - bool need_to_copy = tensor.dim() == 2 && - tensor.sizes()[0] == num_nonzeros && tensor.sizes()[1] == self_.dim() && - !tensor.t().is_contiguous(); - at::Tensor tensor_ = need_to_copy - ? Tensor(at::detail::empty_xpu( - {self_.dim(), num_nonzeros}, tensor.options())) - : tensor.resize_({self_.dim(), num_nonzeros}); - - if (num_nonzeros > 0 && num_dim > 0) { - int64_t* tensor_begin = tensor_.data_ptr(); - - // preload sizes tensor for index calculation - int64_t sizes[XPU_MAX_TENSORINFO_DIMS]; - int64_t divisor[XPU_MAX_TENSORINFO_DIMS]; - sizes[num_dim - 1] = self.size(num_dim - 1); - divisor[num_dim - 1] = 1; - for (auto dim = num_dim - 2; dim >= 0; dim--) { - sizes[dim] = self.size(dim); - divisor[dim] = sizes[dim + 1] * divisor[dim + 1]; - } - - const int64_t N = num_nonzeros * num_dim; - // restore flatten idx to indices - FlattenIdxtoRealIdxKernelFunctor kfn( - N, - num_dim, - num_nonzeros, - tensor_begin, - idx_flat_begin, - divisor, - sizes); - - const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N); - const auto num_wg = (N + wg_sz - 1) / wg_sz; - - sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn); - } - if (need_to_copy) { - tensor.copy_(tensor_.t()); - } else { - // transpose out so it is correct size - Tensor tensor_temp = tensor_.t(); - tensor.set_(tensor_temp); + const int64_t N = self.numel(); + + Tensor idx_flat = at::empty( + {N}, out.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); + + const scalar_t* self_begin = self.const_data_ptr(); + int64_t* idx_flat_begin = idx_flat.data_ptr(); + int64_t* range_begin = nullptr; + + CopyIfFunc f(self_begin); + auto idx_flat_end = + pstl::copy_if(range_begin, range_begin + N, idx_flat_begin, f); + + auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end); + + bool need_to_copy = out.dim() == 2 && + out.sizes()[0] == num_nonzeros && out.sizes()[1] == num_dim && + !out.t().is_contiguous(); + Tensor out_ = need_to_copy + ? Tensor(at::detail::empty_xpu( + {num_dim, num_nonzeros}, out.options())) + : out.resize_({num_dim, num_nonzeros}); + + if (num_nonzeros > 0 && num_dim > 0) { + int64_t* out_begin = out_.data_ptr(); + + // preload sizes tensor for index calculation + int64_t sizes[XPU_MAX_TENSORINFO_DIMS]; + int64_t divisor[XPU_MAX_TENSORINFO_DIMS]; + sizes[num_dim - 1] = self.size(num_dim - 1); + divisor[num_dim - 1] = 1; + for (auto dim = num_dim - 2; dim >= 0; dim--) { + sizes[dim] = self.size(dim); + divisor[dim] = sizes[dim + 1] * divisor[dim + 1]; } + const int64_t N = num_nonzeros * num_dim; + // restore flatten idx to indices + FlattenIdxtoRealIdxKernelFunctor kfn( + N, + num_dim, + num_nonzeros, + out_begin, + idx_flat_begin, + divisor, + sizes); + + const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N); + const auto num_wg = (N + wg_sz - 1) / wg_sz; + + sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn); + } + if (need_to_copy) { + out.copy_(out_.t()); } else { - tensor = tensor.resize_({num_dim, N}).contiguous().t(); + // transpose out so it is correct size + Tensor out_temp = out_.t(); + out.set_(out_temp); } } From 9f08551751b3214c10075ea77966a0912665881e Mon Sep 17 00:00:00 2001 From: Cherry Zhang Date: Wed, 17 Sep 2025 15:12:43 +0800 Subject: [PATCH 06/27] Check input contiguous before collectives call to low-level communication library (#2048) Co-authored-by: Chao Han --- src/xccl/ProcessGroupXCCL.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index ba7c8561be..923771a13f 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -97,17 +97,17 @@ void checkSingleTensor( ) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + } - // Skip the following requirements for P2P operations - if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - if (p2p) { - TORCH_WARN_ONCE( - "Detected non-contiguous tensor in P2P operations. It is user " - "responsibility to guarantee that source and destination tensors have " - "the same contiguity format."); - } else { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); - } + // Skip the following requirements for P2P operations + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + if (p2p) { + TORCH_WARN_ONCE( + "Detected non-contiguous tensor in P2P operations. It is user " + "responsibility to guarantee that source and destination tensors have " + "the same contiguity format."); + } else { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); } } } From 6508096a46812e712374aae68c798ef267773172 Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Wed, 17 Sep 2025 15:17:59 +0200 Subject: [PATCH 07/27] Fix hardswish gradients corner case (#2050) Fixes https://github.com/intel/torch-xpu-ops/issues/2014 --- src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp | 4 ++-- test/xpu/skip_list_common.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp index 8c6e47f774..2c2c60ada9 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp @@ -35,9 +35,9 @@ struct HardswishBackwardFunctor { const opmath_t one_half(0.5f); opmath_t grad_val = static_cast(grad_val_); opmath_t self_val = static_cast(self_val_); - if (self_val < neg_three) { + if (self_val <= neg_three) { return zero; - } else if (self_val <= three) { + } else if (self_val < three) { return grad_val * ((self_val / three) + one_half); } else { return grad_val; diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index ae3a8b8a93..5e70eebf44 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -1022,10 +1022,6 @@ "test_ctc_loss_cudnn_tensor", # want "xpu" in function name # RuntimeError: reflection_pad2d_backward_xpu does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'. "test_ReflectionPad2d_large_deterministic_xpu", - # Case updated in pytorch commit 97272e4 - "test_hardswish_grad_corner_xpu_bfloat16", - "test_hardswish_grad_corner_xpu_float16", - "test_hardswish_grad_corner_xpu_float32", # x_cuda = x.clone().detach().to("cuda").requires_grad_(): Torch not compiled with CUDA enabled "test_layer_norm_backwards_eps", ), From 24fab67b6ecf7620d0cf776047a3056c5b518bab Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:43:49 -0700 Subject: [PATCH 08/27] Clean up getDeviceIndexOfCurrentQueue (#2060) # Motivation Clean up `getDeviceIndexOfCurrentQueue` since it duplicates `current_device` and provides no additional functionality. --- src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 3 +- src/ATen/native/xpu/sycl/Norm.h | 3 +- src/ATen/native/xpu/sycl/SoftMaxKernels.cpp | 12 +- .../native/xpu/sycl/TensorShapeKernels.cpp | 3 +- src/comm/DeviceProperties.h | 130 +++++++----------- src/comm/Runtime.h | 4 - 6 files changed, 60 insertions(+), 95 deletions(-) diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index f8e1b6906e..4124890274 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -185,8 +185,7 @@ int get_num_threads_by_dev_max_group_size( int get_prefer_simd(int numPlane, int nHw) { // decide SIMD: SIMD32 or SIMD16 - auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue(); - + auto dev_id = at::xpu::current_device(); auto* dev_prop = at::xpu::getDeviceProperties(dev_id); auto sub_group_size = dev_prop->sub_group_sizes; int simd = sub_group_size[1]; diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 6117b6d261..8b887a443a 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -269,8 +269,7 @@ class NormConfig { } void get_max_vec_size() { - auto dev_id = getDeviceIndexOfCurrentQueue(); - int total_resource = syclMaxWorkItemsPerTile(dev_id); + int64_t total_resource = syclMaxWorkItemsPerTile(); constexpr int float4_size = sizeof(float) * 4; max_vec_size = float4_size / element_size_bytes; diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index fdedf5fb09..020fe4a1bc 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -1559,8 +1559,7 @@ void spatial_softmax_forward( canUse32BitIndexMath(input) && canUse32BitIndexMath(output); // decide SIMD: SIMD32 or SIMD16 - auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue(); - auto* dev_prop = at::xpu::getDeviceProperties(dev_id); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -1749,8 +1748,7 @@ void spatial_softmax_backward( canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -1901,8 +1899,7 @@ Tensor& masked_softmax_forward( canUse32BitIndexMath(input) && canUse32BitIndexMath(output); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -2026,8 +2023,7 @@ void masked_softmax_backward( canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { diff --git a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp index 8c46658e7a..fed5fee904 100644 --- a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp @@ -669,8 +669,7 @@ void split_with_sizes_copy_out_xpu_contiguous_no_cast( num_groups += div_up(split_chunk_size, GROUP_SIZE * BYTES_PER_THREAD); } - auto dev_id = getDeviceIndexOfCurrentQueue(); - int64_t tile_size = syclMaxWorkItemsPerTile(dev_id); + int64_t tile_size = syclMaxWorkItemsPerTile(); const int64_t max_groups = tile_size / GROUP_SIZE * 2.0; // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h index ee0d285eaf..5d1f0c8f77 100644 --- a/src/comm/DeviceProperties.h +++ b/src/comm/DeviceProperties.h @@ -3,17 +3,15 @@ #include #include -#include namespace xpu { namespace sycl { template static int64_t syclMaxWorkGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { - auto q = c10::xpu::getCurrentXPUStream(dev_id).queue(); - auto ctx = q.get_context(); - auto dev = q.get_device(); + at::DeviceIndex dev_id = at::xpu::current_device()) { + auto& ctx = c10::xpu::get_device_context(); + auto& dev = c10::xpu::get_raw_device(dev_id); auto kid = ::sycl::get_kernel_id(); // The kernel won't be built for devices except for the first device. @@ -30,73 +28,69 @@ static int64_t syclMaxWorkGroupSize( template static int64_t syclMaxWorkGroupSize( - KernelClass /*kfn*/, - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + const KernelClass& /*kfn*/, + at::DeviceIndex dev_id = at::xpu::current_device()) { return syclMaxWorkGroupSize(dev_id); } static inline int64_t syclDeviceMaxWorkGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_work_group_size; } static inline int64_t syclMaxSubGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - auto subgroup_sizes = dev_prop->sub_group_sizes; - uint64_t max_val = 0; - for (auto i : subgroup_sizes) { - if (i > max_val) - max_val = i; - } - return max_val; + const auto& subgroup_sizes = dev_prop->sub_group_sizes; + TORCH_CHECK( + !subgroup_sizes.empty(), + "The device subgroup sizes is empty, please check the device status."); + return *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end()); } static inline int64_t syclMinSubGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - auto subgroup_sizes = dev_prop->sub_group_sizes; - uint64_t min_val = dev_prop->max_work_group_size; - for (auto i : subgroup_sizes) { - if (i < min_val) - min_val = i; - } - return min_val; + const auto& subgroup_sizes = dev_prop->sub_group_sizes; + TORCH_CHECK( + !subgroup_sizes.empty(), + "The device subgroup sizes is empty, please check the device status."); + return *std::min_element(subgroup_sizes.begin(), subgroup_sizes.end()); } static inline int64_t syclMaxComputeUnitSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_compute_units; } static inline int64_t syclGpuEuCount( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_count; } static inline int64_t syclGpuEuSimdWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_simd_width; } static inline int64_t syclGpuHWThreadsPerEU( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_hw_threads_per_eu; } static inline int64_t syclGpuEUCountPerSubslice( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_count_per_subslice; } static inline int64_t syclMaxWorkItemsPerTile( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t eu_cnt = dev_prop->gpu_eu_count; int64_t simd_width = syclMaxSubGroupSize(dev_id); @@ -105,7 +99,7 @@ static inline int64_t syclMaxWorkItemsPerTile( } static inline int64_t syclMaxWorkItemsPerSubSlice( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t simd_width = syclMaxSubGroupSize(dev_id); int64_t eu_count = dev_prop->gpu_eu_count_per_subslice; @@ -113,7 +107,7 @@ static inline int64_t syclMaxWorkItemsPerSubSlice( } static inline int64_t syclMaxWorkItemsPerEU( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t simd_width = syclMaxSubGroupSize(dev_id); int64_t hw_threads = dev_prop->gpu_hw_threads_per_eu; @@ -121,94 +115,76 @@ static inline int64_t syclMaxWorkItemsPerEU( } static inline int64_t syclMaxNumSubGroups( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_num_sub_groups; } static inline int64_t syclMaxDSSNum( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { int64_t dss_num = syclMaxComputeUnitSize(dev_id) / syclGpuEUCountPerSubslice(dev_id); return dss_num; } static inline size_t syclGlobalMemSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->global_mem_size; } static inline int64_t syclLocalMemSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->local_mem_size; } template uint32_t syclPrefVectorWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { (void)dev_id; // Suppress unused variable warning // Hot fix. This is the preferred vector width for GPUs up to LNL/BMG. - uint32_t vec_width = 16; + constexpr uint32_t vec_width = 16; - if (std::is_same::value) { - return vec_width / sizeof(char); - } - if (std::is_same::value) { - return vec_width / sizeof(short); - } - if (std::is_same::value) { - return vec_width / sizeof(int); - } - if (std::is_same::value) { - return vec_width / sizeof(int64_t); - } - if (std::is_same::value) { - return vec_width / sizeof(float); - } - if (std::is_same::value) { - return vec_width / sizeof(double); + if constexpr ( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v) { + return vec_width / sizeof(T); + } else { + throw std::invalid_argument( + "Invalid data type to fetch preferred vector width!"); } - if (std::is_same::value) { - return vec_width / sizeof(::sycl::half); - } - throw std::invalid_argument( - "Invalid data type to fetch preferred vector width!"); } template uint32_t syclNativeVectorWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - if (std::is_same::value) { + if constexpr (std::is_same_v) { return dev_prop->native_vector_width_char; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_short; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_int; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_long; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_float; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_double; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_half; + } else { + throw std::invalid_argument( + "Invalid data type to fetch native vector width!"); } - throw std::invalid_argument( - "Invalid data type to fetch native vector width!"); } static inline bool syclHasFloat64( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->has_fp64; } diff --git a/src/comm/Runtime.h b/src/comm/Runtime.h index fa7daaf125..4fd44d08e5 100644 --- a/src/comm/Runtime.h +++ b/src/comm/Runtime.h @@ -4,10 +4,6 @@ namespace at::xpu { -static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() { - return c10::xpu::getCurrentXPUStream().device_index(); -} - static inline sycl::queue& getCurrentSYCLQueue() { return c10::xpu::getCurrentXPUStream().queue(); } From bc52e6303989c5e8baa7ad97f39040251434b828 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Mon, 22 Sep 2025 10:07:34 +0800 Subject: [PATCH 09/27] [CI] Cleanup after build to avoid permission issue (#2088) disable_ut disable_e2e disable_distributed --- .github/workflows/_linux_build.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 286d124b7b..6fcaf5c954 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -200,7 +200,6 @@ jobs: python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' - chmod 777 /__w /github ./ -R - name: Upload Torch XPU Wheel if: ${{ success() }} uses: actions/upload-artifact@v4 @@ -213,3 +212,8 @@ jobs: with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/build_*.log + - name: Cleanup workspace + if: ${{ always() }} + run: | + chmod 777 /__w /github ./ -R + find ./ |grep -v "^\./$" |xargs rm -rf From 6e5af1ee2f2560426e74f6c842fb46d5ae254f75 Mon Sep 17 00:00:00 2001 From: Frost Mitchell Date: Mon, 22 Sep 2025 01:46:17 -0400 Subject: [PATCH 10/27] Revert tracking of Work status for FlightRecorder in ProcessGroupXCCL (#2076) The callback used to track the work status in ProcessGroupXCCL was causing an unintended memory leak by maintaining the work objects and therefor the stashed tensors. For now, I'm removing the callback and I have added a unit test to ensure this memory leak isn't returning. Fix https://github.com/intel/torch-xpu-ops/issues/2084 --- src/xccl/ProcessGroupXCCL.cpp | 26 +++++++++++--------------- src/xccl/ProcessGroupXCCL.hpp | 2 -- test/xpu/distributed/test_c10d_xccl.py | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 923771a13f..514cdaf3a7 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -437,17 +437,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus( pgStatus_->lastEnqueuedNumelOut = work->numelOut_; } -void ProcessGroupXCCL::setCompletedPgStatus( - c10::intrusive_ptr work) { - pgStatus_->lastCompletedSeq = static_cast(work->getSequencenumber()); - pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_); - pgStatus_->lastCompletedNumelIn = work->numelIn_; - pgStatus_->lastCompletedNumelOut = work->numelOut_; - // To avoid complexity, we're not computing duration. - FlightRecorderXCCL::get()->retire_id( - work->trace_id_, /*compute_duration*/ false); -} - void ProcessGroupXCCL::setSequenceNumberForGroup() {} uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() { @@ -777,8 +766,12 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); + auto id = work->trace_id_; work->future_->addCallback( - [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); }); + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->blockingWait_ = blockingWait_; work->numelIn_ = 0; @@ -889,9 +882,12 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); - work->future_->addCallback([this, work](at::ivalue::Future&) { - this->setCompletedPgStatus(work); - }); + auto id = work->trace_id_; + work->future_->addCallback( + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->numelIn_ = work->numelOut_ = tensor.numel(); setEnqueuedPgStatus(work); diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index 2516c0b912..8e4604fbfb 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -424,8 +424,6 @@ class TORCH_API ProcessGroupXCCL : public Backend { const std::vector& groupRanks() const; void setEnqueuedPgStatus(c10::intrusive_ptr work); - void setCompletedPgStatus( - c10::intrusive_ptr work); bool dumpDebuggingInfo(bool includeStackTrace = true); protected: diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 226501d7ac..57b33ae29d 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -365,6 +365,31 @@ def test_nan_assert(self, type): # reset env os.environ["TORCH_XCCL_NAN_CHECK"] = "0" + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_oom(self): + pg = self._create_process_group_xccl() + dp_ranks = range(0, self.world_size) + dp_group = c10d.new_group(dp_ranks) + device = torch.device(f"xpu:{self.rank}") + torch.xpu.set_device(device) + + shape = (16384 * 2, 16384 * 2) + weight = torch.ones(shape, device=device).half() + gradient = torch.zeros(shape, device=device).half() + ret = torch.randn(shape, device=device).half() + + for iter in range(50): + output = torch.empty_like(ret) + output = ret + weight + gradient + ret = torch.nn.functional.linear(output, weight=ret) + dist.all_reduce(ret, op=dist.ReduceOp.SUM) + torch.xpu.synchronize() + self.assertLess( + torch.xpu.max_memory_allocated(), + torch.xpu.max_memory_reserved() * 2, + ) + class CommTest(MultiProcessTestCase): @property From 65234bdfba4a70fc5eb3c73fae421fcd8249351e Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Mon, 22 Sep 2025 22:44:13 -0700 Subject: [PATCH 11/27] build: enable SYCL warnings on Linux (#2096) Some versions of DPC++ compiler pass paths to SYCL headers as user include paths (`-I`) rather than system paths (`-isystem`). This makes host compiler to report warnings encountered in the SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program. We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround the issue we wrap paths to SYCL headers in `-isystem`. disable_ut disable_e2e disable_distributed CC: @EikanWang @chuanqi129 Signed-off-by: Dmitry Rogozhkin --- cmake/BuildFlags.cmake | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index 23956d710a..c0afe2d8f1 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -36,10 +36,16 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" list(APPEND SYCL_HOST_FLAGS -fPIC) list(APPEND SYCL_HOST_FLAGS -std=c++17) list(APPEND SYCL_HOST_FLAGS -Wunused-variable) - # SYCL headers warnings - list(APPEND SYCL_HOST_FLAGS -Wno-deprecated-declarations) - list(APPEND SYCL_HOST_FLAGS -Wno-deprecated) - list(APPEND SYCL_HOST_FLAGS -Wno-attributes) + # Some versions of DPC++ compiler pass paths to SYCL headers as user include paths (`-I`) rather + # than system paths (`-isystem`). This makes host compiler to report warnings encountered in the + # SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program. + # We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround + # the issue we wrap paths to SYCL headers in `-isystem`. + foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR) + list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}") + endforeach() + # Excluding warnings which flood the compilation output + # TODO: fix warnings in the source code and then reenable them in compilation list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare) endif() From b755d3cb8de018e35c6ea104440bc1b1de0c0aaa Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 23 Sep 2025 15:17:25 +0800 Subject: [PATCH 12/27] [CI] Add deps for new LLM models (#2054) Add new LLM models deps `accelerate` disable_ut disable_distributed --- .github/actions/linux-testenv/action.yml | 4 +++- .../rolling/inductor_huggingface_inference.csv | 4 ++++ .../rolling/inductor_huggingface_training.csv | 5 +++++ .github/workflows/_linux_e2e_summary.yml | 4 ++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 246b94c919..cbf2a60a5c 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -57,7 +57,7 @@ runs: run: | # install pytorch if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then @@ -115,6 +115,8 @@ runs: pip install pandas psutil scipy pyyaml cd pytorch if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then + # for new LLM models + pip install accelerate pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0 TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')" elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv index a75d3d2253..fe1e531308 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv @@ -45,3 +45,7 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass XGLMForCausalLM,pass,pass,pass,pass,pass XLNetLMHeadModel,pass,pass,pass,pass,pass YituTechConvBert,pass,pass,pass,pass,pass +Qwen/Qwen3-0.6B,pass,pass,pass,pass,pass +google/gemma-2-2b,pass,pass,pass,pass,pass +meta-llama/Llama-3.2-1B,pass,pass,pass,pass,pass +openai/whisper-tiny,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv index a75d3d2253..6d3726bdfb 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv @@ -45,3 +45,8 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass XGLMForCausalLM,pass,pass,pass,pass,pass XLNetLMHeadModel,pass,pass,pass,pass,pass YituTechConvBert,pass,pass,pass,pass,pass +# https://github.com/intel/torch-xpu-ops/issues/2055 +Qwen/Qwen3-0.6B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +google/gemma-2-2b,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +meta-llama/Llama-3.2-1B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +openai/whisper-tiny,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index f2026a9f5a..9824c8c458 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -77,10 +77,10 @@ jobs: fi pt2e_summary_csv="$(find ./target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true) if [ ${failed_num} -ne 0 ];then echo "[Warning] PT2E has failures!" + grep 'failed' ${pt2e_summary_csv} fi fi exit ${exit_label} @@ -88,7 +88,7 @@ jobs: if: ${{ endsWith(inputs.test_type, 'ly') }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 - has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" + has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt || true)" if [ ${has_or_not} -ne 0 ];then sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt else From 9eed218770fc9f9ba6dcbbb3ee7480c6fb247d7a Mon Sep 17 00:00:00 2001 From: Konrad Drozd Date: Tue, 23 Sep 2025 09:58:06 +0200 Subject: [PATCH 13/27] Fix accuracy issues with CTC loss (#2074) Fixes #1737 and the flaky backward accuracy issue uncovered by fixing the forward pass, using sycl_global_and_local_fence to sync data in global memory - log_alpha_data_. --- src/ATen/native/xpu/sycl/LossCTCKernels.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp index 6acaf2e6cb..90c090e38e 100644 --- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp @@ -115,7 +115,7 @@ struct CTCLossLogAlphaKernelFunctor { have_three = false; } for (int64_t t = 1; t < max_input_length_; t++) { - item.barrier(sycl_local_fence); + item.barrier(sycl_global_and_local_fence); if (valid && (t < input_length) && (s < 2 * target_length + 1)) { // only for valid t, s. This is equation (6) and (7), la1, la2, la3 // are the three summands, lamax is the maximum for the logsumexp @@ -165,7 +165,7 @@ struct CTCLossLogAlphaKernelFunctor { } } } - item.barrier(sycl_local_fence); + item.barrier(sycl_global_and_local_fence); if (!valid) return; @@ -502,7 +502,7 @@ struct CTCLossBackwardLogBetaKernelFunctor { // now go backward in t. Note that we need to skip the last timestep that // we did above. for (int64_t t = max_input_length_ - 2; t >= 0; t--) { - item.barrier(sycl_local_fence); + item.barrier(sycl_global_and_local_fence); if (valid && (t < input_length - 1) && (s < 2 * target_length + 1)) { scalar_t lb1 = log_beta_data_ [lb_batch_offset + lb_input_stride_ * (t + 1) + From 0df6a62ddba9928f4524e485ec9386aa420aa8ce Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Tue, 23 Sep 2025 10:32:43 +0200 Subject: [PATCH 14/27] Implement aten::nonzero_static on XPU backend (#2061) Fixes https://github.com/intel/torch-xpu-ops/issues/1943 --- src/ATen/native/xpu/Nonzero.cpp | 69 ++++++++++++++++++++++++++----- yaml/native/native_functions.yaml | 9 ++++ 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp index a1f9b1d09f..82cfaa8275 100644 --- a/src/ATen/native/xpu/Nonzero.cpp +++ b/src/ATen/native/xpu/Nonzero.cpp @@ -1,22 +1,18 @@ #include +#include +#include #include - #include #include namespace at { namespace native { -Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { + +void nonzero_common_checks(const Tensor& self, Tensor& out, const std::string& op_name) { TORCH_CHECK( self.numel() < std::numeric_limits::max(), - "nonzero is not supported for tensors with more than INT_MAX elements, \ + op_name, " is not supported for tensors with more than INT_MAX elements, \ See https://github.com/pytorch/pytorch/issues/51871"); - TORCH_CHECK( - out.dtype() == at::kLong, - "Expected object of scalar type ", - at::kLong, - " as out, but got ", - out.dtype()); TORCH_CHECK( self.device() == out.device(), "expected self and out to be on the same device, but got out on ", @@ -25,10 +21,19 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { self.device()); TORCH_CHECK( self.dim() <= XPU_MAX_TENSORINFO_DIMS, - "nonzero is not supported for tensor with more than ", + op_name, " is not supported for tensor with more than ", XPU_MAX_TENSORINFO_DIMS, " dimensions"); +} +Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { + TORCH_CHECK( + out.dtype() == at::kLong, + "Expected object of scalar type ", + at::kLong, + " as out, but got ", + out.dtype()); + nonzero_common_checks(self, out, "nonzero"); if (self.numel() == 0) { out = at::detail::empty_xpu({0, self.dim()}, out.options()); return out; @@ -42,5 +47,49 @@ Tensor nonzero_xpu(const Tensor& self) { nonzero_out_xpu(self, out); return out; } + +Tensor& nonzero_static_out_xpu( + const Tensor& self, + int64_t size, + int64_t fill_value, + Tensor& out) { + TORCH_CHECK( + size >= 0, "nonzero_static: 'size' must be an non-negative integer"); + TORCH_CHECK( + out.dtype() == at::kLong, + "nonzero_static: Expected out tensor to have scalar type Long"); + nonzero_common_checks(self, out, "nonzero_static"); + if (self.numel() == 0) { + out = at::full({size, self.dim()}, fill_value, out.options()); + return out; + } + + Tensor nonzero_out = at::detail::empty_xpu({0}, self.options().dtype(kLong)); + xpu::nonzero_kernel(self, nonzero_out); + auto nonzero_size = nonzero_out.size(0); + out.resize_({size, self.dim()}); + + if (nonzero_size > size) { + out.copy_(nonzero_out.narrow(0, 0, size)); + } else if (nonzero_size < size) { + auto padding = at::full({size - nonzero_size, self.dim()}, fill_value, out.options()); + out.copy_(at::cat({nonzero_out, padding}, 0)); + } else { + out.copy_(nonzero_out); + } + return out; +} + +Tensor nonzero_static_xpu( + const Tensor& self, + int64_t size, + int64_t fill_value) { + TORCH_CHECK( + size >= 0, "nonzero_static: 'size' must be an non-negative integer"); + Tensor out = at::detail::empty_xpu({size, self.dim()}, self.options().dtype(kLong)); + nonzero_static_out_xpu(self, size, fill_value, out); + return out; +} + } // namespace native } // namespace at diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index 2df8464262..a3281791de 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -2071,6 +2071,15 @@ XPU: nonzero_xpu tags: [dynamic_output_shape, core] +- func: nonzero_static.out(Tensor self, *, SymInt size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!) + dispatch: + XPU: nonzero_static_out_xpu + +- func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor + variants: method, function + dispatch: + XPU: nonzero_static_xpu + - func: maximum(Tensor self, Tensor other) -> Tensor structured_delegate: maximum.out device_check: NoCheck # TensorIterator From 229e8ba104e3b0a6e88c36effefa3ea914b25673 Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Tue, 23 Sep 2025 15:22:38 +0200 Subject: [PATCH 15/27] Stop recursive calculations in polynomial kernels if tensor has NaNs (#2062) Fixes https://github.com/intel/torch-xpu-ops/issues/1823 --- .../native/xpu/sycl/HermitePolynomialHKernel.cpp | 12 +++++------- .../native/xpu/sycl/HermitePolynomialHeKernel.cpp | 12 +++++------- .../native/xpu/sycl/LaguerrePolynomialLKernel.cpp | 2 +- .../native/xpu/sycl/LegendrePolynomialPKernel.cpp | 2 +- test/xpu/skip_list_common.py | 6 ------ 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp index 4862af9a8f..bffd6117b5 100644 --- a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp +++ b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp @@ -9,17 +9,15 @@ namespace at::native::xpu { template struct HermitePolynomialHFunctor { scalar_t operator()(scalar_t x, scalar_t n_) const { - int64_t n = static_cast(n_); + auto n = static_cast(n_); if (n < 0) { return scalar_t(0.0); - } - - if (n == 0) { + } else if (n == 0) { return scalar_t(1.0); - } - - if (n == 1) { + } else if (n == 1) { return x + x; + } else if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); } scalar_t p = scalar_t(1.0); diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp index 6c37e87202..8ba1d8bfff 100644 --- a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp +++ b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp @@ -9,17 +9,15 @@ namespace at::native::xpu { template struct HermitePolynomialHeFunctor { scalar_t operator()(scalar_t x, scalar_t n_) const { - int64_t n = static_cast(n_); + auto n = static_cast(n_); if (n < 0) { return scalar_t(0.0); - } - - if (n == 0) { + } else if (n == 0) { return scalar_t(1.0); - } - - if (n == 1) { + } else if (n == 1) { return x; + } else if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); } scalar_t p = scalar_t(1.0); diff --git a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp index 8688f80456..869d64492a 100644 --- a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp +++ b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp @@ -30,7 +30,7 @@ struct LaguerrePolynomialLFunctor { scalar_t q = scalar_t(1.0) - x; scalar_t r; - for (int64_t k = 1; k < n; k++) { + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { r = (((k + k) + (scalar_t(1.0) - x)) * q - k * p) / (k + 1); p = q; q = r; diff --git a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp index 2176c4e79c..caf1d8ac16 100644 --- a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp +++ b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp @@ -34,7 +34,7 @@ struct LegendrePolynomialPFunctor { scalar_t q = x; scalar_t r; - for (int64_t k = 1; k < n; k++) { + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { r = ((k + k + 1) * x * q - k * p) / (k + 1); p = q; q = r; diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index 5e70eebf44..9eeb6b44c4 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -74,12 +74,6 @@ "test_compare_cpu_linalg_lu_factor_ex_xpu_float32", "test_compare_cpu_linalg_lu_factor_xpu_float32", "test_compare_cpu_linalg_lu_xpu_float32", - # XPU hang. CUDA hang as well. - # https://github.com/pytorch/pytorch/issues/79528 - "test_compare_cpu_special_hermite_polynomial_he_xpu_float32", - "test_compare_cpu_special_hermite_polynomial_h_xpu_float32", - "test_compare_cpu_special_laguerre_polynomial_l_xpu_float32", - "test_compare_cpu_special_legendre_polynomial_p_xpu_float32", # core dump "test_dtypes__refs_nn_functional_pdist_xpu", # XFAIL of CUDA and XPU, unexpected success in fallback From 23cd5841a06e88431a35e4827b70ff35e78d6da6 Mon Sep 17 00:00:00 2001 From: Slawomir Siwek Date: Thu, 25 Sep 2025 08:10:11 +0200 Subject: [PATCH 16/27] Remove unused variables (#2044) - remove unused variables, - add compiler flag to prevent this in the future --- .github/workflows/_linux_build.yml | 1 + CMakeLists.txt | 3 +++ src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 3 --- src/ATen/native/xpu/sycl/RoiAlignKernels.cpp | 1 - 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 6fcaf5c954..6f209ae0b1 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -94,6 +94,7 @@ jobs: - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} run: | export USE_XCCL=1 + export IS_XPU_CI=1 # only build pvc for CI if [ "${{ github.event_name }}" == "pull_request" ];then export TORCH_XPU_ARCH_LIST='pvc' diff --git a/CMakeLists.txt b/CMakeLists.txt index 277f1f067d..063c728411 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,9 @@ set(PROJECT_VERSION "2.3.0") # Avoid SYCL compiler error if(NOT WIN32) string(APPEND CMAKE_CXX_FLAGS " -Wno-error") + if("$ENV{IS_XPU_CI}" STREQUAL "1") + string(APPEND CMAKE_CXX_FLAGS " -Werror=unused-variable") + endif() endif() cmake_policy(SET CMP0048 NEW) diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 92a4195d01..e4a8444c94 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1103,9 +1103,6 @@ void _layer_norm_backward_kernel( // affecting performance and behavior. const scalar_t* dY_data = dY.const_data_ptr(); const scalar_t* X_data = X.const_data_ptr(); - weight_t* dg_data = - dgamma.defined() ? dgamma.data_ptr() : nullptr; - weight_t* db_data = dbeta.defined() ? dbeta.data_ptr() : nullptr; Tensor dgamma_blocks; Tensor dbeta_blocks; weight_t* dgamma_blocks_ptr = nullptr; diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp index f0188ce0ee..869a2b6627 100644 --- a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp +++ b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp @@ -440,7 +440,6 @@ Tensor roi_align_kernel( at::Tensor output = at::zeros( {num_rois, channels, pooled_height, pooled_width}, input.options()); - auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; From 304983a659a10abe9d6946c54b50221b9299a389 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 26 Sep 2025 09:44:40 +0800 Subject: [PATCH 17/27] [CI] Login docker hub to enlarge pull limitation (#2107) --- .github/workflows/_linux_accelerate.yml | 5 +++-- .github/workflows/_linux_build.yml | 2 ++ .github/workflows/_linux_e2e.yml | 1 + .github/workflows/_linux_op_benchmark.yml | 1 + .github/workflows/_linux_transformers.yml | 1 + .github/workflows/_linux_ut.yml | 1 + .github/workflows/bisect_search.yml | 9 +++++---- 7 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml index decf4e612f..fbaa0c3f07 100644 --- a/.github/workflows/_linux_accelerate.yml +++ b/.github/workflows/_linux_accelerate.yml @@ -48,6 +48,9 @@ concurrency: defaults: run: shell: bash {0} +env: + GH_TOKEN: ${{ github.token }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: conditions-filter: @@ -55,8 +58,6 @@ jobs: if: ${{ github.event.pull_request.draft == false }} runs-on: ubuntu-24.04 timeout-minutes: 10 - env: - GH_TOKEN: ${{ github.token }} outputs: disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }} steps: diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 6f209ae0b1..77caf298b4 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -31,6 +31,8 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: runner: diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index cd2056ff36..08f88462e6 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -50,6 +50,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: runner: diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index d71458c9bf..e8474d7ab1 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -26,6 +26,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} reference_issue: 1689 jobs: diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index fc3deaf7a3..67e2082834 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -56,6 +56,7 @@ concurrency: cancel-in-progress: true env: HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} HF_HUB_ETAG_TIMEOUT: 120 HF_HUB_DOWNLOAD_TIMEOUT: 120 python: ${{ inputs.python != '' && inputs.python || '3.10' }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6da0abadd1..e895a959b7 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -33,6 +33,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} UT_SKIP_ISSUE: 1624 jobs: diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml index c2512acb33..d45c0a83f8 100644 --- a/.github/workflows/bisect_search.yml +++ b/.github/workflows/bisect_search.yml @@ -40,6 +40,11 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: get_runner: @@ -74,10 +79,6 @@ jobs: TORCH_XPU_ARCH_LIST: pvc USE_XCCL: 0 USE_KINETO: 0 - env: - GH_TOKEN: ${{ github.token }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 26ae9e71332c5f29892339e6aa626e39def363c3 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Thu, 25 Sep 2025 19:50:07 -0700 Subject: [PATCH 18/27] Install xpu internal headers to PyTorch (#2106) # Motivation To resolve https://github.com/intel/torch-xpu-ops/issues/2034, expose all xpu internal headers to PyTorch. --- src/ATen/CMakeLists.txt | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index 37ec7066b7..493675e804 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,6 +1,5 @@ # ATen XPU sources -file(GLOB xpu_h "xpu/*.h") file(GLOB xpu_cpp "xpu/*.cpp") file(GLOB xpu_mkl "native/xpu/mkl/*.cpp") file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp") @@ -18,10 +17,28 @@ set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE) set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) -foreach(HEADER ${xpu_h}) - install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu") -endforeach() +# ATen XPU headers -foreach(HEADER ${xpu_ops_generated_headers}) - install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops) -endforeach() +macro(install_xpu_headers glob_pattern dest_subdir) + file(GLOB headers ${glob_pattern}) + if(headers) + install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/${dest_subdir}") + endif() +endmacro() + +install_xpu_headers("xpu/*.h" "ATen/xpu") +install_xpu_headers("native/xpu/*.h" "ATen/native/xpu") +install_xpu_headers("native/xpu/sycl/*.h" "ATen/native/xpu/sycl") +install_xpu_headers("native/xpu/mkl/*.h" "ATen/native/xpu/mkl") +install_xpu_headers("native/nested/xpu/*.h" "ATen/native/nested/xpu") +install_xpu_headers("native/nested/xpu/sycl/*.h" "ATen/native/nested/xpu/sycl") +install_xpu_headers("native/quantized/*.h" "ATen/native/quantized/xpu") +install_xpu_headers("native/quantized/sycl/*.h" "ATen/native/quantized/xpu/sycl") +install_xpu_headers("native/sparse/xpu/*.h" "ATen/native/sparse/xpu") +install_xpu_headers("native/sparse/xpu/sycl/*.h" "ATen/native/sparse/xpu/sycl") +install_xpu_headers("native/transformers/*.h" "ATen/native/transformers/xpu") +install_xpu_headers("native/transformers/sycl/*.h" "ATen/native/transformers/xpu/sycl") + +if(xpu_ops_generated_headers) + install(FILES ${xpu_ops_generated_headers} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops) +endif() From eed1b8f22f2d470c9812e792520c5156947c0f27 Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Thu, 25 Sep 2025 19:55:49 -0700 Subject: [PATCH 19/27] Fix error handling for BatchLinearAlgebra Ops (#2073) This PR is to fix the access to exception vector in error handling. Original implementation may cause a segmentation fault due to out-of-bounds access. --- src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp index 29facbaa3d..26e80fa4d0 100644 --- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp @@ -54,7 +54,7 @@ void error_handle( return; } - for (auto& i : ids) { + for (size_t i = 0; i < errs.size(); ++i) { try { std::rethrow_exception(errs[i]); } catch (const oneapi::mkl::lapack::exception& e) { From 9e95ad0be226f13375f90a4cd5574311611b5a37 Mon Sep 17 00:00:00 2001 From: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com> Date: Fri, 26 Sep 2025 13:29:49 +0800 Subject: [PATCH 20/27] Add dynamic skip template (#2115) - Add template for dynamic skip, makes it easier to create issues that require skipping disable_all --------- Co-authored-by: Wang, Chuanqi --- .github/ISSUE_TEMPLATE/dynamic-skip.yml | 63 +++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/dynamic-skip.yml diff --git a/.github/ISSUE_TEMPLATE/dynamic-skip.yml b/.github/ISSUE_TEMPLATE/dynamic-skip.yml new file mode 100644 index 0000000000..bdde3c5809 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/dynamic-skip.yml @@ -0,0 +1,63 @@ +name: ๐Ÿ› Dynamic skip +description: Create an issue to skip PR unrelated failures dynamically +title: "[Bug Skip]: " +labels: ["skipped"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: ๐Ÿ› Describe the bug with skip template + description: | + Please provide a clear and concise description of what the bug is. + The template for dynamic skip as below: + + ```python + # Template(Check in the github action summary) + Cases: + [Category],[Class name],[Test name] + ``` + + ```python + # example + Cases: + op_ut,third_party.torch-xpu-ops.test.xpu.test_transformers_xpu.TestTransformersXPU,test_scaled + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + placeholder: | + A clear and concise description of what the bug is and also align the dynamic template. + ``` + # Skippped cases with dynamic template + ``` + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: textarea + attributes: + label: Versions + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py + # For security purposes, please check the contents of collect_env.py before running it. + python collect_env.py + ``` + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing ๐ŸŽ‰! From 09edbeeb138fd8504ef42bcd3bb2402360b1b95f Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Fri, 26 Sep 2025 15:16:00 +0800 Subject: [PATCH 21/27] [CI] Modify ci test workflow (#2116) 1. Use `timm_nfnet` for CI test instead of `timm_regnet`, which has known accuracy issue https://github.com/intel/torch-xpu-ops/issues/1334 2. fix ondemand test issue disable_build disable_ut disable_distributed --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/nightly_ondemand.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index cbf2a60a5c..a4c77e5f8a 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -77,7 +77,7 @@ runs: # apply extra PRs for stock pytorch pip install requests if [ "${{ github.event_name }}" == "pull_request" ];then - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/27 else python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py fi diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index d408622efe..5c6207b963 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -111,7 +111,7 @@ jobs: else test_type="build-ondemand" fi - ut="$(echo ${{ inputs.ut }} |sed 's/,/","/g;s/^/["/;s/$/"]/')" + ut="$(echo ${{ inputs.ut }} |sed -E 's/(microbench,|,microbench)//g;s/(,windows|windows,)//g;s/,/","/g;s/^/["/;s/$/"]/')" suite="$(echo ${{ inputs.suite }} |sed 's/,/","/g;s/^/["/;s/$/"]/')" triton="${{ inputs.triton }}" python="${{ inputs.python }}" From fa1e391a447223d4d659d7ade09827d5ed31c8ce Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Sun, 28 Sep 2025 22:33:43 -0700 Subject: [PATCH 22/27] Fix unnecessary double data type conversion (#2114) # Motivation The original issue occurs on some old iGPU running the following code on Windows: ```python import torch import torch.nn.functional as F print(torch.xpu.get_device_properties()) arr = torch.rand(1, 2, 5, 5, device='xpu') pts = torch.rand(1, 3, 3, 2, device='xpu') out = F.grid_sample(arr, pts, align_corners=False) ``` The failure output is: ```bash Traceback (most recent call last): File "C:\Vesuvius\urerr\urerr.py", line 22, in out = F.grid_sample(arr, pts, align_corners=False) File "C:\Anaconda3\envs\pytn\Lib\site-packages\torch\nn\functional.py", line 5118, in grid_sample return torch.grid_sampler(input, grid, mode_enum, padding_mode_enum, align_corners) ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: UR error ``` The Driver team analysis located the crash in the generated spriv IR code. ```asm ; Function Attrs: nounwind define internal spir_func float @_ZN2at6native3xpuL19compute_coordinatesIfEET_S3_iNS0_6detail18GridSamplerPaddingEb(float %0, i32 %1, i32 %2, i1 zeroext %3) #0 !spirv.ParameterDecorations !393 { %5 = alloca double, align 8, !spirv.Decorations !394 switch i32 %2, label %58 [ i32 1, label %6 i32 2, label %14 ] 6: ; preds = %4 %7 = sext i32 %1 to i64 %8 = add nsw i64 %7, -1, !spirv.Decorations !387 %9 = sitofp i64 %8 to float %10 = fcmp olt float %0, 0.000000e+00 %11 = select i1 %10, float 0.000000e+00, float %0 %12 = fcmp olt float %11, %9 %13 = select i1 %12, float %11, float %9 br label %58 14: ; preds = %4 br i1 %3, label %15, label %32 15: ; preds = %14 %16 = shl i32 %1, 1 %17 = add i32 %16, -2 %18 = icmp eq i32 %17, 0 br i1 %18, label %49, label %19 19: ; preds = %15 %20 = sitofp i32 %17 to float %21 = fmul float %20, 5.000000e-01 %22 = call spir_func float @_Z16__spirv_ocl_fabsf(float %0) #0 %23 = call spir_func float @fmodf(float %22, float %21) #3 %24 = fdiv float %22, %21 %25 = call spir_func float @_Z17__spirv_ocl_floorf(float %24) #0 %26 = fptosi float %25 to i32 %27 = and i32 %26, 1 %28 = icmp eq i32 %27, 0 %29 = fsub float %21, %23 %30 = select i1 %28, float %23, float %29 %31 = fadd float %30, 0.000000e+00 br label %49 32: ; preds = %14 %33 = icmp eq i32 %1, 0 br i1 %33, label %49, label %34 34: ; preds = %32 %35 = shl nsw i32 %1, 1, !spirv.Decorations !387 %36 = sitofp i32 %35 to float %37 = fmul float %36, 5.000000e-01 %38 = fadd float %0, 5.000000e-01 %39 = call spir_func float @_Z16__spirv_ocl_fabsf(float %38) #0 %40 = call spir_func float @fmodf(float %39, float %37) #3 %41 = fdiv float %39, %37 %42 = call spir_func float @_Z17__spirv_ocl_floorf(float %41) #0 %43 = fptosi float %42 to i32 %44 = and i32 %43, 1 %45 = icmp eq i32 %44, 0 %46 = fsub float %37, %40 %47 = select i1 %45, float %40, float %46 %48 = fadd float %47, -5.000000e-01 br label %49 49: ; preds = %34, %32, %19, %15 %50 = phi float [ %31, %19 ], [ 0.000000e+00, %15 ], [ %48, %34 ], [ 0.000000e+00, %32 ] %51 = sext i32 %1 to i64 %52 = add nsw i64 %51, -1, !spirv.Decorations !387 %53 = sitofp i64 %52 to float %54 = fcmp olt float %50, 0.000000e+00 %55 = select i1 %54, float 0.000000e+00, float %50 %56 = fcmp olt float %55, %53 %57 = select i1 %56, float %55, float %53 br label %58 58: ; preds = %49, %6, %4 %59 = phi float [ %13, %6 ], [ %57, %49 ], [ %0, %4 ] %60 = fptosi float %59 to i64 %61 = icmp sgt i64 %60, 2147483646 %62 = fcmp olt float %59, 0xC1E0000000000000 %63 = or i1 %61, %62 br i1 %63, label %72, label %64 64: ; preds = %58 %65 = fpext float %59 to double %66 = bitcast double* %5 to i8* call void @llvm.lifetime.start.p0i8(i64 8, i8* %66) %67 = addrspacecast double* %5 to double addrspace(4)* store double %65, double* %5, align 8 %68 = call spir_func signext i16 @_dtest(double addrspace(4)* %67) #3 %69 = bitcast double* %5 to i8* call void @llvm.lifetime.end.p0i8(i64 8, i8* %69) %70 = icmp slt i16 %68, 1 %71 = select i1 %70, float %59, float -1.000000e+02 br label %72 72: ; preds = %64, %58 %73 = phi float [ %71, %64 ], [ -1.000000e+02, %58 ] ret float %73 } ``` We can see that spirv IR code uses a double type and calls the @_dtest function in block 64. Accroding to [MSVC document](https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/floating-point-primitives?view=msvc-170#_dtest-_ldtest-_fdtest), _dtest is used to detect whether a number is `Nan` or `INFINITE`. This allows us to locate the root cause of the crash, which corresponds to the following C++ logic: ```cpp if (static_cast(x) > INT_MAX - 1 || x < INT_MIN || !std::isfinite(static_cast(x))) return static_cast(-100.0); return x; ``` In other words, the crash occurs when the GPU executes code that tries to convert a floating-point value (Half or BFloat16) to a double and check whether it is finite. # Solution - For Half and BFloat16, `std::isfinite(x)` promot `x` to `float`, providing enough precision for finiteness checks. Casting to double is redundant and can be safely removed. - Explicitly return `-100.0f` instead of double type. # Additional Context I can't find the iGPU that could verify the fix, but it is unlikely to introduce any additional error. --- src/ATen/native/xpu/sycl/GridSampler.h | 5 +++-- test/regressions/test_grid_sample.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 test/regressions/test_grid_sample.py diff --git a/src/ATen/native/xpu/sycl/GridSampler.h b/src/ATen/native/xpu/sycl/GridSampler.h index 33192f4cd7..cfc6283d2f 100644 --- a/src/ATen/native/xpu/sycl/GridSampler.h +++ b/src/ATen/native/xpu/sycl/GridSampler.h @@ -49,9 +49,10 @@ static inline scalar_t safe_downgrade_to_int_range(scalar_t x) { // -100.0 does not have special meaning. This is just to make sure // it's not within_bounds_2d or within_bounds_3d, and does not cause // undefined behavior. + // We avoid using double here because some platforms may not support it. if (static_cast(x) > INT_MAX - 1 || x < INT_MIN || - !std::isfinite(static_cast(x))) - return static_cast(-100.0); + !std::isfinite(x)) + return static_cast(-100.0f); return x; } diff --git a/test/regressions/test_grid_sample.py b/test/regressions/test_grid_sample.py new file mode 100644 index 0000000000..a7c28c10a9 --- /dev/null +++ b/test/regressions/test_grid_sample.py @@ -0,0 +1,19 @@ +# Owner(s): ["module: intel"] +import torch +import torch.nn.functional as F +from torch.testing._internal.common_utils import TestCase + +cpu_device = torch.device("cpu") +xpu_device = torch.device("xpu") + + +class TestSimpleCopy(TestCase): + # Refer to https://github.com/pytorch/pytorch/issues/153996 + def test_grid_sample(self, dtype=torch.float): + input_cpu = torch.rand(1, 2, 5, 5, device=cpu_device) + grid_cpu = torch.rand(1, 3, 3, 2, device=cpu_device) + out_cpu = F.grid_sample(input_cpu, grid_cpu, align_corners=False) + input_xpu = input_cpu.to(xpu_device) + grid_xpu = grid_cpu.to(xpu_device) + out_xpu = F.grid_sample(input_xpu, grid_xpu, align_corners=False) + self.assertEqual(out_cpu, out_xpu.to(cpu_device)) From d5a81e058f1fd5abf2f77101c11d375a5dadb8a1 Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Sun, 28 Sep 2025 22:58:44 -0700 Subject: [PATCH 23/27] Fix overflow when calculating workgroups count (#2104) To fix #2070. This PR updates several SYCL kernel launch functions in `src/ATen/native/xpu/sycl/Loops.h` to use `int64_t` for workgroup size and number of workgroups calculations. This change prevents overflow issues when handling large tensor sizes. --- src/ATen/native/xpu/sycl/Loops.h | 26 ++++++++++++------------ test/xpu/test_tensor_creation_ops_xpu.py | 8 ++++++++ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h index 8c4fbbdef5..05a2f6b468 100644 --- a/src/ATen/native/xpu/sycl/Loops.h +++ b/src/ATen/native/xpu/sycl/Loops.h @@ -314,8 +314,8 @@ static void launch_legacy_group_range_kernel(int64_t N, const func_t& f) { auto ker = ElementwiseGroupRangeKernel(N, f); - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz * vec_size); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz * vec_size); sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); } @@ -328,9 +328,9 @@ static void launch_legacy_global_range_kernel(int64_t N, const func_t& f) { auto ker = ElementwiseGlobalRangeKernel(N, f); - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz); - int hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz; + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz); + int64_t hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz; num_wg = num_wg > hw_max_num_wg ? hw_max_num_wg : num_wg; sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); } @@ -355,8 +355,8 @@ static inline void launch_unrolled_kernel( auto ker = UnrolledElementwiseKernel(N, f, data, ic, oc, l, s); using ker_t = decltype(ker); - auto wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz * ker_t::item_work_size); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz * ker_t::item_work_size); sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); } @@ -393,13 +393,13 @@ static inline void launch_vectorized_kernel( #define VEC_KER(vec_size) \ { \ - TORCH_CHECK(max_scalar_bytes* vec_size <= 16); \ + TORCH_CHECK(max_scalar_bytes * vec_size <= 16); \ if constexpr (max_scalar_bytes * vec_size <= 16) { \ auto ker = \ VectorizedElementwiseKernel( \ N, f, data, input_calc); \ - int num_wg = ceil_div(N, wg_sz * vec_size); \ - sycl_kernel_submit(wg_sz* num_wg, wg_sz, getCurrentSYCLQueue(), ker); \ + int64_t num_wg = ceil_div(N, wg_sz * vec_size); \ + sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); \ } \ } @@ -426,7 +426,7 @@ static inline void launch_vectorized_kernel( N, f, data, input_calc, output_calc, loader, storer); using ker_t = decltype(ker); - int num_wg = ceil_div(N, wg_sz * ker_t::item_work_size); + int64_t num_wg = ceil_div(N, wg_sz * ker_t::item_work_size); sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); break; } @@ -457,8 +457,8 @@ static inline void launch_unrolled_kernel_for_multi_outputs( out_calc_t>(N, f, data, ic, oc); using ker_t = decltype(ker); - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, ker_t::item_work_size * wg_sz); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, ker_t::item_work_size * wg_sz); sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), ker); } diff --git a/test/xpu/test_tensor_creation_ops_xpu.py b/test/xpu/test_tensor_creation_ops_xpu.py index 95a30716ea..077fbb6ad5 100644 --- a/test/xpu/test_tensor_creation_ops_xpu.py +++ b/test/xpu/test_tensor_creation_ops_xpu.py @@ -4371,6 +4371,14 @@ def test_full_like_inference(self, device): torch.full_like(like, 1.0, dtype=torch.complex64).dtype, torch.complex64 ) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_zeros_large(self, device, dtype): + output = torch.zeros(2**31 - 1, device=device, dtype=dtype) + + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_ones_large(self, device, dtype): + output = torch.ones(2**31 - 1, device=device, dtype=dtype) + # Tests for the `frombuffer` function (only work on CPU): # Constructs tensors from Python objects that implement the buffer protocol, From f301733b03758ccd67642d2c202f2d589bd231a4 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:30:05 +0800 Subject: [PATCH 24/27] Fix segmentation fault and calculation error in AveragePool2dKernel (#2091) Fixed the following issues found by test/test_nn.py::TestNNDeviceTypeXPU::test_avg_pool_large_tensor2_xpu 1. A segmentation fault caused by a data type conversion error that invalidated the memory address. 2. A calculation error caused by data overflow. --------- Co-authored-by: Cui, Yifeng --- .../native/xpu/sycl/AveragePool2dKernels.cpp | 67 +++++++++---------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index 4caa4a7ead..cf856d6177 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -25,9 +26,7 @@ inline int max(int a, int b) { template struct AvgPool2dKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - - if (index < total_elements_) { + XPU_KERNEL_LOOP(item, index, total_elements_) { const int pw = index % pooled_width_; const int ph = (index / pooled_width_) % pooled_height_; const int c = (index / pooled_width_ / pooled_height_) % channels_; @@ -73,19 +72,19 @@ struct AvgPool2dKernelFunctor { AvgPool2dKernelFunctor( scalar_t* top_data, const scalar_t* bottom_data, - index_t total_elements, - index_t channels, - index_t height, - index_t width, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - int divisor_override, + const int total_elements, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t pooled_height, + const int pooled_width, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int divisor_override, bool count_include_pad, bool use_divisor) : top_data_(top_data), @@ -109,19 +108,19 @@ struct AvgPool2dKernelFunctor { private: scalar_t* top_data_; const scalar_t* bottom_data_; - index_t total_elements_; - index_t channels_; - index_t height_; - index_t width_; - int pooled_height_; - int pooled_width_; - int kernel_h_; - int kernel_w_; - int stride_h_; - int stride_w_; - int pad_h_; - int pad_w_; - int divisor_override_; + const int total_elements_; + const int64_t channels_; + const int64_t height_; + const int64_t width_; + const int64_t pooled_height_; + const int pooled_width_; + const int kernel_h_; + const int kernel_w_; + const int stride_h_; + const int stride_w_; + const int pad_h_; + const int pad_w_; + const int divisor_override_; bool count_include_pad_; bool use_divisor_; }; @@ -129,9 +128,7 @@ struct AvgPool2dKernelFunctor { template struct AvgPool2dChannelsLastKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - - if (index < total_elements_) { + XPU_KERNEL_LOOP(item, index, total_elements_) { const int c = index % channels_; const int pw = (index / channels_) % pooled_width_; const int ph = (index / channels_ / pooled_width_) % pooled_height_; @@ -327,8 +324,7 @@ void launch_avg_pool2d_kernel( template struct AvgPool2dChannelsLastBackwardKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - if (index < total_elements_) { + XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) { const int c = index % channels_; const int w = (index / channels_) % width_ + pad_w_; const int h = (index / channels_ / width_) % height_ + pad_h_; @@ -431,8 +427,7 @@ struct AvgPool2dChannelsLastBackwardKernelFunctor { template struct AvgPool2dBackwarKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - if (index < total_elements_) { + XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) { // find out the local index // find out the local offset const int w = index % width_ + pad_w_; From 086f20a0ce7e090ab403f23b35c14267752bab91 Mon Sep 17 00:00:00 2001 From: Frost Mitchell Date: Thu, 2 Oct 2025 20:08:03 -0400 Subject: [PATCH 25/27] Fix test_barrier hang by using static global rank in ProcessGroupXCCL (#2036) Fixes #1978 In ProcessGroupNCCL, `globalRank()` returns a static int globalRank, which is first initialized by the ProcessGroup setup, so the globalRank assigned to each thread matches the id of the CUDA device. However, we were not using this same pattern for XCCL. Instead, we were just using the assigned rank of the thread, which were not necessarily the same as the globalRank. The failing test `test_barrier` created two separate groups of 2 ranks each, and then 4 threads called barrier, but all on the same 2-thread group. Since initially the device id is not specified in this barrier call, the thread attempts to "guess" the device index. In the previous code, this guess would be 0 or 1, since the rank of each thread was not actually the globalRank. In `barrier`, this guessed id was used to initialize XCCLComm objects, and then call allreduce on a single element tensor. However, this resulted in an allreduce call two times on each device, which could result in a hang based on the execution order of the 4 threads. With the update in this PR, PGXCCL now uses the static globalRank in the same places as ProcessGroupNCCL, so the initialized XCCLComm objects are for unique devices and allreduce doesn't call on the same device multiple times. --- src/xccl/ProcessGroupXCCL.cpp | 11 ++++++++--- src/xccl/ProcessGroupXCCL.hpp | 1 + src/xccl/ProcessGroupXCCLMonitor.cpp | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index 514cdaf3a7..aca5a928aa 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -352,6 +352,11 @@ const std::string& ProcessGroupXCCL::logPrefix() const { return logPrefix_; } +const int& ProcessGroupXCCL::globalRank() const { + static int globalRank = rank_; + return globalRank; +} + ProcessGroupXCCL::ProcessGroupXCCL( c10::intrusive_ptr store, int rank, @@ -379,7 +384,7 @@ ProcessGroupXCCL::ProcessGroupXCCL( std::string torch_distributed_debug = getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str()); LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: " - << "size: " << size << ", global rank: " << rank_ + << "size: " << size << ", global rank: " << globalRank() << ", USE_HIGH_PRIORITY_STREAM: " << options_->is_high_priority_stream << ", PG Name: " << options_->group_name; @@ -410,7 +415,7 @@ bool ProcessGroupXCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) { if (traceBufferSize_ > 0) { // TODO: dump_xccl_trace auto xcclTrace = dump_xccl_trace(true, includeStackTrace, false); - DebugInfoWriter& writer = DebugInfoWriter::getWriter(rank_); + DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank()); LOG(INFO) << logPrefix() << "ProcessGroupXCCL dumping xccl trace to " << writer.getWriterTarget(); writer.write(xcclTrace); @@ -2021,7 +2026,7 @@ c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const { return *usedDeviceIdxs_.begin(); } int devIdx = - static_cast(rank_ % at::detail::getXPUHooks().getNumGPUs()); + static_cast(globalRank() % at::detail::getXPUHooks().getNumGPUs()); LOG(WARNING) << logPrefix() << c10::str( diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index 8e4604fbfb..7074354c64 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -423,6 +423,7 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::DeviceIndex guessDeviceId() const; const std::vector& groupRanks() const; + const int& globalRank() const; void setEnqueuedPgStatus(c10::intrusive_ptr work); bool dumpDebuggingInfo(bool includeStackTrace = true); diff --git a/src/xccl/ProcessGroupXCCLMonitor.cpp b/src/xccl/ProcessGroupXCCLMonitor.cpp index 68fdd402c1..246bdf3a59 100644 --- a/src/xccl/ProcessGroupXCCLMonitor.cpp +++ b/src/xccl/ProcessGroupXCCLMonitor.cpp @@ -39,7 +39,7 @@ void HeartbeatMonitorXCCL::runLoop() { // We only need to dump once per PG, so we use local_id_ == 0 for the first PG if (pg_->local_id_ == 0) { // DumpPipe is one per-trainer process - dumpPipe.emplace(pg_->getRank()); + dumpPipe.emplace(pg_->globalRank()); while (true) { std::unique_lock lock(monitorMutex_); if (monitorWakeUpCV_.wait_for( From cae6ba3511ed03df00e245eb8f72ebef6c06dbfa Mon Sep 17 00:00:00 2001 From: Frost Mitchell Date: Fri, 3 Oct 2025 11:42:52 -0400 Subject: [PATCH 26/27] Add FlightRecorder tests (#1971) As a follow-up to #1867 , this PR includes tests for the FlightRecorder on XCCL, as well as moving some definitions from ProcessGroupXCCL::Options to Backend::Options. These tests are largely based on `pytorch/test/distributed/test_c10d_nccl.py`, but doesn't include some tests: - `test_short_json` since json dumps are not supported in ProcessGroupXCCL - `test_trace_while_all_works_retired`: `_wait_for_pending_works` isn't supported by XCCL - `test_trace_while_active`: XCCL hangs when op is called on only one rank - `test_trace_while_stuck`: XCCL hangs when op is called on only one rank --------- Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com> --- src/xccl/ProcessGroupXCCL.cpp | 51 ++- src/xccl/ProcessGroupXCCL.hpp | 3 +- test/xpu/distributed/test_c10d_xccl.py | 565 ++++++++++++++++++++++++- 3 files changed, 598 insertions(+), 21 deletions(-) diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index aca5a928aa..ba25869a2a 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -459,7 +459,8 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( bool isP2P, const char* profilingTitle, const std::vector& inputs, - const std::vector& outputs) { + const std::vector& outputs, + bool record) { auto r = c10::make_intrusive( device, rank, @@ -470,20 +471,22 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( profilingTitle != nullptr ? std::optional>(inputs) : std::nullopt); - r->trace_id_ = FlightRecorderXCCL::get()->record( - local_id_, - std::make_tuple(pg_uid_, pg_desc_), // PG name tuple - seqCollective_, - seqP2P_, - op_id_, - profilingTitle ? profilingTitle : "", - inputs, - outputs, - nullptr, - r->xcclEndEvent_.get(), - options_->timeout, - pgStatus_, - isP2P); + if (record) { + r->trace_id_ = FlightRecorderXCCL::get()->record( + local_id_, + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + seqCollective_, + seqP2P_, + op_id_, + profilingTitle ? profilingTitle : "", + inputs, + outputs, + nullptr, + r->xcclEndEvent_.get(), + options_->timeout, + pgStatus_, + isP2P); + } return r; } @@ -664,16 +667,19 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const char* profilingTitle, bool nanCheck) { nanCheck &= enableNanCheck_; - seqCollective_++; auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); + if (!coalescing_state_) { + seqCollective_++; + } + op_id_++; + if (coalescing_state_ & CoalActive) { if ((coalescing_state_ & CoalColl) == 0) { seqCollective_++; } - op_id_++; coalescing_state_ |= CoalColl; if (coalescedDevice_.index() < 0) { coalescedDevice_ = device; @@ -714,8 +720,15 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( } c10::intrusive_ptr work; - work = - initWork(device, rank_, opType, false, profilingTitle, inputs, outputs); + work = initWork( + device, + rank_, + opType, + false, + profilingTitle, + inputs, + outputs, + !coalescing_state_); if (coalescing_state_) { FlightRecorderXCCL::get()->record( local_id_, diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index 7074354c64..bbc1f7cca8 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -180,7 +180,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { bool isP2P, const char* profilingTitle = nullptr, const std::vector& inputs = {}, - const std::vector& outputs = {}); + const std::vector& outputs = {}, + bool record = false); template c10::intrusive_ptr collective( diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 57b33ae29d..062b502781 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -1,16 +1,21 @@ # Owner(s): ["oncall: distributed"] +import json import math import os +import pickle import random import signal import sys +import tempfile +import threading import time -from datetime import timedelta +from datetime import datetime, timedelta from enum import auto, Enum from unittest import mock import torch +import torch._C._distributed_c10d import torch.distributed as c10d import torch.distributed._functional_collectives as _functional_collectives @@ -733,6 +738,564 @@ def test_wait_tensor(self) -> None: self.assertEqual(input1, input2) +class XCCLTraceTestBase(MultiProcessTestCase): + def setUp(self): + super().setUp() + os.environ["TORCH_FR_BUFFER_SIZE"] = "1000" + self.tempdir = tempfile.TemporaryDirectory() + os.environ["TORCH_FR_DUMP_TEMP_FILE"] = self._trace_basename() + os.environ["TORCH_FR_DEBUG_INFO_PIPE_FILE"] = self._trace_basename() + self._spawn_processes() + + @classmethod + def _run( + cls, + parent_conn, + rank: int, + test_name: str, + file_name: str, + parent_pipe, + **kwargs, + ) -> None: + cls.parent = parent_conn + super()._run(rank, test_name, file_name, parent_pipe) + + @property + def local_device(self): + return torch.device("xpu", self.rank_to_GPU[self.rank][0]) + + def _join_processes(self, fn): + # We need to patch sys.exit() as skip_if will use sys.exit() and + # the exit code from the this process will not be caught. + with mock.patch("sys.exit"): + fn() + super()._join_processes(fn) + + def _spawn_processes(self) -> None: + proc = torch.multiprocessing.get_context("spawn").Process + self.children_pipes = [] + parent_pipes = [] + for _ in range(self.world_size): + parent_conn, child_conn = torch.multiprocessing.Pipe() + self.children_pipes.append(child_conn) + parent_pipes.append(parent_conn) + piter = iter(parent_pipes) + + def wrap(*positional, args, **kwargs): + args = (next(piter), *args) + return proc(*positional, args=args, **kwargs) + + self._start_processes(wrap) + + def _create_process_group_xccl( + self, timeout=timedelta(seconds=600), device_id=None + ): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timeout, + device_id=device_id, + ) + pg = c10d.distributed_c10d._get_default_group() + return pg + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + @property + def rank_to_GPU(self): + # return rank to GPU map + return init_multigpu_helper(self.world_size, "xccl") + + def _trace_basename(self): + # we pass the base to the env, and the dump util will append rank + return os.path.join(self.tempdir.name, "trace_") + + def _trace_name(self, rank): + return self._trace_basename() + str(rank) + + def started_or_scheduled(self, timing_enabled=False): + return "started" if timing_enabled else "scheduled" + + +class XCCLTraceTest(XCCLTraceTestBase): + def _verify_trace(self, t, include_collectives, is_json, timing_enabled=False): + ver = t["version"] + self.assertEqual(ver, "2.10") + comm_lib_version = t["comm_lib_version"] + torch_comm_lib_version = torch._C._distributed_c10d.get_xccl_version() + self.assertEqual(comm_lib_version, torch_comm_lib_version) + pg_config = t["pg_config"] + self.assertEqual(len(pg_config), 1) + default_pg_info = pg_config["0"] + self.assertIn("name", default_pg_info) + self.assertIn("desc", default_pg_info) + self.assertIn("ranks", default_pg_info) + pg_status = t["pg_status"] + self.assertEqual(len(pg_status), 1) + self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2") + self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2") + self.assertEqual( + str(pg_status["0"]["last_started_collective"]), + "2" if timing_enabled else "-1", + ) + global_ranks = pg_config["0"]["ranks"] + self.assertEqual(len(json.loads(global_ranks)), self.world_size) + if include_collectives: + self.assertEqual(len(t["entries"]), 2) + t = t["entries"] + last = t[-1] + self.assertEqual(last["thread_id"], str(threading.current_thread().ident)) + self.assertEqual(last["thread_name"], "fr_test_thread") + self.assertEqual(last["process_group"], ("0", "default_pg")) + # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed + s = last["time_discovered_started_ns"] + f = last["time_discovered_completed_ns"] + self.assertEqual(last["record_id"], 1) + # self.assertIsNotNone(f) + if timing_enabled: + self.assertIsNotNone(s) + self.assertTrue(s <= f) + # we don't collect stack traces in JSON at the moment + if not is_json: + self.assertIn("test_c10d_xccl.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + self.assertEqual(last["collective_seq_id"], 2) + self.assertEqual(last["timeout_ms"], 600000) + now = datetime.now() + event_created_time = datetime.fromtimestamp( + last["time_created_ns"] / 1000000000 + ) + before_test = now - timedelta(minutes=1) + self.assertTrue(before_test < event_created_time < now) + if timing_enabled: + # very loose bounds, measured 0.036 ms on devgpu + self.assertTrue(0 < last["duration_ms"] < 100) + else: + self.assertTrue("duration_ms" not in last) + else: + self.assertTrue("entries" not in t) + + def load_libpthread_or_libc(self): + import ctypes.util + + for base in ("pthread", "c"): + path = ctypes.util.find_library(base) + if path: + try: + return ctypes.CDLL(path) + except OSError: + continue + raise RuntimeError("Could not load pthread or libc") + + # Directly set thread name using threading.current_thread().name does not work + # because we use pthread_getname_np to get the threadโ€™s OS-level name in C++ + def set_thread_name(self, name): + import ctypes + + lib = self.load_libpthread_or_libc() + pthread_self = lib.pthread_self + pthread_self.restype = ctypes.c_void_p + pthread_setname_np = lib.pthread_setname_np + pthread_setname_np.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + + # Get current pthread handle + tid = pthread_self() + + # Set name + pthread_setname_np(tid, name.encode()) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("include_collectives", [True, False]) + def test_short_pickle(self, include_collectives, timing_enabled=False): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + device = self.local_device + self.set_thread_name("fr_test_thread") + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api + time.sleep(1) + t = pickle.loads( + torch._C._distributed_c10d._dump_xccl_trace( + includeCollectives=include_collectives + ) + ) + self._verify_trace( + t, + include_collectives=include_collectives, + is_json=True, + timing_enabled=timing_enabled, + ) + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_dump_pipe(self): + def open_file_with_timeout(file_path, mode, timeout=1.0): + start_time = time.time() + while time.time() - start_time < timeout: + if os.path.exists(file_path): + return open(file_path, mode) + time.sleep(0.1) + raise FileNotFoundError + + if self.rank == self.MAIN_PROCESS_RANK: + for c in self.children_pipes: + self.assertEqual(c.recv(), "next") + + dump_file = self._trace_name(rank=0) + pipe_file = dump_file + ".pipe" + with open_file_with_timeout(pipe_file, "w") as f: + f.write("1\n") + with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f: + self.assertTrue("all_reduce" in str(pickle.load(f))) + + for c in self.children_pipes: + c.send("next") + return + + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + self.parent.send("next") + self.parent.recv() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_long(self): + os.environ["TORCH_FR_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + # test some other primitives to make sure + # their strings are valid + xs = [torch.ones(3, 4, device=device)] + pg.broadcast(xs).wait() + pg.allreduce(xs).wait() + # pg.reduce(xs).wait() //Currently failing on XPU + ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]] + pg.allgather(ys, xs).wait() + pg.reduce_scatter(xs, ys).wait() + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 10) + first = t[0] + last = t[-1] + self.assertEqual(last["profiling_name"], "xccl:all_reduce") + # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed + self.assertIn("test_c10d_xccl.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + self.assertEqual(last["timeout_ms"], 600000) + self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9) + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_barrier_profiling(self): + os.environ["TORCH_FR_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + f = pg.barrier() + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 2) + first = t[0] + last = t[-1] + self.assertEqual(first["profiling_name"], "xccl:all_reduce_barrier") + self.assertEqual(last["profiling_name"], "xccl:all_reduce") + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize( + "op_sizes_per_coalesce", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [False]) + def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's xpu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + num_coalesced_ops = 20 + ops_per_coalesce = len(op_sizes_per_coalesce) + for _ in range(num_coalesced_ops): + ops = [] + for input_sizes in op_sizes_per_coalesce: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + ops.append(dist.P2POp(dist.irecv, tensor, 1)) + elif self.rank == 1: + tensor *= 2 + ops.append(dist.P2POp(dist.isend, tensor, 0)) + + dist.batch_isend_irecv(ops).pop().wait() + + torch.xpu.synchronize(device=self.local_device) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), num_coalesced_ops * ops_per_coalesce) + + expected_record_id = 0 + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_coalesced_ops): + first_op = seq * (ops_per_coalesce) + coalesced_op = first_op + ops_per_coalesce + for p2p_op_idx, input_sizes in zip( + range(first_op, coalesced_op, 1), op_sizes_per_coalesce + ): + # the indivudal ops inside the coalescing group the individual op metadata, + # but not the timing info coming from the actual coalesced kernel + profiling_name = ( + "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0" + ) + self.assertEqual( + t["entries"][p2p_op_idx]["record_id"], expected_record_id + ) + expected_record_id += 1 + self.assertEqual( + t["entries"][p2p_op_idx]["profiling_name"], profiling_name + ) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0) + self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq) + self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes]) + self.assertEqual( + t["entries"][p2p_op_idx]["output_sizes"], [input_sizes] + ) + # duration doesn't get tagged onto individual ops yet, nor is their state updated + self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled") + self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx]) + + # coalesced ops not yet supported in FR + expected_seq += 1 + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize( + "op_sizes", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [False]) + def test_individual_send_recv(self, op_sizes, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's xpu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + num_repeats = 10 + ops_per_repeat = len(op_sizes) + for _ in range(num_repeats): + for input_sizes in op_sizes: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + dist.recv(tensor, 1) + elif self.rank == 1: + tensor *= 2 + dist.send(tensor, 0) + + torch.xpu.synchronize(device=self.local_device) + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat)) + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_repeats * ops_per_repeat): + input_sizes = op_sizes[seq % ops_per_repeat] + profiling_name = "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0" + self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][seq]["collective_seq_id"], 0) + self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq) + expected_seq += 1 + self.assertEqual(t["entries"][seq]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["state"], "completed") + + if timing_enabled: + duration = t["entries"][seq]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][seq]) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("timing_enabled", [False]) + def test_allgather_uneven(self, timing_enabled): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + output_split_sizes = [i + 1 for i in range(self.world_size)] + sum_len = sum(output_split_sizes) + output_tensor = torch.zeros(sum_len, 2).to(self.rank) + expected_tensor = torch.ones(sum_len, 2).to(self.rank) + input_tensor = torch.ones(output_split_sizes[self.rank], 2).to(self.rank) + + dist.all_gather( + list(torch.split(output_tensor, output_split_sizes)), input_tensor + ) + torch.xpu.synchronize(device=self.rank) + self.assertEqual(output_tensor, expected_tensor) + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), self.world_size) + for i in range(self.world_size): + self.assertEqual(t["entries"][i]["profiling_name"], "xccl:_broadcast_oop") + # collective_seq_id should be incremented once. + self.assertEqual(t["entries"][i]["collective_seq_id"], 1) + self.assertEqual(t["entries"][i]["input_sizes"], [[i + 1, 2]]) + self.assertEqual( + t["entries"][i]["output_sizes"], + [[i + 1, 2]], + ) + self.assertEqual(t["entries"][i]["state"], "scheduled") + # No event is recorded for individual ops + self.assertTrue("time_discovered_completed_ns" in t["entries"][i]) + # TODO: (frost-intel) Add coalesced op recording for FR + + # TODO(whc) test out other ops (And combinations of ops, if that's valid?) + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("timing_enabled", [False]) + def test_coalescing_manager_collective(self, timing_enabled): + """ + The coalescing manager api works by accumulating operations in python via a contextmanager, and then making + one call into c++ to an _coalesced API. It has limited support for ops and has been added recently to + avoid overheads of making individual py-cpp calls. This complicates flight recording.. + + For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives. + """ + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + output_tensors = torch.zeros(2, 2).to(self.rank) + input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)] + + # TODO(whc) make this work with bigger world or something + self.assertEqual(self.world_size, 2, self.world_size) + + with dist._coalescing_manager(): + for i in range(self.world_size): + dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i]) + self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size) + + torch.xpu.synchronize(device=self.rank) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + + self.assertEqual( + len(t["entries"]), 1 + ) # one for the reduce_scatter_tensor_coalesced + self.assertEqual( + t["entries"][0]["profiling_name"], "xccl:reduce_scatter_tensor_coalesced" + ) + # collective_seq_id should be incremented once. + self.assertEqual(t["entries"][0]["collective_seq_id"], 1) + self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]]) + self.assertEqual( + t["entries"][0]["output_sizes"], + [ + [ + 2, + ], + [ + 2, + ], + ], + ) + self.assertEqual(t["entries"][0]["state"], "completed") + if timing_enabled: + duration = t["entries"][0]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][0]) + + +instantiate_parametrized_tests(XCCLTraceTest) instantiate_parametrized_tests(ProcessGroupXCCLTest) From fe324d3fe9e95f202f7d3a5d07385a59aa786129 Mon Sep 17 00:00:00 2001 From: Pawel Swider <137768238+PawelSwider2000@users.noreply.github.com> Date: Thu, 9 Oct 2025 03:12:20 +0200 Subject: [PATCH 27/27] Modify files in install_xpu_headers only if they changes (#2138) Observed that recompilations are triggered by updating files by install_xpu_headers.py script. Turns out that script does not change the files in any way but rewriting the same content into files updating their timestamp causing multiple dependent files to recompile. This PR makes sure that `install_xpu_headers.py` changes or creates files only when content should change. This allow to speedup recompilations several times, by my experience from few minutes to few seconds. This fixes: https://github.com/intel/torch-xpu-ops/issues/2093 --------- Co-authored-by: Pawel Swider --- tools/codegen/install_xpu_headers.py | 103 ++++++++++++++++++++------- 1 file changed, 76 insertions(+), 27 deletions(-) diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py index 807a509bc6..0b5ad70660 100644 --- a/tools/codegen/install_xpu_headers.py +++ b/tools/codegen/install_xpu_headers.py @@ -4,6 +4,7 @@ import shutil from pathlib import Path +VERBOSE = False parser = argparse.ArgumentParser(description="Utils for append ops headers") parser.add_argument( @@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst): r""" Cleans trailing empty lines from the destination file, then appends #include lines from the source file that match `#include \s*\r?\n" matches = re.findall(pattern, src_text, re.MULTILINE) if not matches: return - with open(dst, "r+", encoding="utf-8") as f: - dst_lines = f.readlines() - dst_text = "".join(dst_lines) - missing_headers = [match for match in matches if match not in dst_text] - if not missing_headers: - return + try: + with open(dst, encoding="utf-8") as f: + dst_lines = f.readlines() + dst_text = "".join(dst_lines) + except OSError as e: + if VERBOSE: + print(f"Warning: Could not read destination file {dst}: {e}") + return + + missing_headers = [match for match in matches if match not in dst_text] + if not missing_headers: + return + + new_dst_lines = dst_lines.copy() + + while new_dst_lines and not new_dst_lines[-1].strip(): + new_dst_lines.pop() + new_dst_lines.extend(missing_headers) - # Remove trailing empty lines from dst_lines - while dst_lines and not dst_lines[-1].strip(): - dst_lines.pop() + new_content = "".join(new_dst_lines) + old_content = "".join(dst_lines) - f.seek(0) - f.truncate() - f.writelines(dst_lines) - # Append missing headers to the end of the file - f.writelines(missing_headers) + if new_content != old_content: + try: + with open(dst, "w", encoding="utf-8") as f: + f.writelines(new_dst_lines) + except OSError as e: + if VERBOSE: + print(f"Error: Could not write to {dst}: {e}") def parse_ops_headers(src): @@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir): def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers): r""" - Generate XPU ops headers xpu_ops_generated_headers.cmake + Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes """ - with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw: - fw.write("set(xpu_ops_generated_headers\n") - for header in xpu_ops_headers: - fw.write(f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n') - fw.write(")\n") + output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake") + + # Generate new content + new_content = "set(xpu_ops_generated_headers\n" + for header in xpu_ops_headers: + new_content += f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n' + new_content += ")\n" + + # Check if file exists and has same content + should_write = True + if os.path.exists(output_file): + try: + with open(output_file, encoding="utf-8") as f: + existing_content = f.read() + should_write = existing_content != new_content + except OSError: + # If we can't read the file, write it anyway + should_write = True + + if should_write: + with open(output_file, "w", encoding="utf-8") as fw: + fw.write(new_content) def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): r""" For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers. + Copies and appends are done only if leading to file changes to prevent unnecessary recompilations. """ if args.dry_run: return @@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present." src = os.path.join(src_dir, f) dst = os.path.join(dst_dir, f) - shutil.copy(src, dst) + # Only copy if src and dst differ or dst does not exist + should_copy = True + if os.path.exists(dst): + try: + with open(src, "rb") as fsrc, open(dst, "rb") as fdst: + should_copy = fsrc.read() != fdst.read() + except OSError: + should_copy = True + if should_copy: + shutil.copy(src, dst) for f in common_headers: src = os.path.join(src_dir, f) @@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): with open(dst, "r+", encoding="utf-8") as f: dst_lines = f.readlines() dst_text = "".join(dst_lines) + old_content = "".join(dst_lines) missing_declarations = [] insertion_index = None for index, line in enumerate(dst_lines): @@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): break assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}." - f.seek(0) - f.writelines(dst_lines) - f.truncate() + if old_content != "".join(dst_lines): + f.seek(0) + f.writelines(dst_lines) + f.truncate() def main():