diff --git a/.github/ISSUE_TEMPLATE/dynamic-skip.yml b/.github/ISSUE_TEMPLATE/dynamic-skip.yml new file mode 100644 index 0000000000..bdde3c5809 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/dynamic-skip.yml @@ -0,0 +1,63 @@ +name: ๐Ÿ› Dynamic skip +description: Create an issue to skip PR unrelated failures dynamically +title: "[Bug Skip]: " +labels: ["skipped"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: ๐Ÿ› Describe the bug with skip template + description: | + Please provide a clear and concise description of what the bug is. + The template for dynamic skip as below: + + ```python + # Template(Check in the github action summary) + Cases: + [Category],[Class name],[Test name] + ``` + + ```python + # example + Cases: + op_ut,third_party.torch-xpu-ops.test.xpu.test_transformers_xpu.TestTransformersXPU,test_scaled + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + placeholder: | + A clear and concise description of what the bug is and also align the dynamic template. + ``` + # Skippped cases with dynamic template + ``` + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: textarea + attributes: + label: Versions + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py + # For security purposes, please check the contents of collect_env.py before running it. + python collect_env.py + ``` + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing ๐ŸŽ‰! diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index cc5b9e0a70..a4c77e5f8a 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -57,7 +57,7 @@ runs: run: | # install pytorch if [ $(echo "${{ inputs.pytorch }}" |grep -w "release_wheel" -c) -ne 0 ];then - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/xpu + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu elif [ $(echo "${{ inputs.pytorch }}" |grep -w "test_wheel" -c) -ne 0 ];then pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/test/xpu elif [ $(echo "${{ inputs.pytorch }}" |grep -w "nightly_wheel" -c) -ne 0 ];then @@ -77,7 +77,7 @@ runs: # apply extra PRs for stock pytorch pip install requests if [ "${{ github.event_name }}" == "pull_request" ];then - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/27 else python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py fi @@ -99,7 +99,7 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ];then + if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops cd third_party/torch-xpu-ops else @@ -115,6 +115,8 @@ runs: pip install pandas psutil scipy pyyaml cd pytorch if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then + # for new LLM models + pip install accelerate pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt || pip install transformers==4.54.0 soxr==0.5.0 TRANSFORMERS_VERSION_ID="$(python -c 'import os; os.chdir("/tmp"); import transformers; print(transformers.__version__)')" elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then @@ -139,7 +141,7 @@ runs: fi # for dlrm pip install pyre-extensions - curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install # for soft_actor_critic, temp fix pip install git+https://github.com/nocoding03/gym@fix-np cd ../pytorch @@ -152,10 +154,13 @@ runs: TORCHBENCH_COMMIT_ID="$(git rev-parse --short HEAD)" sed -i 's/^ *pynvml.*//' requirements.txt pip install -r requirements.txt - python install.py --continue_on_fail + # python install.py --continue_on_fail + echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV} + pip install dominate + python install.py Super_SloMo # for dlrm pip install pyre-extensions - curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install --no-deps + curl -fsSL https://raw.githubusercontent.com/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install cd ../pytorch else pip install -r ./.ci/docker/requirements-ci.txt @@ -170,6 +175,11 @@ runs: else pip install torchao --pre --index-url https://download.pytorch.org/whl/nightly/xpu fi + if [ "${{ inputs.suite }}" != "None" ];then + # To install numpy 1.x for benchmarks as CUDA + # yolov requires numpy>=1.23 + pip install -U numpy==1.26.4 + fi - name: Torch Config shell: bash -xe {0} run: | diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index fb79182006..6e2cf2ceeb 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -155,7 +155,7 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 3600 bash -xeu -o pipefail {0} + shell: timeout 36000 bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m @@ -166,9 +166,13 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi + export CCL_ROOT=$(dirname $(which python))/../ + export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" + export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" python run_distributed.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; # Summary - name: UT Test Results Summary diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml index 8704ed0fc6..60954ffd29 100644 --- a/.github/actions/pt2e/action.yml +++ b/.github/actions/pt2e/action.yml @@ -29,6 +29,7 @@ runs: shell: bash -xe {0} run: | # dataset + dataset_dir="${RUNNER_TEMP}/_datasets/imagenet" if [ ! -d ${dataset_dir} ];then rm -rf ${dataset_dir} && mkdir -p ${dataset_dir} && cd ${dataset_dir} wget -O valprep.sh https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh @@ -36,6 +37,7 @@ runs: tar -xf ILSVRC2012_img_val.tar bash valprep.sh fi + echo "dataset_dir=${dataset_dir}" >> ${GITHUB_ENV} - name: PT2E Test (${{ inputs.dt }} ${{ inputs.scenario }}) shell: bash -xe {0} run: | diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv index a75d3d2253..fe1e531308 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_inference.csv @@ -45,3 +45,7 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass XGLMForCausalLM,pass,pass,pass,pass,pass XLNetLMHeadModel,pass,pass,pass,pass,pass YituTechConvBert,pass,pass,pass,pass,pass +Qwen/Qwen3-0.6B,pass,pass,pass,pass,pass +google/gemma-2-2b,pass,pass,pass,pass,pass +meta-llama/Llama-3.2-1B,pass,pass,pass,pass,pass +openai/whisper-tiny,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv index a75d3d2253..6d3726bdfb 100644 --- a/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv +++ b/.github/ci_expected_accuracy/rolling/inductor_huggingface_training.csv @@ -45,3 +45,8 @@ TrOCRForCausalLM,pass,pass,pass,pass,pass XGLMForCausalLM,pass,pass,pass,pass,pass XLNetLMHeadModel,pass,pass,pass,pass,pass YituTechConvBert,pass,pass,pass,pass,pass +# https://github.com/intel/torch-xpu-ops/issues/2055 +Qwen/Qwen3-0.6B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +google/gemma-2-2b,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +meta-llama/Llama-3.2-1B,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run +openai/whisper-tiny,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index 0e6b95ec45..1aac532497 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -198,7 +198,8 @@ if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' fi if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log echo -e "=========================================================================" echo -e "Show Failed cases in ${ut_suite} xpu distributed" diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml index decf4e612f..fbaa0c3f07 100644 --- a/.github/workflows/_linux_accelerate.yml +++ b/.github/workflows/_linux_accelerate.yml @@ -48,6 +48,9 @@ concurrency: defaults: run: shell: bash {0} +env: + GH_TOKEN: ${{ github.token }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: conditions-filter: @@ -55,8 +58,6 @@ jobs: if: ${{ github.event.pull_request.draft == false }} runs-on: ubuntu-24.04 timeout-minutes: 10 - env: - GH_TOKEN: ${{ github.token }} outputs: disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }} steps: diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 286d124b7b..77caf298b4 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -31,6 +31,8 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: runner: @@ -94,6 +96,7 @@ jobs: - name: Build Pytorch on ${{ needs.runner.outputs.hostname }} run: | export USE_XCCL=1 + export IS_XPU_CI=1 # only build pvc for CI if [ "${{ github.event_name }}" == "pull_request" ];then export TORCH_XPU_ARCH_LIST='pvc' @@ -200,7 +203,6 @@ jobs: python -c "import torchaudio; print(torchaudio.__version__)" python pytorch/torch/utils/collect_env.py pip list |grep -E 'torch|intel' - chmod 777 /__w /github ./ -R - name: Upload Torch XPU Wheel if: ${{ success() }} uses: actions/upload-artifact@v4 @@ -213,3 +215,8 @@ jobs: with: name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/build_*.log + - name: Cleanup workspace + if: ${{ always() }} + run: | + chmod 777 /__w /github ./ -R + find ./ |grep -v "^\./$" |xargs rm -rf diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml index ab49d28490..08f88462e6 100644 --- a/.github/workflows/_linux_e2e.yml +++ b/.github/workflows/_linux_e2e.yml @@ -50,6 +50,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: runner: @@ -87,7 +88,6 @@ jobs: cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }} MODEL_ONLY_NAME: ${{ inputs.model }} AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - dataset_dir: ${{ runner.temp }}/../_datasets/imagenet steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index 427a35ca82..9824c8c458 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -30,14 +30,15 @@ jobs: steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 - - name: Install gh-cli - run: | - sudo apt-get update - sudo apt-get install gh rsync ca-certificates -y - name: Setup python-${{ inputs.python }} uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} + - name: Install gh-cli + run: | + sudo apt-get update + sudo apt-get install gh rsync ca-certificates -y + pip install pandas requests - name: Download Target Artifact run: | mkdir target/ @@ -64,7 +65,7 @@ jobs: - name: Get summary if: ${{ ! cancelled() }} run: | - pip install pandas requests + exit_label=0 e2e_summary_csv="$(find ./target/ -name "inductor_*.csv" |head -n 1)" if [ -f "${e2e_summary_csv}" ];then bash ./.github/scripts/e2e_summary.sh ./target ./baseline >> ${GITHUB_STEP_SUMMARY} @@ -72,22 +73,22 @@ jobs: if [ ${exit_label} -ne 0 ];then grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} fi fi pt2e_summary_csv="$(find ./target/ -name "summary.csv")" if [ -f "${pt2e_summary_csv}" ];then - cat ${pt2e_summary_csv} - failed_num=$(grep -c ',failed' ${pt2e_summary_csv}) + failed_num=$(grep -c ',failed' ${pt2e_summary_csv} || true) if [ ${failed_num} -ne 0 ];then echo "[Warning] PT2E has failures!" + grep 'failed' ${pt2e_summary_csv} fi fi + exit ${exit_label} - name: Upload Reference Run ID if: ${{ endsWith(inputs.test_type, 'ly') }} run: | gh --repo ${GITHUB_REPOSITORY} issue view ${REFERENCE_ISSUE_ID} --json body -q .body 2>&1 |tee new_body.txt 2>&1 - has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt)" + has_or_not="$(grep -c 'Inductor-${{ inputs.test_type }}-LTS2' new_body.txt || true)" if [ ${has_or_not} -ne 0 ];then sed -i "s/Inductor-${{ inputs.test_type }}-LTS2:.*/Inductor-${{ inputs.test_type }}-LTS2: ${GITHUB_RUN_ID}/" new_body.txt else diff --git a/.github/workflows/_linux_op_benchmark.yml b/.github/workflows/_linux_op_benchmark.yml index b65d985008..e8474d7ab1 100644 --- a/.github/workflows/_linux_op_benchmark.yml +++ b/.github/workflows/_linux_op_benchmark.yml @@ -26,6 +26,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} reference_issue: 1689 jobs: @@ -50,8 +51,6 @@ jobs: op_benchmark: needs: runner runs-on: ${{ needs.runner.outputs.runner_id }} - permissions: - issues: write timeout-minutes: 900 container: image: mengfeili/intel-pvc-driver:1146-1136 @@ -93,6 +92,8 @@ jobs: op_benchmark_test_results_check: needs: op_benchmark runs-on: ubuntu-24.04 + permissions: + issues: write steps: - name: Install gh-cli run: | diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index fc3deaf7a3..67e2082834 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -56,6 +56,7 @@ concurrency: cancel-in-progress: true env: HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} HF_HUB_ETAG_TIMEOUT: 120 HF_HUB_DOWNLOAD_TIMEOUT: 120 python: ${{ inputs.python != '' && inputs.python || '3.10' }} diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1751d9d972..e895a959b7 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -33,6 +33,7 @@ env: GH_TOKEN: ${{ github.token }} HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} UT_SKIP_ISSUE: 1624 jobs: @@ -99,11 +100,12 @@ jobs: test-in-baremetal: needs: runner + timeout-minutes: 600 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/.github/workflows/bisect_search.yml b/.github/workflows/bisect_search.yml index 4adba0df6c..d45c0a83f8 100644 --- a/.github/workflows/bisect_search.yml +++ b/.github/workflows/bisect_search.yml @@ -40,47 +40,36 @@ permissions: read-all defaults: run: shell: bash -xe {0} +env: + GH_TOKEN: ${{ github.token }} + HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} jobs: get_runner: runs-on: ${{ inputs.runner }} outputs: - test_host: ${{ steps.runner-info.outputs.test_host }} - test_user: ${{ steps.runner-info.outputs.test_user }} - test_group: ${{ steps.runner-info.outputs.test_group }} + runner_id: ${{ steps.runner-info.outputs.runner_id }} + user_id: ${{ steps.runner-info.outputs.user_id }} + render_id: ${{ steps.runner-info.outputs.render_id }} + hostname: ${{ steps.runner-info.outputs.hostname }} steps: - - name: Get runner info + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Get runner id: runner-info - run: | - # get test runner - echo "test_host=${RUNNER_NAME}" |tee -a ${GITHUB_OUTPUT} - echo "test_user=$(id -u)" |tee -a ${GITHUB_OUTPUT} - echo "test_group=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} - # show host info - cat /etc/os-release - uname -a - source /opt/intel/oneapi/setvars.sh - sycl-ls - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - - name: Cleanup workspace - if: ${{ always() }} - run: | - # clean docker cache - docker stop $(docker ps -aq) || true - docker system prune -af || true - # clean files - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf + uses: ./.github/actions/get-runner biisect-search: needs: get_runner - runs-on: ${{ needs.get_runner.outputs.test_host }} + runs-on: ${{ needs.get_runner.outputs.runner_id }} container: image: mengfeili/intel-pvc-driver:1146-1136 volumes: - ${{ github.workspace }}:${{ github.workspace }} options: --device=/dev/mem --device=/dev/dri --group-add video --privileged --shm-size=8g - -u ${{ needs.get_runner.outputs.test_user }}:${{ needs.get_runner.outputs.test_group }} + -u ${{ needs.get_runner.outputs.user_id }} --group-add ${{ needs.get_runner.outputs.render_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool SEARCH_COMMITS: ${{ inputs.search_commits }} @@ -90,38 +79,20 @@ jobs: TORCH_XPU_ARCH_LIST: pvc USE_XCCL: 0 USE_KINETO: 0 - env: - GH_TOKEN: ${{ github.token }} - HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} steps: - - name: Check runner - run: | - ls -al - sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf - sudo rm -rf /tmp/xpu-tool - - name: Setup python-${{ inputs.python }} - uses: actions/setup-python@v5 + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 with: - python-version: ${{ inputs.python }} - - name: Check runner - run: | - hostname && whoami && id - clinfo --list - gcc -v && g++ -v - which python && which pip - python -V - pip install -U pip wheel setuptools - pip list - uname -a - dpkg -l |grep -E 'libigc-dev|libze-dev|level-zero-dev' - pip install cmake ninja pandas psutil scipy requests pybind11 - mkdir gs-logs gs-search - echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv + path: gs-scripts + - name: Prepare test env + uses: ./gs-scripts/.github/actions/linux-testenv + with: + pytorch: nightly_wheel + python: ${{ inputs.python }} - name: Install oneAPI DLE if: ${{ inputs.oneapi != 'installed' }} run: | - rm -rf ~/intel ~/.intel /tmp/intel + rm -rf ~/intel ~/.intel ${HOME}/intel if [ "${{ inputs.oneapi }}" == "2025.1" ];then ONEAPI_URL="https://registrationcenter-download.intel.com/akdlm/IRC_NAS/3435dc45-055e-4f7a-86b1-779931772404/intel-deep-learning-essentials-2025.1.3.7_offline.sh" elif [ "${{ inputs.oneapi }}" == "2025.2" ];then @@ -130,42 +101,42 @@ jobs: ONEAPI_URL="${{ inputs.oneapi }}" fi wget -q -O oneapi.sh "${ONEAPI_URL}" - bash oneapi.sh -a -s --eula accept --action install --install-dir /tmp/intel/oneapi - echo "XPU_ONEAPI_PATH=/tmp/intel/oneapi" >> ${GITHUB_ENV} - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - with: - path: gs-scripts - - name: Prepare source code + bash oneapi.sh -a -s --eula accept --action install --install-dir ${HOME}/intel/oneapi + echo "XPU_ONEAPI_PATH=${HOME}/intel/oneapi" >> ${GITHUB_ENV} + + - name: Summary file run: | - git clone https://github.com/pytorch/pytorch gs-pytorch - cd gs-pytorch + mkdir -p gs-logs + echo "Status,Acc,Perf,PyTorch,Torch-xpu-ops" > gs-logs/summary.csv + - name: Get latest versions + run: | + cd pytorch LATEST_PT_COMMIT="$(git rev-parse HEAD)" - cd .. - git clone https://github.com/intel/torch-xpu-ops gs-torch-xpu-ops - cd gs-torch-xpu-ops + cd third_party/torch-xpu-ops LATEST_XPU_COMMIT="$(git rev-parse HEAD)" - cd .. echo "LATEST_PT_COMMIT=${LATEST_PT_COMMIT}" >> ${GITHUB_ENV} echo "LATEST_XPU_COMMIT=${LATEST_XPU_COMMIT}" >> ${GITHUB_ENV} - name: Prepare test env run: | - pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu if [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/huggingface.py"* ]];then - pip install transformers==4.44.2 + pip install transformers elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/timm_models.py"* ]];then - pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14 - pip install $(curl -sSL https://raw.githubusercontent.com/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch) + pip install timm elif [[ "${{ inputs.search_case }}" == *"benchmarks/dynamo/torchbench.py"* ]];then model_name="$(echo ${{ inputs.search_case }} |sed 's+.*\--only *++;s/ .*//')" git clone https://github.com/pytorch/benchmark gs-benchmark cd gs-benchmark + pip install -r requirements.txt echo "PYTHONPATH=${PWD}:${PYTHONPATH}" >> ${GITHUB_ENV} python install.py ${model_name} - else - pip install -r gs-pytorch/.ci/docker/requirements-ci.txt + cd .. fi - pip uninstall -y torch && pip uninstall -y torch + pip uninstall -y torchvision torchaudio + git clone https://github.com/pytorch/vision gs-vision + cd gs-vision + python setup.py install + cd .. + pip uninstall -y torch - name: Bisect search pytorch if: ${{ contains(inputs.search_commits, 'pytorch') }} run: | @@ -186,7 +157,7 @@ jobs: > ${{ github.workspace }}/gs-logs/search-${new_commit}-${LATEST_XPU_COMMIT}.log 2>&1 && echo $? || echo $?)" new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)" if [ "${old_status}" != "${new_status}" ];then - cd gs-pytorch + cd pytorch git reset --hard rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/ git bisect start ${new_commit} ${old_commit} @@ -219,7 +190,7 @@ jobs: > ${{ github.workspace }}/gs-logs/search-${LATEST_PT_COMMIT}-${new_commit}.log && echo $? || echo $?)" new_result="$(tail -n 1 ${{ github.workspace }}/gs-search/result.csv)" if [ "${old_status}" != "${new_status}" ];then - cd gs-pytorch + cd pytorch git reset --hard rsync -avz --delete ${{ github.workspace }}/gs-scripts/ gs-scripts/ git bisect start ${new_commit} ${old_commit} diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index d408622efe..5c6207b963 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -111,7 +111,7 @@ jobs: else test_type="build-ondemand" fi - ut="$(echo ${{ inputs.ut }} |sed 's/,/","/g;s/^/["/;s/$/"]/')" + ut="$(echo ${{ inputs.ut }} |sed -E 's/(microbench,|,microbench)//g;s/(,windows|windows,)//g;s/,/","/g;s/^/["/;s/$/"]/')" suite="$(echo ${{ inputs.suite }} |sed 's/,/","/g;s/^/["/;s/$/"]/')" triton="${{ inputs.triton }}" python="${{ inputs.python }}" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index be6b260734..b5e8606ac8 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -115,6 +115,7 @@ jobs: with: runner: pvc_rolling pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} ut: ${{ matrix.ut_name }} linux-distributed: @@ -129,6 +130,7 @@ jobs: with: runner: pvc_rolling pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} ut: ${{ matrix.ut_name }} linux-e2e: diff --git a/CMakeLists.txt b/CMakeLists.txt index 277f1f067d..063c728411 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,9 @@ set(PROJECT_VERSION "2.3.0") # Avoid SYCL compiler error if(NOT WIN32) string(APPEND CMAKE_CXX_FLAGS " -Wno-error") + if("$ENV{IS_XPU_CI}" STREQUAL "1") + string(APPEND CMAKE_CXX_FLAGS " -Werror=unused-variable") + endif() endif() cmake_policy(SET CMP0048 NEW) diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index 23956d710a..c0afe2d8f1 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -36,10 +36,16 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" list(APPEND SYCL_HOST_FLAGS -fPIC) list(APPEND SYCL_HOST_FLAGS -std=c++17) list(APPEND SYCL_HOST_FLAGS -Wunused-variable) - # SYCL headers warnings - list(APPEND SYCL_HOST_FLAGS -Wno-deprecated-declarations) - list(APPEND SYCL_HOST_FLAGS -Wno-deprecated) - list(APPEND SYCL_HOST_FLAGS -Wno-attributes) + # Some versions of DPC++ compiler pass paths to SYCL headers as user include paths (`-I`) rather + # than system paths (`-isystem`). This makes host compiler to report warnings encountered in the + # SYCL headers, such as deprecated warnings, even if warned API is not actually used in the program. + # We expect that this issue will be addressed in the later version of DPC++ compiler. To workaround + # the issue we wrap paths to SYCL headers in `-isystem`. + foreach(FLAGS IN LISTS SYCL_INCLUDE_DIR) + list(APPEND SYCL_HOST_FLAGS "-isystem ${FLAGS}") + endforeach() + # Excluding warnings which flood the compilation output + # TODO: fix warnings in the source code and then reenable them in compilation list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare) endif() diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index 37ec7066b7..493675e804 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,6 +1,5 @@ # ATen XPU sources -file(GLOB xpu_h "xpu/*.h") file(GLOB xpu_cpp "xpu/*.cpp") file(GLOB xpu_mkl "native/xpu/mkl/*.cpp") file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp") @@ -18,10 +17,28 @@ set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE) set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE) set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) -foreach(HEADER ${xpu_h}) - install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu") -endforeach() +# ATen XPU headers -foreach(HEADER ${xpu_ops_generated_headers}) - install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops) -endforeach() +macro(install_xpu_headers glob_pattern dest_subdir) + file(GLOB headers ${glob_pattern}) + if(headers) + install(FILES ${headers} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/${dest_subdir}") + endif() +endmacro() + +install_xpu_headers("xpu/*.h" "ATen/xpu") +install_xpu_headers("native/xpu/*.h" "ATen/native/xpu") +install_xpu_headers("native/xpu/sycl/*.h" "ATen/native/xpu/sycl") +install_xpu_headers("native/xpu/mkl/*.h" "ATen/native/xpu/mkl") +install_xpu_headers("native/nested/xpu/*.h" "ATen/native/nested/xpu") +install_xpu_headers("native/nested/xpu/sycl/*.h" "ATen/native/nested/xpu/sycl") +install_xpu_headers("native/quantized/*.h" "ATen/native/quantized/xpu") +install_xpu_headers("native/quantized/sycl/*.h" "ATen/native/quantized/xpu/sycl") +install_xpu_headers("native/sparse/xpu/*.h" "ATen/native/sparse/xpu") +install_xpu_headers("native/sparse/xpu/sycl/*.h" "ATen/native/sparse/xpu/sycl") +install_xpu_headers("native/transformers/*.h" "ATen/native/transformers/xpu") +install_xpu_headers("native/transformers/sycl/*.h" "ATen/native/transformers/xpu/sycl") + +if(xpu_ops_generated_headers) + install(FILES ${xpu_ops_generated_headers} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops) +endif() diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp index 9988631d3b..82cfaa8275 100644 --- a/src/ATen/native/xpu/Nonzero.cpp +++ b/src/ATen/native/xpu/Nonzero.cpp @@ -1,22 +1,18 @@ #include +#include +#include #include - #include -#include +#include namespace at { namespace native { -Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { + +void nonzero_common_checks(const Tensor& self, Tensor& out, const std::string& op_name) { TORCH_CHECK( self.numel() < std::numeric_limits::max(), - "nonzero is not supported for tensors with more than INT_MAX elements, \ - See https://github.com/pytorch/pytorch/issues/51871"); - TORCH_CHECK( - out.dtype() == at::kLong, - "Expected object of scalar type ", - at::kLong, - " as out, but got ", - out.dtype()); + op_name, " is not supported for tensors with more than INT_MAX elements, \ + See https://github.com/pytorch/pytorch/issues/51871"); TORCH_CHECK( self.device() == out.device(), "expected self and out to be on the same device, but got out on ", @@ -24,11 +20,24 @@ Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { " and self on ", self.device()); TORCH_CHECK( - self.dim() <= MAX_DIMS, - "nonzero is not supported for tensor with more than ", - MAX_DIMS, + self.dim() <= XPU_MAX_TENSORINFO_DIMS, + op_name, " is not supported for tensor with more than ", + XPU_MAX_TENSORINFO_DIMS, " dimensions"); +} +Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) { + TORCH_CHECK( + out.dtype() == at::kLong, + "Expected object of scalar type ", + at::kLong, + " as out, but got ", + out.dtype()); + nonzero_common_checks(self, out, "nonzero"); + if (self.numel() == 0) { + out = at::detail::empty_xpu({0, self.dim()}, out.options()); + return out; + } xpu::nonzero_kernel(self, out); return out; } @@ -38,5 +47,49 @@ Tensor nonzero_xpu(const Tensor& self) { nonzero_out_xpu(self, out); return out; } + +Tensor& nonzero_static_out_xpu( + const Tensor& self, + int64_t size, + int64_t fill_value, + Tensor& out) { + TORCH_CHECK( + size >= 0, "nonzero_static: 'size' must be an non-negative integer"); + TORCH_CHECK( + out.dtype() == at::kLong, + "nonzero_static: Expected out tensor to have scalar type Long"); + nonzero_common_checks(self, out, "nonzero_static"); + if (self.numel() == 0) { + out = at::full({size, self.dim()}, fill_value, out.options()); + return out; + } + + Tensor nonzero_out = at::detail::empty_xpu({0}, self.options().dtype(kLong)); + xpu::nonzero_kernel(self, nonzero_out); + auto nonzero_size = nonzero_out.size(0); + out.resize_({size, self.dim()}); + + if (nonzero_size > size) { + out.copy_(nonzero_out.narrow(0, 0, size)); + } else if (nonzero_size < size) { + auto padding = at::full({size - nonzero_size, self.dim()}, fill_value, out.options()); + out.copy_(at::cat({nonzero_out, padding}, 0)); + } else { + out.copy_(nonzero_out); + } + return out; +} + +Tensor nonzero_static_xpu( + const Tensor& self, + int64_t size, + int64_t fill_value) { + TORCH_CHECK( + size >= 0, "nonzero_static: 'size' must be an non-negative integer"); + Tensor out = at::detail::empty_xpu({size, self.dim()}, self.options().dtype(kLong)); + nonzero_static_out_xpu(self, size, fill_value, out); + return out; +} + } // namespace native -} // namespace at \ No newline at end of file +} // namespace at diff --git a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp index 29facbaa3d..26e80fa4d0 100644 --- a/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp +++ b/src/ATen/native/xpu/mkl/BatchLinearAlgebra.cpp @@ -54,7 +54,7 @@ void error_handle( return; } - for (auto& i : ids) { + for (size_t i = 0; i < errs.size(); ++i) { try { std::rethrow_exception(errs[i]); } catch (const oneapi::mkl::lapack::exception& e) { diff --git a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp index 8c6e47f774..2c2c60ada9 100644 --- a/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp +++ b/src/ATen/native/xpu/sycl/ActivationHardswishKernels.cpp @@ -35,9 +35,9 @@ struct HardswishBackwardFunctor { const opmath_t one_half(0.5f); opmath_t grad_val = static_cast(grad_val_); opmath_t self_val = static_cast(self_val_); - if (self_val < neg_three) { + if (self_val <= neg_three) { return zero; - } else if (self_val <= three) { + } else if (self_val < three) { return grad_val * ((self_val / three) + one_half); } else { return grad_val; diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index 4caa4a7ead..cf856d6177 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -25,9 +26,7 @@ inline int max(int a, int b) { template struct AvgPool2dKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - - if (index < total_elements_) { + XPU_KERNEL_LOOP(item, index, total_elements_) { const int pw = index % pooled_width_; const int ph = (index / pooled_width_) % pooled_height_; const int c = (index / pooled_width_ / pooled_height_) % channels_; @@ -73,19 +72,19 @@ struct AvgPool2dKernelFunctor { AvgPool2dKernelFunctor( scalar_t* top_data, const scalar_t* bottom_data, - index_t total_elements, - index_t channels, - index_t height, - index_t width, - int pooled_height, - int pooled_width, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - int divisor_override, + const int total_elements, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t pooled_height, + const int pooled_width, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int divisor_override, bool count_include_pad, bool use_divisor) : top_data_(top_data), @@ -109,19 +108,19 @@ struct AvgPool2dKernelFunctor { private: scalar_t* top_data_; const scalar_t* bottom_data_; - index_t total_elements_; - index_t channels_; - index_t height_; - index_t width_; - int pooled_height_; - int pooled_width_; - int kernel_h_; - int kernel_w_; - int stride_h_; - int stride_w_; - int pad_h_; - int pad_w_; - int divisor_override_; + const int total_elements_; + const int64_t channels_; + const int64_t height_; + const int64_t width_; + const int64_t pooled_height_; + const int pooled_width_; + const int kernel_h_; + const int kernel_w_; + const int stride_h_; + const int stride_w_; + const int pad_h_; + const int pad_w_; + const int divisor_override_; bool count_include_pad_; bool use_divisor_; }; @@ -129,9 +128,7 @@ struct AvgPool2dKernelFunctor { template struct AvgPool2dChannelsLastKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - - if (index < total_elements_) { + XPU_KERNEL_LOOP(item, index, total_elements_) { const int c = index % channels_; const int pw = (index / channels_) % pooled_width_; const int ph = (index / channels_ / pooled_width_) % pooled_height_; @@ -327,8 +324,7 @@ void launch_avg_pool2d_kernel( template struct AvgPool2dChannelsLastBackwardKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - if (index < total_elements_) { + XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) { const int c = index % channels_; const int w = (index / channels_) % width_ + pad_w_; const int h = (index / channels_ / width_) % height_ + pad_h_; @@ -431,8 +427,7 @@ struct AvgPool2dChannelsLastBackwardKernelFunctor { template struct AvgPool2dBackwarKernelFunctor { void operator()(sycl::nd_item<1> item) const { - index_t index = item.get_global_linear_id(); - if (index < total_elements_) { + XPU_KERNEL_LOOP_TYPE(item, index, total_elements_, index_t) { // find out the local index // find out the local offset const int w = index % width_ + pad_w_; diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index f8e1b6906e..4124890274 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -185,8 +185,7 @@ int get_num_threads_by_dev_max_group_size( int get_prefer_simd(int numPlane, int nHw) { // decide SIMD: SIMD32 or SIMD16 - auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue(); - + auto dev_id = at::xpu::current_device(); auto* dev_prop = at::xpu::getDeviceProperties(dev_id); auto sub_group_size = dev_prop->sub_group_sizes; int simd = sub_group_size[1]; diff --git a/src/ATen/native/xpu/sycl/GridSampler.h b/src/ATen/native/xpu/sycl/GridSampler.h index 33192f4cd7..cfc6283d2f 100644 --- a/src/ATen/native/xpu/sycl/GridSampler.h +++ b/src/ATen/native/xpu/sycl/GridSampler.h @@ -49,9 +49,10 @@ static inline scalar_t safe_downgrade_to_int_range(scalar_t x) { // -100.0 does not have special meaning. This is just to make sure // it's not within_bounds_2d or within_bounds_3d, and does not cause // undefined behavior. + // We avoid using double here because some platforms may not support it. if (static_cast(x) > INT_MAX - 1 || x < INT_MIN || - !std::isfinite(static_cast(x))) - return static_cast(-100.0); + !std::isfinite(x)) + return static_cast(-100.0f); return x; } diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp index 4862af9a8f..bffd6117b5 100644 --- a/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp +++ b/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.cpp @@ -9,17 +9,15 @@ namespace at::native::xpu { template struct HermitePolynomialHFunctor { scalar_t operator()(scalar_t x, scalar_t n_) const { - int64_t n = static_cast(n_); + auto n = static_cast(n_); if (n < 0) { return scalar_t(0.0); - } - - if (n == 0) { + } else if (n == 0) { return scalar_t(1.0); - } - - if (n == 1) { + } else if (n == 1) { return x + x; + } else if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); } scalar_t p = scalar_t(1.0); diff --git a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp index 6c37e87202..8ba1d8bfff 100644 --- a/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp +++ b/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.cpp @@ -9,17 +9,15 @@ namespace at::native::xpu { template struct HermitePolynomialHeFunctor { scalar_t operator()(scalar_t x, scalar_t n_) const { - int64_t n = static_cast(n_); + auto n = static_cast(n_); if (n < 0) { return scalar_t(0.0); - } - - if (n == 0) { + } else if (n == 0) { return scalar_t(1.0); - } - - if (n == 1) { + } else if (n == 1) { return x; + } else if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); } scalar_t p = scalar_t(1.0); diff --git a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp index 8688f80456..869d64492a 100644 --- a/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp +++ b/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.cpp @@ -30,7 +30,7 @@ struct LaguerrePolynomialLFunctor { scalar_t q = scalar_t(1.0) - x; scalar_t r; - for (int64_t k = 1; k < n; k++) { + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { r = (((k + k) + (scalar_t(1.0) - x)) * q - k * p) / (k + 1); p = q; q = r; diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 92a4195d01..e4a8444c94 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -1103,9 +1103,6 @@ void _layer_norm_backward_kernel( // affecting performance and behavior. const scalar_t* dY_data = dY.const_data_ptr(); const scalar_t* X_data = X.const_data_ptr(); - weight_t* dg_data = - dgamma.defined() ? dgamma.data_ptr() : nullptr; - weight_t* db_data = dbeta.defined() ? dbeta.data_ptr() : nullptr; Tensor dgamma_blocks; Tensor dbeta_blocks; weight_t* dgamma_blocks_ptr = nullptr; diff --git a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp index 2176c4e79c..caf1d8ac16 100644 --- a/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp +++ b/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.cpp @@ -34,7 +34,7 @@ struct LegendrePolynomialPFunctor { scalar_t q = x; scalar_t r; - for (int64_t k = 1; k < n; k++) { + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { r = ((k + k + 1) * x * q - k * p) / (k + 1); p = q; q = r; diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h index d9c4ede284..b6d677b78d 100644 --- a/src/ATen/native/xpu/sycl/Loops.h +++ b/src/ATen/native/xpu/sycl/Loops.h @@ -228,8 +228,8 @@ static void launch_legacy_group_range_kernel(int64_t N, const func_t& f) { return; } - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz * vec_size); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz * vec_size); sycl_kernel_submit>( wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), 0, N, f); } @@ -241,9 +241,9 @@ static void launch_legacy_global_range_kernel(int64_t N, const func_t& f) { return; } - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz); - int hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz; + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz); + int64_t hw_max_num_wg = syclMaxWorkItemsPerTile() / wg_sz; num_wg = num_wg > hw_max_num_wg ? hw_max_num_wg : num_wg; sycl_kernel_submit>( wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), 0, N, f); @@ -267,8 +267,8 @@ static inline void launch_unrolled_kernel( TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); static constexpr int item_work_size = 4; - auto wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, wg_sz * item_work_size); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, wg_sz * item_work_size); sycl_kernel_submit; auto wg_sz = syclMaxWorkItemsPerSubSlice(); -#define VEC_KER(vec_size) \ - { \ - TORCH_CHECK(max_scalar_bytes* vec_size <= 16); \ - if constexpr (max_scalar_bytes * vec_size <= 16) { \ - int num_wg = ceil_div(N, wg_sz * vec_size); \ - sycl_kernel_submit>( \ - wg_sz * num_wg, \ - wg_sz, \ - getCurrentSYCLQueue(), \ - 0, \ - N, \ - f, \ - data, \ - input_calc); \ - } \ +#define VEC_KER(vec_size) \ + { \ + TORCH_CHECK(max_scalar_bytes* vec_size <= 16); \ + if constexpr (max_scalar_bytes * vec_size <= 16) { \ + int64_t num_wg = ceil_div(N, wg_sz * vec_size); \ + sycl_kernel_submit>( \ + wg_sz * num_wg, \ + wg_sz, \ + getCurrentSYCLQueue(), \ + 0, \ + N, \ + f, \ + data, \ + input_calc); \ + } \ } switch (vec_size) { @@ -367,7 +367,7 @@ static inline void launch_vectorized_kernel( auto storer = storer_t(); static constexpr int item_work_size = 4; - int num_wg = ceil_div(N, wg_sz * item_work_size); + int64_t num_wg = ceil_div(N, wg_sz * item_work_size); sycl_kernel_submit 0 && N <= std::numeric_limits::max()); static constexpr int item_work_size = 4; - int wg_sz = syclMaxWorkItemsPerSubSlice(); - int num_wg = ceil_div(N, item_work_size * wg_sz); + int64_t wg_sz = syclMaxWorkItemsPerSubSlice(); + int64_t num_wg = ceil_div(N, item_work_size * wg_sz); sycl_kernel_submit= 0; t--) { - item.barrier(sycl_local_fence); + item.barrier(sycl_global_and_local_fence); if (valid && (t < input_length - 1) && (s < 2 * target_length + 1)) { scalar_t lb1 = log_beta_data_ [lb_batch_offset + lb_input_stride_ * (t + 1) + diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp index 72df4b8ee8..d9c166dbcb 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp @@ -17,7 +17,7 @@ struct FlattenIdxtoRealIdxKernelFunctor { if (global_id < N_) { auto dim = global_id / num_nonzeros_; auto index = global_id % num_nonzeros_; - tensor_begin_[global_id] = + out_begin_[global_id] = idx_flat_begin_[index] / divisor_[dim] % sizes_[dim]; } } @@ -25,14 +25,14 @@ struct FlattenIdxtoRealIdxKernelFunctor { int64_t N, const int64_t num_dim, const int64_t num_nonzeros, - int64_t* tensor_begin, + int64_t* out_begin, int64_t* idx_flat_begin, int64_t* divisor, int64_t* sizes) : N_(N), num_dim_(num_dim), num_nonzeros_(num_nonzeros), - tensor_begin_(tensor_begin), + out_begin_(out_begin), idx_flat_begin_(idx_flat_begin) { for (auto dim = num_dim - 1; dim >= 0; dim--) { sizes_[dim] = sizes[dim]; @@ -44,7 +44,7 @@ struct FlattenIdxtoRealIdxKernelFunctor { int64_t N_; const int64_t num_dim_; const int64_t num_nonzeros_; - int64_t* tensor_begin_; + int64_t* out_begin_; int64_t* idx_flat_begin_; int64_t divisor_[XPU_MAX_TENSORINFO_DIMS]; int64_t sizes_[XPU_MAX_TENSORINFO_DIMS]; @@ -79,77 +79,68 @@ struct CopyIfFunc { }; template -void nonzero_template(const Tensor& self_, Tensor& tensor) { +void nonzero_template(const Tensor& self_, Tensor& out) { Tensor self = self_.contiguous(); const int64_t num_dim = self.dim(); - TORCH_CHECK(num_dim <= XPU_MAX_TENSORINFO_DIMS, "dim exceed max allowed dim"); - - int64_t N = self.numel(); - - if (N > 0) { - Tensor idx_flat = at::empty( - {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - Tensor range = at::empty( - {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - - const scalar_t* self_begin = self.const_data_ptr(); - int64_t* idx_flat_begin = idx_flat.data_ptr(); - int64_t* range_begin = nullptr; - - CopyIfFunc f(self_begin); - auto idx_flat_end = - pstl::copy_if(range_begin, range_begin + N, idx_flat_begin, f); - - auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end); - - bool need_to_copy = tensor.dim() == 2 && - tensor.sizes()[0] == num_nonzeros && tensor.sizes()[1] == self_.dim() && - !tensor.t().is_contiguous(); - at::Tensor tensor_ = need_to_copy - ? Tensor(at::detail::empty_xpu( - {self_.dim(), num_nonzeros}, tensor.options())) - : tensor.resize_({self_.dim(), num_nonzeros}); - - if (num_nonzeros > 0 && num_dim > 0) { - int64_t* tensor_begin = tensor_.data_ptr(); - - // preload sizes tensor for index calculation - int64_t sizes[XPU_MAX_TENSORINFO_DIMS]; - int64_t divisor[XPU_MAX_TENSORINFO_DIMS]; - sizes[num_dim - 1] = self.size(num_dim - 1); - divisor[num_dim - 1] = 1; - for (auto dim = num_dim - 2; dim >= 0; dim--) { - sizes[dim] = self.size(dim); - divisor[dim] = sizes[dim + 1] * divisor[dim + 1]; - } - - const int64_t N = num_nonzeros * num_dim; - // restore flatten idx to indices - FlattenIdxtoRealIdxKernelFunctor kfn( - N, - num_dim, - num_nonzeros, - tensor_begin, - idx_flat_begin, - divisor, - sizes); - - const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N); - const auto num_wg = (N + wg_sz - 1) / wg_sz; - - sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn); - } - if (need_to_copy) { - tensor.copy_(tensor_.t()); - } else { - // transpose out so it is correct size - Tensor tensor_temp = tensor_.t(); - tensor.set_(tensor_temp); + const int64_t N = self.numel(); + + Tensor idx_flat = at::empty( + {N}, out.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); + + const scalar_t* self_begin = self.const_data_ptr(); + int64_t* idx_flat_begin = idx_flat.data_ptr(); + int64_t* range_begin = nullptr; + + CopyIfFunc f(self_begin); + auto idx_flat_end = + pstl::copy_if(range_begin, range_begin + N, idx_flat_begin, f); + + auto num_nonzeros = std::distance(idx_flat_begin, idx_flat_end); + + bool need_to_copy = out.dim() == 2 && + out.sizes()[0] == num_nonzeros && out.sizes()[1] == num_dim && + !out.t().is_contiguous(); + Tensor out_ = need_to_copy + ? Tensor(at::detail::empty_xpu( + {num_dim, num_nonzeros}, out.options())) + : out.resize_({num_dim, num_nonzeros}); + + if (num_nonzeros > 0 && num_dim > 0) { + int64_t* out_begin = out_.data_ptr(); + + // preload sizes tensor for index calculation + int64_t sizes[XPU_MAX_TENSORINFO_DIMS]; + int64_t divisor[XPU_MAX_TENSORINFO_DIMS]; + sizes[num_dim - 1] = self.size(num_dim - 1); + divisor[num_dim - 1] = 1; + for (auto dim = num_dim - 2; dim >= 0; dim--) { + sizes[dim] = self.size(dim); + divisor[dim] = sizes[dim + 1] * divisor[dim + 1]; } + const int64_t N = num_nonzeros * num_dim; + // restore flatten idx to indices + FlattenIdxtoRealIdxKernelFunctor kfn( + N, + num_dim, + num_nonzeros, + out_begin, + idx_flat_begin, + divisor, + sizes); + + const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N); + const auto num_wg = (N + wg_sz - 1) / wg_sz; + + sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn); + } + if (need_to_copy) { + out.copy_(out_.t()); } else { - tensor = tensor.resize_({num_dim, N}).contiguous().t(); + // transpose out so it is correct size + Tensor out_temp = out_.t(); + out.set_(out_temp); } } diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 6117b6d261..8b887a443a 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -269,8 +269,7 @@ class NormConfig { } void get_max_vec_size() { - auto dev_id = getDeviceIndexOfCurrentQueue(); - int total_resource = syclMaxWorkItemsPerTile(dev_id); + int64_t total_resource = syclMaxWorkItemsPerTile(); constexpr int float4_size = sizeof(float) * 4; max_vec_size = float4_size / element_size_bytes; diff --git a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp index f0188ce0ee..869a2b6627 100644 --- a/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp +++ b/src/ATen/native/xpu/sycl/RoiAlignKernels.cpp @@ -440,7 +440,6 @@ Tensor roi_align_kernel( at::Tensor output = at::zeros( {num_rois, channels, pooled_height, pooled_width}, input.options()); - auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index fdedf5fb09..020fe4a1bc 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -1559,8 +1559,7 @@ void spatial_softmax_forward( canUse32BitIndexMath(input) && canUse32BitIndexMath(output); // decide SIMD: SIMD32 or SIMD16 - auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue(); - auto* dev_prop = at::xpu::getDeviceProperties(dev_id); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -1749,8 +1748,7 @@ void spatial_softmax_backward( canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -1901,8 +1899,7 @@ Tensor& masked_softmax_forward( canUse32BitIndexMath(input) && canUse32BitIndexMath(output); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { @@ -2026,8 +2023,7 @@ void masked_softmax_backward( canUse32BitIndexMath(output) && canUse32BitIndexMath(gradOutput); // decide SIMD: SIMD32 or SIMD16 - auto* dev_prop = - at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue()); + auto* dev_prop = at::xpu::getCurrentDeviceProperties(); auto sub_group_size = dev_prop->sub_group_sizes; int SIMD = sub_group_size[1]; if (SIMD == SIMD32) { diff --git a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp index 8c46658e7a..fed5fee904 100644 --- a/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorShapeKernels.cpp @@ -669,8 +669,7 @@ void split_with_sizes_copy_out_xpu_contiguous_no_cast( num_groups += div_up(split_chunk_size, GROUP_SIZE * BYTES_PER_THREAD); } - auto dev_id = getDeviceIndexOfCurrentQueue(); - int64_t tile_size = syclMaxWorkItemsPerTile(dev_id); + int64_t tile_size = syclMaxWorkItemsPerTile(); const int64_t max_groups = tile_size / GROUP_SIZE * 2.0; // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h index 1620ea1b7f..b4532fe59a 100644 --- a/src/comm/DeviceProperties.h +++ b/src/comm/DeviceProperties.h @@ -4,7 +4,6 @@ #include #include -#include namespace syclext = sycl::ext::oneapi; namespace syclexp = sycl::ext::oneapi::experimental; @@ -14,10 +13,9 @@ namespace sycl { template static int64_t syclMaxWorkGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { - auto q = c10::xpu::getCurrentXPUStream(dev_id).queue(); - auto ctx = q.get_context(); - auto dev = q.get_device(); + at::DeviceIndex dev_id = at::xpu::current_device()) { + auto& ctx = c10::xpu::get_device_context(); + auto& dev = c10::xpu::get_raw_device(dev_id); auto kid = ::sycl::get_kernel_id(); // The kernel won't be built for devices except for the first device. @@ -34,15 +32,15 @@ static int64_t syclMaxWorkGroupSize( template static int64_t syclMaxWorkGroupSize( - KernelClass /*kfn*/, - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + const KernelClass& /*kfn*/, + at::DeviceIndex dev_id = at::xpu::current_device()) { return syclMaxWorkGroupSize(dev_id); } // For SYCL free function template static int64_t syclMaxWorkGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto q = c10::xpu::getCurrentXPUStream(dev_id).queue(); auto ctxt = q.get_context(); auto dev = q.get_device(); @@ -54,67 +52,63 @@ static int64_t syclMaxWorkGroupSize( } static inline int64_t syclDeviceMaxWorkGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_work_group_size; } static inline int64_t syclMaxSubGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - auto subgroup_sizes = dev_prop->sub_group_sizes; - uint64_t max_val = 0; - for (auto i : subgroup_sizes) { - if (i > max_val) - max_val = i; - } - return max_val; + const auto& subgroup_sizes = dev_prop->sub_group_sizes; + TORCH_CHECK( + !subgroup_sizes.empty(), + "The device subgroup sizes is empty, please check the device status."); + return *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end()); } static inline int64_t syclMinSubGroupSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - auto subgroup_sizes = dev_prop->sub_group_sizes; - uint64_t min_val = dev_prop->max_work_group_size; - for (auto i : subgroup_sizes) { - if (i < min_val) - min_val = i; - } - return min_val; + const auto& subgroup_sizes = dev_prop->sub_group_sizes; + TORCH_CHECK( + !subgroup_sizes.empty(), + "The device subgroup sizes is empty, please check the device status."); + return *std::min_element(subgroup_sizes.begin(), subgroup_sizes.end()); } static inline int64_t syclMaxComputeUnitSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_compute_units; } static inline int64_t syclGpuEuCount( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_count; } static inline int64_t syclGpuEuSimdWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_simd_width; } static inline int64_t syclGpuHWThreadsPerEU( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_hw_threads_per_eu; } static inline int64_t syclGpuEUCountPerSubslice( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->gpu_eu_count_per_subslice; } static inline int64_t syclMaxWorkItemsPerTile( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t eu_cnt = dev_prop->gpu_eu_count; int64_t simd_width = syclMaxSubGroupSize(dev_id); @@ -123,7 +117,7 @@ static inline int64_t syclMaxWorkItemsPerTile( } static inline int64_t syclMaxWorkItemsPerSubSlice( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t simd_width = syclMaxSubGroupSize(dev_id); int64_t eu_count = dev_prop->gpu_eu_count_per_subslice; @@ -131,7 +125,7 @@ static inline int64_t syclMaxWorkItemsPerSubSlice( } static inline int64_t syclMaxWorkItemsPerEU( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); int64_t simd_width = syclMaxSubGroupSize(dev_id); int64_t hw_threads = dev_prop->gpu_hw_threads_per_eu; @@ -139,94 +133,76 @@ static inline int64_t syclMaxWorkItemsPerEU( } static inline int64_t syclMaxNumSubGroups( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->max_num_sub_groups; } static inline int64_t syclMaxDSSNum( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { int64_t dss_num = syclMaxComputeUnitSize(dev_id) / syclGpuEUCountPerSubslice(dev_id); return dss_num; } static inline size_t syclGlobalMemSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->global_mem_size; } static inline int64_t syclLocalMemSize( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->local_mem_size; } template uint32_t syclPrefVectorWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { (void)dev_id; // Suppress unused variable warning // Hot fix. This is the preferred vector width for GPUs up to LNL/BMG. - uint32_t vec_width = 16; + constexpr uint32_t vec_width = 16; - if (std::is_same::value) { - return vec_width / sizeof(char); - } - if (std::is_same::value) { - return vec_width / sizeof(short); - } - if (std::is_same::value) { - return vec_width / sizeof(int); - } - if (std::is_same::value) { - return vec_width / sizeof(int64_t); - } - if (std::is_same::value) { - return vec_width / sizeof(float); + if constexpr ( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v) { + return vec_width / sizeof(T); + } else { + throw std::invalid_argument( + "Invalid data type to fetch preferred vector width!"); } - if (std::is_same::value) { - return vec_width / sizeof(double); - } - if (std::is_same::value) { - return vec_width / sizeof(::sycl::half); - } - throw std::invalid_argument( - "Invalid data type to fetch preferred vector width!"); } template uint32_t syclNativeVectorWidth( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); - if (std::is_same::value) { + if constexpr (std::is_same_v) { return dev_prop->native_vector_width_char; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_short; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_int; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_long; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_float; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_double; - } - if (std::is_same::value) { + } else if constexpr (std::is_same_v) { return dev_prop->native_vector_width_half; + } else { + throw std::invalid_argument( + "Invalid data type to fetch native vector width!"); } - throw std::invalid_argument( - "Invalid data type to fetch native vector width!"); } static inline bool syclHasFloat64( - at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) { + at::DeviceIndex dev_id = at::xpu::current_device()) { auto* dev_prop = at::xpu::getDeviceProperties(dev_id); return dev_prop->has_fp64; } diff --git a/src/comm/Runtime.h b/src/comm/Runtime.h index fa7daaf125..4fd44d08e5 100644 --- a/src/comm/Runtime.h +++ b/src/comm/Runtime.h @@ -4,10 +4,6 @@ namespace at::xpu { -static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() { - return c10::xpu::getCurrentXPUStream().device_index(); -} - static inline sycl::queue& getCurrentSYCLQueue() { return c10::xpu::getCurrentXPUStream().queue(); } diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp index c820a1c486..ba25869a2a 100644 --- a/src/xccl/ProcessGroupXCCL.cpp +++ b/src/xccl/ProcessGroupXCCL.cpp @@ -97,17 +97,17 @@ void checkSingleTensor( ) { if (!tensor.is_xpu() || tensor.is_sparse()) { C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense"); + } - // Skip the following requirements for P2P operations - if (!tensor.is_contiguous(tensor.suggest_memory_format())) { - if (p2p) { - TORCH_WARN_ONCE( - "Detected non-contiguous tensor in P2P operations. It is user " - "responsibility to guarantee that source and destination tensors have " - "the same contiguity format."); - } else { - C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); - } + // Skip the following requirements for P2P operations + if (!tensor.is_contiguous(tensor.suggest_memory_format())) { + if (p2p) { + TORCH_WARN_ONCE( + "Detected non-contiguous tensor in P2P operations. It is user " + "responsibility to guarantee that source and destination tensors have " + "the same contiguity format."); + } else { + C10_THROW_ERROR(ValueError, "Tensors must be contiguous"); } } } @@ -322,7 +322,9 @@ bool ProcessGroupXCCL::WorkXCCL::wait(std::chrono::milliseconds timeout) { return true; } -ProcessGroupXCCL::Options::Options() : Backend::Options(XCCL_BACKEND_NAME) {} +ProcessGroupXCCL::Options::Options(bool is_high_priority_stream) + : Backend::Options(XCCL_BACKEND_NAME), + is_high_priority_stream(is_high_priority_stream) {} static std::atomic process_group_id = 0; @@ -350,8 +352,13 @@ const std::string& ProcessGroupXCCL::logPrefix() const { return logPrefix_; } +const int& ProcessGroupXCCL::globalRank() const { + static int globalRank = rank_; + return globalRank; +} + ProcessGroupXCCL::ProcessGroupXCCL( - const c10::intrusive_ptr& store, + c10::intrusive_ptr store, int rank, int size, c10::intrusive_ptr options) @@ -377,7 +384,10 @@ ProcessGroupXCCL::ProcessGroupXCCL( std::string torch_distributed_debug = getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str()); LOG(INFO) << logPrefix() << "ProcessGroupXCCL initialization options: " - << "size: " << size << ", global rank: " << rank_; + << "size: " << size << ", global rank: " << globalRank() + << ", USE_HIGH_PRIORITY_STREAM: " + << options_->is_high_priority_stream + << ", PG Name: " << options_->group_name; LOG(INFO) << logPrefix() << "ProcessGroupXCCL environments: " << "XCCL version: " << XcclVersion @@ -405,7 +415,7 @@ bool ProcessGroupXCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) { if (traceBufferSize_ > 0) { // TODO: dump_xccl_trace auto xcclTrace = dump_xccl_trace(true, includeStackTrace, false); - DebugInfoWriter& writer = DebugInfoWriter::getWriter(rank_); + DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank()); LOG(INFO) << logPrefix() << "ProcessGroupXCCL dumping xccl trace to " << writer.getWriterTarget(); writer.write(xcclTrace); @@ -432,17 +442,6 @@ void ProcessGroupXCCL::setEnqueuedPgStatus( pgStatus_->lastEnqueuedNumelOut = work->numelOut_; } -void ProcessGroupXCCL::setCompletedPgStatus( - c10::intrusive_ptr work) { - pgStatus_->lastCompletedSeq = static_cast(work->getSequencenumber()); - pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_); - pgStatus_->lastCompletedNumelIn = work->numelIn_; - pgStatus_->lastCompletedNumelOut = work->numelOut_; - // To avoid complexity, we're not computing duration. - FlightRecorderXCCL::get()->retire_id( - work->trace_id_, /*compute_duration*/ false); -} - void ProcessGroupXCCL::setSequenceNumberForGroup() {} uint64_t ProcessGroupXCCL::getSequenceNumberForGroup() { @@ -460,7 +459,8 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( bool isP2P, const char* profilingTitle, const std::vector& inputs, - const std::vector& outputs) { + const std::vector& outputs, + bool record) { auto r = c10::make_intrusive( device, rank, @@ -471,20 +471,22 @@ c10::intrusive_ptr ProcessGroupXCCL::initWork( profilingTitle != nullptr ? std::optional>(inputs) : std::nullopt); - r->trace_id_ = FlightRecorderXCCL::get()->record( - local_id_, - std::make_tuple(pg_uid_, pg_desc_), // PG name tuple - seqCollective_, - seqP2P_, - op_id_, - profilingTitle ? profilingTitle : "", - inputs, - outputs, - nullptr, - r->xcclEndEvent_.get(), - options_->timeout, - pgStatus_, - isP2P); + if (record) { + r->trace_id_ = FlightRecorderXCCL::get()->record( + local_id_, + std::make_tuple(pg_uid_, pg_desc_), // PG name tuple + seqCollective_, + seqP2P_, + op_id_, + profilingTitle ? profilingTitle : "", + inputs, + outputs, + nullptr, + r->xcclEndEvent_.get(), + options_->timeout, + pgStatus_, + isP2P); + } return r; } @@ -534,9 +536,9 @@ std::shared_ptr ProcessGroupXCCL::getXCCLComm( rank = p2pRank; } - c10::impl::VirtualGuardImpl impl(device.type()); - c10::Stream stream = - impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false); + bool force_high = getCvarBool(TORCH_XCCL_HIGH_PRIORITY, false); + c10::Stream stream = at::xpu::getStreamFromPool( + options_->is_high_priority_stream || force_high); sycl::queue& q = c10::xpu::XPUStream(stream).queue(); auto ctx = ccl::create_context(q.get_context()); @@ -665,16 +667,19 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( const char* profilingTitle, bool nanCheck) { nanCheck &= enableNanCheck_; - seqCollective_++; auto device = inputs[0].device(); const auto key = std::to_string(device.index()); auto comm = getXCCLComm(key, device, opType); + if (!coalescing_state_) { + seqCollective_++; + } + op_id_++; + if (coalescing_state_ & CoalActive) { if ((coalescing_state_ & CoalColl) == 0) { seqCollective_++; } - op_id_++; coalescing_state_ |= CoalColl; if (coalescedDevice_.index() < 0) { coalescedDevice_ = device; @@ -715,8 +720,15 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( } c10::intrusive_ptr work; - work = - initWork(device, rank_, opType, false, profilingTitle, inputs, outputs); + work = initWork( + device, + rank_, + opType, + false, + profilingTitle, + inputs, + outputs, + !coalescing_state_); if (coalescing_state_) { FlightRecorderXCCL::get()->record( local_id_, @@ -772,8 +784,12 @@ c10::intrusive_ptr ProcessGroupXCCL::collective( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); + auto id = work->trace_id_; work->future_->addCallback( - [this, work](at::ivalue::Future&) { this->setCompletedPgStatus(work); }); + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->blockingWait_ = blockingWait_; work->numelIn_ = 0; @@ -884,9 +900,12 @@ c10::intrusive_ptr ProcessGroupXCCL::pointToPoint( work->future_ = c10::make_intrusive( c10::ListType::create(c10::TensorType::get()), devices); work->future_->markCompleted(at::IValue(*work->outputs_)); - work->future_->addCallback([this, work](at::ivalue::Future&) { - this->setCompletedPgStatus(work); - }); + auto id = work->trace_id_; + work->future_->addCallback( + [id](at::ivalue::Future&) { + FlightRecorderXCCL::get()->retire_id(id, /*compute_duration*/ false); + }, + /*use_future*/ false); work->numelIn_ = work->numelOut_ = tensor.numel(); setEnqueuedPgStatus(work); @@ -2020,7 +2039,7 @@ c10::DeviceIndex ProcessGroupXCCL::guessDeviceId() const { return *usedDeviceIdxs_.begin(); } int devIdx = - static_cast(rank_ % at::detail::getXPUHooks().getNumGPUs()); + static_cast(globalRank() % at::detail::getXPUHooks().getNumGPUs()); LOG(WARNING) << logPrefix() << c10::str( diff --git a/src/xccl/ProcessGroupXCCL.hpp b/src/xccl/ProcessGroupXCCL.hpp index e7aa39c82d..bbc1f7cca8 100644 --- a/src/xccl/ProcessGroupXCCL.hpp +++ b/src/xccl/ProcessGroupXCCL.hpp @@ -24,6 +24,9 @@ #include namespace c10d { +static std::vector TORCH_XCCL_HIGH_PRIORITY = { + "TORCH_XCCL_HIGH_PRIORITY"}; + static std::vector TORCH_XCCL_BLOCKING_WAIT = { "TORCH_XCCL_BLOCKING_WAIT", "XCCL_BLOCKING_WAIT"}; @@ -118,18 +121,19 @@ class TORCH_API ProcessGroupXCCL : public Backend { }; struct Options : public Backend::Options { - explicit Options(); + explicit Options(bool is_high_priority_stream = false); - static c10::intrusive_ptr create() { - return c10::make_intrusive(); + static c10::intrusive_ptr create( + bool is_high_priority_stream = false) { + return c10::make_intrusive(is_high_priority_stream); } - + bool is_high_priority_stream; std::vector global_ranks_in_group; std::string group_name; }; ProcessGroupXCCL( - const c10::intrusive_ptr& store, + c10::intrusive_ptr store, int rank, int size, c10::intrusive_ptr options = Options::create()); @@ -138,11 +142,16 @@ class TORCH_API ProcessGroupXCCL : public Backend { const c10::intrusive_ptr& store, int rank, int size, - const std::string& groupName) - : ProcessGroupXCCL(store, rank, size) {} + const std::string& groupName, + c10::intrusive_ptr options = Options::create()) + : ProcessGroupXCCL(store, rank, size, std::move(options)) {} ~ProcessGroupXCCL() override; + c10::intrusive_ptr getOptions() { + return options_; + } + const std::string getBackendName() const override { return std::string(XCCL_BACKEND_NAME); } @@ -171,7 +180,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { bool isP2P, const char* profilingTitle = nullptr, const std::vector& inputs = {}, - const std::vector& outputs = {}); + const std::vector& outputs = {}, + bool record = false); template c10::intrusive_ptr collective( @@ -414,9 +424,8 @@ class TORCH_API ProcessGroupXCCL : public Backend { c10::DeviceIndex guessDeviceId() const; const std::vector& groupRanks() const; + const int& globalRank() const; void setEnqueuedPgStatus(c10::intrusive_ptr work); - void setCompletedPgStatus( - c10::intrusive_ptr work); bool dumpDebuggingInfo(bool includeStackTrace = true); protected: diff --git a/src/xccl/ProcessGroupXCCLMonitor.cpp b/src/xccl/ProcessGroupXCCLMonitor.cpp index 68fdd402c1..246bdf3a59 100644 --- a/src/xccl/ProcessGroupXCCLMonitor.cpp +++ b/src/xccl/ProcessGroupXCCLMonitor.cpp @@ -39,7 +39,7 @@ void HeartbeatMonitorXCCL::runLoop() { // We only need to dump once per PG, so we use local_id_ == 0 for the first PG if (pg_->local_id_ == 0) { // DumpPipe is one per-trainer process - dumpPipe.emplace(pg_->getRank()); + dumpPipe.emplace(pg_->globalRank()); while (true) { std::unique_lock lock(monitorMutex_); if (monitorWakeUpCV_.wait_for( diff --git a/test/regressions/test_grid_sample.py b/test/regressions/test_grid_sample.py new file mode 100644 index 0000000000..a7c28c10a9 --- /dev/null +++ b/test/regressions/test_grid_sample.py @@ -0,0 +1,19 @@ +# Owner(s): ["module: intel"] +import torch +import torch.nn.functional as F +from torch.testing._internal.common_utils import TestCase + +cpu_device = torch.device("cpu") +xpu_device = torch.device("xpu") + + +class TestSimpleCopy(TestCase): + # Refer to https://github.com/pytorch/pytorch/issues/153996 + def test_grid_sample(self, dtype=torch.float): + input_cpu = torch.rand(1, 2, 5, 5, device=cpu_device) + grid_cpu = torch.rand(1, 3, 3, 2, device=cpu_device) + out_cpu = F.grid_sample(input_cpu, grid_cpu, align_corners=False) + input_xpu = input_cpu.to(xpu_device) + grid_xpu = grid_cpu.to(xpu_device) + out_xpu = F.grid_sample(input_xpu, grid_xpu, align_corners=False) + self.assertEqual(out_cpu, out_xpu.to(cpu_device)) diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 44a3ac148a..062b502781 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -1,16 +1,21 @@ # Owner(s): ["oncall: distributed"] +import json import math import os +import pickle import random import signal import sys +import tempfile +import threading import time -from datetime import timedelta +from datetime import datetime, timedelta from enum import auto, Enum from unittest import mock import torch +import torch._C._distributed_c10d import torch.distributed as c10d import torch.distributed._functional_collectives as _functional_collectives @@ -365,6 +370,31 @@ def test_nan_assert(self, type): # reset env os.environ["TORCH_XCCL_NAN_CHECK"] = "0" + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_oom(self): + pg = self._create_process_group_xccl() + dp_ranks = range(0, self.world_size) + dp_group = c10d.new_group(dp_ranks) + device = torch.device(f"xpu:{self.rank}") + torch.xpu.set_device(device) + + shape = (16384 * 2, 16384 * 2) + weight = torch.ones(shape, device=device).half() + gradient = torch.zeros(shape, device=device).half() + ret = torch.randn(shape, device=device).half() + + for iter in range(50): + output = torch.empty_like(ret) + output = ret + weight + gradient + ret = torch.nn.functional.linear(output, weight=ret) + dist.all_reduce(ret, op=dist.ReduceOp.SUM) + torch.xpu.synchronize() + self.assertLess( + torch.xpu.max_memory_allocated(), + torch.xpu.max_memory_reserved() * 2, + ) + class CommTest(MultiProcessTestCase): @property @@ -641,7 +671,7 @@ def test_unwaited(self) -> None: with _functional_collectives.allow_inflight_collective_as_graph_input_ctx(): self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0) input = torch.full( - (10240, 10240), float(self.rank), device=f"xpu:{self.rank}" + (1024, 1024), float(self.rank), device=f"xpu:{self.rank}" ) dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True) # Non-functional collectives run under the context manager is registered in the work registry. @@ -653,7 +683,7 @@ def test_unwaited(self) -> None: # Case 2: Run collectives not under context manager, and don't call wait on them. # NOTE: Here we intentionally test memory-stressed case. self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2) - for _ in range(50000): + for _ in range(500): input = torch.full( (1024, 1024), float(self.rank), device=f"xpu:{self.rank}" ) @@ -708,6 +738,564 @@ def test_wait_tensor(self) -> None: self.assertEqual(input1, input2) +class XCCLTraceTestBase(MultiProcessTestCase): + def setUp(self): + super().setUp() + os.environ["TORCH_FR_BUFFER_SIZE"] = "1000" + self.tempdir = tempfile.TemporaryDirectory() + os.environ["TORCH_FR_DUMP_TEMP_FILE"] = self._trace_basename() + os.environ["TORCH_FR_DEBUG_INFO_PIPE_FILE"] = self._trace_basename() + self._spawn_processes() + + @classmethod + def _run( + cls, + parent_conn, + rank: int, + test_name: str, + file_name: str, + parent_pipe, + **kwargs, + ) -> None: + cls.parent = parent_conn + super()._run(rank, test_name, file_name, parent_pipe) + + @property + def local_device(self): + return torch.device("xpu", self.rank_to_GPU[self.rank][0]) + + def _join_processes(self, fn): + # We need to patch sys.exit() as skip_if will use sys.exit() and + # the exit code from the this process will not be caught. + with mock.patch("sys.exit"): + fn() + super()._join_processes(fn) + + def _spawn_processes(self) -> None: + proc = torch.multiprocessing.get_context("spawn").Process + self.children_pipes = [] + parent_pipes = [] + for _ in range(self.world_size): + parent_conn, child_conn = torch.multiprocessing.Pipe() + self.children_pipes.append(child_conn) + parent_pipes.append(parent_conn) + piter = iter(parent_pipes) + + def wrap(*positional, args, **kwargs): + args = (next(piter), *args) + return proc(*positional, args=args, **kwargs) + + self._start_processes(wrap) + + def _create_process_group_xccl( + self, timeout=timedelta(seconds=600), device_id=None + ): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + "xccl", + world_size=self.world_size, + rank=self.rank, + store=store, + timeout=timeout, + device_id=device_id, + ) + pg = c10d.distributed_c10d._get_default_group() + return pg + + def tearDown(self): + super().tearDown() + try: + os.remove(self.file_name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + @property + def rank_to_GPU(self): + # return rank to GPU map + return init_multigpu_helper(self.world_size, "xccl") + + def _trace_basename(self): + # we pass the base to the env, and the dump util will append rank + return os.path.join(self.tempdir.name, "trace_") + + def _trace_name(self, rank): + return self._trace_basename() + str(rank) + + def started_or_scheduled(self, timing_enabled=False): + return "started" if timing_enabled else "scheduled" + + +class XCCLTraceTest(XCCLTraceTestBase): + def _verify_trace(self, t, include_collectives, is_json, timing_enabled=False): + ver = t["version"] + self.assertEqual(ver, "2.10") + comm_lib_version = t["comm_lib_version"] + torch_comm_lib_version = torch._C._distributed_c10d.get_xccl_version() + self.assertEqual(comm_lib_version, torch_comm_lib_version) + pg_config = t["pg_config"] + self.assertEqual(len(pg_config), 1) + default_pg_info = pg_config["0"] + self.assertIn("name", default_pg_info) + self.assertIn("desc", default_pg_info) + self.assertIn("ranks", default_pg_info) + pg_status = t["pg_status"] + self.assertEqual(len(pg_status), 1) + self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2") + self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2") + self.assertEqual( + str(pg_status["0"]["last_started_collective"]), + "2" if timing_enabled else "-1", + ) + global_ranks = pg_config["0"]["ranks"] + self.assertEqual(len(json.loads(global_ranks)), self.world_size) + if include_collectives: + self.assertEqual(len(t["entries"]), 2) + t = t["entries"] + last = t[-1] + self.assertEqual(last["thread_id"], str(threading.current_thread().ident)) + self.assertEqual(last["thread_name"], "fr_test_thread") + self.assertEqual(last["process_group"], ("0", "default_pg")) + # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed + s = last["time_discovered_started_ns"] + f = last["time_discovered_completed_ns"] + self.assertEqual(last["record_id"], 1) + # self.assertIsNotNone(f) + if timing_enabled: + self.assertIsNotNone(s) + self.assertTrue(s <= f) + # we don't collect stack traces in JSON at the moment + if not is_json: + self.assertIn("test_c10d_xccl.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + self.assertEqual(last["collective_seq_id"], 2) + self.assertEqual(last["timeout_ms"], 600000) + now = datetime.now() + event_created_time = datetime.fromtimestamp( + last["time_created_ns"] / 1000000000 + ) + before_test = now - timedelta(minutes=1) + self.assertTrue(before_test < event_created_time < now) + if timing_enabled: + # very loose bounds, measured 0.036 ms on devgpu + self.assertTrue(0 < last["duration_ms"] < 100) + else: + self.assertTrue("duration_ms" not in last) + else: + self.assertTrue("entries" not in t) + + def load_libpthread_or_libc(self): + import ctypes.util + + for base in ("pthread", "c"): + path = ctypes.util.find_library(base) + if path: + try: + return ctypes.CDLL(path) + except OSError: + continue + raise RuntimeError("Could not load pthread or libc") + + # Directly set thread name using threading.current_thread().name does not work + # because we use pthread_getname_np to get the threadโ€™s OS-level name in C++ + def set_thread_name(self, name): + import ctypes + + lib = self.load_libpthread_or_libc() + pthread_self = lib.pthread_self + pthread_self.restype = ctypes.c_void_p + pthread_setname_np = lib.pthread_setname_np + pthread_setname_np.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + + # Get current pthread handle + tid = pthread_self() + + # Set name + pthread_setname_np(tid, name.encode()) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("include_collectives", [True, False]) + def test_short_pickle(self, include_collectives, timing_enabled=False): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + device = self.local_device + self.set_thread_name("fr_test_thread") + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api + time.sleep(1) + t = pickle.loads( + torch._C._distributed_c10d._dump_xccl_trace( + includeCollectives=include_collectives + ) + ) + self._verify_trace( + t, + include_collectives=include_collectives, + is_json=True, + timing_enabled=timing_enabled, + ) + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_dump_pipe(self): + def open_file_with_timeout(file_path, mode, timeout=1.0): + start_time = time.time() + while time.time() - start_time < timeout: + if os.path.exists(file_path): + return open(file_path, mode) + time.sleep(0.1) + raise FileNotFoundError + + if self.rank == self.MAIN_PROCESS_RANK: + for c in self.children_pipes: + self.assertEqual(c.recv(), "next") + + dump_file = self._trace_name(rank=0) + pipe_file = dump_file + ".pipe" + with open_file_with_timeout(pipe_file, "w") as f: + f.write("1\n") + with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f: + self.assertTrue("all_reduce" in str(pickle.load(f))) + + for c in self.children_pipes: + c.send("next") + return + + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + self.parent.send("next") + self.parent.recv() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_long(self): + os.environ["TORCH_FR_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + for _ in range(2): + # test some other primitives to make sure + # their strings are valid + xs = [torch.ones(3, 4, device=device)] + pg.broadcast(xs).wait() + pg.allreduce(xs).wait() + # pg.reduce(xs).wait() //Currently failing on XPU + ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]] + pg.allgather(ys, xs).wait() + pg.reduce_scatter(xs, ys).wait() + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 10) + first = t[0] + last = t[-1] + self.assertEqual(last["profiling_name"], "xccl:all_reduce") + # self.assertEqual(last["state"], "completed") # Watchdog will fix marking works completed + self.assertIn("test_c10d_xccl.py", str(last["frames"])) + self.assertEqual(last["input_sizes"], ((3, 4),)) + self.assertEqual(last["input_dtypes"], ["Float"]) + self.assertEqual(last["output_sizes"], ((3, 4),)) + self.assertEqual(last["output_dtypes"], ["Float"]) + self.assertEqual(last["timeout_ms"], 600000) + self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9) + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + def test_barrier_profiling(self): + os.environ["TORCH_FR_BUFFER_SIZE"] = "10" + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + device = self.local_device + a = torch.full((3, 4), float(self.rank), device=device) + f = pg.barrier() + f = pg.allreduce(a) + f.wait() + torch.xpu.synchronize(device=device) + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + t = t["entries"] + self.assertEqual(len(t), 2) + first = t[0] + last = t[-1] + self.assertEqual(first["profiling_name"], "xccl:all_reduce_barrier") + self.assertEqual(last["profiling_name"], "xccl:all_reduce") + dist.destroy_process_group() + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize( + "op_sizes_per_coalesce", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [False]) + def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's xpu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + num_coalesced_ops = 20 + ops_per_coalesce = len(op_sizes_per_coalesce) + for _ in range(num_coalesced_ops): + ops = [] + for input_sizes in op_sizes_per_coalesce: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + ops.append(dist.P2POp(dist.irecv, tensor, 1)) + elif self.rank == 1: + tensor *= 2 + ops.append(dist.P2POp(dist.isend, tensor, 0)) + + dist.batch_isend_irecv(ops).pop().wait() + + torch.xpu.synchronize(device=self.local_device) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), num_coalesced_ops * ops_per_coalesce) + + expected_record_id = 0 + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_coalesced_ops): + first_op = seq * (ops_per_coalesce) + coalesced_op = first_op + ops_per_coalesce + for p2p_op_idx, input_sizes in zip( + range(first_op, coalesced_op, 1), op_sizes_per_coalesce + ): + # the indivudal ops inside the coalescing group the individual op metadata, + # but not the timing info coming from the actual coalesced kernel + profiling_name = ( + "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0" + ) + self.assertEqual( + t["entries"][p2p_op_idx]["record_id"], expected_record_id + ) + expected_record_id += 1 + self.assertEqual( + t["entries"][p2p_op_idx]["profiling_name"], profiling_name + ) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0) + self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq) + self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes]) + self.assertEqual( + t["entries"][p2p_op_idx]["output_sizes"], [input_sizes] + ) + # duration doesn't get tagged onto individual ops yet, nor is their state updated + self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled") + self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx]) + + # coalesced ops not yet supported in FR + expected_seq += 1 + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize( + "op_sizes", + [ + [(2, 3)], + [(2, 3), (5, 5), (1,)], + ], + ) + @parametrize("timing_enabled", [False]) + def test_individual_send_recv(self, op_sizes, timing_enabled): + """ + 'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use + a destructed Work obj's xpu events + """ + + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + num_repeats = 10 + ops_per_repeat = len(op_sizes) + for _ in range(num_repeats): + for input_sizes in op_sizes: + tensor = torch.zeros(input_sizes).to(self.local_device) + if self.rank == 0: + dist.recv(tensor, 1) + elif self.rank == 1: + tensor *= 2 + dist.send(tensor, 0) + + torch.xpu.synchronize(device=self.local_device) + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat)) + expected_seq = 1 + expected_op_id = 1 + for seq in range(num_repeats * ops_per_repeat): + input_sizes = op_sizes[seq % ops_per_repeat] + profiling_name = "xccl:recv 0<-1" if self.rank == 0 else "xccl:send 1->0" + self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name) + # we don't increment collective_seq_id for p2p ops. + self.assertEqual(t["entries"][seq]["collective_seq_id"], 0) + self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq) + expected_seq += 1 + self.assertEqual(t["entries"][seq]["op_id"], expected_op_id) + expected_op_id += 1 + self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes]) + self.assertEqual(t["entries"][seq]["state"], "completed") + + if timing_enabled: + duration = t["entries"][seq]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][seq]) + + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("timing_enabled", [False]) + def test_allgather_uneven(self, timing_enabled): + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + output_split_sizes = [i + 1 for i in range(self.world_size)] + sum_len = sum(output_split_sizes) + output_tensor = torch.zeros(sum_len, 2).to(self.rank) + expected_tensor = torch.ones(sum_len, 2).to(self.rank) + input_tensor = torch.ones(output_split_sizes[self.rank], 2).to(self.rank) + + dist.all_gather( + list(torch.split(output_tensor, output_split_sizes)), input_tensor + ) + torch.xpu.synchronize(device=self.rank) + self.assertEqual(output_tensor, expected_tensor) + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + self.assertEqual(len(t["entries"]), self.world_size) + for i in range(self.world_size): + self.assertEqual(t["entries"][i]["profiling_name"], "xccl:_broadcast_oop") + # collective_seq_id should be incremented once. + self.assertEqual(t["entries"][i]["collective_seq_id"], 1) + self.assertEqual(t["entries"][i]["input_sizes"], [[i + 1, 2]]) + self.assertEqual( + t["entries"][i]["output_sizes"], + [[i + 1, 2]], + ) + self.assertEqual(t["entries"][i]["state"], "scheduled") + # No event is recorded for individual ops + self.assertTrue("time_discovered_completed_ns" in t["entries"][i]) + # TODO: (frost-intel) Add coalesced op recording for FR + + # TODO(whc) test out other ops (And combinations of ops, if that's valid?) + @requires_xccl() + @skip_if_lt_x_gpu(2) + @parametrize("timing_enabled", [False]) + def test_coalescing_manager_collective(self, timing_enabled): + """ + The coalescing manager api works by accumulating operations in python via a contextmanager, and then making + one call into c++ to an _coalesced API. It has limited support for ops and has been added recently to + avoid overheads of making individual py-cpp calls. This complicates flight recording.. + + For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives. + """ + if self.rank == self.MAIN_PROCESS_RANK: + return + pg = self._create_process_group_xccl() + if timing_enabled: + pg._enable_collectives_timing() + + output_tensors = torch.zeros(2, 2).to(self.rank) + input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)] + + # TODO(whc) make this work with bigger world or something + self.assertEqual(self.world_size, 2, self.world_size) + + with dist._coalescing_manager(): + for i in range(self.world_size): + dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i]) + self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size) + + torch.xpu.synchronize(device=self.rank) + + if timing_enabled: + # wait for watchdog thread to process the queue of works + time.sleep(1) + + t = pickle.loads(torch._C._distributed_c10d._dump_xccl_trace()) + + self.assertEqual( + len(t["entries"]), 1 + ) # one for the reduce_scatter_tensor_coalesced + self.assertEqual( + t["entries"][0]["profiling_name"], "xccl:reduce_scatter_tensor_coalesced" + ) + # collective_seq_id should be incremented once. + self.assertEqual(t["entries"][0]["collective_seq_id"], 1) + self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]]) + self.assertEqual( + t["entries"][0]["output_sizes"], + [ + [ + 2, + ], + [ + 2, + ], + ], + ) + self.assertEqual(t["entries"][0]["state"], "completed") + if timing_enabled: + duration = t["entries"][0]["duration_ms"] + self.assertTrue(0.001 < duration < 10000, duration) + else: + self.assertTrue("duration_ms" not in t["entries"][0]) + + +instantiate_parametrized_tests(XCCLTraceTest) instantiate_parametrized_tests(ProcessGroupXCCLTest) diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index ddde5f8c8a..cd04712091 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -9,9 +9,41 @@ res2 = 0 fail_test = [] -# libfabric WA to avoid hang issue -os.environ["FI_PROVIDER"] = "tcp" -# os.environ["ZE_AFFINITY_MASK"] = "0,1,2,3" +# Get the xelink group card affinity +ret = os.system("xpu-smi topology -m 2>&1|tee topology.log") +if ret == 0: + gpu_dict = {} + with open("topology.log") as file: + lines = file.readlines() + for line in lines: + if "CPU Affinity" in line: + continue + line = line.strip() + if line.startswith("GPU "): + items = line.split(" ") + items = [x for x in items if x] + gpu_id = items[1] + i = gpu_id.split("/")[0] + affinity = "" + for j, item in enumerate(items): + if "SYS" not in item and ("XL" in item or "S" in item): + if len(affinity) == 0: + affinity = str(j - 2) + else: + affinity = affinity + "," + str(j - 2) + gpu_dict[i] = affinity + + max_affinity = "" + for key, value in gpu_dict.items(): + if len(value) > len(max_affinity): + max_affinity = value + + os.environ["ZE_AFFINITY_MASK"] = str(max_affinity) + print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK"))) + +else: + print("xpu-smi topology failed") + sys.exit(255) # run python test @@ -26,6 +58,10 @@ def run(test_command): test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_backward.py"] +res += run(test_command) +test_command = ["python", "../../../../test/distributed/pipelining/test_microbatch.py"] +res += run(test_command) # run pytest with skiplist for key in skip_dict: @@ -38,8 +74,4 @@ def run(test_command): if fail_test: print(",".join(fail_test) + " have failures") -exit_code = os.WEXITSTATUS(res2) -if exit_code == 0: - sys.exit(res) -else: - sys.exit(exit_code) +sys.exit(res) diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index ae3a8b8a93..9eeb6b44c4 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -74,12 +74,6 @@ "test_compare_cpu_linalg_lu_factor_ex_xpu_float32", "test_compare_cpu_linalg_lu_factor_xpu_float32", "test_compare_cpu_linalg_lu_xpu_float32", - # XPU hang. CUDA hang as well. - # https://github.com/pytorch/pytorch/issues/79528 - "test_compare_cpu_special_hermite_polynomial_he_xpu_float32", - "test_compare_cpu_special_hermite_polynomial_h_xpu_float32", - "test_compare_cpu_special_laguerre_polynomial_l_xpu_float32", - "test_compare_cpu_special_legendre_polynomial_p_xpu_float32", # core dump "test_dtypes__refs_nn_functional_pdist_xpu", # XFAIL of CUDA and XPU, unexpected success in fallback @@ -1022,10 +1016,6 @@ "test_ctc_loss_cudnn_tensor", # want "xpu" in function name # RuntimeError: reflection_pad2d_backward_xpu does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'. "test_ReflectionPad2d_large_deterministic_xpu", - # Case updated in pytorch commit 97272e4 - "test_hardswish_grad_corner_xpu_bfloat16", - "test_hardswish_grad_corner_xpu_float16", - "test_hardswish_grad_corner_xpu_float32", # x_cuda = x.clone().detach().to("cuda").requires_grad_(): Torch not compiled with CUDA enabled "test_layer_norm_backwards_eps", ), diff --git a/test/xpu/skip_list_dist.py b/test/xpu/skip_list_dist.py index 1210896ec6..cf5ed5cd78 100644 --- a/test/xpu/skip_list_dist.py +++ b/test/xpu/skip_list_dist.py @@ -1,105 +1,36 @@ skip_dict = { - "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": ( - "test_checkpoint_fsdp_wrapping_cpu_offload0_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_False_use_orig_params_False", - "test_checkpoint_fsdp_wrapping_cpu_offload1_offload_activations_True_use_orig_params_False", - "test_checkpoint_submodule_use_reentrant_False_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None, "../../../../test/distributed/fsdp/test_fsdp_apply.py": None, "../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": ( "test_ddp_parity_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_comm.py": None, - "../../../../test/distributed/fsdp/test_fsdp_core.py": ( - "test_delayed_optim_step_offload_false_no_shard_xpu", - "test_delayed_optim_step_offload_false_none_xpu", - "test_delayed_optim_step_offload_false_shard_grad_op_xpu", - "test_delayed_optim_step_offload_true_none_xpu", - "test_delayed_optim_step_offload_true_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_false_no_shard_xpu", - "test_delayed_reduce_scatter_offload_false_none_xpu", - "test_delayed_reduce_scatter_offload_false_shard_grad_op_xpu", - "test_delayed_reduce_scatter_offload_true_none_xpu", - "test_delayed_reduce_scatter_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_offload_false_no_shard_xpu", - "test_mixture_of_experts_offload_false_none_xpu", - "test_mixture_of_experts_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_offload_true_no_shard_xpu", - "test_mixture_of_experts_offload_true_none_xpu", - "test_mixture_of_experts_offload_true_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_false_shard_grad_op_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_no_shard_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_none_xpu", - "test_mixture_of_experts_with_delay_before_free_offload_true_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_false_no_shard_xpu", - "test_nested_always_wrap_model_offload_false_none_xpu", - "test_nested_always_wrap_model_offload_false_shard_grad_op_xpu", - "test_nested_always_wrap_model_offload_true_none_xpu", - "test_nested_always_wrap_model_offload_true_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_false_no_shard_xpu", - "test_nested_wrapped_model_offload_false_none_xpu", - "test_nested_wrapped_model_offload_false_shard_grad_op_xpu", - "test_nested_wrapped_model_offload_true_none_xpu", - "test_nested_wrapped_model_offload_true_shard_grad_op_xpu", - "test_transformer_offload_false_no_shard_xpu", - "test_transformer_offload_false_none_xpu", - "test_transformer_offload_false_shard_grad_op_xpu", - "test_transformer_offload_true_none_xpu", - "test_transformer_offload_true_shard_grad_op_xpu", - # https://github.com/intel/torch-xpu-ops/issues/1475 - "test_transformer_no_grad_mixed_precision_True_xpu", - "test_transformer_no_grad_mixed_precision_False_xpu", - ), - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_is_even_sharded_model_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_is_even_sharded_model_True_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_False_xpu", - # "test_fsdp_init_with_device_mesh_is_even_sharded_model_True_xpu", - # "test_raises_warning_or_errors_xpu", - # ), - "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": ( - "test_invalid_first_iter_order_sharding_strategy1_xpu", - "test_train_eval_sharding_strategy1_xpu", - ), + "../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None, + "../../../../test/distributed/fsdp/test_fsdp_core.py": None, + "../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None, "../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": ( "test_parity_with_non_frozen_fsdp_xpu", "test_parity_with_ddp_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_fx.py": None, - # will bring back after oneccl upgrade to 2021.16.1 - # "../../../../test/distributed/fsdp/test_fsdp_input.py": None, + "../../../../test/distributed/fsdp/test_fsdp_input.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None, "../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": ( "test_transformer_no_grad_mixed_precision_True_xpu", ), "../../../../test/distributed/fsdp/test_fsdp_uneven.py": None, - # Will add them back after debugging - # "../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": ( - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_model_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_optim_load_state_dict_offload_to_cpu_True_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_False_xpu", - # "test_dtensor_sharded_tensor_state_dict_identical_offload_to_cpu_True_xpu", - # "test_hsdp_init_with_device_mesh_xpu", - # "test_root_module_is_not_FSDP_xpu", - # ), "../../../../test/distributed/fsdp/test_utils.py": None, "distributed/test_c10d_xccl.py": ( - # will bring back after oneccl upgrade to 2021.16.1 - "test_xccl_barrier", + # https://github.com/intel/torch-xpu-ops/issues/2046 + "test_unwaited", + ), + "distributed/test_c10d_ops_xccl.py": None, + "../../../../test/distributed/fsdp/test_fsdp_misc.py": None, + "../../../../test/distributed/test_functional_api.py": ( + # depends on https://github.com/pytorch/pytorch/pull/159473 + "test_tracing_with_fakepg_xpu", ), + "../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_mem_tracker.py": None, + "../../../../test/distributed/_tools/test_memory_tracker.py": None, } diff --git a/test/xpu/test_tensor_creation_ops_xpu.py b/test/xpu/test_tensor_creation_ops_xpu.py index 95a30716ea..077fbb6ad5 100644 --- a/test/xpu/test_tensor_creation_ops_xpu.py +++ b/test/xpu/test_tensor_creation_ops_xpu.py @@ -4371,6 +4371,14 @@ def test_full_like_inference(self, device): torch.full_like(like, 1.0, dtype=torch.complex64).dtype, torch.complex64 ) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_zeros_large(self, device, dtype): + output = torch.zeros(2**31 - 1, device=device, dtype=dtype) + + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_ones_large(self, device, dtype): + output = torch.ones(2**31 - 1, device=device, dtype=dtype) + # Tests for the `frombuffer` function (only work on CPU): # Constructs tensors from Python objects that implement the buffer protocol, diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py index 807a509bc6..0b5ad70660 100644 --- a/tools/codegen/install_xpu_headers.py +++ b/tools/codegen/install_xpu_headers.py @@ -4,6 +4,7 @@ import shutil from pathlib import Path +VERBOSE = False parser = argparse.ArgumentParser(description="Utils for append ops headers") parser.add_argument( @@ -20,34 +21,53 @@ def append_xpu_function_header(src, dst): r""" Cleans trailing empty lines from the destination file, then appends #include lines from the source file that match `#include \s*\r?\n" matches = re.findall(pattern, src_text, re.MULTILINE) if not matches: return - with open(dst, "r+", encoding="utf-8") as f: - dst_lines = f.readlines() - dst_text = "".join(dst_lines) - missing_headers = [match for match in matches if match not in dst_text] - if not missing_headers: - return + try: + with open(dst, encoding="utf-8") as f: + dst_lines = f.readlines() + dst_text = "".join(dst_lines) + except OSError as e: + if VERBOSE: + print(f"Warning: Could not read destination file {dst}: {e}") + return + + missing_headers = [match for match in matches if match not in dst_text] + if not missing_headers: + return + + new_dst_lines = dst_lines.copy() + + while new_dst_lines and not new_dst_lines[-1].strip(): + new_dst_lines.pop() + new_dst_lines.extend(missing_headers) - # Remove trailing empty lines from dst_lines - while dst_lines and not dst_lines[-1].strip(): - dst_lines.pop() + new_content = "".join(new_dst_lines) + old_content = "".join(dst_lines) - f.seek(0) - f.truncate() - f.writelines(dst_lines) - # Append missing headers to the end of the file - f.writelines(missing_headers) + if new_content != old_content: + try: + with open(dst, "w", encoding="utf-8") as f: + f.writelines(new_dst_lines) + except OSError as e: + if VERBOSE: + print(f"Error: Could not write to {dst}: {e}") def parse_ops_headers(src): @@ -78,18 +98,36 @@ def classify_ops_headers(src_dir, dst_dir): def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers): r""" - Generate XPU ops headers xpu_ops_generated_headers.cmake + Generate XPU ops headers xpu_ops_generated_headers.cmake only if content changes """ - with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w", encoding="utf-8") as fw: - fw.write("set(xpu_ops_generated_headers\n") - for header in xpu_ops_headers: - fw.write(f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n') - fw.write(")\n") + output_file = os.path.join(src_dir, "xpu_ops_generated_headers.cmake") + + # Generate new content + new_content = "set(xpu_ops_generated_headers\n" + for header in xpu_ops_headers: + new_content += f' "{Path(os.path.join(dst_dir, header)).as_posix()}"\n' + new_content += ")\n" + + # Check if file exists and has same content + should_write = True + if os.path.exists(output_file): + try: + with open(output_file, encoding="utf-8") as f: + existing_content = f.read() + should_write = existing_content != new_content + except OSError: + # If we can't read the file, write it anyway + should_write = True + + if should_write: + with open(output_file, "w", encoding="utf-8") as fw: + fw.write(new_content) def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): r""" For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers. + Copies and appends are done only if leading to file changes to prevent unnecessary recompilations. """ if args.dry_run: return @@ -99,7 +137,16 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present." src = os.path.join(src_dir, f) dst = os.path.join(dst_dir, f) - shutil.copy(src, dst) + # Only copy if src and dst differ or dst does not exist + should_copy = True + if os.path.exists(dst): + try: + with open(src, "rb") as fsrc, open(dst, "rb") as fdst: + should_copy = fsrc.read() != fdst.read() + except OSError: + should_copy = True + if should_copy: + shutil.copy(src, dst) for f in common_headers: src = os.path.join(src_dir, f) @@ -118,6 +165,7 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): with open(dst, "r+", encoding="utf-8") as f: dst_lines = f.readlines() dst_text = "".join(dst_lines) + old_content = "".join(dst_lines) missing_declarations = [] insertion_index = None for index, line in enumerate(dst_lines): @@ -133,9 +181,10 @@ def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers): break assert (insertion_index is not None), f"Error: No TORCH_API declaration found in {dst}." - f.seek(0) - f.writelines(dst_lines) - f.truncate() + if old_content != "".join(dst_lines): + f.seek(0) + f.writelines(dst_lines) + f.truncate() def main(): diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index 2df8464262..a3281791de 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -2071,6 +2071,15 @@ XPU: nonzero_xpu tags: [dynamic_output_shape, core] +- func: nonzero_static.out(Tensor self, *, SymInt size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!) + dispatch: + XPU: nonzero_static_out_xpu + +- func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor + variants: method, function + dispatch: + XPU: nonzero_static_xpu + - func: maximum(Tensor self, Tensor other) -> Tensor structured_delegate: maximum.out device_check: NoCheck # TensorIterator