diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 1d97b8c5de21..b005093420d8 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -34,6 +34,10 @@ runners: cpu: 16 family: ["c6g", "c7g"] image: linux-arm64 + linux-arm64-gpu: + family: ["g5g.xlarge"] + image: linux-arm64 + spot: "false" windows-gpu: family: ["g4dn.2xlarge"] image: windows-amd64 diff --git a/.github/workflows/cuda13.yml b/.github/workflows/cuda13.yml index 5ea448f25cce..b6f33291d178 100644 --- a/.github/workflows/cuda13.yml +++ b/.github/workflows/cuda13.yml @@ -36,6 +36,29 @@ jobs: --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ --prefix cache/${{ github.run_id }}/build-cuda13 \ build/testxgboost python-package/dist/*.whl + + build-cuda13-arm64: + name: Build CUDA 13 (ARM64) + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + - tag=cuda13-build-cuda13-arm64 + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh + - run: | + bash ops/pipeline/build-cuda13.sh + - name: Stash files + run: | + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda13-arm64 \ + python-package/dist/*.whl test-cpp-cuda13: name: Google Test (C++) with CUDA 13 needs: [build-cuda13] @@ -62,12 +85,22 @@ jobs: - run: | bash ops/pipeline/test-cpp-cuda13.sh test-python-cuda13: - name: Run Python tests with CUDA 13 - needs: [build-cuda13] + name: Run Python tests with CUDA 13 (${{ matrix.description }}) + needs: [build-cuda13, build-cuda13-arm64] runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-gpu - - tag=cuda13-test-python-cuda13 + - runner=${{ matrix.runner }} + - tag=cuda13-test-python-cuda13-${{ matrix.description }} + strategy: + fail-fast: false + matrix: + include: + - description: amd64 + runner: linux-amd64-gpu + artifact_from: build-cuda13 + - description: arm64 + runner: linux-arm64-gpu + artifact_from: build-cuda13-arm64 steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -80,7 +113,7 @@ jobs: run: | python3 ops/pipeline/manage-artifacts.py download \ --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix cache/${{ github.run_id }}/build-cuda13 \ + --prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \ --dest-dir wheelhouse \ *.whl - name: Run Python tests diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 11fb4ff0a7df..5f7aa9aa95dd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -94,6 +94,28 @@ jobs: bash ops/pipeline/build-cuda.sh \ xgb-ci.gpu_build_rockylinux8_dev_ver enable-rmm + build-cuda-arm64: + name: Build CUDA + manylinux_2_28_aarch64 wheel + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + - tag=main-build-cuda-arm64 + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Log into Docker registry (AWS ECR) + run: bash ops/pipeline/login-docker-registry.sh + - run: bash ops/pipeline/build-cuda-arm64.sh + - name: Stash files + run: | + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix cache/${{ github.run_id }}/build-cuda-arm64 \ + python-package/dist/*.whl + build-python-wheels-arm64: name: Build manylinux_2_28_aarch64 wheel runs-on: @@ -211,7 +233,7 @@ jobs: test-python-wheel: name: Run Python tests (${{ matrix.description }}) - needs: [build-cuda, build-python-wheels-arm64] + needs: [build-cuda, build-cuda-arm64, build-python-wheels-arm64] runs-on: - runs-on - runner=${{ matrix.runner }} @@ -242,6 +264,11 @@ jobs: suite: cpu-arm64 runner: linux-arm64-cpu artifact_from: build-python-wheels-arm64 + - description: gpu-arm64 + image_repo: xgb-ci.gpu_aarch64 + suite: gpu-arm64 + runner: linux-arm64-gpu + artifact_from: build-cuda-arm64 steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 964d833e4756..1e9319b1593d 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -198,6 +198,15 @@ Examples: useful tasks for local development --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8:main \ -- ops/pipeline/build-cuda-impl.sh +* Build XGBoost with GPU support on Linux ARM64 + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_build_rockylinux8_aarch64:main \ + -- ops/pipeline/build-cuda-impl.sh + * Run Python tests .. code-block:: bash @@ -217,6 +226,16 @@ Examples: useful tasks for local development --use-gpus \ -- ops/pipeline/test-python-wheel-impl.sh gpu +* Run Python tests with GPU algorithm on Linux ARM64 + + .. code-block:: bash + + export DOCKER_REGISTRY=492475357299.dkr.ecr.us-west-2.amazonaws.com + python3 ops/docker_run.py \ + --image-uri ${DOCKER_REGISTRY}/xgb-ci.gpu_aarch64:main \ + --use-gpus \ + -- ops/pipeline/test-python-wheel-impl.sh gpu-arm64 + * Run Python tests with GPU algorithm, with multiple GPUs .. code-block:: bash @@ -287,6 +306,8 @@ To opt into self-hosted runners (enabled by RunsOn), we use the following specia - tag=[unique tag that uniquely identifies the job in the GH Action workflow] where the runner is defined in ``.github/runs-on.yml``. +For CUDA-enabled ARM64 builds and tests we rely on the ``linux-arm64-gpu`` runner, +which provisions a Graviton + NVIDIA GPU instance. =================================================================== The Lay of the Land: how CI pipelines are organized in the codebase diff --git a/doc/install.rst b/doc/install.rst index 7fcea0d3b68c..ea466c624acf 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -67,7 +67,7 @@ Capabilities of binary wheels for each platform: +=====================+=========+======================+ | Linux x86_64 | |tick| | |tick| | +---------------------+---------+----------------------+ -| Linux aarch64 | |cross| | |cross| | +| Linux aarch64 | |tick| | |cross| | +---------------------+---------+----------------------+ | MacOS x86_64 | |cross| | |cross| | +---------------------+---------+----------------------+ @@ -76,6 +76,11 @@ Capabilities of binary wheels for each platform: | Windows | |tick| | |cross| | +---------------------+---------+----------------------+ +Linux aarch64 wheels now ship with CUDA support, so ``pip install xgboost`` on +modern Jetson or Graviton machines provides the same GPU functionality as the +Linux x86_64 wheel. Multi-node and multi-GPU training remain experimental on +ARM64 at this time. + Minimal installation (CPU-only) ******************************* The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning. diff --git a/ops/pipeline/build-cuda-arm64.sh b/ops/pipeline/build-cuda-arm64.sh new file mode 100755 index 000000000000..f6078cba5298 --- /dev/null +++ b/ops/pipeline/build-cuda-arm64.sh @@ -0,0 +1,75 @@ +#!/bin/bash +## Build XGBoost with CUDA for Linux ARM64 + +set -euo pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +IMAGE_REPO="xgb-ci.gpu_build_rockylinux8_aarch64" +export USE_FEDERATED=1 +export USE_RMM=0 + +source ops/pipeline/classify-git-branch.sh +source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh + +WHEEL_TAG=manylinux_2_28_aarch64 +BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}" +MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}" + +echo "--- Build with CUDA (ARM64)" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + export BUILD_ONLY_SM75=1 +else + export BUILD_ONLY_SM75=0 +fi + +set -x + +python3 ops/docker_run.py \ + --image-uri ${BUILD_IMAGE_URI} \ + --run-args='-e BUILD_ONLY_SM75 -e USE_RMM -e USE_FEDERATED' \ + -- ops/pipeline/build-cuda-impl.sh + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --image-uri ${MANYLINUX_IMAGE_URI} \ + -- auditwheel repair --only-plat \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 -m wheel tags --python-tag py3 --abi-tag none --platform ${WHEEL_TAG} --remove \ + wheelhouse/*.whl +mv -v wheelhouse/*.whl python-package/dist/ +if ! unzip -l ./python-package/dist/*.whl | grep libgomp > /dev/null; then + echo "error: libgomp.so was not vendored in the wheel" + exit -1 +fi + +# Check size of wheel +pydistcheck --config python-package/pyproject.toml python-package/dist/*.whl + +echo "--- Generate meta info" +python3 ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \ + python-package/dist/*.whl + python3 ops/pipeline/manage-artifacts.py upload \ + --s3-bucket xgboost-nightly-builds \ + --prefix ${BRANCH_NAME} --make-public \ + python-package/dist/meta.json +fi + diff --git a/ops/pipeline/build-cuda13.sh b/ops/pipeline/build-cuda13.sh index 8e24e8147b70..36caec14ae31 100755 --- a/ops/pipeline/build-cuda13.sh +++ b/ops/pipeline/build-cuda13.sh @@ -9,15 +9,29 @@ then exit 1 fi -IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8" export USE_RMM=0 export USE_FEDERATED=0 +ARCH=$(uname -m) +case "${ARCH}" in + x86_64) + IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8" + WHEEL_TAG=manylinux_2_28_x86_64 + ;; + aarch64) + IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64" + WHEEL_TAG=manylinux_2_28_aarch64 + ;; + *) + echo "Unsupported architecture: ${ARCH}" + exit 1 + ;; +esac + source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh source ops/pipeline/get-image-tag.sh -WHEEL_TAG=manylinux_2_28_x86_64 BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}" MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}" diff --git a/ops/pipeline/test-python-wheel-cuda13.sh b/ops/pipeline/test-python-wheel-cuda13.sh index 279411779927..495fe5672aa5 100755 --- a/ops/pipeline/test-python-wheel-cuda13.sh +++ b/ops/pipeline/test-python-wheel-cuda13.sh @@ -6,7 +6,20 @@ set -euo pipefail source ops/pipeline/get-docker-registry-details.sh source ops/pipeline/get-image-tag.sh -IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8" +ARCH=$(uname -m) +case "${ARCH}" in + x86_64) + IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8" + ;; + aarch64) + IMAGE_REPO="xgb-ci.gpu_build_cuda13_rockylinux8_aarch64" + ;; + *) + echo "Unsupported architecture: ${ARCH}" + exit 1 + ;; +esac + IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}" set -x diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh index 5c24e31210d2..88270e85bf5a 100755 --- a/ops/pipeline/test-python-wheel-impl.sh +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -13,7 +13,7 @@ suite="$1" # Cannot set -u before Conda env activation case "$suite" in - gpu|mgpu) + gpu|mgpu|gpu-arm64) source activate gpu_test ;; cpu) @@ -42,6 +42,11 @@ case "$suite" in python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu ;; + gpu-arm64) + echo "-- Run Python tests, using a single GPU (ARM64)" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --durations=0 -m 'not mgpu' tests/python-gpu + ;; mgpu) echo "-- Run Python tests, using multiple GPUs" python -c 'from cupy.cuda import jitify; jitify._init_module()' diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index 9ccdc42042d5..bc83504f2fba 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -5,14 +5,14 @@ set -euo pipefail if [[ "$#" -lt 2 ]] then - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [image_repo]" + echo "Usage: $0 {gpu|mgpu|gpu-arm64|cpu|cpu-arm64} [image_repo]" exit 1 fi suite="$1" image_repo="$2" -if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +if [[ "$suite" == "gpu" || "$suite" == "mgpu" || "$suite" == "gpu-arm64" ]] then gpu_option="--use-gpus" else diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py index ef05a71420ac..a26f5c3ba449 100644 --- a/ops/script/release_artifacts.py +++ b/ops/script/release_artifacts.py @@ -154,6 +154,7 @@ def download_python_wheels(branch: str, commit_hash: str, outdir: Path) -> None: ] cu13_platforms = [ "manylinux_2_28_x86_64", + "manylinux_2_28_aarch64", ] minimal_platforms = [ "win_amd64",