diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 34c1f0d45..597b2ee5c 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -13,14 +13,20 @@ shift echo "Building ${IMAGE_NAME} Docker image" OS=ubuntu -OS_VERSION=20.04 CLANG_VERSION="" PYTHON_VERSION=3.11 MINICONDA_VERSION=24.3.0-0 case "${IMAGE_NAME}" in torchtitan-ubuntu-20.04-clang12) + OS_VERSION=20.04 CLANG_VERSION=12 + BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} + ;; + torchtitan-rocm-ubuntu-22.04-clang12) + OS_VERSION=22.04 + CLANG_VERSION=12 + BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest ;; *) echo "Invalid image name ${IMAGE_NAME}" @@ -30,6 +36,7 @@ esac docker build \ --no-cache \ --progress=plain \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ --build-arg "OS_VERSION=${OS_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ @@ -38,3 +45,4 @@ docker build \ -f "${OS}"/Dockerfile \ "$@" \ . + diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 39e4d8ec5..5d10c01b7 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ -ARG OS_VERSION +ARG BASE_IMAGE -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM ${BASE_IMAGE} ARG OS_VERSION diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 11ff5390c..d5f52824d 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -22,13 +22,16 @@ concurrency: jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge] - timeout-minutes: 240 strategy: fail-fast: false matrix: include: - docker-image-name: torchtitan-ubuntu-20.04-clang12 + runner: [self-hosted, linux.2xlarge] + - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12 + runner: [linux.2xlarge] + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 env: DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} steps: diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 4f3421fce..85ddbb580 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -1,4 +1,5 @@ name: 8 GPU Feature Tests + on: push: branches: [ main ] @@ -19,18 +20,40 @@ defaults: run: shell: bash -l -eo pipefail {0} +permissions: + id-token: write + contents: read + jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + is-rocm: 0 + - name: rocm + runner: linux.rocm.gpu.gfx942.4 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 + is-rocm: 1 with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux @@ -44,9 +67,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded - python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8 + TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4 diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index a64c69eb6..57b9fca2b 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -24,6 +24,13 @@ } +# tests skipped for ROCm +skip_for_rocm_test_list = [ + "model_only_hf_checkpoint", +] +TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" + + def _run_cmd(cmd): return subprocess.run([cmd], text=True, shell=True) @@ -87,6 +94,10 @@ def run_tests(args, test_list: list[OverrideDefinitions]): if test_flavor.disabled: continue + # Skip the test for ROCm + if TEST_WITH_ROCM and test_flavor.test_name in skip_for_rocm_test_list: + continue + # Check if we have enough GPUs if args.ngpu < test_flavor.ngpu: logger.info(