From e7a9e0b81ccf2805e8633ff99b0c901bcad8fdc1 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 3 Jun 2025 23:45:19 -0500 Subject: [PATCH 01/33] Added support to run torchtitan tests on ROCm. --- .ci/docker/build.sh | 26 +++++++++++++++----- .ci/docker/ubuntu-cuda/Dockerfile | 41 +++++++++++++++++++++++++++++++ .ci/docker/ubuntu-rocm/Dockerfile | 16 ++++++++++++ 3 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 .ci/docker/ubuntu-cuda/Dockerfile create mode 100644 .ci/docker/ubuntu-rocm/Dockerfile diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 34c1f0d45..5511acabf 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -12,15 +12,29 @@ shift echo "Building ${IMAGE_NAME} Docker image" +# set operating system OS=ubuntu -OS_VERSION=20.04 -CLANG_VERSION="" -PYTHON_VERSION=3.11 -MINICONDA_VERSION=24.3.0-0 + +# set Dockerfile +DOCKERFILE="${OS}/Dockerfile" +if [[ "$IMAGE_NAME" == *cuda* ]]; then + DOCKERFILE="${OS}-cuda/Dockerfile" +elif [[ "$IMAGE_NAME" == *rocm* ]]; then + DOCKERFILE="${OS}-rocm/Dockerfile" +fi case "${IMAGE_NAME}" in torchtitan-ubuntu-20.04-clang12) + OS_VERSION=20.04 CLANG_VERSION=12 + PYTHON_VERSION=3.11 + MINICONDA_VERSION=24.3.0-0 + ;; + torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3) + OS_VERSION=22.04 + CLANG_VERSION=19 + PYTHON_VERSION=3.10 + MINICONDA_VERSION=25.3.1-0 ;; *) echo "Invalid image name ${IMAGE_NAME}" @@ -34,7 +48,7 @@ docker build \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - --shm-size=1g \ - -f "${OS}"/Dockerfile \ + -f $(dirname ${DOCKERFILE})/Dockerfile \ "$@" \ . + diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile new file mode 100644 index 000000000..39e4d8ec5 --- /dev/null +++ b/.ci/docker/ubuntu-cuda/Dockerfile @@ -0,0 +1,41 @@ +ARG OS_VERSION + +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} + +ARG OS_VERSION + +ENV DEBIAN_FRONTEND noninteractive + +# Install common dependencies +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +# Install clang +ARG CLANG_VERSION +COPY ./common/install_clang.sh install_clang.sh +RUN bash ./install_clang.sh && rm install_clang.sh + +# Install gcc +ARG GCC_VERSION +COPY ./common/install_gcc.sh install_gcc.sh +RUN bash ./install_gcc.sh && rm install_gcc.sh + +# Setup user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +# Install conda and other dependencies +ARG MINICONDA_VERSION +ARG PYTHON_VERSION +ENV PYTHON_VERSION=$PYTHON_VERSION +ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH +COPY requirements-dev.txt /opt/conda/ +COPY requirements.txt /opt/conda/ +COPY requirements-flux.txt /opt/conda/ +COPY conda-env-ci.txt /opt/conda/ +COPY ./common/install_conda.sh install_conda.sh +COPY ./common/utils.sh utils.sh +RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt + +USER ci-user +CMD ["bash"] diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile new file mode 100644 index 000000000..6f292dfdd --- /dev/null +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -0,0 +1,16 @@ +# base image +FROM rocm/pytorch-nightly:latest + +# args +ARG OS_VERSION +ARG CLANG_VERSION +ARG GCC_VERSION +ARG MINICONDA_VERSION +ARG PYTHON_VERSION + +# install dependencies +COPY requirements.txt requirements.txt +RUN pip install -r ./requirements.txt + +CMD ["bash"] + From 04a17180ed86a32cdf31594032e7f2a381866a73 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 4 Jun 2025 22:53:57 -0500 Subject: [PATCH 02/33] Added rocm ci support for integration_test_h100. --- .../integration_test_8gpu_h100_rocm.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/integration_test_8gpu_h100_rocm.yaml diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_h100_rocm.yaml new file mode 100644 index 000000000..3debb1760 --- /dev/null +++ b/.github/workflows/integration_test_8gpu_h100_rocm.yaml @@ -0,0 +1,37 @@ +name: 8 GPU Integration Test at H100 + +on: + push: + branches: [ main ] + pull_request: + schedule: + # Runs every 6 hours + - cron: '0 */6 * * *' +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + + USE_CPP=0 python -m pip install --pre torchao + + mkdir artifacts-to-be-uploaded + python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 + From 7894f3fb2c46fafe347aab25ef36632aab9c7422 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Sat, 7 Jun 2025 00:40:28 -0500 Subject: [PATCH 03/33] Fixed a bug in build script. Removed ubuntu-cuda folder, instead using ubuntu folder for cuda Dockerfile. --- .ci/docker/build.sh | 4 +- .ci/docker/ubuntu-cuda/Dockerfile | 41 ------------------- ...m.yaml => integration_test_8gpu_rocm.yaml} | 8 ++-- 3 files changed, 4 insertions(+), 49 deletions(-) delete mode 100644 .ci/docker/ubuntu-cuda/Dockerfile rename .github/workflows/{integration_test_8gpu_h100_rocm.yaml => integration_test_8gpu_rocm.yaml} (70%) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 5511acabf..a1aafe3d3 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -17,9 +17,7 @@ OS=ubuntu # set Dockerfile DOCKERFILE="${OS}/Dockerfile" -if [[ "$IMAGE_NAME" == *cuda* ]]; then - DOCKERFILE="${OS}-cuda/Dockerfile" -elif [[ "$IMAGE_NAME" == *rocm* ]]; then +if [[ "$IMAGE_NAME" == *rocm* ]]; then DOCKERFILE="${OS}-rocm/Dockerfile" fi diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile deleted file mode 100644 index 39e4d8ec5..000000000 --- a/.ci/docker/ubuntu-cuda/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -ARG OS_VERSION - -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} - -ARG OS_VERSION - -ENV DEBIAN_FRONTEND noninteractive - -# Install common dependencies -COPY ./common/install_base.sh install_base.sh -RUN bash ./install_base.sh && rm install_base.sh - -# Install clang -ARG CLANG_VERSION -COPY ./common/install_clang.sh install_clang.sh -RUN bash ./install_clang.sh && rm install_clang.sh - -# Install gcc -ARG GCC_VERSION -COPY ./common/install_gcc.sh install_gcc.sh -RUN bash ./install_gcc.sh && rm install_gcc.sh - -# Setup user -COPY ./common/install_user.sh install_user.sh -RUN bash ./install_user.sh && rm install_user.sh - -# Install conda and other dependencies -ARG MINICONDA_VERSION -ARG PYTHON_VERSION -ENV PYTHON_VERSION=$PYTHON_VERSION -ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH -COPY requirements-dev.txt /opt/conda/ -COPY requirements.txt /opt/conda/ -COPY requirements-flux.txt /opt/conda/ -COPY conda-env-ci.txt /opt/conda/ -COPY ./common/install_conda.sh install_conda.sh -COPY ./common/utils.sh utils.sh -RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt - -USER ci-user -CMD ["bash"] diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml similarity index 70% rename from .github/workflows/integration_test_8gpu_h100_rocm.yaml rename to .github/workflows/integration_test_8gpu_rocm.yaml index 3debb1760..bb64ad528 100644 --- a/.github/workflows/integration_test_8gpu_h100_rocm.yaml +++ b/.github/workflows/integration_test_8gpu_rocm.yaml @@ -1,4 +1,4 @@ -name: 8 GPU Integration Test at H100 +name: 8 GPU Integration Test on: push: @@ -17,13 +17,11 @@ defaults: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm gpu-arch-version: "6.4" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 repository: pytorch/torchtitan upload-artifact: outputs @@ -33,5 +31,5 @@ jobs: USE_CPP=0 python -m pip install --pre torchao mkdir artifacts-to-be-uploaded - python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From 041c04bcb9462a6f4b2b78df6aa5fc9903c6ad41 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 10 Jun 2025 20:07:10 -0500 Subject: [PATCH 04/33] Added tests.integration_tests.features during rebase. --- tests/integration_tests/features.py | 71 +++++++++++++++++------------ 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index 31c15017d..20ae39af4 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -7,6 +7,8 @@ from tests.integration_tests import OverrideDefinitions +test_with_rocm = os.getenv("TEST_WITH_ROCM", "0") + def build_features_test_list() -> list[OverrideDefinitions]: """ @@ -129,28 +131,34 @@ def build_features_test_list() -> list[OverrideDefinitions]: "Checkpoint Integration Test - Save Model Only bf16", "last_save_model_only_bf16", ), - OverrideDefinitions( - [ + ] + # check test_with_rocm + if test_with_rocm != "1": + integration_tests_flavors["debug_model.toml"].extend([ + OverrideDefinitions( [ - "--parallelism.pipeline_parallel_degree 4", - "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", + [ + "--parallelism.pipeline_parallel_degree 4", + "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", + ], ], - ], - "PP looped zero bubble test", - "pp_looped_zero_bubble", - ngpu=4, - ), - OverrideDefinitions( - [ + "PP looped zero bubble test", + "pp_looped_zero_bubble", + ngpu=4, + ), + OverrideDefinitions( [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", + [ + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", + ], ], - ], - "PP zero bubble test (v shaped)", - "pp_zbv", - ngpu=2, - ), + "PP zero bubble test (v shaped)", + "pp_zbv", + ngpu=2, + ), + ]) + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -262,18 +270,24 @@ def build_features_test_list() -> list[OverrideDefinitions]: "pp_looped_1f1b", ngpu=4, ), - OverrideDefinitions( - [ + ]) + # check test_with_rocm + if test_with_rocm != "1": + integration_tests_flavors["debug_model.toml"].extend( + OverrideDefinitions( [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", - "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", + [ + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", + "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", + ], ], - ], - "PP with custom pipeline schedule loaded from CSV file", - "pp_custom_csv", - ngpu=2, - ), + "PP with custom pipeline schedule loaded from CSV file", + "pp_custom_csv", + ngpu=2, + ), + ) + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -528,5 +542,4 @@ def build_features_test_list() -> list[OverrideDefinitions]: ngpu=8, ), ] - return integration_tests_flavors From 19863fbd8edd957fb38a5d3b7478a70907926ef7 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 13 Jun 2025 13:51:04 -0500 Subject: [PATCH 05/33] Modified docker-builds.yml to build rocm docker image for torchtitan. --- .github/workflows/docker-builds.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 11ff5390c..75d5082c5 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -22,13 +22,16 @@ concurrency: jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge] + runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8] timeout-minutes: 240 strategy: fail-fast: false matrix: include: - - docker-image-name: torchtitan-ubuntu-20.04-clang12 + - docker-image-name: [ + torchtitan-ubuntu-20.04-clang12, + torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + ] env: DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} steps: From cacfd759594c95ee356b6448ef754e7e1f80e3ba Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 18 Jun 2025 13:04:59 -0500 Subject: [PATCH 06/33] Fixed runner for cuda and rocm images in docker-builds.yml. --- .github/workflows/docker-builds.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 75d5082c5..4289a07ab 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -22,16 +22,16 @@ concurrency: jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8] - timeout-minutes: 240 strategy: fail-fast: false matrix: include: - - docker-image-name: [ - torchtitan-ubuntu-20.04-clang12, - torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - ] + - docker-image-name: torchtitan-ubuntu-20.04-clang12 + runner: [self-hosted, linux.2xlarge] + - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + runner: linux.rocm.gpu.mi300.8 + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 env: DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} steps: From 0f89cb6b30c9f585c4443ce1f1776ba4cd39d9c8 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 19 Jun 2025 01:01:43 -0500 Subject: [PATCH 07/33] Added TEST_WITH_ROCM environment variable for running tests on rocm. Fixed error in integration_tests.py. Fixed lint errors. --- .ci/docker/ubuntu-rocm/Dockerfile | 1 - .github/workflows/integration_test_8gpu_rocm.yaml | 3 +-- tests/integration_tests/features.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index 6f292dfdd..ae944279c 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -13,4 +13,3 @@ COPY requirements.txt requirements.txt RUN pip install -r ./requirements.txt CMD ["bash"] - diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml index bb64ad528..c5069164d 100644 --- a/.github/workflows/integration_test_8gpu_rocm.yaml +++ b/.github/workflows/integration_test_8gpu_rocm.yaml @@ -31,5 +31,4 @@ jobs: USE_CPP=0 python -m pip install --pre torchao mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 - + python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index 20ae39af4..6a43f940c 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -273,7 +273,7 @@ def build_features_test_list() -> list[OverrideDefinitions]: ]) # check test_with_rocm if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend( + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -286,7 +286,7 @@ def build_features_test_list() -> list[OverrideDefinitions]: "pp_custom_csv", ngpu=2, ), - ) + ]) integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ From 21838e0dd0c3ff1e975894b6cfdc816c4b32d52b Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 24 Jun 2025 13:38:01 -0500 Subject: [PATCH 08/33] Made additional changes to tests.integration_tests.features during rebase. --- tests/integration_tests/features.py | 82 ++++++++++++++--------------- 1 file changed, 39 insertions(+), 43 deletions(-) mode change 100755 => 100644 tests/integration_tests/features.py diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py old mode 100755 new mode 100644 index 6a43f940c..6fce80052 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -7,7 +7,15 @@ from tests.integration_tests import OverrideDefinitions -test_with_rocm = os.getenv("TEST_WITH_ROCM", "0") +# tests skipped for ROCm +skip_for_rocm_test_list = [ + "pp_looped_zero_bubble", + "pp_zbv", + "pp_custom_csv", + "last_save_model_weights_only_bf16", + "last_save_model_weights_only_fp32", +] +TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" def build_features_test_list() -> list[OverrideDefinitions]: @@ -131,34 +139,28 @@ def build_features_test_list() -> list[OverrideDefinitions]: "Checkpoint Integration Test - Save Model Only bf16", "last_save_model_only_bf16", ), - ] - # check test_with_rocm - if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend([ - OverrideDefinitions( + OverrideDefinitions( + [ [ - [ - "--parallelism.pipeline_parallel_degree 4", - "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", - ], + "--parallelism.pipeline_parallel_degree 4", + "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", ], - "PP looped zero bubble test", - "pp_looped_zero_bubble", - ngpu=4, - ), - OverrideDefinitions( + ], + "PP looped zero bubble test", + "pp_looped_zero_bubble", + ngpu=4, + ), + OverrideDefinitions( + [ [ - [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", - ], + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", ], - "PP zero bubble test (v shaped)", - "pp_zbv", - ngpu=2, - ), - ]) - integration_tests_flavors["debug_model.toml"].extend([ + ], + "PP zero bubble test (v shaped)", + "pp_zbv", + ngpu=2, + ), OverrideDefinitions( [ [ @@ -270,24 +272,18 @@ def build_features_test_list() -> list[OverrideDefinitions]: "pp_looped_1f1b", ngpu=4, ), - ]) - # check test_with_rocm - if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend([ - OverrideDefinitions( - [ - [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", - "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", - ], - ], - "PP with custom pipeline schedule loaded from CSV file", - "pp_custom_csv", - ngpu=2, - ), - ]) - integration_tests_flavors["debug_model.toml"].extend([ + OverrideDefinitions( + [ + [ + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", + "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", + ], + ], + "PP with custom pipeline schedule loaded from CSV file", + "pp_custom_csv", + ngpu=2, + ), OverrideDefinitions( [ [ From 98c7a65efd4adc938e93111082a9f8f522aca056 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Sun, 29 Jun 2025 02:14:23 -0500 Subject: [PATCH 09/33] Changed runner to i-0962598bd0e8298b3 for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 4289a07ab..bfe2fac82 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: linux.rocm.gpu.mi300.8 + runner: i-0962598bd0e8298b3 runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From 9a287760db402f05f424cfef95acff0b000ca37a Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 30 Jun 2025 10:59:25 -0500 Subject: [PATCH 10/33] Changed runner to linux.12xlarge for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index bfe2fac82..be12cceaa 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: i-0962598bd0e8298b3 + runner: [linux.12xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From ab45e78ac1107f3194fcfc9c1c80073c2ae833a2 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 30 Jun 2025 11:21:03 -0500 Subject: [PATCH 11/33] Changed runner to linux.2xlarge for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index be12cceaa..84afe8bd2 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: [linux.12xlarge] + runner: [linux.2xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From 56bf9303c3616fa4b1d2dd0d85abd3e2b7d62ec0 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 3 Jul 2025 00:14:42 -0500 Subject: [PATCH 12/33] Resolved conflict in .github.workflows.integration_test_8gpu_models during rebase. --- .ci/docker/build.sh | 24 ++++++------- .ci/docker/ubuntu-rocm/Dockerfile | 15 -------- .ci/docker/ubuntu/Dockerfile | 4 +-- .../workflows/integration_test_8gpu_h100.yaml | 31 ++++++++++++----- .../integration_test_8gpu_models.yaml | 33 +++++++++++++----- .../workflows/integration_test_8gpu_rocm.yaml | 34 ------------------- tests/integration_tests/features.py | 0 7 files changed, 60 insertions(+), 81 deletions(-) delete mode 100644 .ci/docker/ubuntu-rocm/Dockerfile delete mode 100644 .github/workflows/integration_test_8gpu_rocm.yaml mode change 100644 => 100755 tests/integration_tests/features.py diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index a1aafe3d3..597b2ee5c 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -12,27 +12,21 @@ shift echo "Building ${IMAGE_NAME} Docker image" -# set operating system OS=ubuntu - -# set Dockerfile -DOCKERFILE="${OS}/Dockerfile" -if [[ "$IMAGE_NAME" == *rocm* ]]; then - DOCKERFILE="${OS}-rocm/Dockerfile" -fi +CLANG_VERSION="" +PYTHON_VERSION=3.11 +MINICONDA_VERSION=24.3.0-0 case "${IMAGE_NAME}" in torchtitan-ubuntu-20.04-clang12) OS_VERSION=20.04 CLANG_VERSION=12 - PYTHON_VERSION=3.11 - MINICONDA_VERSION=24.3.0-0 + BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} ;; - torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3) + torchtitan-rocm-ubuntu-22.04-clang12) OS_VERSION=22.04 - CLANG_VERSION=19 - PYTHON_VERSION=3.10 - MINICONDA_VERSION=25.3.1-0 + CLANG_VERSION=12 + BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest ;; *) echo "Invalid image name ${IMAGE_NAME}" @@ -42,11 +36,13 @@ esac docker build \ --no-cache \ --progress=plain \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ --build-arg "OS_VERSION=${OS_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - -f $(dirname ${DOCKERFILE})/Dockerfile \ + --shm-size=1g \ + -f "${OS}"/Dockerfile \ "$@" \ . diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile deleted file mode 100644 index ae944279c..000000000 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -# base image -FROM rocm/pytorch-nightly:latest - -# args -ARG OS_VERSION -ARG CLANG_VERSION -ARG GCC_VERSION -ARG MINICONDA_VERSION -ARG PYTHON_VERSION - -# install dependencies -COPY requirements.txt requirements.txt -RUN pip install -r ./requirements.txt - -CMD ["bash"] diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 39e4d8ec5..5d10c01b7 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ -ARG OS_VERSION +ARG BASE_IMAGE -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM ${BASE_IMAGE} ARG OS_VERSION diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 94a3c298b..535bbe189 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -24,13 +24,28 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.aws.h100.8 + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + - name: rocm + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 with: - runner: linux.aws.h100.8 - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -46,9 +61,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index ed044b0c0..0100dc727 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -23,13 +23,30 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + is-rocm: 0 + - name: rocm + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 + is-rocm: 1 with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -45,9 +62,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8 diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml deleted file mode 100644 index c5069164d..000000000 --- a/.github/workflows/integration_test_8gpu_rocm.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: 8 GPU Integration Test - -on: - push: - branches: [ main ] - pull_request: - schedule: - # Runs every 6 hours - - cron: '0 */6 * * *' -concurrency: - group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -l -eo pipefail {0} - -jobs: - build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - with: - runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - repository: pytorch/torchtitan - upload-artifact: outputs - script: | - set -eux - - USE_CPP=0 python -m pip install --pre torchao - - mkdir artifacts-to-be-uploaded - python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py old mode 100644 new mode 100755 From 74dbc4a5785c8f75c5600916504245b79faf906e Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 3 Jul 2025 00:35:08 -0500 Subject: [PATCH 13/33] Changed rocm docker image name in docker-builds.yml. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 84afe8bd2..d5f52824d 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -28,7 +28,7 @@ jobs: include: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12 runner: [linux.2xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 From 07a4a736ec4ee0213c45455b3ad3c243d8c5d879 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 9 Jul 2025 00:54:14 -0500 Subject: [PATCH 14/33] Reverted the changes to integration_test_8gpu_h100.yaml. --- .../workflows/integration_test_8gpu_h100.yaml | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 535bbe189..94a3c298b 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -24,28 +24,13 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - include: - - name: cuda - runner: linux.aws.h100.8 - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/cu126 - - name: rocm - runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-ubuntu-22.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/rocm6.4 with: - runner: ${{ matrix.runner }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} - docker-image: ${{ matrix.docker-image }} + runner: linux.aws.h100.8 + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -61,9 +46,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 - USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded From be0ecb5c8979091f37d3d361809e0673d30575ff Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 16 Jul 2025 13:47:13 -0500 Subject: [PATCH 15/33] Empty dummy commit. From 0f5048e5233aad7591c013589098690e401a8bd9 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 16 Jul 2025 20:30:01 -0500 Subject: [PATCH 16/33] Increased the timeout to 45 minutes to override timeout used in linux_job_v2.yml for integration_test_8gpu.yaml. --- .github/workflows/integration_test_8gpu_models.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index 0100dc727..ba5180686 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -49,6 +49,7 @@ jobs: docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux From 7b5dcdffbf614313c778c8b682245c0737a69245 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 17 Jul 2025 13:29:35 -0500 Subject: [PATCH 17/33] Empty dummy commit. From 2512cf581e4d1293d68393f316bcbaccd2dd3109 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 23 Sep 2025 00:29:54 -0500 Subject: [PATCH 18/33] Added aws setup in the integration_test_8gpu workflow. --- .../workflows/integration_test_8gpu_models.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index ba5180686..3848843c2 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -21,6 +21,23 @@ defaults: shell: bash -l -eo pipefail {0} jobs: + # aws setup + aws-setup: + runs-on: linux.rocm.gpu.mi300.8 + steps: + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main strategy: From c23e65bcac793cd77d4df931d4e94b18344ce242 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 25 Sep 2025 23:44:59 -0500 Subject: [PATCH 19/33] Performed rebase and made changes to include code refactoring done upstream. --- .github/workflows/integration_test_8gpu.yaml | 88 +++++++++++++++++++ .../integration_test_8gpu_models.yaml | 51 ++--------- tests/integration_tests/features.py | 11 +-- tests/integration_tests/run_tests.py | 11 +++ 4 files changed, 108 insertions(+), 53 deletions(-) create mode 100644 .github/workflows/integration_test_8gpu.yaml diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml new file mode 100644 index 000000000..441177c44 --- /dev/null +++ b/.github/workflows/integration_test_8gpu.yaml @@ -0,0 +1,88 @@ +name: 8 GPU Model Tests + +on: + push: + branches: [ main ] + paths-ignore: + - 'torchtitan/experiments/**' + pull_request: + paths-ignore: + - 'torchtitan/experiments/**' + schedule: + # Runs every 6 hours + - cron: '0 */6 * * *' + +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + # aws setup + aws-setup: + runs-on: linux.rocm.gpu.mi300.8 + steps: + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + is-rocm: 0 + - name: rocm + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 + is-rocm: 1 + with: + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} + repository: pytorch/torchtitan + upload-artifact: outputs + timeout: 45 + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # Log CUDA driver version for debugging. + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true) + echo "CUDA driver version: ${DRIVER_VERSION}" + + pip config --user set global.progress_bar off + + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} + + mkdir artifacts-to-be-uploaded + TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8 diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index 3848843c2..ed044b0c0 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -21,52 +21,17 @@ defaults: shell: bash -l -eo pipefail {0} jobs: - # aws setup - aws-setup: - runs-on: linux.rocm.gpu.mi300.8 - steps: - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: true - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - strategy: - matrix: - include: - - name: cuda - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/cu126 - is-rocm: 0 - - name: rocm - runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-ubuntu-22.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/rocm6.4 - is-rocm: 1 with: - runner: ${{ matrix.runner }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} - docker-image: ${{ matrix.docker-image }} + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 repository: pytorch/torchtitan upload-artifact: outputs - timeout: 45 script: | set -eux @@ -80,9 +45,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 - USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index 6fce80052..31c15017d 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -7,16 +7,6 @@ from tests.integration_tests import OverrideDefinitions -# tests skipped for ROCm -skip_for_rocm_test_list = [ - "pp_looped_zero_bubble", - "pp_zbv", - "pp_custom_csv", - "last_save_model_weights_only_bf16", - "last_save_model_weights_only_fp32", -] -TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" - def build_features_test_list() -> list[OverrideDefinitions]: """ @@ -538,4 +528,5 @@ def build_features_test_list() -> list[OverrideDefinitions]: ngpu=8, ), ] + return integration_tests_flavors diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index a64c69eb6..57b9fca2b 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -24,6 +24,13 @@ } +# tests skipped for ROCm +skip_for_rocm_test_list = [ + "model_only_hf_checkpoint", +] +TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" + + def _run_cmd(cmd): return subprocess.run([cmd], text=True, shell=True) @@ -87,6 +94,10 @@ def run_tests(args, test_list: list[OverrideDefinitions]): if test_flavor.disabled: continue + # Skip the test for ROCm + if TEST_WITH_ROCM and test_flavor.test_name in skip_for_rocm_test_list: + continue + # Check if we have enough GPUs if args.ngpu < test_flavor.ngpu: logger.info( From a99db9f2b08b41c9ac6c9ef7e52caffd1dfc5414 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 26 Sep 2025 11:00:30 -0500 Subject: [PATCH 20/33] Changed rocm runner name. --- .github/workflows/integration_test_8gpu.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 441177c44..b89d148bd 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -23,7 +23,7 @@ defaults: jobs: # aws setup aws-setup: - runs-on: linux.rocm.gpu.mi300.8 + runs-on: linux.rocm.gpu.gfx942.8 steps: - name: configure aws credentials id: aws_creds @@ -53,7 +53,7 @@ jobs: index-url: https://download.pytorch.org/whl/nightly/cu126 is-rocm: 0 - name: rocm - runner: linux.rocm.gpu.mi300.8 + runner: linux.rocm.gpu.gfx942.8 gpu-arch-type: rocm gpu-arch-version: "6.4" docker-image: torchtitan-rocm-ubuntu-22.04-clang12 From 3d331bc1fd40bf0db62d884c4e338c06a9379c50 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 26 Sep 2025 14:03:05 -0500 Subject: [PATCH 21/33] Added a change to run build-test after aws-setup. --- .github/workflows/integration_test_8gpu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index b89d148bd..0fcb965e9 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -39,6 +39,7 @@ jobs: uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 build-test: + needs: aws-setup uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main strategy: matrix: From 7d359dd99ef90c922baa285e70da4a11c72403eb Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 26 Sep 2025 14:10:42 -0500 Subject: [PATCH 22/33] Changed the test name in integration_test_8gpu.yaml workflow file. --- .github/workflows/integration_test_8gpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 0fcb965e9..40b8c6568 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -1,4 +1,4 @@ -name: 8 GPU Model Tests +name: 8 GPU Integration Test on: push: From 0f5c57f1f1640d641aae53c44932bd8373c8837a Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 29 Sep 2025 12:20:04 -0500 Subject: [PATCH 23/33] Fixed id-token permission issue in integration_test_8gpu.yaml. --- .github/workflows/integration_test_8gpu.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 40b8c6568..851011b1a 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -20,6 +20,10 @@ defaults: run: shell: bash -l -eo pipefail {0} +permissions: + id-token: write + contents: read + jobs: # aws setup aws-setup: From a8368a2248dac5b87a6bde8e27766eb26d7b606f Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 29 Sep 2025 12:37:51 -0500 Subject: [PATCH 24/33] Added id-token permission issue inside aws-setup job in integration_test_8gpu.yaml. --- .github/workflows/integration_test_8gpu.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 851011b1a..b5ed4fc93 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -20,13 +20,12 @@ defaults: run: shell: bash -l -eo pipefail {0} -permissions: - id-token: write - contents: read - jobs: # aws setup aws-setup: + permissions: + id-token: write + contents: read runs-on: linux.rocm.gpu.gfx942.8 steps: - name: configure aws credentials From 36fb0e53550a8192e605f71e0857257053fcda3d Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 29 Sep 2025 14:39:25 -0500 Subject: [PATCH 25/33] To test workflow, switched to 4 GPU runner as they are relatively easily available to run the workflow. --- .github/workflows/integration_test_8gpu.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index b5ed4fc93..0ff2ff509 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -26,7 +26,7 @@ jobs: permissions: id-token: write contents: read - runs-on: linux.rocm.gpu.gfx942.8 + runs-on: linux.rocm.gpu.gfx942.4 steps: - name: configure aws credentials id: aws_creds @@ -57,7 +57,7 @@ jobs: index-url: https://download.pytorch.org/whl/nightly/cu126 is-rocm: 0 - name: rocm - runner: linux.rocm.gpu.gfx942.8 + runner: linux.rocm.gpu.gfx942.4 gpu-arch-type: rocm gpu-arch-version: "6.4" docker-image: torchtitan-rocm-ubuntu-22.04-clang12 @@ -89,4 +89,4 @@ jobs: USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded - TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8 + TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4 From 1fba2ab8a183dffea901a391bd08e59fc81d1fb6 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 30 Sep 2025 12:38:36 -0500 Subject: [PATCH 26/33] Moved permissions section for id-token outside the aws-setup job. --- .github/workflows/integration_test_8gpu.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 0ff2ff509..898cfd529 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -12,6 +12,10 @@ on: # Runs every 6 hours - cron: '0 */6 * * *' +permissions: + id-token: write + contents: read + concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true @@ -23,9 +27,6 @@ defaults: jobs: # aws setup aws-setup: - permissions: - id-token: write - contents: read runs-on: linux.rocm.gpu.gfx942.4 steps: - name: configure aws credentials From acbedff8136ecf55c4cc462e455a67b462b7084f Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 6 Oct 2025 16:19:31 -0500 Subject: [PATCH 27/33] Using move_aws_steps_inside_setup_rocm branch to do aws authentication in linux_job_v2.yml. --- .github/workflows/integration_test_8gpu.yaml | 28 ++++---------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 898cfd529..a65ca937c 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -12,10 +12,6 @@ on: # Runs every 6 hours - cron: '0 */6 * * *' -permissions: - id-token: write - contents: read - concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true @@ -24,27 +20,13 @@ defaults: run: shell: bash -l -eo pipefail {0} -jobs: - # aws setup - aws-setup: - runs-on: linux.rocm.gpu.gfx942.4 - steps: - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: true - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 +permissions: + id-token: write + contents: read +jobs: build-test: - needs: aws-setup - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm strategy: matrix: include: From f8577d7fd3f3d69bef68737ab4a6e8531870a5a1 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 7 Oct 2025 00:03:11 -0500 Subject: [PATCH 28/33] Using linux linux_job_v2.yml in akashveramd fork having aws setup only for rocm. --- .github/workflows/integration_test_8gpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index a65ca937c..830ddef56 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -26,7 +26,7 @@ permissions: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm + uses: akashveramd/test-infra/.github/workflows/linux_job_v2.yml@linux_job_v2_main_rocm_aws strategy: matrix: include: From 962608347e336e534d2565671f635e0479cb9425 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 7 Oct 2025 14:42:13 -0500 Subject: [PATCH 29/33] Using linux_job_v2.yml having id-token write permissions from pytorch and move_aws_steps_inside_setup_rocm branch. --- .github/workflows/integration_test_8gpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 830ddef56..a65ca937c 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -26,7 +26,7 @@ permissions: jobs: build-test: - uses: akashveramd/test-infra/.github/workflows/linux_job_v2.yml@linux_job_v2_main_rocm_aws + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm strategy: matrix: include: From 6985eec00f5fb9b125b3dbbf01b85aee6df24eaa Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 8 Oct 2025 15:16:01 -0500 Subject: [PATCH 30/33] Removed integration_test_8gpu.yaml and added ROCm workflow to run features tests inside integration_test_8gpu_features.yaml. Using linux_job_v2.yml from the main branch. Rolled back to using 8 GPU runner for ROCm. --- .github/workflows/integration_test_8gpu.yaml | 75 ------------------- .../integration_test_8gpu_features.yaml | 41 +++++++--- 2 files changed, 32 insertions(+), 84 deletions(-) delete mode 100644 .github/workflows/integration_test_8gpu.yaml diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml deleted file mode 100644 index a65ca937c..000000000 --- a/.github/workflows/integration_test_8gpu.yaml +++ /dev/null @@ -1,75 +0,0 @@ -name: 8 GPU Integration Test - -on: - push: - branches: [ main ] - paths-ignore: - - 'torchtitan/experiments/**' - pull_request: - paths-ignore: - - 'torchtitan/experiments/**' - schedule: - # Runs every 6 hours - - cron: '0 */6 * * *' - -concurrency: - group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -l -eo pipefail {0} - -permissions: - id-token: write - contents: read - -jobs: - build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm - strategy: - matrix: - include: - - name: cuda - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/cu126 - is-rocm: 0 - - name: rocm - runner: linux.rocm.gpu.gfx942.4 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-ubuntu-22.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/rocm6.4 - is-rocm: 1 - with: - runner: ${{ matrix.runner }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} - docker-image: ${{ matrix.docker-image }} - repository: pytorch/torchtitan - upload-artifact: outputs - timeout: 45 - script: | - set -eux - - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - # Log CUDA driver version for debugging. - DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true) - echo "CUDA driver version: ${DRIVER_VERSION}" - - pip config --user set global.progress_bar off - - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - - USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} - - mkdir artifacts-to-be-uploaded - TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4 diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 4f3421fce..475bbd960 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -1,4 +1,5 @@ name: 8 GPU Feature Tests + on: push: branches: [ main ] @@ -19,18 +20,40 @@ defaults: run: shell: bash -l -eo pipefail {0} +permissions: + id-token: write + contents: read + jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + is-rocm: 0 + - name: rocm + runner: linux.rocm.gpu.gfx942.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 + is-rocm: 1 with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux @@ -44,9 +67,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded - python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8 + TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4 From b9ab5c278e43c9cdd903858bf2518886ab1ccd2d Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 9 Oct 2025 11:37:54 -0500 Subject: [PATCH 31/33] Using 7311 branch for linux_job_v2.yml. --- .github/workflows/integration_test_8gpu_features.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 475bbd960..91875ae2d 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -26,7 +26,7 @@ permissions: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@7311 strategy: matrix: include: From 184e8be4b74ffc87d1dcb9e3a58013cb05cb02a5 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 9 Oct 2025 18:13:41 -0500 Subject: [PATCH 32/33] Using 4 GPU runner for ROCm. --- .github/workflows/integration_test_8gpu_features.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 91875ae2d..a0275dc4d 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -40,7 +40,7 @@ jobs: index-url: https://download.pytorch.org/whl/nightly/cu126 is-rocm: 0 - name: rocm - runner: linux.rocm.gpu.gfx942.8 + runner: linux.rocm.gpu.gfx942.4 gpu-arch-type: rocm gpu-arch-version: "6.4" docker-image: torchtitan-rocm-ubuntu-22.04-clang12 From 11cb25d3a62f2088a940113166e163aad26b31e1 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 10 Oct 2025 23:51:26 -0500 Subject: [PATCH 33/33] Switched back to main branch for linux_job_v2.yml. --- .github/workflows/integration_test_8gpu_features.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index a0275dc4d..85ddbb580 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -26,7 +26,7 @@ permissions: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@7311 + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main strategy: matrix: include: