From e7a9e0b81ccf2805e8633ff99b0c901bcad8fdc1 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 3 Jun 2025 23:45:19 -0500
Subject: [PATCH 01/33] Added support to run torchtitan tests on ROCm.

---
 .ci/docker/build.sh               | 26 +++++++++++++++-----
 .ci/docker/ubuntu-cuda/Dockerfile | 41 +++++++++++++++++++++++++++++++
 .ci/docker/ubuntu-rocm/Dockerfile | 16 ++++++++++++
 3 files changed, 77 insertions(+), 6 deletions(-)
 create mode 100644 .ci/docker/ubuntu-cuda/Dockerfile
 create mode 100644 .ci/docker/ubuntu-rocm/Dockerfile

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 34c1f0d45..5511acabf 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -12,15 +12,29 @@ shift
 
 echo "Building ${IMAGE_NAME} Docker image"
 
+# set operating system
 OS=ubuntu
-OS_VERSION=20.04
-CLANG_VERSION=""
-PYTHON_VERSION=3.11
-MINICONDA_VERSION=24.3.0-0
+
+# set Dockerfile
+DOCKERFILE="${OS}/Dockerfile"
+if [[ "$IMAGE_NAME" == *cuda* ]]; then
+  DOCKERFILE="${OS}-cuda/Dockerfile"
+elif [[ "$IMAGE_NAME" == *rocm* ]]; then
+  DOCKERFILE="${OS}-rocm/Dockerfile"
+fi
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
+    OS_VERSION=20.04
     CLANG_VERSION=12
+    PYTHON_VERSION=3.11
+    MINICONDA_VERSION=24.3.0-0
+    ;;
+  torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3)
+    OS_VERSION=22.04
+    CLANG_VERSION=19
+    PYTHON_VERSION=3.10
+    MINICONDA_VERSION=25.3.1-0
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -34,7 +48,7 @@ docker build \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  --shm-size=1g \
-  -f "${OS}"/Dockerfile \
+  -f $(dirname ${DOCKERFILE})/Dockerfile \
   "$@" \
   .
+
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
new file mode 100644
index 000000000..39e4d8ec5
--- /dev/null
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -0,0 +1,41 @@
+ARG OS_VERSION
+
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+
+ARG OS_VERSION
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install clang
+ARG CLANG_VERSION
+COPY ./common/install_clang.sh install_clang.sh
+RUN bash ./install_clang.sh && rm install_clang.sh
+
+# Install gcc
+ARG GCC_VERSION
+COPY ./common/install_gcc.sh install_gcc.sh
+RUN bash ./install_gcc.sh && rm install_gcc.sh
+
+# Setup user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install conda and other dependencies
+ARG MINICONDA_VERSION
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-dev.txt /opt/conda/
+COPY requirements.txt /opt/conda/
+COPY requirements-flux.txt /opt/conda/
+COPY conda-env-ci.txt /opt/conda/
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/utils.sh utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
+
+USER ci-user
+CMD ["bash"]
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
new file mode 100644
index 000000000..6f292dfdd
--- /dev/null
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -0,0 +1,16 @@
+# base image
+FROM rocm/pytorch-nightly:latest
+
+# args
+ARG OS_VERSION
+ARG CLANG_VERSION
+ARG GCC_VERSION
+ARG MINICONDA_VERSION
+ARG PYTHON_VERSION
+
+# install dependencies
+COPY requirements.txt requirements.txt
+RUN pip install -r ./requirements.txt
+
+CMD ["bash"]
+

From 04a17180ed86a32cdf31594032e7f2a381866a73 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 4 Jun 2025 22:53:57 -0500
Subject: [PATCH 02/33] Added rocm ci support for integration_test_h100.

---
 .../integration_test_8gpu_h100_rocm.yaml      | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/integration_test_8gpu_h100_rocm.yaml

diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_h100_rocm.yaml
new file mode 100644
index 000000000..3debb1760
--- /dev/null
+++ b/.github/workflows/integration_test_8gpu_h100_rocm.yaml
@@ -0,0 +1,37 @@
+name: 8 GPU Integration Test at H100
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.rocm.gpu.mi300.8
+      gpu-arch-type: rocm
+      gpu-arch-version: "6.4"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        USE_CPP=0 python -m pip install --pre torchao
+
+        mkdir artifacts-to-be-uploaded
+        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+

From 7894f3fb2c46fafe347aab25ef36632aab9c7422 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Sat, 7 Jun 2025 00:40:28 -0500
Subject: [PATCH 03/33] Fixed a bug in build script. Removed ubuntu-cuda
 folder, instead using ubuntu folder for cuda Dockerfile.

---
 .ci/docker/build.sh                           |  4 +-
 .ci/docker/ubuntu-cuda/Dockerfile             | 41 -------------------
 ...m.yaml => integration_test_8gpu_rocm.yaml} |  8 ++--
 3 files changed, 4 insertions(+), 49 deletions(-)
 delete mode 100644 .ci/docker/ubuntu-cuda/Dockerfile
 rename .github/workflows/{integration_test_8gpu_h100_rocm.yaml => integration_test_8gpu_rocm.yaml} (70%)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 5511acabf..a1aafe3d3 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -17,9 +17,7 @@ OS=ubuntu
 
 # set Dockerfile
 DOCKERFILE="${OS}/Dockerfile"
-if [[ "$IMAGE_NAME" == *cuda* ]]; then
-  DOCKERFILE="${OS}-cuda/Dockerfile"
-elif [[ "$IMAGE_NAME" == *rocm* ]]; then
+if [[ "$IMAGE_NAME" == *rocm* ]]; then
   DOCKERFILE="${OS}-rocm/Dockerfile"
 fi
 
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
deleted file mode 100644
index 39e4d8ec5..000000000
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-ARG OS_VERSION
-
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
-
-ARG OS_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install common dependencies
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install clang
-ARG CLANG_VERSION
-COPY ./common/install_clang.sh install_clang.sh
-RUN bash ./install_clang.sh && rm install_clang.sh
-
-# Install gcc
-ARG GCC_VERSION
-COPY ./common/install_gcc.sh install_gcc.sh
-RUN bash ./install_gcc.sh && rm install_gcc.sh
-
-# Setup user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install conda and other dependencies
-ARG MINICONDA_VERSION
-ARG PYTHON_VERSION
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY requirements-dev.txt /opt/conda/
-COPY requirements.txt /opt/conda/
-COPY requirements-flux.txt /opt/conda/
-COPY conda-env-ci.txt /opt/conda/
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/utils.sh utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
-
-USER ci-user
-CMD ["bash"]
diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
similarity index 70%
rename from .github/workflows/integration_test_8gpu_h100_rocm.yaml
rename to .github/workflows/integration_test_8gpu_rocm.yaml
index 3debb1760..bb64ad528 100644
--- a/.github/workflows/integration_test_8gpu_h100_rocm.yaml
+++ b/.github/workflows/integration_test_8gpu_rocm.yaml
@@ -1,4 +1,4 @@
-name: 8 GPU Integration Test at H100
+name: 8 GPU Integration Test
 
 on:
   push:
@@ -17,13 +17,11 @@ defaults:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
       gpu-arch-version: "6.4"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
       docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
       repository: pytorch/torchtitan
       upload-artifact: outputs
@@ -33,5 +31,5 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
 

From 041c04bcb9462a6f4b2b78df6aa5fc9903c6ad41 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 10 Jun 2025 20:07:10 -0500
Subject: [PATCH 04/33] Added tests.integration_tests.features during rebase.

---
 tests/integration_tests/features.py | 71 +++++++++++++++++------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
index 31c15017d..20ae39af4 100755
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -7,6 +7,8 @@
 
 from tests.integration_tests import OverrideDefinitions
 
+test_with_rocm = os.getenv("TEST_WITH_ROCM", "0")
+
 
 def build_features_test_list() -> list[OverrideDefinitions]:
     """
@@ -129,28 +131,34 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "Checkpoint Integration Test - Save Model Only bf16",
             "last_save_model_only_bf16",
         ),
-        OverrideDefinitions(
-            [
+    ]
+    # check test_with_rocm
+    if test_with_rocm != "1":
+        integration_tests_flavors["debug_model.toml"].extend([
+            OverrideDefinitions(
                 [
-                    "--parallelism.pipeline_parallel_degree 4",
-                    "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
+                    [
+                        "--parallelism.pipeline_parallel_degree 4",
+                        "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
+                    ],
                 ],
-            ],
-            "PP looped zero bubble test",
-            "pp_looped_zero_bubble",
-            ngpu=4,
-        ),
-        OverrideDefinitions(
-            [
+                "PP looped zero bubble test",
+                "pp_looped_zero_bubble",
+                ngpu=4,
+            ),
+            OverrideDefinitions(
                 [
-                    "--parallelism.pipeline_parallel_degree 2",
-                    "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
+                    [
+                        "--parallelism.pipeline_parallel_degree 2",
+                        "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
+                    ],
                 ],
-            ],
-            "PP zero bubble test (v shaped)",
-            "pp_zbv",
-            ngpu=2,
-        ),
+                "PP zero bubble test (v shaped)",
+                "pp_zbv",
+                ngpu=2,
+            ),
+        ])
+    integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [
                 [
@@ -262,18 +270,24 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "pp_looped_1f1b",
             ngpu=4,
         ),
-        OverrideDefinitions(
-            [
+    ])
+    # check test_with_rocm
+    if test_with_rocm != "1":
+        integration_tests_flavors["debug_model.toml"].extend(
+            OverrideDefinitions(
                 [
-                    "--parallelism.pipeline_parallel_degree 2",
-                    "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
-                    "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
+                    [
+                        "--parallelism.pipeline_parallel_degree 2",
+                        "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
+                        "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
+                    ],
                 ],
-            ],
-            "PP with custom pipeline schedule loaded from CSV file",
-            "pp_custom_csv",
-            ngpu=2,
-        ),
+                "PP with custom pipeline schedule loaded from CSV file",
+                "pp_custom_csv",
+                ngpu=2,
+            ),
+        )
+    integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [
                 [
@@ -528,5 +542,4 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             ngpu=8,
         ),
     ]
-
     return integration_tests_flavors

From 19863fbd8edd957fb38a5d3b7478a70907926ef7 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 13 Jun 2025 13:51:04 -0500
Subject: [PATCH 05/33] Modified docker-builds.yml to build rocm docker image
 for torchtitan.

---
 .github/workflows/docker-builds.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 11ff5390c..75d5082c5 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,13 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8]
     timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
-          - docker-image-name: torchtitan-ubuntu-20.04-clang12
+          - docker-image-name: [
+            torchtitan-ubuntu-20.04-clang12,
+            torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+          ]
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:

From cacfd759594c95ee356b6448ef754e7e1f80e3ba Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 18 Jun 2025 13:04:59 -0500
Subject: [PATCH 06/33] Fixed runner for cuda and rocm images in
 docker-builds.yml.

---
 .github/workflows/docker-builds.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 75d5082c5..4289a07ab 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,16 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8]
-    timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
-          - docker-image-name: [
-            torchtitan-ubuntu-20.04-clang12,
-            torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-          ]
+          - docker-image-name: torchtitan-ubuntu-20.04-clang12
+            runner: [self-hosted, linux.2xlarge]
+          - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+            runner: linux.rocm.gpu.mi300.8
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 240
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:

From 0f89cb6b30c9f585c4443ce1f1776ba4cd39d9c8 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 19 Jun 2025 01:01:43 -0500
Subject: [PATCH 07/33] Added TEST_WITH_ROCM environment variable for running
 tests on rocm. Fixed error in integration_tests.py. Fixed lint errors.

---
 .ci/docker/ubuntu-rocm/Dockerfile                 | 1 -
 .github/workflows/integration_test_8gpu_rocm.yaml | 3 +--
 tests/integration_tests/features.py               | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index 6f292dfdd..ae944279c 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -13,4 +13,3 @@ COPY requirements.txt requirements.txt
 RUN pip install -r ./requirements.txt
 
 CMD ["bash"]
-
diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
index bb64ad528..c5069164d 100644
--- a/.github/workflows/integration_test_8gpu_rocm.yaml
+++ b/.github/workflows/integration_test_8gpu_rocm.yaml
@@ -31,5 +31,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
-
+        python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
index 20ae39af4..6a43f940c 100755
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -273,7 +273,7 @@ def build_features_test_list() -> list[OverrideDefinitions]:
     ])
     # check test_with_rocm
     if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend(
+        integration_tests_flavors["debug_model.toml"].extend([
             OverrideDefinitions(
                 [
                     [
@@ -286,7 +286,7 @@ def build_features_test_list() -> list[OverrideDefinitions]:
                 "pp_custom_csv",
                 ngpu=2,
             ),
-        )
+        ])
     integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [

From 21838e0dd0c3ff1e975894b6cfdc816c4b32d52b Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 24 Jun 2025 13:38:01 -0500
Subject: [PATCH 08/33] Made additional changes to
 tests.integration_tests.features during rebase.

---
 tests/integration_tests/features.py | 82 ++++++++++++++---------------
 1 file changed, 39 insertions(+), 43 deletions(-)
 mode change 100755 => 100644 tests/integration_tests/features.py

diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
old mode 100755
new mode 100644
index 6a43f940c..6fce80052
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -7,7 +7,15 @@
 
 from tests.integration_tests import OverrideDefinitions
 
-test_with_rocm = os.getenv("TEST_WITH_ROCM", "0")
+# tests skipped for ROCm
+skip_for_rocm_test_list = [
+    "pp_looped_zero_bubble",
+    "pp_zbv",
+    "pp_custom_csv",
+    "last_save_model_weights_only_bf16",
+    "last_save_model_weights_only_fp32",
+]
+TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
 
 
 def build_features_test_list() -> list[OverrideDefinitions]:
@@ -131,34 +139,28 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "Checkpoint Integration Test - Save Model Only bf16",
             "last_save_model_only_bf16",
         ),
-    ]
-    # check test_with_rocm
-    if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend([
-            OverrideDefinitions(
+        OverrideDefinitions(
+            [
                 [
-                    [
-                        "--parallelism.pipeline_parallel_degree 4",
-                        "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
-                    ],
+                    "--parallelism.pipeline_parallel_degree 4",
+                    "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
                 ],
-                "PP looped zero bubble test",
-                "pp_looped_zero_bubble",
-                ngpu=4,
-            ),
-            OverrideDefinitions(
+            ],
+            "PP looped zero bubble test",
+            "pp_looped_zero_bubble",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
                 [
-                    [
-                        "--parallelism.pipeline_parallel_degree 2",
-                        "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
-                    ],
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
                 ],
-                "PP zero bubble test (v shaped)",
-                "pp_zbv",
-                ngpu=2,
-            ),
-        ])
-    integration_tests_flavors["debug_model.toml"].extend([
+            ],
+            "PP zero bubble test (v shaped)",
+            "pp_zbv",
+            ngpu=2,
+        ),
         OverrideDefinitions(
             [
                 [
@@ -270,24 +272,18 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "pp_looped_1f1b",
             ngpu=4,
         ),
-    ])
-    # check test_with_rocm
-    if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend([
-            OverrideDefinitions(
-                [
-                    [
-                        "--parallelism.pipeline_parallel_degree 2",
-                        "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
-                        "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
-                    ],
-                ],
-                "PP with custom pipeline schedule loaded from CSV file",
-                "pp_custom_csv",
-                ngpu=2,
-            ),
-        ])
-    integration_tests_flavors["debug_model.toml"].extend([
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
+                    "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
+                ],
+            ],
+            "PP with custom pipeline schedule loaded from CSV file",
+            "pp_custom_csv",
+            ngpu=2,
+        ),
         OverrideDefinitions(
             [
                 [

From 98c7a65efd4adc938e93111082a9f8f522aca056 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Sun, 29 Jun 2025 02:14:23 -0500
Subject: [PATCH 09/33] Changed runner to i-0962598bd0e8298b3 for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 4289a07ab..bfe2fac82 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: linux.rocm.gpu.mi300.8
+            runner: i-0962598bd0e8298b3
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From 9a287760db402f05f424cfef95acff0b000ca37a Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 30 Jun 2025 10:59:25 -0500
Subject: [PATCH 10/33] Changed runner to linux.12xlarge for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index bfe2fac82..be12cceaa 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: i-0962598bd0e8298b3
+            runner: [linux.12xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From ab45e78ac1107f3194fcfc9c1c80073c2ae833a2 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 30 Jun 2025 11:21:03 -0500
Subject: [PATCH 11/33] Changed runner to linux.2xlarge for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index be12cceaa..84afe8bd2 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: [linux.12xlarge]
+            runner: [linux.2xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From 56bf9303c3616fa4b1d2dd0d85abd3e2b7d62ec0 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 3 Jul 2025 00:14:42 -0500
Subject: [PATCH 12/33] Resolved conflict in
 .github.workflows.integration_test_8gpu_models during rebase.

---
 .ci/docker/build.sh                           | 24 ++++++-------
 .ci/docker/ubuntu-rocm/Dockerfile             | 15 --------
 .ci/docker/ubuntu/Dockerfile                  |  4 +--
 .../workflows/integration_test_8gpu_h100.yaml | 31 ++++++++++++-----
 .../integration_test_8gpu_models.yaml         | 33 +++++++++++++-----
 .../workflows/integration_test_8gpu_rocm.yaml | 34 -------------------
 tests/integration_tests/features.py           |  0
 7 files changed, 60 insertions(+), 81 deletions(-)
 delete mode 100644 .ci/docker/ubuntu-rocm/Dockerfile
 delete mode 100644 .github/workflows/integration_test_8gpu_rocm.yaml
 mode change 100644 => 100755 tests/integration_tests/features.py

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index a1aafe3d3..597b2ee5c 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -12,27 +12,21 @@ shift
 
 echo "Building ${IMAGE_NAME} Docker image"
 
-# set operating system
 OS=ubuntu
-
-# set Dockerfile
-DOCKERFILE="${OS}/Dockerfile"
-if [[ "$IMAGE_NAME" == *rocm* ]]; then
-  DOCKERFILE="${OS}-rocm/Dockerfile"
-fi
+CLANG_VERSION=""
+PYTHON_VERSION=3.11
+MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
     OS_VERSION=20.04
     CLANG_VERSION=12
-    PYTHON_VERSION=3.11
-    MINICONDA_VERSION=24.3.0-0
+    BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
     ;;
-  torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3)
+  torchtitan-rocm-ubuntu-22.04-clang12)
     OS_VERSION=22.04
-    CLANG_VERSION=19
-    PYTHON_VERSION=3.10
-    MINICONDA_VERSION=25.3.1-0
+    CLANG_VERSION=12
+    BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -42,11 +36,13 @@ esac
 docker build \
   --no-cache \
   --progress=plain \
+  --build-arg "BASE_IMAGE=${BASE_IMAGE}" \
   --build-arg "OS_VERSION=${OS_VERSION}" \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  -f $(dirname ${DOCKERFILE})/Dockerfile \
+  --shm-size=1g \
+  -f "${OS}"/Dockerfile \
   "$@" \
   .
 
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
deleted file mode 100644
index ae944279c..000000000
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-# base image
-FROM rocm/pytorch-nightly:latest
-
-# args
-ARG OS_VERSION
-ARG CLANG_VERSION
-ARG GCC_VERSION
-ARG MINICONDA_VERSION
-ARG PYTHON_VERSION
-
-# install dependencies
-COPY requirements.txt requirements.txt
-RUN pip install -r ./requirements.txt
-
-CMD ["bash"]
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 39e4d8ec5..5d10c01b7 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
-ARG OS_VERSION
+ARG BASE_IMAGE
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM ${BASE_IMAGE}
 
 ARG OS_VERSION
 
diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 94a3c298b..535bbe189 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -24,13 +24,28 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.aws.h100.8
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
     with:
-      runner: linux.aws.h100.8
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -46,9 +61,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
 
diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
index ed044b0c0..0100dc727 100644
--- a/.github/workflows/integration_test_8gpu_models.yaml
+++ b/.github/workflows/integration_test_8gpu_models.yaml
@@ -23,13 +23,30 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -45,9 +62,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
         python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
deleted file mode 100644
index c5069164d..000000000
--- a/.github/workflows/integration_test_8gpu_rocm.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: 8 GPU Integration Test
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-  schedule:
-    # Runs every 6 hours
-    - cron: '0 */6 * * *'
-concurrency:
-  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
-  cancel-in-progress: true
-
-defaults:
-  run:
-    shell: bash -l -eo pipefail {0}
-
-jobs:
-  build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.rocm.gpu.mi300.8
-      gpu-arch-type: rocm
-      gpu-arch-version: "6.4"
-      docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-      repository: pytorch/torchtitan
-      upload-artifact: outputs
-      script: |
-        set -eux
-
-        USE_CPP=0 python -m pip install --pre torchao
-
-        mkdir artifacts-to-be-uploaded
-        python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
old mode 100644
new mode 100755

From 74dbc4a5785c8f75c5600916504245b79faf906e Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 3 Jul 2025 00:35:08 -0500
Subject: [PATCH 13/33] Changed rocm docker image name in docker-builds.yml.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 84afe8bd2..d5f52824d 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -28,7 +28,7 @@ jobs:
         include:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
-          - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+          - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12
             runner: [linux.2xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240

From 07a4a736ec4ee0213c45455b3ad3c243d8c5d879 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 9 Jul 2025 00:54:14 -0500
Subject: [PATCH 14/33] Reverted the changes to
 integration_test_8gpu_h100.yaml.

---
 .../workflows/integration_test_8gpu_h100.yaml | 31 +++++--------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 535bbe189..94a3c298b 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -24,28 +24,13 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - name: cuda
-            runner: linux.aws.h100.8
-            gpu-arch-type: cuda
-            gpu-arch-version: "12.6"
-            # This image is faster to clone than the default, but it lacks CC needed by triton
-            # (1m25s vs 2m37s).
-            docker-image: torchtitan-ubuntu-20.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/cu126
-          - name: rocm
-            runner: linux.rocm.gpu.mi300.8
-            gpu-arch-type: rocm
-            gpu-arch-version: "6.4"
-            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
     with:
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      docker-image: ${{ matrix.docker-image }}
+      runner: linux.aws.h100.8
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -61,9 +46,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
 

From be0ecb5c8979091f37d3d361809e0673d30575ff Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 16 Jul 2025 13:47:13 -0500
Subject: [PATCH 15/33] Empty dummy commit.


From 0f5048e5233aad7591c013589098690e401a8bd9 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 16 Jul 2025 20:30:01 -0500
Subject: [PATCH 16/33] Increased the timeout to 45 minutes to override timeout
 used in linux_job_v2.yml for integration_test_8gpu.yaml.

---
 .github/workflows/integration_test_8gpu_models.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
index 0100dc727..ba5180686 100644
--- a/.github/workflows/integration_test_8gpu_models.yaml
+++ b/.github/workflows/integration_test_8gpu_models.yaml
@@ -49,6 +49,7 @@ jobs:
       docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 

From 7b5dcdffbf614313c778c8b682245c0737a69245 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 17 Jul 2025 13:29:35 -0500
Subject: [PATCH 17/33] Empty dummy commit.


From 2512cf581e4d1293d68393f316bcbaccd2dd3109 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 23 Sep 2025 00:29:54 -0500
Subject: [PATCH 18/33] Added aws setup in the integration_test_8gpu workflow.

---
 .../workflows/integration_test_8gpu_models.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
index ba5180686..3848843c2 100644
--- a/.github/workflows/integration_test_8gpu_models.yaml
+++ b/.github/workflows/integration_test_8gpu_models.yaml
@@ -21,6 +21,23 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
+  # aws setup
+  aws-setup:
+    runs-on: linux.rocm.gpu.mi300.8
+    steps:
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:

From c23e65bcac793cd77d4df931d4e94b18344ce242 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 25 Sep 2025 23:44:59 -0500
Subject: [PATCH 19/33] Performed rebase and made changes to include code
 refactoring done upstream.

---
 .github/workflows/integration_test_8gpu.yaml  | 88 +++++++++++++++++++
 .../integration_test_8gpu_models.yaml         | 51 ++---------
 tests/integration_tests/features.py           | 11 +--
 tests/integration_tests/run_tests.py          | 11 +++
 4 files changed, 108 insertions(+), 53 deletions(-)
 create mode 100644 .github/workflows/integration_test_8gpu.yaml

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
new file mode 100644
index 000000000..441177c44
--- /dev/null
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -0,0 +1,88 @@
+name: 8 GPU Model Tests
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  # aws setup
+  aws-setup:
+    runs-on: linux.rocm.gpu.mi300.8
+    steps:
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
+    with:
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      timeout: 45
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
+
+        mkdir artifacts-to-be-uploaded
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml
index 3848843c2..ed044b0c0 100644
--- a/.github/workflows/integration_test_8gpu_models.yaml
+++ b/.github/workflows/integration_test_8gpu_models.yaml
@@ -21,52 +21,17 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
-  # aws setup
-  aws-setup:
-    runs-on: linux.rocm.gpu.mi300.8
-    steps:
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: true
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    strategy:
-      matrix:
-        include:
-          - name: cuda
-            runner: linux.g5.48xlarge.nvidia.gpu
-            gpu-arch-type: cuda
-            gpu-arch-version: "12.6"
-            # This image is faster to clone than the default, but it lacks CC needed by triton
-            # (1m25s vs 2m37s).
-            docker-image: torchtitan-ubuntu-20.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/cu126
-            is-rocm: 0
-          - name: rocm
-            runner: linux.rocm.gpu.mi300.8
-            gpu-arch-type: rocm
-            gpu-arch-version: "6.4"
-            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
-            is-rocm: 1
     with:
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      docker-image: ${{ matrix.docker-image }}
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
-      timeout: 45
       script: |
         set -eux
 
@@ -80,9 +45,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
         python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
index 6fce80052..31c15017d 100755
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -7,16 +7,6 @@
 
 from tests.integration_tests import OverrideDefinitions
 
-# tests skipped for ROCm
-skip_for_rocm_test_list = [
-    "pp_looped_zero_bubble",
-    "pp_zbv",
-    "pp_custom_csv",
-    "last_save_model_weights_only_bf16",
-    "last_save_model_weights_only_fp32",
-]
-TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
-
 
 def build_features_test_list() -> list[OverrideDefinitions]:
     """
@@ -538,4 +528,5 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             ngpu=8,
         ),
     ]
+
     return integration_tests_flavors
diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py
index a64c69eb6..57b9fca2b 100644
--- a/tests/integration_tests/run_tests.py
+++ b/tests/integration_tests/run_tests.py
@@ -24,6 +24,13 @@
 }
 
 
+# tests skipped for ROCm
+skip_for_rocm_test_list = [
+    "model_only_hf_checkpoint",
+]
+TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
+
+
 def _run_cmd(cmd):
     return subprocess.run([cmd], text=True, shell=True)
 
@@ -87,6 +94,10 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
         if test_flavor.disabled:
             continue
 
+        # Skip the test for ROCm
+        if TEST_WITH_ROCM and test_flavor.test_name in skip_for_rocm_test_list:
+            continue
+
         # Check if we have enough GPUs
         if args.ngpu < test_flavor.ngpu:
             logger.info(

From a99db9f2b08b41c9ac6c9ef7e52caffd1dfc5414 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 26 Sep 2025 11:00:30 -0500
Subject: [PATCH 20/33] Changed rocm runner name.

---
 .github/workflows/integration_test_8gpu.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 441177c44..b89d148bd 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -23,7 +23,7 @@ defaults:
 jobs:
   # aws setup
   aws-setup:
-    runs-on: linux.rocm.gpu.mi300.8
+    runs-on: linux.rocm.gpu.gfx942.8
     steps:
       - name: configure aws credentials
         id: aws_creds
@@ -53,7 +53,7 @@ jobs:
             index-url: https://download.pytorch.org/whl/nightly/cu126
             is-rocm: 0
           - name: rocm
-            runner: linux.rocm.gpu.mi300.8
+            runner: linux.rocm.gpu.gfx942.8
             gpu-arch-type: rocm
             gpu-arch-version: "6.4"
             docker-image: torchtitan-rocm-ubuntu-22.04-clang12

From 3d331bc1fd40bf0db62d884c4e338c06a9379c50 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 26 Sep 2025 14:03:05 -0500
Subject: [PATCH 21/33] Added a change to run build-test after aws-setup.

---
 .github/workflows/integration_test_8gpu.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index b89d148bd..0fcb965e9 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -39,6 +39,7 @@ jobs:
         uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
 
   build-test:
+    needs: aws-setup
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       matrix:

From 7d359dd99ef90c922baa285e70da4a11c72403eb Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 26 Sep 2025 14:10:42 -0500
Subject: [PATCH 22/33] Changed the test name in integration_test_8gpu.yaml
 workflow file.

---
 .github/workflows/integration_test_8gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 0fcb965e9..40b8c6568 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -1,4 +1,4 @@
-name: 8 GPU Model Tests
+name: 8 GPU Integration Test
 
 on:
   push:

From 0f5c57f1f1640d641aae53c44932bd8373c8837a Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 29 Sep 2025 12:20:04 -0500
Subject: [PATCH 23/33] Fixed id-token permission issue in
 integration_test_8gpu.yaml.

---
 .github/workflows/integration_test_8gpu.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 40b8c6568..851011b1a 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -20,6 +20,10 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   # aws setup
   aws-setup:

From a8368a2248dac5b87a6bde8e27766eb26d7b606f Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 29 Sep 2025 12:37:51 -0500
Subject: [PATCH 24/33] Added id-token permission issue inside aws-setup job in
 integration_test_8gpu.yaml.

---
 .github/workflows/integration_test_8gpu.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 851011b1a..b5ed4fc93 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -20,13 +20,12 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
-permissions:
-  id-token: write
-  contents: read
-
 jobs:
   # aws setup
   aws-setup:
+    permissions:
+      id-token: write
+      contents: read
     runs-on: linux.rocm.gpu.gfx942.8
     steps:
       - name: configure aws credentials

From 36fb0e53550a8192e605f71e0857257053fcda3d Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 29 Sep 2025 14:39:25 -0500
Subject: [PATCH 25/33] To test workflow, switched to 4 GPU runner as they are
 relatively easily available to run the workflow.

---
 .github/workflows/integration_test_8gpu.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index b5ed4fc93..0ff2ff509 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -26,7 +26,7 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    runs-on: linux.rocm.gpu.gfx942.8
+    runs-on: linux.rocm.gpu.gfx942.4
     steps:
       - name: configure aws credentials
         id: aws_creds
@@ -57,7 +57,7 @@ jobs:
             index-url: https://download.pytorch.org/whl/nightly/cu126
             is-rocm: 0
           - name: rocm
-            runner: linux.rocm.gpu.gfx942.8
+            runner: linux.rocm.gpu.gfx942.4
             gpu-arch-type: rocm
             gpu-arch-version: "6.4"
             docker-image: torchtitan-rocm-ubuntu-22.04-clang12
@@ -89,4 +89,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
-        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4

From 1fba2ab8a183dffea901a391bd08e59fc81d1fb6 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 30 Sep 2025 12:38:36 -0500
Subject: [PATCH 26/33] Moved permissions section for id-token outside the
 aws-setup job.

---
 .github/workflows/integration_test_8gpu.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 0ff2ff509..898cfd529 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -12,6 +12,10 @@ on:
     # Runs every 6 hours
     - cron: '0 */6 * * *'
 
+permissions:
+      id-token: write
+      contents: read
+
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
@@ -23,9 +27,6 @@ defaults:
 jobs:
   # aws setup
   aws-setup:
-    permissions:
-      id-token: write
-      contents: read
     runs-on: linux.rocm.gpu.gfx942.4
     steps:
       - name: configure aws credentials

From acbedff8136ecf55c4cc462e455a67b462b7084f Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 6 Oct 2025 16:19:31 -0500
Subject: [PATCH 27/33] Using move_aws_steps_inside_setup_rocm branch to do aws
 authentication in linux_job_v2.yml.

---
 .github/workflows/integration_test_8gpu.yaml | 28 ++++----------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 898cfd529..a65ca937c 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -12,10 +12,6 @@ on:
     # Runs every 6 hours
     - cron: '0 */6 * * *'
 
-permissions:
-      id-token: write
-      contents: read
-
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
@@ -24,27 +20,13 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
-jobs:
-  # aws setup
-  aws-setup:
-    runs-on: linux.rocm.gpu.gfx942.4
-    steps:
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: true
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+permissions:
+      id-token: write
+      contents: read
 
+jobs:
   build-test:
-    needs: aws-setup
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm
     strategy:
       matrix:
         include:

From f8577d7fd3f3d69bef68737ab4a6e8531870a5a1 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 7 Oct 2025 00:03:11 -0500
Subject: [PATCH 28/33] Using linux linux_job_v2.yml in akashveramd fork having
 aws setup only for rocm.

---
 .github/workflows/integration_test_8gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index a65ca937c..830ddef56 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm
+    uses: akashveramd/test-infra/.github/workflows/linux_job_v2.yml@linux_job_v2_main_rocm_aws
     strategy:
       matrix:
         include:

From 962608347e336e534d2565671f635e0479cb9425 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 7 Oct 2025 14:42:13 -0500
Subject: [PATCH 29/33] Using linux_job_v2.yml having id-token write
 permissions from pytorch and move_aws_steps_inside_setup_rocm branch.

---
 .github/workflows/integration_test_8gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 830ddef56..a65ca937c 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   build-test:
-    uses: akashveramd/test-infra/.github/workflows/linux_job_v2.yml@linux_job_v2_main_rocm_aws
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm
     strategy:
       matrix:
         include:

From 6985eec00f5fb9b125b3dbbf01b85aee6df24eaa Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 8 Oct 2025 15:16:01 -0500
Subject: [PATCH 30/33] Removed integration_test_8gpu.yaml and added ROCm
 workflow to run features tests inside integration_test_8gpu_features.yaml.
 Using linux_job_v2.yml from the main branch. Rolled back to using 8 GPU
 runner for ROCm.

---
 .github/workflows/integration_test_8gpu.yaml  | 75 -------------------
 .../integration_test_8gpu_features.yaml       | 41 +++++++---
 2 files changed, 32 insertions(+), 84 deletions(-)
 delete mode 100644 .github/workflows/integration_test_8gpu.yaml

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
deleted file mode 100644
index a65ca937c..000000000
--- a/.github/workflows/integration_test_8gpu.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-name: 8 GPU Integration Test
-
-on:
-  push:
-    branches: [ main ]
-    paths-ignore:
-      - 'torchtitan/experiments/**'
-  pull_request:
-    paths-ignore:
-      - 'torchtitan/experiments/**'
-  schedule:
-    # Runs every 6 hours
-    - cron: '0 */6 * * *'
-
-concurrency:
-  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
-  cancel-in-progress: true
-
-defaults:
-  run:
-    shell: bash -l -eo pipefail {0}
-
-permissions:
-      id-token: write
-      contents: read
-
-jobs:
-  build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@move_aws_steps_inside_setup_rocm
-    strategy:
-      matrix:
-        include:
-          - name: cuda
-            runner: linux.g5.48xlarge.nvidia.gpu
-            gpu-arch-type: cuda
-            gpu-arch-version: "12.6"
-            # This image is faster to clone than the default, but it lacks CC needed by triton
-            # (1m25s vs 2m37s).
-            docker-image: torchtitan-ubuntu-20.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/cu126
-            is-rocm: 0
-          - name: rocm
-            runner: linux.rocm.gpu.gfx942.4
-            gpu-arch-type: rocm
-            gpu-arch-version: "6.4"
-            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
-            is-rocm: 1
-    with:
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      docker-image: ${{ matrix.docker-image }}
-      repository: pytorch/torchtitan
-      upload-artifact: outputs
-      timeout: 45
-      script: |
-        set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Log CUDA driver version for debugging.
-        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
-        echo "CUDA driver version: ${DRIVER_VERSION}"
-
-        pip config --user set global.progress_bar off
-
-        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
-
-        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
-
-        mkdir artifacts-to-be-uploaded
-        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4
diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index 4f3421fce..475bbd960 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -1,4 +1,5 @@
 name: 8 GPU Feature Tests
+
 on:
   push:
     branches: [ main ]
@@ -19,18 +20,40 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
+permissions:
+      id-token: write
+      contents: read
+
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.gfx942.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 
@@ -44,9 +67,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
-        python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4

From b9ab5c278e43c9cdd903858bf2518886ab1ccd2d Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 9 Oct 2025 11:37:54 -0500
Subject: [PATCH 31/33] Using 7311 branch for linux_job_v2.yml.

---
 .github/workflows/integration_test_8gpu_features.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index 475bbd960..91875ae2d 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@7311
     strategy:
       matrix:
         include:

From 184e8be4b74ffc87d1dcb9e3a58013cb05cb02a5 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 9 Oct 2025 18:13:41 -0500
Subject: [PATCH 32/33] Using 4 GPU runner for ROCm.

---
 .github/workflows/integration_test_8gpu_features.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index 91875ae2d..a0275dc4d 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -40,7 +40,7 @@ jobs:
             index-url: https://download.pytorch.org/whl/nightly/cu126
             is-rocm: 0
           - name: rocm
-            runner: linux.rocm.gpu.gfx942.8
+            runner: linux.rocm.gpu.gfx942.4
             gpu-arch-type: rocm
             gpu-arch-version: "6.4"
             docker-image: torchtitan-rocm-ubuntu-22.04-clang12

From 11cb25d3a62f2088a940113166e163aad26b31e1 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 10 Oct 2025 23:51:26 -0500
Subject: [PATCH 33/33] Switched back to main branch for linux_job_v2.yml.

---
 .github/workflows/integration_test_8gpu_features.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index a0275dc4d..85ddbb580 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@7311
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       matrix:
         include: