diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 34c1f0d45..597b2ee5c 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -13,14 +13,20 @@ shift
 echo "Building ${IMAGE_NAME} Docker image"
 
 OS=ubuntu
-OS_VERSION=20.04
 CLANG_VERSION=""
 PYTHON_VERSION=3.11
 MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
+    OS_VERSION=20.04
     CLANG_VERSION=12
+    BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+    ;;
+  torchtitan-rocm-ubuntu-22.04-clang12)
+    OS_VERSION=22.04
+    CLANG_VERSION=12
+    BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -30,6 +36,7 @@ esac
 docker build \
   --no-cache \
   --progress=plain \
+  --build-arg "BASE_IMAGE=${BASE_IMAGE}" \
   --build-arg "OS_VERSION=${OS_VERSION}" \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
@@ -38,3 +45,4 @@ docker build \
   -f "${OS}"/Dockerfile \
   "$@" \
   .
+
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 39e4d8ec5..5d10c01b7 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
-ARG OS_VERSION
+ARG BASE_IMAGE
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM ${BASE_IMAGE}
 
 ARG OS_VERSION
 
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 11ff5390c..d5f52824d 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,13 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
-    timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
+            runner: [self-hosted, linux.2xlarge]
+          - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12
+            runner: [linux.2xlarge]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 240
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:
diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index 4f3421fce..85ddbb580 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -1,4 +1,5 @@
 name: 8 GPU Feature Tests
+
 on:
   push:
     branches: [ main ]
@@ -19,18 +20,40 @@ defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
+permissions:
+      id-token: write
+      contents: read
+
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.gfx942.4
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 
@@ -44,9 +67,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
-        python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 4
diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py
index a64c69eb6..57b9fca2b 100644
--- a/tests/integration_tests/run_tests.py
+++ b/tests/integration_tests/run_tests.py
@@ -24,6 +24,13 @@
 }
 
 
+# tests skipped for ROCm
+skip_for_rocm_test_list = [
+    "model_only_hf_checkpoint",
+]
+TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
+
+
 def _run_cmd(cmd):
     return subprocess.run([cmd], text=True, shell=True)
 
@@ -87,6 +94,10 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
         if test_flavor.disabled:
             continue
 
+        # Skip the test for ROCm
+        if TEST_WITH_ROCM and test_flavor.test_name in skip_for_rocm_test_list:
+            continue
+
         # Check if we have enough GPUs
         if args.ngpu < test_flavor.ngpu:
             logger.info(