From fe07fcd121c7488363a5835a185990f12bb612f6 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 11:35:39 -0700
Subject: [PATCH 01/27] [DONT REVIEW] Debug Async TP CI

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 94a3c298b3..013ded3444 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -27,7 +27,7 @@ jobs:
     with:
       runner: linux.aws.h100.8
       gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "12.8"
       # This image is faster to clone than the default, but it lacks CC needed by triton
       # (1m25s vs 2m37s).
       docker-image: torchtitan-ubuntu-20.04-clang12

From e271e502abd553814ef1606e138a30c30240117f Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 11:45:21 -0700
Subject: [PATCH 02/27] misc

---
 .github/workflows/integration_test_8gpu_h100.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 013ded3444..d655c98879 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -46,9 +46,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128
 
         mkdir artifacts-to-be-uploaded
 

From 056e37af3af1f4a30414ebf5559db8303ce2a053 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 13:34:43 -0700
Subject: [PATCH 03/27] test

---
 .ci/docker/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 34c1f0d45a..7139cdbedc 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -13,9 +13,9 @@ shift
 echo "Building ${IMAGE_NAME} Docker image"
 
 OS=ubuntu
-OS_VERSION=20.04
+OS_VERSION=24.04
 CLANG_VERSION=""
-PYTHON_VERSION=3.11
+PYTHON_VERSION=3.12
 MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
@@ -34,7 +34,7 @@ docker build \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  --shm-size=1g \
+  --build-arg "CUDA_VERSION=12.8.1" \
   -f "${OS}"/Dockerfile \
   "$@" \
   .

From 0582a7e6fa0178bb66422b186a5fdde3ef1b9642 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 13:44:50 -0700
Subject: [PATCH 04/27] another test

---
 .ci/docker/ubuntu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 39e4d8ec5f..35bedc77f9 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
 ARG OS_VERSION
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu${OS_VERSION}
 
 ARG OS_VERSION
 

From 55703de43506f2e1f83d0bab4189c554ebbd75dc Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 13:52:38 -0700
Subject: [PATCH 05/27] test

---
 .ci/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 7139cdbedc..e87f1742dc 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -13,7 +13,7 @@ shift
 echo "Building ${IMAGE_NAME} Docker image"
 
 OS=ubuntu
-OS_VERSION=24.04
+OS_VERSION=22.04
 CLANG_VERSION=""
 PYTHON_VERSION=3.12
 MINICONDA_VERSION=24.3.0-0

From 30cdb4123e8402f75378fac33a527385602490b3 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 14:57:09 -0700
Subject: [PATCH 06/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index d655c98879..7055e58b5d 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -53,4 +53,4 @@ jobs:
         mkdir artifacts-to-be-uploaded
 
         # Enable CPP stacktraces for debugging symmetric memory initialization errors.
-        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
+        CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8

From 130fae47d2f50705ee5f532ab0aff9af6311636d Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 15:43:35 -0700
Subject: [PATCH 07/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 7055e58b5d..46809134d7 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -53,4 +53,4 @@ jobs:
         mkdir artifacts-to-be-uploaded
 
         # Enable CPP stacktraces for debugging symmetric memory initialization errors.
-        CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
+        USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8

From 565a79cb0ffa6d206606d14b5d45dfbc9eb66e71 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 15:55:25 -0700
Subject: [PATCH 08/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 46809134d7..5f5825f70c 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -52,5 +52,11 @@ jobs:
 
         mkdir artifacts-to-be-uploaded
 
+        free -h
+
+        df -h
+
+        nvidia-smi
+
         # Enable CPP stacktraces for debugging symmetric memory initialization errors.
         USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8

From ee7cb527ca5a89ca5fa232cd31deb45653877602 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 16:13:53 -0700
Subject: [PATCH 09/27] test

---
 tests/integration_tests/h100.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py
index ae1fb5b597..a8478c5a0f 100755
--- a/tests/integration_tests/h100.py
+++ b/tests/integration_tests/h100.py
@@ -22,24 +22,25 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
         OverrideDefinitions(
             [
                 [
-                    "--compile.enable",
-                    "--parallelism.tensor_parallel_degree 2",
-                    "--parallelism.enable_async_tensor_parallel",
+                    "--model.converters float8",
+                    "--float8.enable_fsdp_float8_all_gather",
+                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
                 ],
             ],
-            "2D async TP compile",
-            "2d_asynctp_compile",
+            "Float8 test",
+            "float8",
         ),
         OverrideDefinitions(
             [
                 [
-                    "--model.converters float8",
-                    "--float8.enable_fsdp_float8_all_gather",
-                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
+                    "--compile.enable",
+                    "--parallelism.tensor_parallel_degree 8",
+                    "--parallelism.enable_async_tensor_parallel",
                 ],
             ],
-            "Float8 test",
-            "float8",
+            "8D async TP compile",
+            "8d_asynctp_compile",
+            ngpu=8,
         ),
         OverrideDefinitions(
             [

From 53ed590341c208f803f84320cd428823f75e8efe Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 16:22:01 -0700
Subject: [PATCH 10/27] test

---
 tests/integration_tests/models.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/integration_tests/models.py b/tests/integration_tests/models.py
index 81336ab09a..85570fefc6 100755
--- a/tests/integration_tests/models.py
+++ b/tests/integration_tests/models.py
@@ -18,6 +18,18 @@ def build_model_tests_list() -> list[OverrideDefinitions]:
         A dictionary where each key is a model name and value is a list of OverrideDefinitions
     """
     model_tests = [
+        OverrideDefinitions(
+            [
+                [
+                    "--compile.enable",
+                    "--parallelism.tensor_parallel_degree 8",
+                    "--parallelism.enable_async_tensor_parallel",
+                ],
+            ],
+            "8D async TP compile",
+            "8d_asynctp_compile",
+            ngpu=8,
+        ),
         # Integration Test Cases for DeepSeek-V3
         OverrideDefinitions(
             [

From 98a379bedc470a00ceef252dfbc644f7ae20d776 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 16:23:24 -0700
Subject: [PATCH 11/27] test

---
 tests/integration_tests/h100.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py
index a8478c5a0f..a695575379 100755
--- a/tests/integration_tests/h100.py
+++ b/tests/integration_tests/h100.py
@@ -19,17 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
     same root config file.
     """
     integration_tests_flavors = [
-        OverrideDefinitions(
-            [
-                [
-                    "--model.converters float8",
-                    "--float8.enable_fsdp_float8_all_gather",
-                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
-                ],
-            ],
-            "Float8 test",
-            "float8",
-        ),
         OverrideDefinitions(
             [
                 [
@@ -42,6 +31,17 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             "8d_asynctp_compile",
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.converters float8",
+                    "--float8.enable_fsdp_float8_all_gather",
+                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
+                ],
+            ],
+            "Float8 test",
+            "float8",
+        ),
         OverrideDefinitions(
             [
                 [

From 39a6f6de387ab4653dd6012b591e3ca41a5bd0e0 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 18:25:05 -0700
Subject: [PATCH 12/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 5f5825f70c..d8cdf05659 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -47,6 +47,8 @@ jobs:
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+        
+        python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128
 

From bc3ee83a87e635187b13408b7527d496be92a0c6 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 19:54:06 -0700
Subject: [PATCH 13/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index d8cdf05659..b5f0f28a3d 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,6 +48,7 @@ jobs:
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
+        python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128

From a97a0290f9ea2dfc6e914391fe3a7787a8151bd8 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 20:21:25 -0700
Subject: [PATCH 14/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index b5f0f28a3d..8799501571 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -46,7 +46,7 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+        # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
         python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl

From 3b049e7b98b8a7e933d6e0b269872c73e264423f Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 20:31:16 -0700
Subject: [PATCH 15/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 8799501571..7408f0775a 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,8 +48,9 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
-        python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
+        # python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
+        # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+        # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128
 

From 535b8c317e3819156d023d76ff203a2750e4032b Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 20:31:40 -0700
Subject: [PATCH 16/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 7408f0775a..532cb14555 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        # python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
+        python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From b30b85297188ba2c5633dded433c077c57bc874e Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 20:41:23 -0700
Subject: [PATCH 17/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 532cb14555..767c5e09fd 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
+        python -m pip install --force-reinstall torch==2.10.0.dev20250920+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From 81f32d9b569431afcada5cc3b63647661410540b Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 20:48:26 -0700
Subject: [PATCH 18/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 767c5e09fd..88ade945fa 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall torch==2.10.0.dev20250920+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
+        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From b983c7d43f9220f5782d284eff3a41bea4caf6c1 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 21:01:58 -0700
Subject: [PATCH 19/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 88ade945fa..7f45ea896e 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -27,7 +27,7 @@ jobs:
     with:
       runner: linux.aws.h100.8
       gpu-arch-type: cuda
-      gpu-arch-version: "12.8"
+      gpu-arch-version: "12.6"
       # This image is faster to clone than the default, but it lacks CC needed by triton
       # (1m25s vs 2m37s).
       docker-image: torchtitan-ubuntu-20.04-clang12
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
+        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu128
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From 823fff41dcb8fcdd5624ad2cbecc325ac4afc2ec Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 21:16:09 -0700
Subject: [PATCH 20/27] test

---
 .ci/docker/build.sh                               | 4 ++--
 .ci/docker/ubuntu/Dockerfile                      | 2 +-
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index e87f1742dc..1d005d3661 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -13,7 +13,7 @@ shift
 echo "Building ${IMAGE_NAME} Docker image"
 
 OS=ubuntu
-OS_VERSION=22.04
+OS_VERSION=20.04
 CLANG_VERSION=""
 PYTHON_VERSION=3.12
 MINICONDA_VERSION=24.3.0-0
@@ -34,7 +34,7 @@ docker build \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  --build-arg "CUDA_VERSION=12.8.1" \
+  --build-arg "CUDA_VERSION=12.6.1" \
   -f "${OS}"/Dockerfile \
   "$@" \
   .
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 35bedc77f9..fdfd4cb13a 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
 ARG OS_VERSION
 
-FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu${OS_VERSION}
 
 ARG OS_VERSION
 
diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 7f45ea896e..8cf6fcbe6b 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu128
+        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From 8260274ece5d5e1f8450ffcf348bf194a630d90b Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 21:45:57 -0700
Subject: [PATCH 21/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 8cf6fcbe6b..cf00ec0d9a 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -48,7 +48,7 @@ jobs:
 
         # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
         
-        python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
         # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 

From 25d51097f99c9d7e4ce14c9e6ec4957cc384ab12 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 22:07:07 -0700
Subject: [PATCH 22/27] test

---
 .../workflows/integration_test_8gpu_h100.yaml | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index cf00ec0d9a..5881e0a67a 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -36,6 +36,28 @@ jobs:
       script: |
         set -eux
 
+				echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+				sudo killall nvidia-persistenced || true
+				sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"
+				set +e
+				sudo /bin/bash /tmp/nvidia_driver -s --no-drm
+				NVIDIA_INSTALLATION_STATUS=$?
+				sudo apt-get install -y nvidia-container-toolkit-1.17.8
+				sudo systemctl restart docker
+
+				# Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
+				# more than one GPUs. This just needs to be run once. The command fails
+				# on subsequent runs and complains that the mode is already on, but that's
+				# ok
+				nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
+			  NVIDIA_SMI_STATUS=$?
+
+				nvidia-smi
+				sudo nvidia-persistenced || true
+				# This should show persistence mode ON
+				nvidia-smi
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"

From 9580904312f796b36278dd73c757611e894d50c9 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 22:14:45 -0700
Subject: [PATCH 23/27] test:

---
 .../workflows/integration_test_8gpu_h100.yaml | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 5881e0a67a..12f75da9fc 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -36,27 +36,27 @@ jobs:
       script: |
         set -eux
 
-				echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-
-				sudo killall nvidia-persistenced || true
-				sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"
-				set +e
-				sudo /bin/bash /tmp/nvidia_driver -s --no-drm
-				NVIDIA_INSTALLATION_STATUS=$?
-				sudo apt-get install -y nvidia-container-toolkit-1.17.8
-				sudo systemctl restart docker
-
-				# Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
-				# more than one GPUs. This just needs to be run once. The command fails
-				# on subsequent runs and complains that the mode is already on, but that's
-				# ok
-				nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
-			  NVIDIA_SMI_STATUS=$?
-
-				nvidia-smi
-				sudo nvidia-persistenced || true
-				# This should show persistence mode ON
-				nvidia-smi
+        echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+        sudo killall nvidia-persistenced || true
+        sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"
+        set +e
+        sudo /bin/bash /tmp/nvidia_driver -s --no-drm
+        NVIDIA_INSTALLATION_STATUS=$?
+        sudo apt-get install -y nvidia-container-toolkit-1.17.8
+        sudo systemctl restart docker
+
+        # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
+        # more than one GPUs. This just needs to be run once. The command fails
+        # on subsequent runs and complains that the mode is already on, but that's
+        # ok
+        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
+        NVIDIA_SMI_STATUS=$?
+
+        nvidia-smi
+        sudo nvidia-persistenced || true
+        # This should show persistence mode ON
+        nvidia-smi
 
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")

From ddfada1036d4089af67a7c7c1130686b9f0519d3 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 22:25:16 -0700
Subject: [PATCH 24/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 12f75da9fc..150688f05e 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -36,7 +36,7 @@ jobs:
       script: |
         set -eux
 
-        echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
 
         sudo killall nvidia-persistenced || true
         sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"

From ea1609e17f90ed952ccb29384ed1f39af872c6ad Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 22:42:02 -0700
Subject: [PATCH 25/27] test

---
 .github/workflows/integration_test_8gpu_h100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 150688f05e..6d21886934 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -36,7 +36,7 @@ jobs:
       script: |
         set -eux
 
-        sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        # sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
 
         sudo killall nvidia-persistenced || true
         sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"

From a831210b3b3d579f643c7a3e9f9133ee95027ce5 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 23:04:02 -0700
Subject: [PATCH 26/27] test

---
 torchtitan/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchtitan/train.py b/torchtitan/train.py
index 008a4eebba..02b9cadcc7 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed._symmetric_memory import get_symm_mem_workspace
 
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
@@ -93,6 +94,7 @@ def __init__(self, job_config: JobConfig):
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
+        symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=100)
         world_size = int(os.environ["WORLD_SIZE"])
         parallelism_config = job_config.parallelism
         self.parallel_dims = parallel_dims = ParallelDims(

From 99c2fe30410810c9b0227965ef505cd04400c6c5 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 24 Sep 2025 23:15:48 -0700
Subject: [PATCH 27/27] test

---
 torchtitan/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtitan/train.py b/torchtitan/train.py
index 02b9cadcc7..6406451bd3 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -94,7 +94,7 @@ def __init__(self, job_config: JobConfig):
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
-        symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=100)
+        symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64)
         world_size = int(os.environ["WORLD_SIZE"])
         parallelism_config = job_config.parallelism
         self.parallel_dims = parallel_dims = ParallelDims(