diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 34c1f0d45a..1d005d3661 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -15,7 +15,7 @@ echo "Building ${IMAGE_NAME} Docker image"
 OS=ubuntu
 OS_VERSION=20.04
 CLANG_VERSION=""
-PYTHON_VERSION=3.11
+PYTHON_VERSION=3.12
 MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
@@ -34,7 +34,7 @@ docker build \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  --shm-size=1g \
+  --build-arg "CUDA_VERSION=12.6.1" \
   -f "${OS}"/Dockerfile \
   "$@" \
   .
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 39e4d8ec5f..fdfd4cb13a 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
 ARG OS_VERSION
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu${OS_VERSION}
 
 ARG OS_VERSION
 
diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 94a3c298b3..6d21886934 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -36,6 +36,28 @@ jobs:
       script: |
         set -eux
 
+        # sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+        sudo killall nvidia-persistenced || true
+        sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"
+        set +e
+        sudo /bin/bash /tmp/nvidia_driver -s --no-drm
+        NVIDIA_INSTALLATION_STATUS=$?
+        sudo apt-get install -y nvidia-container-toolkit-1.17.8
+        sudo systemctl restart docker
+
+        # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
+        # more than one GPUs. This just needs to be run once. The command fails
+        # on subsequent runs and complains that the mode is already on, but that's
+        # ok
+        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
+        NVIDIA_SMI_STATUS=$?
+
+        nvidia-smi
+        sudo nvidia-persistenced || true
+        # This should show persistence mode ON
+        nvidia-smi
+
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
@@ -46,11 +68,21 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+        
+        python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126
+        # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+        # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128
 
         mkdir artifacts-to-be-uploaded
 
+        free -h
+
+        df -h
+
+        nvidia-smi
+
         # Enable CPP stacktraces for debugging symmetric memory initialization errors.
-        TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
+        USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py
index ae1fb5b597..a695575379 100755
--- a/tests/integration_tests/h100.py
+++ b/tests/integration_tests/h100.py
@@ -23,12 +23,13 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             [
                 [
                     "--compile.enable",
-                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.tensor_parallel_degree 8",
                     "--parallelism.enable_async_tensor_parallel",
                 ],
             ],
-            "2D async TP compile",
-            "2d_asynctp_compile",
+            "8D async TP compile",
+            "8d_asynctp_compile",
+            ngpu=8,
         ),
         OverrideDefinitions(
             [
diff --git a/tests/integration_tests/models.py b/tests/integration_tests/models.py
index 81336ab09a..85570fefc6 100755
--- a/tests/integration_tests/models.py
+++ b/tests/integration_tests/models.py
@@ -18,6 +18,18 @@ def build_model_tests_list() -> list[OverrideDefinitions]:
         A dictionary where each key is a model name and value is a list of OverrideDefinitions
     """
     model_tests = [
+        OverrideDefinitions(
+            [
+                [
+                    "--compile.enable",
+                    "--parallelism.tensor_parallel_degree 8",
+                    "--parallelism.enable_async_tensor_parallel",
+                ],
+            ],
+            "8D async TP compile",
+            "8d_asynctp_compile",
+            ngpu=8,
+        ),
         # Integration Test Cases for DeepSeek-V3
         OverrideDefinitions(
             [
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 008a4eebba..6406451bd3 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed._symmetric_memory import get_symm_mem_workspace
 
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
@@ -93,6 +94,7 @@ def __init__(self, job_config: JobConfig):
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
+        symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64)
         world_size = int(os.environ["WORLD_SIZE"])
         parallelism_config = job_config.parallelism
         self.parallel_dims = parallel_dims = ParallelDims(