From fe07fcd121c7488363a5835a185990f12bb612f6 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 11:35:39 -0700 Subject: [PATCH 01/27] [DONT REVIEW] Debug Async TP CI --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 94a3c298b3..013ded3444 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -27,7 +27,7 @@ jobs: with: runner: linux.aws.h100.8 gpu-arch-type: cuda - gpu-arch-version: "12.6" + gpu-arch-version: "12.8" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). docker-image: torchtitan-ubuntu-20.04-clang12 From e271e502abd553814ef1606e138a30c30240117f Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 11:45:21 -0700 Subject: [PATCH 02/27] misc --- .github/workflows/integration_test_8gpu_h100.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 013ded3444..d655c98879 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -46,9 +46,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128 mkdir artifacts-to-be-uploaded From 056e37af3af1f4a30414ebf5559db8303ce2a053 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 13:34:43 -0700 Subject: [PATCH 03/27] test --- .ci/docker/build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 34c1f0d45a..7139cdbedc 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -13,9 +13,9 @@ shift echo "Building ${IMAGE_NAME} Docker image" OS=ubuntu -OS_VERSION=20.04 +OS_VERSION=24.04 CLANG_VERSION="" -PYTHON_VERSION=3.11 +PYTHON_VERSION=3.12 MINICONDA_VERSION=24.3.0-0 case "${IMAGE_NAME}" in @@ -34,7 +34,7 @@ docker build \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - --shm-size=1g \ + --build-arg "CUDA_VERSION=12.8.1" \ -f "${OS}"/Dockerfile \ "$@" \ . From 0582a7e6fa0178bb66422b186a5fdde3ef1b9642 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 13:44:50 -0700 Subject: [PATCH 04/27] another test --- .ci/docker/ubuntu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 39e4d8ec5f..35bedc77f9 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ ARG OS_VERSION -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu${OS_VERSION} ARG OS_VERSION From 55703de43506f2e1f83d0bab4189c554ebbd75dc Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 13:52:38 -0700 Subject: [PATCH 05/27] test --- .ci/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 7139cdbedc..e87f1742dc 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -13,7 +13,7 @@ shift echo "Building ${IMAGE_NAME} Docker image" OS=ubuntu -OS_VERSION=24.04 +OS_VERSION=22.04 CLANG_VERSION="" PYTHON_VERSION=3.12 MINICONDA_VERSION=24.3.0-0 From 30cdb4123e8402f75378fac33a527385602490b3 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 14:57:09 -0700 Subject: [PATCH 06/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index d655c98879..7055e58b5d 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -53,4 +53,4 @@ jobs: mkdir artifacts-to-be-uploaded # Enable CPP stacktraces for debugging symmetric memory initialization errors. - TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 + CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 From 130fae47d2f50705ee5f532ab0aff9af6311636d Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 15:43:35 -0700 Subject: [PATCH 07/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 7055e58b5d..46809134d7 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -53,4 +53,4 @@ jobs: mkdir artifacts-to-be-uploaded # Enable CPP stacktraces for debugging symmetric memory initialization errors. - CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 + USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 From 565a79cb0ffa6d206606d14b5d45dfbc9eb66e71 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 15:55:25 -0700 Subject: [PATCH 08/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 46809134d7..5f5825f70c 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -52,5 +52,11 @@ jobs: mkdir artifacts-to-be-uploaded + free -h + + df -h + + nvidia-smi + # Enable CPP stacktraces for debugging symmetric memory initialization errors. USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 From ee7cb527ca5a89ca5fa232cd31deb45653877602 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 16:13:53 -0700 Subject: [PATCH 09/27] test --- tests/integration_tests/h100.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index ae1fb5b597..a8478c5a0f 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -22,24 +22,25 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: OverrideDefinitions( [ [ - "--compile.enable", - "--parallelism.tensor_parallel_degree 2", - "--parallelism.enable_async_tensor_parallel", + "--model.converters float8", + "--float8.enable_fsdp_float8_all_gather", + "--float8.precompute_float8_dynamic_scale_for_fsdp", ], ], - "2D async TP compile", - "2d_asynctp_compile", + "Float8 test", + "float8", ), OverrideDefinitions( [ [ - "--model.converters float8", - "--float8.enable_fsdp_float8_all_gather", - "--float8.precompute_float8_dynamic_scale_for_fsdp", + "--compile.enable", + "--parallelism.tensor_parallel_degree 8", + "--parallelism.enable_async_tensor_parallel", ], ], - "Float8 test", - "float8", + "8D async TP compile", + "8d_asynctp_compile", + ngpu=8, ), OverrideDefinitions( [ From 53ed590341c208f803f84320cd428823f75e8efe Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 16:22:01 -0700 Subject: [PATCH 10/27] test --- tests/integration_tests/models.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/integration_tests/models.py b/tests/integration_tests/models.py index 81336ab09a..85570fefc6 100755 --- a/tests/integration_tests/models.py +++ b/tests/integration_tests/models.py @@ -18,6 +18,18 @@ def build_model_tests_list() -> list[OverrideDefinitions]: A dictionary where each key is a model name and value is a list of OverrideDefinitions """ model_tests = [ + OverrideDefinitions( + [ + [ + "--compile.enable", + "--parallelism.tensor_parallel_degree 8", + "--parallelism.enable_async_tensor_parallel", + ], + ], + "8D async TP compile", + "8d_asynctp_compile", + ngpu=8, + ), # Integration Test Cases for DeepSeek-V3 OverrideDefinitions( [ From 98a379bedc470a00ceef252dfbc644f7ae20d776 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 16:23:24 -0700 Subject: [PATCH 11/27] test --- tests/integration_tests/h100.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index a8478c5a0f..a695575379 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -19,17 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: same root config file. """ integration_tests_flavors = [ - OverrideDefinitions( - [ - [ - "--model.converters float8", - "--float8.enable_fsdp_float8_all_gather", - "--float8.precompute_float8_dynamic_scale_for_fsdp", - ], - ], - "Float8 test", - "float8", - ), OverrideDefinitions( [ [ @@ -42,6 +31,17 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: "8d_asynctp_compile", ngpu=8, ), + OverrideDefinitions( + [ + [ + "--model.converters float8", + "--float8.enable_fsdp_float8_all_gather", + "--float8.precompute_float8_dynamic_scale_for_fsdp", + ], + ], + "Float8 test", + "float8", + ), OverrideDefinitions( [ [ From 39a6f6de387ab4653dd6012b591e3ca41a5bd0e0 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 18:25:05 -0700 Subject: [PATCH 12/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 5f5825f70c..d8cdf05659 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -47,6 +47,8 @@ jobs: pip config --user set global.progress_bar off python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 + + python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128 From bc3ee83a87e635187b13408b7527d496be92a0c6 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 19:54:06 -0700 Subject: [PATCH 13/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index d8cdf05659..b5f0f28a3d 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,6 +48,7 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128 From a97a0290f9ea2dfc6e914391fe3a7787a8151bd8 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 20:21:25 -0700 Subject: [PATCH 14/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index b5f0f28a3d..8799501571 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -46,7 +46,7 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 + # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From 3b049e7b98b8a7e933d6e0b269872c73e264423f Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 20:31:16 -0700 Subject: [PATCH 15/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 8799501571..7408f0775a 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,8 +48,9 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl + # python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 + # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128 From 535b8c317e3819156d023d76ff203a2750e4032b Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 20:31:40 -0700 Subject: [PATCH 16/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 7408f0775a..532cb14555 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - # python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From b30b85297188ba2c5633dded433c077c57bc874e Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 20:41:23 -0700 Subject: [PATCH 17/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 532cb14555..767c5e09fd 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall torch==2.10.0.dev20250921+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall torch==2.10.0.dev20250920+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From 81f32d9b569431afcada5cc3b63647661410540b Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 20:48:26 -0700 Subject: [PATCH 18/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 767c5e09fd..88ade945fa 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall torch==2.10.0.dev20250920+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From b983c7d43f9220f5782d284eff3a41bea4caf6c1 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 21:01:58 -0700 Subject: [PATCH 19/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 88ade945fa..7f45ea896e 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -27,7 +27,7 @@ jobs: with: runner: linux.aws.h100.8 gpu-arch-type: cuda - gpu-arch-version: "12.8" + gpu-arch-version: "12.6" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). docker-image: torchtitan-ubuntu-20.04-clang12 @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu128 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From 823fff41dcb8fcdd5624ad2cbecc325ac4afc2ec Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 21:16:09 -0700 Subject: [PATCH 20/27] test --- .ci/docker/build.sh | 4 ++-- .ci/docker/ubuntu/Dockerfile | 2 +- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index e87f1742dc..1d005d3661 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -13,7 +13,7 @@ shift echo "Building ${IMAGE_NAME} Docker image" OS=ubuntu -OS_VERSION=22.04 +OS_VERSION=20.04 CLANG_VERSION="" PYTHON_VERSION=3.12 MINICONDA_VERSION=24.3.0-0 @@ -34,7 +34,7 @@ docker build \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - --build-arg "CUDA_VERSION=12.8.1" \ + --build-arg "CUDA_VERSION=12.6.1" \ -f "${OS}"/Dockerfile \ "$@" \ . diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 35bedc77f9..fdfd4cb13a 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ ARG OS_VERSION -FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu${OS_VERSION} ARG OS_VERSION diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 7f45ea896e..8cf6fcbe6b 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu128 + python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From 8260274ece5d5e1f8450ffcf348bf194a630d90b Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 21:45:57 -0700 Subject: [PATCH 21/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 8cf6fcbe6b..cf00ec0d9a 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -48,7 +48,7 @@ jobs: # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - python -m pip install --force-reinstall torch==2.10.0.dev20250919+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl From 25d51097f99c9d7e4ce14c9e6ec4957cc384ab12 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 22:07:07 -0700 Subject: [PATCH 22/27] test --- .../workflows/integration_test_8gpu_h100.yaml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index cf00ec0d9a..5881e0a67a 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -36,6 +36,28 @@ jobs: script: | set -eux + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + sudo killall nvidia-persistenced || true + sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" + set +e + sudo /bin/bash /tmp/nvidia_driver -s --no-drm + NVIDIA_INSTALLATION_STATUS=$? + sudo apt-get install -y nvidia-container-toolkit-1.17.8 + sudo systemctl restart docker + + # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with + # more than one GPUs. This just needs to be run once. The command fails + # on subsequent runs and complains that the mode is already on, but that's + # ok + nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 + NVIDIA_SMI_STATUS=$? + + nvidia-smi + sudo nvidia-persistenced || true + # This should show persistence mode ON + nvidia-smi + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" From 9580904312f796b36278dd73c757611e894d50c9 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 22:14:45 -0700 Subject: [PATCH 23/27] test: --- .../workflows/integration_test_8gpu_h100.yaml | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 5881e0a67a..12f75da9fc 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -36,27 +36,27 @@ jobs: script: | set -eux - echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" - - sudo killall nvidia-persistenced || true - sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" - set +e - sudo /bin/bash /tmp/nvidia_driver -s --no-drm - NVIDIA_INSTALLATION_STATUS=$? - sudo apt-get install -y nvidia-container-toolkit-1.17.8 - sudo systemctl restart docker - - # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with - # more than one GPUs. This just needs to be run once. The command fails - # on subsequent runs and complains that the mode is already on, but that's - # ok - nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 - NVIDIA_SMI_STATUS=$? - - nvidia-smi - sudo nvidia-persistenced || true - # This should show persistence mode ON - nvidia-smi + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + sudo killall nvidia-persistenced || true + sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" + set +e + sudo /bin/bash /tmp/nvidia_driver -s --no-drm + NVIDIA_INSTALLATION_STATUS=$? + sudo apt-get install -y nvidia-container-toolkit-1.17.8 + sudo systemctl restart docker + + # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with + # more than one GPUs. This just needs to be run once. The command fails + # on subsequent runs and complains that the mode is already on, but that's + # ok + nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 + NVIDIA_SMI_STATUS=$? + + nvidia-smi + sudo nvidia-persistenced || true + # This should show persistence mode ON + nvidia-smi # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") From ddfada1036d4089af67a7c7c1130686b9f0519d3 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 22:25:16 -0700 Subject: [PATCH 24/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 12f75da9fc..150688f05e 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -36,7 +36,7 @@ jobs: script: | set -eux - echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" sudo killall nvidia-persistenced || true sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" From ea1609e17f90ed952ccb29384ed1f39af872c6ad Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 22:42:02 -0700 Subject: [PATCH 25/27] test --- .github/workflows/integration_test_8gpu_h100.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 150688f05e..6d21886934 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -36,7 +36,7 @@ jobs: script: | set -eux - sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + # sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" sudo killall nvidia-persistenced || true sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" From a831210b3b3d579f643c7a3e9f9133ee95027ce5 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 23:04:02 -0700 Subject: [PATCH 26/27] test --- torchtitan/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtitan/train.py b/torchtitan/train.py index 008a4eebba..02b9cadcc7 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -12,6 +12,7 @@ import torch from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed._symmetric_memory import get_symm_mem_workspace import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager @@ -93,6 +94,7 @@ def __init__(self, job_config: JobConfig): enable_cpu_backend=job_config.training.enable_cpu_offload, base_folder=job_config.job.dump_folder, ) + symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=100) world_size = int(os.environ["WORLD_SIZE"]) parallelism_config = job_config.parallelism self.parallel_dims = parallel_dims = ParallelDims( From 99c2fe30410810c9b0227965ef505cd04400c6c5 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 23:15:48 -0700 Subject: [PATCH 27/27] test --- torchtitan/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtitan/train.py b/torchtitan/train.py index 02b9cadcc7..6406451bd3 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -94,7 +94,7 @@ def __init__(self, job_config: JobConfig): enable_cpu_backend=job_config.training.enable_cpu_offload, base_folder=job_config.job.dump_folder, ) - symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=100) + symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64) world_size = int(os.environ["WORLD_SIZE"]) parallelism_config = job_config.parallelism self.parallel_dims = parallel_dims = ParallelDims(