diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 34c1f0d45a..1d005d3661 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -15,7 +15,7 @@ echo "Building ${IMAGE_NAME} Docker image" OS=ubuntu OS_VERSION=20.04 CLANG_VERSION="" -PYTHON_VERSION=3.11 +PYTHON_VERSION=3.12 MINICONDA_VERSION=24.3.0-0 case "${IMAGE_NAME}" in @@ -34,7 +34,7 @@ docker build \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - --shm-size=1g \ + --build-arg "CUDA_VERSION=12.6.1" \ -f "${OS}"/Dockerfile \ "$@" \ . diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 39e4d8ec5f..fdfd4cb13a 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ ARG OS_VERSION -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu${OS_VERSION} ARG OS_VERSION diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 94a3c298b3..6d21886934 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -36,6 +36,28 @@ jobs: script: | set -eux + # sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + sudo killall nvidia-persistenced || true + sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run" + set +e + sudo /bin/bash /tmp/nvidia_driver -s --no-drm + NVIDIA_INSTALLATION_STATUS=$? + sudo apt-get install -y nvidia-container-toolkit-1.17.8 + sudo systemctl restart docker + + # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with + # more than one GPUs. This just needs to be run once. The command fails + # on subsequent runs and complains that the mode is already on, but that's + # ok + nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 + NVIDIA_SMI_STATUS=$? + + nvidia-smi + sudo nvidia-persistenced || true + # This should show persistence mode ON + nvidia-smi + # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" @@ -46,11 +68,21 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + # python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 + + python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 + # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + # python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128 mkdir artifacts-to-be-uploaded + free -h + + df -h + + nvidia-smi + # Enable CPP stacktraces for debugging symmetric memory initialization errors. - TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 + USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index ae1fb5b597..a695575379 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -23,12 +23,13 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: [ [ "--compile.enable", - "--parallelism.tensor_parallel_degree 2", + "--parallelism.tensor_parallel_degree 8", "--parallelism.enable_async_tensor_parallel", ], ], - "2D async TP compile", - "2d_asynctp_compile", + "8D async TP compile", + "8d_asynctp_compile", + ngpu=8, ), OverrideDefinitions( [ diff --git a/tests/integration_tests/models.py b/tests/integration_tests/models.py index 81336ab09a..85570fefc6 100755 --- a/tests/integration_tests/models.py +++ b/tests/integration_tests/models.py @@ -18,6 +18,18 @@ def build_model_tests_list() -> list[OverrideDefinitions]: A dictionary where each key is a model name and value is a list of OverrideDefinitions """ model_tests = [ + OverrideDefinitions( + [ + [ + "--compile.enable", + "--parallelism.tensor_parallel_degree 8", + "--parallelism.enable_async_tensor_parallel", + ], + ], + "8D async TP compile", + "8d_asynctp_compile", + ngpu=8, + ), # Integration Test Cases for DeepSeek-V3 OverrideDefinitions( [ diff --git a/torchtitan/train.py b/torchtitan/train.py index 008a4eebba..6406451bd3 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -12,6 +12,7 @@ import torch from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed._symmetric_memory import get_symm_mem_workspace import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager @@ -93,6 +94,7 @@ def __init__(self, job_config: JobConfig): enable_cpu_backend=job_config.training.enable_cpu_offload, base_folder=job_config.job.dump_folder, ) + symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64) world_size = int(os.environ["WORLD_SIZE"]) parallelism_config = job_config.parallelism self.parallel_dims = parallel_dims = ParallelDims(