Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ echo "Building ${IMAGE_NAME} Docker image"
OS=ubuntu
OS_VERSION=20.04
CLANG_VERSION=""
PYTHON_VERSION=3.11
PYTHON_VERSION=3.12
MINICONDA_VERSION=24.3.0-0

case "${IMAGE_NAME}" in
Expand All @@ -34,7 +34,7 @@ docker build \
--build-arg "CLANG_VERSION=${CLANG_VERSION}" \
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
--build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
--shm-size=1g \
--build-arg "CUDA_VERSION=12.6.1" \
-f "${OS}"/Dockerfile \
"$@" \
.
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG OS_VERSION

FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu${OS_VERSION}

ARG OS_VERSION

Expand Down
38 changes: 35 additions & 3 deletions .github/workflows/integration_test_8gpu_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,28 @@ jobs:
script: |
set -eux

# sudo echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"

sudo killall nvidia-persistenced || true
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-580.65.06.run"
set +e
sudo /bin/bash /tmp/nvidia_driver -s --no-drm
NVIDIA_INSTALLATION_STATUS=$?
sudo apt-get install -y nvidia-container-toolkit-1.17.8
sudo systemctl restart docker

# Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
# more than one GPUs. This just needs to be run once. The command fails
# on subsequent runs and complains that the mode is already on, but that's
# ok
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
NVIDIA_SMI_STATUS=$?

nvidia-smi
sudo nvidia-persistenced || true
# This should show persistence mode ON
nvidia-smi

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
Expand All @@ -46,11 +68,21 @@ jobs:

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
# python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128

python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126
# python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/pytorch_triton-3.5.0%2Bgit5ae38bdb-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
# python -m pip install --force-reinstall https://download.pytorch.org/whl/nightly/cu128/torch-2.10.0.dev20250921%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl

USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu128

mkdir artifacts-to-be-uploaded

free -h

df -h

nvidia-smi

# Enable CPP stacktraces for debugging symmetric memory initialization errors.
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
USE_PYTORCH_KERNEL_CACHE=0 CUDA_LAUNCH_BLOCKING=1 TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
7 changes: 4 additions & 3 deletions tests/integration_tests/h100.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
[
[
"--compile.enable",
"--parallelism.tensor_parallel_degree 2",
"--parallelism.tensor_parallel_degree 8",
"--parallelism.enable_async_tensor_parallel",
],
],
"2D async TP compile",
"2d_asynctp_compile",
"8D async TP compile",
"8d_asynctp_compile",
ngpu=8,
),
OverrideDefinitions(
[
Expand Down
12 changes: 12 additions & 0 deletions tests/integration_tests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ def build_model_tests_list() -> list[OverrideDefinitions]:
A dictionary where each key is a model name and value is a list of OverrideDefinitions
"""
model_tests = [
OverrideDefinitions(
[
[
"--compile.enable",
"--parallelism.tensor_parallel_degree 8",
"--parallelism.enable_async_tensor_parallel",
],
],
"8D async TP compile",
"8d_asynctp_compile",
ngpu=8,
),
# Integration Test Cases for DeepSeek-V3
OverrideDefinitions(
[
Expand Down
2 changes: 2 additions & 0 deletions torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import torch
from torch.distributed.elastic.multiprocessing.errors import record
from torch.distributed._symmetric_memory import get_symm_mem_workspace

import torchtitan.protocols.train_spec as train_spec_module
from torchtitan.components.checkpoint import CheckpointManager
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(self, job_config: JobConfig):
enable_cpu_backend=job_config.training.enable_cpu_offload,
base_folder=job_config.job.dump_folder,
)
symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64)
world_size = int(os.environ["WORLD_SIZE"])
parallelism_config = job_config.parallelism
self.parallel_dims = parallel_dims = ParallelDims(
Expand Down
Loading