ROCm · oyazdanb · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026 · Jan 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -60,6 +60,7 @@ expected_outputs/
 testdata/
 actual_outputs/
 *.html
+*.zip
 
 # IDE/project-specific folders
 .vscode/

diff --git a/config/multi_node/shampoo_opt_multi_node.yaml b/config/multi_node/shampoo_opt_multi_node.yaml
@@ -6,19 +6,28 @@ logging:
   level: INFO
 
 training:
-  epochs: 10
+  epochs: 100
   batch_size: 512
   gradient_accumulation: 2
   mixed_precision: bf16
-  max_steps: 2200
+  max_steps: 4400
   grad_clip_norm: 1.0
   output_dir: artifacts/user_shampoo
-  log_interval: 20
+  log_interval: 5
   additional_compute_streams: 2
   lightweight_op_waves: 3
 
+# Warmup settings to prevent RCCL hangs in multi-node training
+warmup:
+  # RCCL communicator warmup - runs all_reduce on process groups before FSDP init
+  enable_rccl_warmup: true
+  rccl_warmup_iterations: 5
+  # Training warmup - runs forward/backward/optimizer steps before main loop
+  enable_training_warmup: true
+  training_warmup_steps: 1
+
 optimizer:
-  name: shampoo
+  name: adamW
   lr: 0.0002
   weight_decay: 0.01
   betas: [0.9, 0.985]
@@ -35,7 +44,7 @@ dataset:
   sparse_features: 64
   vocab_size: 350000
   num_dense_features: 32
-  seed: 2025
+  seed: 42
 
 model:
   vocab_size: 350000
@@ -94,19 +103,17 @@ dataloader:
   pin_memory: true
 
 profiling:
-  enabled: true
-  wait: 2
-  warmup: 2
-  active: 6
+  enabled: false
+  wait: 0
+  warmup: 0
+  active: 20
   repeat: 1
   record_shapes: true
   profile_memory: true
   with_stack: false
   with_flops: false
-  # tensorboard: true
-  # chrome_trace: true
   tensorboard: false
-  chrome_trace: false
+  chrome_trace: true
   trace_filename: user_shampoo.json
 
 tracelens:

diff --git a/docker/docker-compose.rocm70_9-1-shampoo.yaml b/docker/docker-compose.rocm70_9-1-shampoo.yaml
@@ -22,7 +22,8 @@ services:
 
     volumes:
       - /home/manrao:/manrao
-      - /home/oyazdanb/aorta:/workspace/aorta
+      - /apps/oyazdanb/aorta:/workspace/aorta
+      - /apps/oyazdanb/rccl:/rccl
     devices:
       - /dev/kfd
       - /dev/dri

diff --git a/scripts/multi_node/README.md b/scripts/multi_node/README.md
@@ -211,6 +211,34 @@ done
 | NCCL timeout | Update `NCCL_SOCKET_IFNAME` in `set_env_variables.sh` |
 | World size mismatch | Check `rocm-smi --showid \| wc -l`, adjust `--nproc` |
 
+### Training Hangs at RCCL Initialization
+
+If training hangs at "Warming up global world group..." or during FSDP initialization:
+
+1. **Ensure NCCL environment variables are set** in `local_launch.sh`:
+   - `NCCL_SOCKET_IFNAME` and `TORCH_NCCL_DUMP_ON_TIMEOUT=1` are critical
+   - See the full set in `local_launch.sh` DOCKER_EXEC section
+
+2. **Enable warmup settings** in your config YAML:
+
+```yaml
+warmup:
+  # RCCL communicator warmup - runs all_reduce before FSDP init
+  enable_rccl_warmup: true
+  rccl_warmup_iterations: 5
+  # Training warmup - runs forward/backward/optimizer before main loop
+  enable_training_warmup: true
+  training_warmup_steps: 1
+```
+
+3. **Debug with NCCL logging**:
+```bash
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=ALL
+```
+
+The warmup settings exercise RCCL communicators before the main training loop starts, preventing race conditions during inter-node RDMA setup with HYBRID_SHARD strategy.
+
 ---
 
 ## NCCL Configuration

diff --git a/scripts/multi_node/config_node.sh b/scripts/multi_node/config_node.sh
@@ -21,6 +21,10 @@ ROCPROF_INPUT=$(echo "${15}" | sed 's/"//g')
 DOCKER_CONTAINER="${DOCKER_CONTAINER:-$(echo "${16}" | sed 's/"//g')}"
 DOCKER_CONTAINER="${DOCKER_CONTAINER:-training-overlap-bugs-rocm70_9-1}"
 
+echo "============================================"
+echo "DEBUG: Received ${16} parameters"
+echo "DEBUG: Param 16 (DOCKER_CONTAINER) = '${16}'"
+echo "DEBUG: After processing = '$DOCKER_CONTAINER'"
 echo "============================================"
 echo "Node Configuration"
 echo "============================================"

diff --git a/scripts/multi_node/local_launch.sh b/scripts/multi_node/local_launch.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 # Multi-node local launch script for GEMM training
 # Runs on each node with single channel/thread configuration
+#
+# NCCL/RCCL environment variables are sourced from set_env_variables.sh
+# Edit that file to change NCCL configuration - no need to modify this script.
 
 if [[ $# -lt 11 ]]; then
   echo "Usage: $0 <NODE_RANK> <NODE_IP> <MASTER_IP> <MASTER_PORT> <NNODES> <WORLD_SIZE> <EXPERIMENT_DIR> <CONFIG_FILE> <NPROC_PER_NODE> <CHANNELS> <THREADS> [ENABLE_ROCPROF] [ROCPROF_STATS] [ROCPROF_INPUT] [DOCKER_CONTAINER]"
@@ -23,6 +26,16 @@ ROCPROF_STATS="${13:-false}"
 ROCPROF_INPUT="${14:-}"
 DOCKER_CONTAINER="${15:-training-overlap-bugs-rocm70_9-1}"
 
+# Source environment variables (should already be sourced by config_node.sh, but ensure it's loaded)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+if [[ -f "$SCRIPT_DIR/set_env_variables.sh" ]]; then
+    source "$SCRIPT_DIR/set_env_variables.sh"
+fi
+
+# Override channel/thread settings from command line arguments
+export NCCL_MAX_NCHANNELS="${CHANNELS}"
+export RCCL_THREADS_PER_BLOCK="${THREADS}"
+
 echo "=========================================="
 echo "Local Launch Configuration"
 echo "=========================================="
@@ -37,7 +50,6 @@ echo "Experiment Dir: $EXPERIMENT_DIR"
 echo "Config File: $CONFIG_FILE"
 echo "Channels: $CHANNELS"
 echo "Threads: $THREADS"
-echo "Docker Container: $DOCKER_CONTAINER"
 echo "rocprof enabled: $ENABLE_ROCPROF"
 echo "=========================================="
 echo ""
@@ -60,25 +72,22 @@ else
     CONFIG_FILE_DOCKER="$CONFIG_FILE"
 fi
 
-# Log file
-LOG_FILE="${OUTPUT_DIR}/node_${NODE_RANK}_output.log"
-
 # Function to log with timestamp
 log() {
     local message="$1"
     local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
-    echo "[${timestamp}] [Node ${NODE_RANK}] ${message}" | tee -a "${LOG_FILE}"
+    echo "[${timestamp}] [Node ${NODE_RANK}] ${message}"
 }
 
 # Cleanup function
 cleanup() {
     echo ""
-    echo "=== Caught interrupt signal ===" | tee -a "${LOG_FILE}"
+    echo "=== Caught interrupt signal ==="
     log "Cleaning up training processes on node ${NODE_RANK}..."
 
     # Try to kill processes inside Docker container
-    docker exec "$DOCKER_CONTAINER" pkill -9 -f "train.py" 2>/dev/null || true
-    docker exec "$DOCKER_CONTAINER" pkill -9 -f "torchrun" 2>/dev/null || true
+    docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "train.py" 2>/dev/null || true
+    docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "torchrun" 2>/dev/null || true
 
     # Also try on host (in case anything leaked)
     sudo pkill -9 -f "train.py" 2>/dev/null || true
@@ -109,12 +118,15 @@ BASE_CMD="torchrun --nnodes ${NNODES} --node_rank ${NODE_RANK} --nproc_per_node
 BASE_OVERRIDES="--override profiling.tensorboard=false"
 
 # Build docker exec prefix with environment variables
-DOCKER_EXEC="docker exec \
-    -e RCCL_THREADS_PER_BLOCK=${THREADS} \
-    -e NCCL_MAX_NCHANNELS=${CHANNELS} \
-    -e HSA_ENABLE_SDMA=0 \
-    -e PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 \
-    ${DOCKER_CONTAINER}"
+# All NCCL/RCCL variables are defined in set_env_variables.sh
+DOCKER_ENV_FLAGS=$(build_docker_env_flags)
+DOCKER_EXEC="docker exec ${DOCKER_ENV_FLAGS} ${DOCKER_CONTAINER}"
+
+# Log which env vars are being passed
+log "Docker environment variables:"
+for var in "${DOCKER_ENV_VARS[@]}"; do
+    log "  ${var}=${!var}"
+done
 
 # Run with or without rocprofv3
 if [ "${ENABLE_ROCPROF}" = "true" ]; then

diff --git a/scripts/multi_node/master_launch.sh b/scripts/multi_node/master_launch.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
 # Multi-node orchestration script for Aorta GEMM training
 # Adapted from DLRM master_launch.sh pattern
-#
-# TODO: Convert to SLURM-native launch using srun instead of SSH to individual nodes.
-#       Currently this script runs from a compute node and SSHs to other nodes.
-#       Ideally, we should run SLURM commands from the login node, which would
-#       eliminate the need for SSH connectivity checks and branch verification.
 
 usage() {
     echo "Usage: $0 [OPTIONS]"

diff --git a/scripts/multi_node/set_env_variables.sh b/scripts/multi_node/set_env_variables.sh
@@ -1,42 +1,121 @@
 #!/bin/bash
+# =============================================================================
 # Global NCCL/RCCL environment variables for multi-node training
-# Based on DLRM_set_env_variables.sh
+# Configured for MI350X cluster
+#
+# This file is the SINGLE SOURCE OF TRUTH for all NCCL/RCCL configuration.
+# Edit variables here - local_launch.sh will automatically pick them up.
+# =============================================================================
 
-# NCCL Debug Settings (use INFO for debugging network issues)
-export NCCL_DEBUG=INFO
-export NCCL_DEBUG_SUBSYS=INIT,NET
-# Try disabling IB if InfiniBand is not properly configured
-export NCCL_IB_DISABLE=1
+# -----------------------------------------------------------------------------
+# NCCL Debug Settings
+# -----------------------------------------------------------------------------
+export NCCL_DEBUG=WARN
+export NCCL_DEBUG_SUBSYS=                    # Options: COLL,INIT,NET (empty = none)
 
-# IB/RNIC Configuration (commented out when IB is disabled)
-# export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
-# export NCCL_IB_GID_INDEX=3
+# -----------------------------------------------------------------------------
+# RCCL-Specific Settings (ROCm)
+# -----------------------------------------------------------------------------
+export RCCL_DIRECT_ALLGATHER_DISABLE=1       # Disable direct allgather
+export RCCL_MSCCL_ENABLE=0                   # Disable MSCCL
+export RCCL_THREADS_PER_BLOCK=256            # Threads per block (override via --threads)
+
+# -----------------------------------------------------------------------------
+# IB/RNIC Configuration for MI350X
+# -----------------------------------------------------------------------------
+export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
+export NCCL_IB_GID_INDEX=3
 export NCCL_NCHANNELS_PER_NET_PEER=8
 
+# -----------------------------------------------------------------------------
 # HSA Settings for ROCm
+# -----------------------------------------------------------------------------
 export HSA_ENABLE_IPC_MODE_LEGACY=1
+export HSA_ENABLE_SDMA=0                     # Disable SDMA for stability
 
-# NCCL Protocol
+# -----------------------------------------------------------------------------
+# NCCL Protocol and Channels
+# -----------------------------------------------------------------------------
 export NCCL_PROTO=Simple
-
-# Channel Configuration (can be overridden by sweep parameters)
 export NCCL_MIN_NCHANNELS=40
-export NCCL_MAX_NCHANNELS=40
+export NCCL_MAX_NCHANNELS=40                 # Override via --channels
 
-# Network Interface
-# Change this to match your network interface: eth0, ib0, enp49s0f0np0, etc.
-# Temporarily commented out for auto-detection:
-# export NCCL_SOCKET_IFNAME=enp193s0f0
+# -----------------------------------------------------------------------------
+# Network Interface for MI350X cluster
+# -----------------------------------------------------------------------------
+export NCCL_SOCKET_IFNAME=enp49s0f0np0,fenic0
 
+# -----------------------------------------------------------------------------
+# Timeout and Error Handling
+# -----------------------------------------------------------------------------
+export NCCL_TIMEOUT_MS=12000                 # 12 second timeout
+export TORCH_DIST_INIT_TIMEOUT=60
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_NCCL_TRACE_BUFFER_SIZE=10000
+export TORCH_NCCL_DUMP_ON_TIMEOUT=1          # Critical for hang debugging
+
+# -----------------------------------------------------------------------------
 # PyTorch ROCm Profiler
+# -----------------------------------------------------------------------------
 export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1
 
-# Optional: Force non-overlap for debugging
+# -----------------------------------------------------------------------------
+# List of environment variables to pass to Docker container
+# Add/remove variables here to control what gets passed through
+# -----------------------------------------------------------------------------
+DOCKER_ENV_VARS=(
+    # NCCL Debug
+    NCCL_DEBUG
+    NCCL_DEBUG_SUBSYS
+    # RCCL
+    RCCL_DIRECT_ALLGATHER_DISABLE
+    RCCL_MSCCL_ENABLE
+    RCCL_THREADS_PER_BLOCK
+    # IB/RNIC
+    NCCL_IB_HCA
+    NCCL_IB_GID_INDEX
+    NCCL_NCHANNELS_PER_NET_PEER
+    # HSA
+    HSA_ENABLE_IPC_MODE_LEGACY
+    HSA_ENABLE_SDMA
+    # Protocol/Channels
+    NCCL_PROTO
+    NCCL_MIN_NCHANNELS
+    NCCL_MAX_NCHANNELS
+    # Network
+    NCCL_SOCKET_IFNAME
+    # Timeout/Error Handling
+    NCCL_TIMEOUT_MS
+    TORCH_DIST_INIT_TIMEOUT
+    TORCH_NCCL_ASYNC_ERROR_HANDLING
+    TORCH_NCCL_TRACE_BUFFER_SIZE
+    TORCH_NCCL_DUMP_ON_TIMEOUT
+    # Profiler
+    PYTORCH_ROCM_PROFILER_ENABLE_TRACING
+)
+export DOCKER_ENV_VARS
+
+# -----------------------------------------------------------------------------
+# Helper function: Build docker -e flags from DOCKER_ENV_VARS
+# Usage: DOCKER_ENV_FLAGS=$(build_docker_env_flags)
+# -----------------------------------------------------------------------------
+build_docker_env_flags() {
+    local flags=""
+    for var in "${DOCKER_ENV_VARS[@]}"; do
+        local value="${!var}"
+        flags+=" -e ${var}=${value}"
+    done
+    echo "$flags"
+}
+export -f build_docker_env_flags
+
+# =============================================================================
+# Optional settings (uncomment to enable)
+# =============================================================================
+
+# Force non-overlap for debugging (single HW queue)
 # export GPU_MAX_HW_QUEUES=1
 # unset TORCH_NCCL_HIGH_PRIORITY
 
-# Optional: Disable SDMA for testing
-# export HSA_ENABLE_SDMA=0
-
-# Optional: Disable IB for Ethernet-only testing
+# Disable IB for Ethernet-only testing
 # export NCCL_IB_DISABLE=1
diff --git a/src/aorta/profiling/stream_profiler.py b/src/aorta/profiling/stream_profiler.py
@@ -37,7 +37,11 @@ class MarkerRecord:
 class StreamProfiler:
     """Track activity across multiple CUDA/HIP streams with precise timing."""
 
-    def __init__(self, device: torch.device, stream_names: Optional[Iterable[StreamName]] = None) -> None:
+    def __init__(
+        self,
+        device: torch.device,
+        stream_names: Optional[Iterable[StreamName]] = None,
+    ) -> None:
         if not torch.cuda.is_available():  # pragma: no cover - runtime guard
             raise RuntimeError("StreamProfiler requires CUDA/HIP availability")