diff --git a/.gitignore b/.gitignore index 0aaa6d8..b40cab9 100644 --- a/.gitignore +++ b/.gitignore @@ -42,19 +42,25 @@ checkpoints/ runs/ logs/ tensorboard/ +experiments/ +artifacts/ +artifacts_*/ +overlap_debug*/ # Experiment outputs and traces -experiments/ *.json +*.jsonl *.xlsx +*.log +*.logs *_merged_enhanced.json nccl_thread_sweep_*.log nccl_thread_sweep_summary_*.txt expected_outputs/ testdata/ -*.log actual_outputs/ *.html + # IDE/project-specific folders .vscode/ .idea/ @@ -68,7 +74,13 @@ Thumbs.db # Data files *.dat +# Profiler traces tmp_prof.* rocprof_traces overlap.* .*repro.* +*.gz +trace_* + +# Multi-node IP configuration (should be customized per deployment) +node_ip_list.txt diff --git a/config/multi_node/distributed_multinode.yaml b/config/multi_node/distributed_multinode.yaml index 143c680..acd9162 100644 --- a/config/multi_node/distributed_multinode.yaml +++ b/config/multi_node/distributed_multinode.yaml @@ -5,7 +5,7 @@ training: batch_size: 448 gradient_accumulation: 4 mixed_precision: bf16 - max_steps: 2000 + max_steps: 10 grad_clip_norm: 1.0 output_dir: artifacts/optuna_tracelens_sweep_fixed/trial_0000 log_interval: 20 @@ -46,6 +46,8 @@ fsdp: forward_prefetch: true sync_module_states: true param_init_device: meta + # hybrid_shard_gpus_per_node: auto-detects from --nproc (via LOCAL_WORLD_SIZE) + # Only set manually if auto-detection fails distributed: backend: nccl mode: fsdp @@ -54,7 +56,7 @@ distributed: static_graph: true find_unused_parameters: false compile: - enabled: true + enabled: false backend: inductor mode: max-autotune fullgraph: false @@ -82,10 +84,10 @@ dataloader: pin_memory: true profiling: enabled: true - wait: 5 - warmup: 5 - active: 1 - repeat: 1 + wait: 1 + warmup: 2 + active: 5 + repeat: 0 record_shapes: false profile_memory: false with_stack: false diff --git a/config/multi_node/shampoo_opt_multi_node.yaml b/config/multi_node/shampoo_opt_multi_node.yaml new file mode 100644 index 0000000..88fc42b --- /dev/null +++ b/config/multi_node/shampoo_opt_multi_node.yaml @@ -0,0 +1,113 @@ +# Shampoo-only variant of the user workload. +# NOTE: User reports this configuration produces NaNs when using Shampoo, +# while AdamW and other optimizers remain healthy. Monitor loss closely. + +logging: + level: INFO + +training: + epochs: 10 + batch_size: 512 + gradient_accumulation: 2 + mixed_precision: bf16 + max_steps: 2200 + grad_clip_norm: 1.0 + output_dir: artifacts/user_shampoo + log_interval: 20 + additional_compute_streams: 2 + lightweight_op_waves: 3 + +optimizer: + name: shampoo + lr: 0.0002 + weight_decay: 0.01 + betas: [0.9, 0.985] + eps: 1.0e-8 + +scheduler: + warmup_steps: 200 + total_steps: 2200 + +dataset: + num_samples: 200000 + sequence_length: 160 + dense_dim: 256 + sparse_features: 64 + vocab_size: 350000 + num_dense_features: 32 + seed: 2025 + +model: + vocab_size: 350000 + embedding_dim: 256 + num_dense_features: 32 + dense_dim: 256 + model_dim: 1024 + num_heads: 16 + num_layers: 18 + dropout: 0.1 + mlp_hidden_dim: 4096 + +fsdp: + sharding_strategy: hybrid_shard + backward_prefetch: BACKWARD_PRE + use_orig_params: true + limit_all_gathers: true + forward_prefetch: true + sync_module_states: true + param_init_device: meta + +distributed: + backend: nccl + mode: fsdp + bucket_cap_mb: 128 + gradient_as_bucket_view: true + static_graph: true + find_unused_parameters: false + +compile: + enabled: false + backend: inductor + mode: max-autotune + fullgraph: false + dynamic: false + +streams: + num_streams: 4 + high_priority: + - allreduce + - reducescatter + stream_assignments: + compute: + - dev6_stream3 + - dev6_stream9 + communication: + - dev6_stream13 + - dev6_stream17 + reducescatter: + - dev6_stream22 + aux: + - dev6_stream0 + +dataloader: + num_workers: 0 + pin_memory: true + +profiling: + enabled: true + wait: 2 + warmup: 2 + active: 6 + repeat: 1 + record_shapes: true + profile_memory: true + with_stack: false + with_flops: false + # tensorboard: true + # chrome_trace: true + tensorboard: false + chrome_trace: false + trace_filename: user_shampoo.json + +tracelens: + enabled: false diff --git a/docker/Dockerfile.rocm70_9-1-shampoo b/docker/Dockerfile.rocm70_9-1-shampoo new file mode 100644 index 0000000..4ce3c4f --- /dev/null +++ b/docker/Dockerfile.rocm70_9-1-shampoo @@ -0,0 +1,57 @@ +# Start from the existing PyTorch ROCm image +FROM rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly + +# Switch to root to install packages +USER root + +# Install wget and git if not available +RUN yum install -y wget git 2>/dev/null || apt-get update && apt-get install -y wget git || true + +# Download and install amdgpu-install package for RHEL +RUN wget https://artifactory-cdn.amd.com/artifactory/list/amdgpu-rpm/rhel/amdgpu-install-internal-7.0_9-1.noarch.rpm && \ + yum install -y ./amdgpu-install-internal-7.0_9-1.noarch.rpm || rpm -ivh ./amdgpu-install-internal-7.0_9-1.noarch.rpm && \ + rm amdgpu-install-internal-7.0_9-1.noarch.rpm + +# Update amdgpu-repo with specific builds +RUN amdgpu-repo --amdgpu-build=2247890 --rocm-build=compute-rocm-rel-7.0-meta/7 + +# Since base image already has ROCm, just update the key runtime components +RUN yum update -y --skip-broken \ + rocm-hip \ + rocm-libs \ + rocm-hip-libraries \ + rocm-hip-runtime-devel \ + hip-base \ + hip-dev \ + hip-runtime-amd \ + rocm-core || echo "Updated available ROCm packages" + +RUN python3.10 -m pip install git+https://github.com/AMD-AGI/TraceLens.git +RUN python3.10 -m pip install openpyxl seaborn + +# Download and install rocprof-trace-decoder +RUN wget https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.6/rocprof-trace-decoder-manylinux-2.28-0.1.6-Linux.rpm \ + -O /tmp/rocprof-trace-decoder.rpm && \ + rpm -i /tmp/rocprof-trace-decoder.rpm && \ + rm /tmp/rocprof-trace-decoder.rpm + +# Install Facebook Distributed Shampoo optimizer +RUN cd /tmp && \ + git clone https://github.com/facebookresearch/optimizers.git && \ + cd optimizers && \ + python3.10 -m pip install . && \ + cd / && \ + rm -rf /tmp/optimizers + +# Verify Shampoo installation +RUN python3.10 -c "import distributed_shampoo; print('[OK] Shampoo optimizer installed successfully')" + +# Update environment variables +ENV ROCM_HOME=/opt/rocm +ENV PATH=$ROCM_HOME/bin:$PATH +ENV LD_LIBRARY_PATH=$ROCM_HOME/lib:$LD_LIBRARY_PATH + +# Set working directory +WORKDIR /workspace/aorta + +CMD ["/bin/bash"] diff --git a/docker/docker-compose.rocm70_9-1-shampoo.yaml b/docker/docker-compose.rocm70_9-1-shampoo.yaml new file mode 100644 index 0000000..232e3a3 --- /dev/null +++ b/docker/docker-compose.rocm70_9-1-shampoo.yaml @@ -0,0 +1,35 @@ +services: + torchenv-rocm70-shampoo: + container_name: training-overlap-bugs-rocm70_9-1-shampoo + build: + context: . + dockerfile: Dockerfile.rocm70_9-1-shampoo + network: host + user: root + privileged: true + network_mode: host + group_add: + - video + ipc: host + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + environment: + - RCCL_FOLDER=/rccl + - LD_LIBRARY_PATH=/rccl/build/release:${LD_LIBRARY_PATH:-} + - TORCH_NCCL_HIGH_PRIORITY=1 + + volumes: + - /home/manrao:/manrao + - /home/oyazdanb/aorta:/workspace/aorta + devices: + - /dev/kfd + - /dev/dri + working_dir: /workspace/aorta + shm_size: 17G + ulimits: + memlock: -1 + stack: 67108864 + stdin_open: true + tty: true diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml index c4704ae..7f082e2 100644 --- a/docker/docker-compose.rocm70_9-1.yaml +++ b/docker/docker-compose.rocm70_9-1.yaml @@ -4,6 +4,7 @@ services: build: context: . dockerfile: Dockerfile.rocm70_9-1 + network: host user: root privileged: true network_mode: host diff --git a/scripts/multi_node/README.md b/scripts/multi_node/README.md new file mode 100644 index 0000000..d4e1786 --- /dev/null +++ b/scripts/multi_node/README.md @@ -0,0 +1,239 @@ +# Multi-Node Training + +Scripts for multi-node distributed training with custom NCCL channel and thread configurations. + +## Table of Contents + +- [Quick Start](#quick-start) +- [Slurm Setup](#slurm-setup) +- [Usage](#usage) +- [Stopping Training](#stopping-training) +- [Troubleshooting](#troubleshooting) +- [NCCL Configuration](#nccl-configuration) +- [Conductor Setup](#conductor-setup) + +## Prerequisites + +- 2+ machines with ROCm GPUs, Docker, network connectivity (host mode) +- Passwordless SSH between nodes +- `scripts/multi_node/node_ip_list.txt` with node hostnames - master first +- All nodes on same git branch + +## File Structure + +``` +aorta/ +├── scripts/multi_node/ +│ ├── master_launch.sh # Main entrypoint +│ ├── start_docker_all_nodes.sh # Start Docker on all nodes +│ ├── setup_multi_node.sh # Automated setup (2+ nodes) +│ ├── config_node.sh # Per-node setup +│ ├── local_launch.sh # Per-node training (runs in Docker) +│ ├── set_env_variables.sh # NCCL/RCCL config +│ └── node_ip_list.txt # Node hostnames +├── docker/ +│ ├── docker-compose.rocm70_9-1.yaml # Base Docker config +│ └── docker-compose.rocm70_9-1-shampoo.yaml # Docker with Shampoo optimizer +├── config/ +│ ├── multi_node/ +│ │ └── distributed_multinode.yaml # Default config +│ └── shampoo_opt.yaml # Shampoo optimizer config +``` + +## Quick Start + +```bash +# First time setup (once per allocation) +scontrol show hostnames $SLURM_NODELIST > scripts/multi_node/node_ip_list.txt +./scripts/multi_node/start_docker_all_nodes.sh + +# Run training +./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --nproc 8 +``` + +World size: `NPROC_PER_NODE × NUM_NODES` (e.g., 8 GPUs/node × 2 nodes = 16) + +--- + +## Slurm Setup + +### Step 1: Pull Base Docker Image + +```bash +docker pull rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly + +# If authentication required +docker login +``` + +### Step 2: Allocate Nodes + +```bash +# From head node +salloc -N 3 -p gpu_partition -t 4:00:00 +squeue -u $USER +``` + +### Step 3: Create node_ip_list.txt + +```bash +cd /path/to/aorta/scripts/multi_node +scontrol show hostnames $SLURM_NODELIST > node_ip_list.txt +cat node_ip_list.txt +``` + +### Step 4: SSH to Master and Test Connectivity + +```bash +ssh node1-hostname +cd /path/to/aorta + +# Test worker connectivity +ssh node2-hostname hostname +ssh node3-hostname hostname +``` + +### Step 5: Pull Image on All Nodes + +```bash +for HOST in $(cat scripts/multi_node/node_ip_list.txt); do + ssh $HOST "docker pull rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly" +done +``` + +### Step 6: Start Docker and Run Training + +```bash +./scripts/multi_node/start_docker_all_nodes.sh + +./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --nproc 8 +``` + +--- + +## Usage + +```bash +# Basic launch (defaults: 28 channels, 256 threads, 8 GPUs/node) +./scripts/multi_node/master_launch.sh + +# Custom parameters +./scripts/multi_node/master_launch.sh -c 28 -t 256 -p 4 -f config/custom.yaml +``` + +### Parameters + +| Flag | Option | Default | Description | +|------|--------|---------|-------------| +| -c | --channels | 28 | NCCL_MAX_NCHANNELS | +| -t | --threads | 256 | RCCL_THREADS_PER_BLOCK | +| -p | --nproc | 8 | GPUs per node | +| -f | --config | config/multi_node/distributed_multinode.yaml | Config file | +| -r | --rocprof | false | Enable rocprofv3 | +| -m | --stats | false | rocprof stats | +| | --rocprof-input | none | rocprof yaml | +| | --master-port | auto | Master port | + +Environment variables: `CHANNELS=42 THREADS=512 ./scripts/multi_node/master_launch.sh` + +GPU subset: Use `-p 4` or `export CUDA_VISIBLE_DEVICES=0,2,4,6` + +### Custom Config + +Select a config file from `config/` or `config/multi_node/`: + +```bash +./scripts/multi_node/master_launch.sh \ + --channels 28 --threads 256 \ + --config config/multi_node/distributed_multinode.yaml +``` + +### Monitoring + +```bash +tail -f experiments/multinode_*/logs/node_*.txt # All nodes +tail -f experiments/multinode_*/logs/node_0_*.txt # Master only +cat experiments/multinode_*/outputs/rank_00_metrics.jsonl | tail -n 5 # Metrics +``` + +--- + +## Stopping Training + +`Ctrl+C` stops monitoring but training continues in background. + +To stop training: +```bash +for HOST in $(cat scripts/multi_node/node_ip_list.txt); do + ssh $HOST "docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f 'train.py|torchrun'" +done +``` + +--- + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Script hangs | Check last [STAGE] message | +| SSH fails | `ssh-copy-id $USER@` | +| Docker version mismatch | `docker compose version` on each node | +| NCCL timeout | Update `NCCL_SOCKET_IFNAME` in `set_env_variables.sh` | +| World size mismatch | Check `rocm-smi --showid \| wc -l`, adjust `--nproc` | + +--- + +## NCCL Configuration + +Edit `set_env_variables.sh`: + +**InfiniBand:** +```bash +export NCCL_IB_DISABLE=0 +export NCCL_IB_HCA=mlx5_0 # Check: ibstat +export NCCL_IB_GID_INDEX=3 +export NCCL_SOCKET_IFNAME=ib0 +``` + +**Ethernet:** +```bash +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=eth0 +export NCCL_NSOCKS_PERTHREAD=4 +export NCCL_SOCKET_NTHREADS=2 +``` + +**Debug:** `export NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=ALL` + +--- + +## Conductor Setup + +For Conductor environments with SSH key management: + +### SSH Key Setup + +```bash +ssh-keygen -t rsa -b 4096 -C "conductor-multi-node" -f ~/.ssh/id_rsa_conductor -N '' +cat ~/.ssh/id_rsa_conductor.pub +``` + +Register public key with your cluster's SSH key management system. + +```bash +cat >> ~/.ssh/config << 'EOF' +Host *.dcgpu smci350-* *.zts-gtu.dcgpu + IdentityFile ~/.ssh/id_rsa_conductor + StrictHostKeyChecking no +EOF +chmod 600 ~/.ssh/config +``` + +### Run Setup and Start Docker + +```bash +./scripts/multi_node/setup_multi_node.sh +./scripts/multi_node/start_docker_all_nodes.sh +``` + +Creates `node_ip_list.txt` with hostnames, detects network interfaces, verifies SSH and git branches. diff --git a/scripts/multi_node/config_node.sh b/scripts/multi_node/config_node.sh new file mode 100755 index 0000000..ddd631e --- /dev/null +++ b/scripts/multi_node/config_node.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Per-node configuration and launch script for Aorta GEMM training +# This script runs on each node (via SSH or locally) + +NODE_RANK=$(echo "$1" | sed 's/"//g') +NODE_IP=$(echo "$2" | sed 's/"//g') +MASTER_IP=$(echo "$3" | sed 's/"//g') +MASTER_PORT=$(echo "$4" | sed 's/"//g') +NNODES=$(echo "$5" | sed 's/"//g') +WORLD_SIZE=$(echo "$6" | sed 's/"//g') +WORKDIR=$(echo "$7" | sed 's/"//g') +EXPERIMENT_DIR=$(echo "$8" | sed 's/"//g') +CONFIG_FILE=$(echo "$9" | sed 's/"//g') +NPROC_PER_NODE=$(echo "${10}" | sed 's/"//g') +CHANNELS=$(echo "${11}" | sed 's/"//g') +THREADS=$(echo "${12}" | sed 's/"//g') +ENABLE_ROCPROF=$(echo "${13}" | sed 's/"//g') +ROCPROF_STATS=$(echo "${14}" | sed 's/"//g') +ROCPROF_INPUT=$(echo "${15}" | sed 's/"//g') + +echo "============================================" +echo "Node Configuration" +echo "============================================" +echo "Node Rank: $NODE_RANK" +echo "Node IP: $NODE_IP" +echo "Master IP: $MASTER_IP" +echo "Master Port: $MASTER_PORT" +echo "Number of Nodes: $NNODES" +echo "World Size: $WORLD_SIZE GPUs" +echo "Processes per node: $NPROC_PER_NODE" +echo "Work Directory: $WORKDIR" +echo "Experiment Directory: $EXPERIMENT_DIR" +echo "Config File: $CONFIG_FILE" +echo "Channels: $CHANNELS" +echo "Threads: $THREADS" +echo "============================================" +echo "" + +# Change to working directory +cd "$WORKDIR" || exit 1 + +# Activate virtual environment if it exists +VENV_PATH="$WORKDIR/.venv" +if [[ -d "$VENV_PATH" ]]; then + echo "Activating virtual environment at $VENV_PATH" + source "$VENV_PATH/bin/activate" +fi + +# Source common environment variables +if [[ -f "$WORKDIR/scripts/multi_node/set_env_variables.sh" ]]; then + echo "Sourcing set_env_variables.sh" + source "$WORKDIR/scripts/multi_node/set_env_variables.sh" +else + echo "Warning: set_env_variables.sh not found, using default NCCL settings" + export NCCL_DEBUG=WARN + export NCCL_IB_DISABLE=0 + export NCCL_SOCKET_IFNAME=eth0 +fi + +echo "" +echo "Environment configured. Starting GEMM training..." +echo "" + +# Launch local_launch.sh with all parameters +"$WORKDIR/scripts/multi_node/local_launch.sh" \ + "$NODE_RANK" "$NODE_IP" "$MASTER_IP" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" \ + "$EXPERIMENT_DIR" "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" \ + "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT" + +echo "" +echo "============================================" +echo "Node $NODE_RANK training completed" +echo "============================================" diff --git a/scripts/multi_node/local_launch.sh b/scripts/multi_node/local_launch.sh new file mode 100755 index 0000000..f517906 --- /dev/null +++ b/scripts/multi_node/local_launch.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Multi-node local launch script for GEMM training +# Runs on each node with single channel/thread configuration + +if [[ $# -lt 11 ]]; then + echo "Usage: $0 [ENABLE_ROCPROF] [ROCPROF_STATS] [ROCPROF_INPUT]" + exit 1 +fi + +NODE_RANK="$1" +NODE_IP="$2" +MASTER_IP="$3" +MASTER_PORT="$4" +NNODES="$5" +WORLD_SIZE="$6" +EXPERIMENT_DIR="$7" +CONFIG_FILE="$8" +NPROC_PER_NODE="$9" +CHANNELS="${10}" +THREADS="${11}" +ENABLE_ROCPROF="${12:-false}" +ROCPROF_STATS="${13:-false}" +ROCPROF_INPUT="${14:-}" + +echo "==========================================" +echo "Local Launch Configuration" +echo "==========================================" +echo "Node Rank: $NODE_RANK" +echo "Node IP: $NODE_IP" +echo "Master IP: $MASTER_IP" +echo "Master Port: $MASTER_PORT" +echo "Number of Nodes: $NNODES" +echo "World Size: $WORLD_SIZE" +echo "Processes per node: $NPROC_PER_NODE" +echo "Experiment Dir: $EXPERIMENT_DIR" +echo "Config File: $CONFIG_FILE" +echo "Channels: $CHANNELS" +echo "Threads: $THREADS" +echo "rocprof enabled: $ENABLE_ROCPROF" +echo "==========================================" +echo "" + +# Output directory for this configuration +OUTPUT_DIR="${EXPERIMENT_DIR}/${THREADS}thread_${CHANNELS}channels" +mkdir -p "${OUTPUT_DIR}" + +# Convert host path to Docker path for use inside container +# Docker mounts host aorta directory -> /workspace/aorta +# Extract aorta root from EXPERIMENT_DIR (e.g., /home/user/aorta/experiments/... -> /home/user/aorta) +AORTA_ROOT_FROM_EXP=$(echo "$EXPERIMENT_DIR" | sed 's|/experiments/.*||') +# Replace the aorta root with /workspace/aorta +OUTPUT_DIR_DOCKER=$(echo "$OUTPUT_DIR" | sed "s|^${AORTA_ROOT_FROM_EXP}|/workspace/aorta|") + +# Also convert CONFIG_FILE to Docker path if it's an absolute path +if [[ "$CONFIG_FILE" =~ ^/ ]]; then + CONFIG_FILE_DOCKER=$(echo "$CONFIG_FILE" | sed "s|^${AORTA_ROOT_FROM_EXP}|/workspace/aorta|") +else + CONFIG_FILE_DOCKER="$CONFIG_FILE" +fi + +# Log file +LOG_FILE="${OUTPUT_DIR}/node_${NODE_RANK}_output.log" + +# Function to log with timestamp +log() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[${timestamp}] [Node ${NODE_RANK}] ${message}" | tee -a "${LOG_FILE}" +} + +# Cleanup function +cleanup() { + echo "" + echo "=== Caught interrupt signal ===" | tee -a "${LOG_FILE}" + log "Cleaning up training processes on node ${NODE_RANK}..." + + # Try to kill processes inside Docker container + docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "train.py" 2>/dev/null || true + docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "torchrun" 2>/dev/null || true + + # Also try on host (in case anything leaked) + sudo pkill -9 -f "train.py" 2>/dev/null || true + sudo pkill -9 -f "torchrun" 2>/dev/null || true + + log "Cleanup complete. Exiting." + exit 130 +} + +trap cleanup SIGINT SIGTERM + +log "Starting multi-node training with RCCL_THREADS_PER_BLOCK=${THREADS}, NCCL_MAX_NCHANNELS=${CHANNELS}" +log "Output directory: ${OUTPUT_DIR}" + +START_TIME=$(date +%s) + +# Docker container name (update if different) +DOCKER_CONTAINER="training-overlap-bugs-rocm70_9-1" + +# Check if Docker container is running +if ! docker ps --format '{{.Names}}' | grep -q "^${DOCKER_CONTAINER}$"; then + log "ERROR: Docker container '${DOCKER_CONTAINER}' is not running" + log "Start it with: cd /path/to/aorta/docker && docker compose -f docker-compose.rocm70_9-1.yaml up -d" + exit 1 +fi + +log "Docker container '${DOCKER_CONTAINER}' is running" + +# Base command for torchrun with multi-node parameters +BASE_CMD="torchrun --nnodes ${NNODES} --node_rank ${NODE_RANK} --nproc_per_node ${NPROC_PER_NODE} --master_addr ${MASTER_IP} --master_port ${MASTER_PORT} train.py --config ${CONFIG_FILE_DOCKER}" +BASE_OVERRIDES="--override profiling.tensorboard=false" + +# Build docker exec prefix with environment variables +DOCKER_EXEC="docker exec \ + -e RCCL_THREADS_PER_BLOCK=${THREADS} \ + -e NCCL_MAX_NCHANNELS=${CHANNELS} \ + -e HSA_ENABLE_SDMA=0 \ + -e PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 \ + ${DOCKER_CONTAINER}" + +# Run with or without rocprofv3 +if [ "${ENABLE_ROCPROF}" = "true" ]; then + ROCPROF_DIR="${OUTPUT_DIR}/rocprof_traces/node_${NODE_RANK}" + mkdir -p "${ROCPROF_DIR}" + + if [ -n "${ROCPROF_INPUT}" ]; then + log "Using rocprofv3 input file: ${ROCPROF_INPUT}" + ${DOCKER_EXEC} bash -c "rocprofv3 -i ${ROCPROF_INPUT} -d ${ROCPROF_DIR} -- \ + ${BASE_CMD} ${BASE_OVERRIDES} \ + --override training.output_dir=${OUTPUT_DIR_DOCKER}" \ + 2>&1 | tee -a "${LOG_FILE}" + else + ROCPROF_ARGS="--kernel-trace" + if [ "${ROCPROF_STATS}" = "true" ]; then + ROCPROF_ARGS="${ROCPROF_ARGS} --stats" + fi + + log "Running with rocprofv3 kernel tracing inside Docker" + ${DOCKER_EXEC} bash -c "rocprofv3 ${ROCPROF_ARGS} -d ${ROCPROF_DIR} -- \ + ${BASE_CMD} ${BASE_OVERRIDES} \ + --override training.output_dir=${OUTPUT_DIR_DOCKER}" \ + 2>&1 | tee -a "${LOG_FILE}" + fi +else + log "Running inside Docker container" + log "Command: ${BASE_CMD} ${BASE_OVERRIDES} --override training.output_dir=${OUTPUT_DIR_DOCKER}" + ${DOCKER_EXEC} bash -c "${BASE_CMD} ${BASE_OVERRIDES} \ + --override training.output_dir=${OUTPUT_DIR_DOCKER}" \ + 2>&1 | tee -a "${LOG_FILE}" +fi + +EXIT_CODE=${PIPESTATUS[0]} +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +if [ $EXIT_CODE -eq 0 ]; then + log "Training completed successfully (duration: ${DURATION}s)" +else + log "Training failed with exit code: $EXIT_CODE (duration: ${DURATION}s)" +fi + +echo "" +log "Node ${NODE_RANK} finished" diff --git a/scripts/multi_node/master_launch.sh b/scripts/multi_node/master_launch.sh new file mode 100755 index 0000000..f88d2fe --- /dev/null +++ b/scripts/multi_node/master_launch.sh @@ -0,0 +1,228 @@ +#!/bin/bash +# Multi-node orchestration script for Aorta GEMM training +# Adapted from DLRM master_launch.sh pattern +# +# TODO: Convert to SLURM-native launch using srun instead of SSH to individual nodes. +# Currently this script runs from a compute node and SSHs to other nodes. +# Ideally, we should run SLURM commands from the login node, which would +# eliminate the need for SSH connectivity checks and branch verification. + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " -c, --channels CHANNELS NCCL_MAX_NCHANNELS value (default: 28)" + echo " -t, --threads THREADS RCCL_THREADS_PER_BLOCK value (default: 256)" + echo " -f, --config CONFIG Config file path (default: config/multi_node/distributed_multinode.yaml)" + echo " -p, --nproc NPROC Number of processes per node (default: 8)" + echo " -r, --rocprof Enable rocprofv3 tracing" + echo " -m, --stats Enable rocprof stats (CU utilization, occupancy)" + echo " --rocprof-input FILE Use rocprofv3 input yaml/json" + echo " --master-port PORT Master port (default: auto-select)" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 --channels 28 --threads 256" + echo " $0 -c 28 -t 256 --rocprof" + echo " $0 --channels 28 --config config/my_custom.yaml" + echo "" + echo "Or use environment variables:" + echo " CHANNELS=28 THREADS=256 $0" + exit 1 +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MACHINE_IP_FILE="$SCRIPT_DIR/node_ip_list.txt" # Contains hostnames or IPs + +# Default values (can be overridden by env vars or command-line args) +CONFIG_FILE="${CONFIG_FILE:-config/multi_node/distributed_multinode.yaml}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" +CHANNELS="${CHANNELS:-28}" +THREADS="${THREADS:-256}" +ENABLE_ROCPROF="${ENABLE_ROCPROF:-false}" +ROCPROF_STATS="${ROCPROF_STATS:-false}" +ROCPROF_INPUT="${ROCPROF_INPUT:-}" +MASTER_PORT="${MASTER_PORT:-}" + +# Parse command-line arguments (override env vars) +while [[ $# -gt 0 ]]; do + case $1 in + -c|--channels) + CHANNELS="$2" + shift 2 + ;; + -t|--threads) + THREADS="$2" + shift 2 + ;; + -f|--config) + CONFIG_FILE="$2" + shift 2 + ;; + -p|--nproc) + NPROC_PER_NODE="$2" + shift 2 + ;; + -r|--rocprof) + ENABLE_ROCPROF="true" + shift + ;; + -m|--stats) + ROCPROF_STATS="true" + shift + ;; + --rocprof-input) + ROCPROF_INPUT="$2" + shift 2 + ;; + --master-port) + MASTER_PORT="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + + +if [[ -z "$MASTER_PORT" ]]; then + if ! MASTER_PORT=$(python3 - <<'PY' +import socket +s=socket.socket() +s.bind(('',0)) +print(s.getsockname()[1]) +s.close() +PY + ); then + echo "Error: Failed to auto-select master port. Set MASTER_PORT manually." + exit 1 + fi +fi + +# Check git branch consistency before launching +echo "=== Checking git branch consistency ===" +MASTER_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo") + +if [[ "$MASTER_BRANCH" != "not-a-git-repo" ]]; then + echo "Master node branch: $MASTER_BRANCH" + + node=0 + while IFS= read -r HOST || [[ -n "$HOST" ]]; do # HOST can be hostname or IP + if [[ -z "$HOST" ]]; then continue; fi + + if [[ "$node" -gt 0 ]]; then + WORKER_BRANCH=$(ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "cd ~/aorta && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo") + + if [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then + echo "[WARN] Worker node $HOST: Not a git repository" + elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then + echo "" + echo "[ERROR] Branch mismatch on worker node $HOST!" + echo " Master: $MASTER_BRANCH" + echo " Worker: $WORKER_BRANCH" + echo "" + echo "Fix: ssh $USER@$HOST 'cd ~/aorta && git checkout $MASTER_BRANCH && git pull'" + echo "" + exit 1 + else + echo "Worker node $HOST: $WORKER_BRANCH [OK]" + fi + fi + ((node++)) + done < "$MACHINE_IP_FILE" +else + echo "[WARN] Not a git repository - skipping branch check" +fi +echo "" + +TRACE_TIMESTAMP=$(date +%Y%m%d_%H%M%S) +EXPERIMENT_DIR="$HOME/aorta/experiments/multinode_${CHANNELS}ch_${THREADS}th_${TRACE_TIMESTAMP}" +mkdir -p "$EXPERIMENT_DIR" +mkdir -p "$EXPERIMENT_DIR/logs" + +echo "=== Aorta Multi-Node GEMM Training ===" +echo "Experiment directory: $EXPERIMENT_DIR" +echo "Config file: $CONFIG_FILE" +echo "NCCL Channels: $CHANNELS" +echo "RCCL Threads per block: $THREADS" +echo "Processes per node: $NPROC_PER_NODE" +echo "rocprof enabled: $ENABLE_ROCPROF" + +NUM_NODES=$(awk 'NF' "$MACHINE_IP_FILE" | wc -l) +WORLD_SIZE=$((NPROC_PER_NODE * NUM_NODES)) +NNODES=$NUM_NODES + +echo "Number of nodes: $NUM_NODES" +echo "World size: $WORLD_SIZE (GPUs)" +echo "Using MASTER_PORT: $MASTER_PORT" +echo "" + +node=0 +while IFS= read -r HOST || [[ -n "$HOST" ]]; do # HOST can be hostname or IP + if [[ -z "$HOST" ]]; then + continue + fi + + echo "Setting up Node: $node, Host: $HOST" + + TIME=$(date +"%Y%m%d_%H%M%S") + LOG_FILE="$EXPERIMENT_DIR/logs/node_${node}_${TIME}.txt" + + if [[ "$node" -eq 0 ]]; then + MASTER_ADDR="$HOST" + echo "Master node: $MASTER_ADDR" + echo "" + + ./scripts/multi_node/config_node.sh "$node" "$HOST" "$MASTER_ADDR" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" "$PWD" "$EXPERIMENT_DIR" \ + "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT" \ + > "$LOG_FILE" 2>&1 & + + else + # Note: stdin explicitly redirected from config_node.sh, so -n flag not needed + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + "$USER"@"$HOST" bash -s -- "$node" "$HOST" "$MASTER_ADDR" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" "$PWD" "$EXPERIMENT_DIR" \ + "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT" \ + < ./scripts/multi_node/config_node.sh \ + > "$LOG_FILE" 2>&1 & + fi + + ((node++)) + +done < "$MACHINE_IP_FILE" + +echo "" +echo "=== All nodes launched ===" +echo "Monitor logs in: $EXPERIMENT_DIR/logs/" +echo "" +echo "To monitor progress:" +echo " tail -f $EXPERIMENT_DIR/logs/node_0_*.txt" +echo "" +echo "To check all nodes:" +echo " tail -f $EXPERIMENT_DIR/logs/node_*.txt" +echo "" +echo "Waiting for training to complete..." +echo "Press Ctrl+C to stop monitoring (training will continue in background)" + +wait + +echo "" +echo "=== Training completed ===" +echo "Results saved to: $EXPERIMENT_DIR" + +# ================================================================================ +# HOW TO STOP TRAINING +# ================================================================================ +# Press Ctrl+C above (stops monitoring, but training continues in background) +# +# To find running processes: +# ps aux | grep -E 'config_node.sh|torchrun.*train.py' | grep -v grep +# +# To stop by PID (replace 12345 with your PID): +# for HOST in $(cat scripts/multi_node/node_ip_list.txt); do +# ssh $USER@$HOST "kill -9 12345" +# done +# ================================================================================ diff --git a/scripts/multi_node/set_env_variables.sh b/scripts/multi_node/set_env_variables.sh new file mode 100755 index 0000000..3e9c070 --- /dev/null +++ b/scripts/multi_node/set_env_variables.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Global NCCL/RCCL environment variables for multi-node training +# Based on DLRM_set_env_variables.sh + +# NCCL Debug Settings (use INFO for debugging network issues) +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=INIT,NET +# Try disabling IB if InfiniBand is not properly configured +export NCCL_IB_DISABLE=1 + +# IB/RNIC Configuration (commented out when IB is disabled) +# export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 +# export NCCL_IB_GID_INDEX=3 +export NCCL_NCHANNELS_PER_NET_PEER=8 + +# HSA Settings for ROCm +export HSA_ENABLE_IPC_MODE_LEGACY=1 + +# NCCL Protocol +export NCCL_PROTO=Simple + +# Channel Configuration (can be overridden by sweep parameters) +export NCCL_MIN_NCHANNELS=40 +export NCCL_MAX_NCHANNELS=40 + +# Network Interface +# Change this to match your network interface: eth0, ib0, enp49s0f0np0, etc. +# Temporarily commented out for auto-detection: +# export NCCL_SOCKET_IFNAME=enp193s0f0 + +# PyTorch ROCm Profiler +export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 + +# Optional: Force non-overlap for debugging +# export GPU_MAX_HW_QUEUES=1 +# unset TORCH_NCCL_HIGH_PRIORITY + +# Optional: Disable SDMA for testing +# export HSA_ENABLE_SDMA=0 + +# Optional: Disable IB for Ethernet-only testing +# export NCCL_IB_DISABLE=1 diff --git a/scripts/multi_node/setup_multi_node.sh b/scripts/multi_node/setup_multi_node.sh new file mode 100755 index 0000000..6270a1a --- /dev/null +++ b/scripts/multi_node/setup_multi_node.sh @@ -0,0 +1,318 @@ +#!/bin/bash +# Interactive setup script for multi-node training +# Run this on Machine 1 (master node) + +set -e + +echo "================================================" +echo " Multi-Node Training Setup" +echo "================================================" +echo "" +echo "Prerequisites:" +echo " - You have access to multiple machines" +echo " - You can SSH into all machines from the master node" +echo " - You have the hostnames/IPs of all machines" +echo "" +read -p "Press Enter to continue..." +echo "" + +# Get current machine info +CURRENT_HOST=$(hostname) +CURRENT_IP=$(hostname -I | awk '{print $1}') + +echo "Current machine (Master Node):" +echo " Hostname: $CURRENT_HOST" +echo " IP: $CURRENT_IP" +echo "" + +# Ask for number of worker nodes +read -p "How many worker nodes? (default: 1): " NUM_WORKERS +NUM_WORKERS=${NUM_WORKERS:-1} + +if ! [[ "$NUM_WORKERS" =~ ^[0-9]+$ ]] || [[ "$NUM_WORKERS" -lt 1 ]]; then + echo "Error: Number of workers must be a positive integer" + exit 1 +fi + +echo "Setting up 1 master + $NUM_WORKERS worker(s) = $((NUM_WORKERS + 1)) total nodes" +echo "" + +# Collect worker hostnames +WORKER_HOSTS=() +for i in $(seq 1 $NUM_WORKERS); do + read -p "Enter hostname for worker $i: " WORKER_HOST + + if [[ -z "$WORKER_HOST" ]]; then + echo "Error: Worker hostname cannot be empty" + exit 1 + fi + + WORKER_HOSTS+=("$WORKER_HOST") +done +echo "" + +# Test SSH to all workers and collect IPs +echo "Testing SSH connections to workers..." +WORKER_IPS=() +for i in "${!WORKER_HOSTS[@]}"; do + WORKER_HOST="${WORKER_HOSTS[$i]}" + WORKER_NUM=$((i + 1)) + + echo "Worker $WORKER_NUM: $WORKER_HOST" + + if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$USER@$WORKER_HOST" "hostname" >/dev/null 2>&1; then + echo " [OK] SSH successful" + WORKER_IP=$(ssh "$USER@$WORKER_HOST" "hostname -I | awk '{print \$1}'") + echo " [OK] IP: $WORKER_IP" + WORKER_IPS+=("$WORKER_IP") + else + echo " [FAIL] SSH failed" + echo "" + echo "Fixes:" + echo " 1. Ensure your SSH key is registered with your cluster management system" + echo " 2. Generate and copy SSH key:" + echo " ssh-keygen -t rsa -b 4096 -C 'multi-node' -f ~/.ssh/id_rsa_cluster -N ''" + echo " # Then register the public key (~/.ssh/id_rsa_cluster.pub) with your cluster" + echo " 3. Or use ssh-copy-id if direct access is available:" + echo " ssh-copy-id -i ~/.ssh/id_rsa_cluster.pub $USER@$WORKER_HOST" + exit 1 + fi + echo "" +done + +# Test connectivity and reverse SSH +echo "Testing connectivity and reverse SSH..." +for i in "${!WORKER_HOSTS[@]}"; do + WORKER_HOST="${WORKER_HOSTS[$i]}" + WORKER_IP="${WORKER_IPS[$i]}" + WORKER_NUM=$((i + 1)) + + echo "Worker $WORKER_NUM ($WORKER_HOST):" + + # Test ping + if ping -c 2 "$WORKER_IP" >/dev/null 2>&1; then + echo " [OK] Ping successful" + else + echo " [WARN] Ping failed (might be okay if ICMP is blocked)" + fi + + # Test reverse SSH + if ssh "$USER@$WORKER_HOST" "ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no $USER@$CURRENT_HOST hostname" >/dev/null 2>&1; then + echo " [OK] Reverse SSH successful" + else + echo " [WARN] Reverse SSH failed - setting up passwordless SSH" + + # Generate key on worker if needed + ssh "$USER@$WORKER_HOST" "test -f ~/.ssh/id_rsa || ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa" >/dev/null 2>&1 + + # Copy worker's public key to master + WORKER_PUBKEY=$(ssh "$USER@$WORKER_HOST" "cat ~/.ssh/id_rsa.pub") + mkdir -p ~/.ssh + echo "$WORKER_PUBKEY" >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys + + # Test again + if ssh "$USER@$WORKER_HOST" "ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no $USER@$CURRENT_HOST hostname" >/dev/null 2>&1; then + echo " [OK] Reverse SSH now working" + else + echo " [FAIL] Reverse SSH still failing - manual setup needed" + fi + fi + echo "" +done + +# Check if code exists on workers +AORTA_PATH="$HOME/aorta" +echo "Checking code availability on workers..." + +MASTER_INODE=$(stat -c %i "$AORTA_PATH" 2>/dev/null || echo "0") +MASTER_BRANCH=$(cd "$AORTA_PATH" && git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo") +SHARED_FS=true + +for i in "${!WORKER_HOSTS[@]}"; do + WORKER_HOST="${WORKER_HOSTS[$i]}" + WORKER_NUM=$((i + 1)) + + echo "Worker $WORKER_NUM ($WORKER_HOST):" + + if ssh "$USER@$WORKER_HOST" "test -d $AORTA_PATH" 2>/dev/null; then + echo " [OK] Code found: $AORTA_PATH" + + # Check if it's the same filesystem + WORKER_INODE=$(ssh "$USER@$WORKER_HOST" "stat -c %i $AORTA_PATH" 2>/dev/null || echo "0") + + if [[ "$MASTER_INODE" == "$WORKER_INODE" ]] && [[ "$MASTER_INODE" != "0" ]]; then + echo " [OK] Shared filesystem detected" + else + echo " [WARN] Separate filesystem - manual sync needed" + SHARED_FS=false + fi + + # Check git branch + WORKER_BRANCH=$(ssh "$USER@$WORKER_HOST" "cd $AORTA_PATH && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo") + + if [[ "$MASTER_BRANCH" == "not-a-git-repo" ]] || [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then + echo " [WARN] Not a git repository - cannot verify branch" + elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then + echo " [ERROR] Branch mismatch!" + echo " Master: $MASTER_BRANCH" + echo " Worker: $WORKER_BRANCH" + echo "" + echo " Fix: ssh $WORKER_HOST 'cd $AORTA_PATH && git checkout $MASTER_BRANCH && git pull'" + exit 1 + else + echo " [OK] Branch: $MASTER_BRANCH" + fi + else + echo " [FAIL] Code not found" + echo " You'll need to clone or rsync the code to: $AORTA_PATH" + SHARED_FS=false + fi + echo "" +done + +# Check GPUs +echo "Checking GPUs on all nodes..." +MASTER_GPUS=$(rocm-smi --showid 2>/dev/null | grep -c "GPU" || echo "unknown") +echo "Master: $MASTER_GPUS GPUs" + +GPU_MISMATCH=false +for i in "${!WORKER_HOSTS[@]}"; do + WORKER_HOST="${WORKER_HOSTS[$i]}" + WORKER_NUM=$((i + 1)) + WORKER_GPUS=$(ssh "$USER@$WORKER_HOST" "rocm-smi --showid 2>/dev/null | grep -c GPU" || echo "unknown") + + echo "Worker $WORKER_NUM: $WORKER_GPUS GPUs" + + if [[ "$MASTER_GPUS" != "$WORKER_GPUS" ]]; then + GPU_MISMATCH=true + fi +done + +if [[ "$GPU_MISMATCH" == "true" ]]; then + echo "[WARN] GPU count mismatch detected" + echo " Use --nproc flag with master_launch.sh to specify GPU count per node" +fi +echo "" + +# Detect network interface +echo "Detecting network interface..." +INTERFACE=$(ifconfig 2>/dev/null | grep -E "^(ib|enp|eth)" | head -1 | cut -d: -f1 || echo "unknown") +if [[ "$INTERFACE" == "unknown" ]]; then + INTERFACE=$(ip addr show 2>/dev/null | grep -E "^[0-9]+: (ib|enp|eth)" | head -1 | awk '{print $2}' | tr -d ':' || echo "eth0") +fi + +echo " Detected interface: $INTERFACE" + +# Ask user to confirm or change +read -p "Network interface for NCCL (press Enter to use detected, or enter different name): " USER_INTERFACE +if [[ -n "$USER_INTERFACE" ]]; then + INTERFACE="$USER_INTERFACE" + echo " Using: $INTERFACE" +else + echo " Using detected interface: $INTERFACE" +fi +echo "" + +# Create node_ip_list.txt (stores hostnames, not IPs) in scripts/multi_node/ +echo "Creating node_ip_list.txt (with hostnames)..." +NODE_IP_FILE="$AORTA_PATH/scripts/multi_node/node_ip_list.txt" + +# Write master hostname first +echo "$CURRENT_HOST" > "$NODE_IP_FILE" + +# Add all worker hostnames +for WORKER_HOST in "${WORKER_HOSTS[@]}"; do + echo "$WORKER_HOST" >> "$NODE_IP_FILE" +done + +echo "[OK] Created $NODE_IP_FILE:" +cat "$NODE_IP_FILE" +echo "" +echo "[INFO] File contains hostnames (not IPs) for SSH compatibility with config files" +echo "" + +# Update network interface in set_env_variables.sh +echo "Updating network interface in set_env_variables.sh..." +if [[ -f "scripts/multi_node/set_env_variables.sh" ]]; then + # Backup original + cp scripts/multi_node/set_env_variables.sh scripts/multi_node/set_env_variables.sh.bak + + # Update interface + sed -i "s/export NCCL_SOCKET_IFNAME=.*/export NCCL_SOCKET_IFNAME=$INTERFACE/" scripts/multi_node/set_env_variables.sh + + echo "[OK] Updated NCCL_SOCKET_IFNAME=$INTERFACE" +else + echo "[WARN] set_env_variables.sh not found - manual configuration needed" +fi +echo "" + +# Summary +echo "================================================" +echo " Setup Complete!" +echo "================================================" +echo "" +echo "Configuration Summary:" +echo " Total Nodes: $((NUM_WORKERS + 1)) (1 master + $NUM_WORKERS workers)" +echo " Network Interface: $INTERFACE" +echo " Shared Filesystem: ${SHARED_FS:-false}" +echo "" +echo " Master: $CURRENT_HOST ($CURRENT_IP) - $MASTER_GPUS GPUs" +for i in "${!WORKER_HOSTS[@]}"; do + WORKER_HOST="${WORKER_HOSTS[$i]}" + WORKER_IP="${WORKER_IPS[$i]}" + WORKER_NUM=$((i + 1)) + WORKER_GPUS=$(ssh "$USER@$WORKER_HOST" "rocm-smi --showid 2>/dev/null | grep -c GPU" || echo "unknown") + echo " Worker $WORKER_NUM: $WORKER_HOST ($WORKER_IP) - $WORKER_GPUS GPUs" +done +echo "" +echo "Node IP list created at:" +echo " $AORTA_PATH/scripts/multi_node/node_ip_list.txt" +echo "" + +if [[ "${SHARED_FS:-false}" == "false" ]]; then + echo "[IMPORTANT] Sync code to all workers before running:" + for WORKER_HOST in "${WORKER_HOSTS[@]}"; do + echo " ssh $WORKER_HOST 'cd ~/ && git clone aorta'" + done + echo " OR use rsync:" + for WORKER_HOST in "${WORKER_HOSTS[@]}"; do + echo " rsync -avz $AORTA_PATH/ $WORKER_HOST:$AORTA_PATH/" + done + echo "" +fi + +echo "Next Steps:" +echo "" +echo "1. Start Docker on all nodes (run once, containers persist):" +echo " cd $AORTA_PATH" +echo " ./scripts/multi_node/start_docker_all_nodes.sh" +echo "" +echo "2. Launch training (run as many times as you want):" +echo " ./scripts/multi_node/master_launch.sh --channels 28 --threads 256" +echo "" +echo "3. Monitor training:" +echo " tail -f experiments/multinode_*/logs/node_*.txt" +echo "" +echo "Additional Options:" +echo "" +echo " Different parameters:" +echo " ./scripts/multi_node/master_launch.sh --channels 42 --threads 512 --nproc 8" +echo "" +echo " With profiling:" +echo " ./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --rocprof --stats" +echo "" +echo " Custom config:" +echo " ./scripts/multi_node/master_launch.sh --config config/distributed_two_nodes.yaml" +echo "" +echo "Stop training on all nodes:" +for WORKER_HOST in "${WORKER_HOSTS[@]}"; do + echo " ssh $WORKER_HOST 'pkill -9 -f train.py'" +done +echo " pkill -9 -f train.py # On master" +echo "" +echo "Stop Docker when completely done:" +echo " for IP in \$(cat node_ip_list.txt); do" +echo " ssh \$USER@\$IP 'cd $AORTA_PATH/docker && docker compose -f docker-compose.rocm70_9-1.yaml down'" +echo " done" +echo "" diff --git a/scripts/multi_node/start_docker_all_nodes.sh b/scripts/multi_node/start_docker_all_nodes.sh new file mode 100755 index 0000000..f93ccd7 --- /dev/null +++ b/scripts/multi_node/start_docker_all_nodes.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Start Docker containers on all nodes for multi-node training +# Usage: ./start_docker_all_nodes.sh [docker-compose-file] [container-name] + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AORTA_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +MACHINE_IP_FILE="$SCRIPT_DIR/node_ip_list.txt" # Contains hostnames or IPs + +# Allow custom docker-compose file and container name via arguments +DOCKER_COMPOSE_FILE="${1:-docker/docker-compose.rocm70_9-1.yaml}" +DOCKER_CONTAINER="${2:-training-overlap-bugs-rocm70_9-1}" + +if [[ ! -f "$MACHINE_IP_FILE" ]]; then + echo "Error: $MACHINE_IP_FILE not found" + echo "Run setup_multi_node.sh first" + exit 1 +fi + +cd "$AORTA_ROOT" + +# Check git branch consistency before starting Docker +echo "=== Checking git branch consistency ===" +MASTER_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo") + +if [[ "$MASTER_BRANCH" != "not-a-git-repo" ]]; then + echo "Master node branch: $MASTER_BRANCH" + TOTAL_NODES=$(grep -c . "$MACHINE_IP_FILE" || echo "0") + echo "Found $TOTAL_NODES nodes in $MACHINE_IP_FILE" + echo "" + + node=0 + while IFS= read -r HOST || [[ -n "$HOST" ]]; do # HOST can be hostname or IP + # Skip empty lines + if [[ -z "$HOST" ]]; then + continue + fi + + if [[ "$node" -gt 0 ]]; then + echo "[STAGE] Checking worker node $node ($HOST)..." + WORKER_BRANCH=$(ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "cd ~/aorta && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo") + + if [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then + echo " [WARN] Worker node $HOST: Not a git repository" + elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then + echo " [ERROR] Branch mismatch on node $HOST!" + echo " Master: $MASTER_BRANCH" + echo " Worker: $WORKER_BRANCH" + echo "" + echo "Fix: ssh $USER@$HOST 'cd ~/aorta && git checkout $MASTER_BRANCH && git pull'" + exit 1 + else + echo " Worker node $HOST: $WORKER_BRANCH [OK]" + fi + fi + ((node++)) || true + done < "$MACHINE_IP_FILE" + echo "" + echo "Branch check complete [OK]" + echo "" +else + echo "[WARN] Not a git repository - skipping branch check" + echo "" +fi + +echo "=== Starting Docker containers on all nodes ===" +TOTAL_NODES=$(wc -l < "$MACHINE_IP_FILE") +echo "Total nodes to process: $TOTAL_NODES" +echo "" + +node=0 +while IFS= read -r HOST || [[ -n "$HOST" ]]; do # HOST can be hostname or IP + if [[ -z "$HOST" ]]; then + continue + fi + + echo "Node $node (Host: $HOST):" + + if [[ "$node" -eq 0 ]]; then + # Master node (local) + echo " [STAGE] Checking existing containers on master..." + if docker ps --format '{{.Names}}' < /dev/null | grep -q "^${DOCKER_CONTAINER}$"; then + echo " [INFO] Container already running, restarting..." + fi + + echo " [STAGE] Running docker compose up -d on master..." + COMPOSE_FILE_PATH="${DOCKER_COMPOSE_FILE#docker/}" + cd docker && docker compose -f "$COMPOSE_FILE_PATH" up -d < /dev/null && cd .. + + echo " [STAGE] Verifying master container..." + if docker ps --format '{{.Names}}' < /dev/null | grep -q "^${DOCKER_CONTAINER}$"; then + echo " [OK] Docker container '${DOCKER_CONTAINER}' is running" + else + echo " [FAIL] Failed to start Docker container" + exit 1 + fi + else + # Worker nodes (via SSH) + echo " [STAGE] Connecting to worker via SSH..." + if ! ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "echo 'SSH connection successful'" > /dev/null 2>&1; then + echo " [FAIL] Cannot SSH to worker node $HOST" + exit 1 + fi + echo " [OK] SSH connection successful" + + echo " [STAGE] Checking existing containers on worker..." + if ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "docker ps --format '{{.Names}}'" | grep -q "^${DOCKER_CONTAINER}$"; then + echo " [INFO] Container already running, restarting..." + fi + + echo " [STAGE] Running docker compose up -d on worker..." + COMPOSE_FILE_PATH="${DOCKER_COMPOSE_FILE#docker/}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" \ + "cd /home/$USER/aorta/docker && docker compose -f $COMPOSE_FILE_PATH up -d" + + echo " [STAGE] Verifying worker container..." + if ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "docker ps --format '{{.Names}}'" | grep -q "^${DOCKER_CONTAINER}$"; then + echo " [OK] Docker container '${DOCKER_CONTAINER}' is running on worker" + else + echo " [FAIL] Failed to start Docker container on worker" + exit 1 + fi + fi + + echo "" + ((node++)) || true +done < "$MACHINE_IP_FILE" + +echo "=== All Docker containers started successfully ===" +echo "Docker container: $DOCKER_CONTAINER" +echo "" +echo "Verify with:" +echo " docker ps # Check master" +while IFS= read -r HOST || [[ -n "$HOST" ]]; do + if [[ -z "$HOST" ]]; then continue; fi + if [[ "$node" -gt 1 ]]; then + echo " ssh $USER@$HOST 'docker ps' # Check worker" + fi + ((node++)) +done < "$MACHINE_IP_FILE" +echo "" +echo "Ready to launch training:" +echo " ./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --config .yaml" diff --git a/src/aorta/training/fsdp_trainer.py b/src/aorta/training/fsdp_trainer.py index b38f260..eea1923 100644 --- a/src/aorta/training/fsdp_trainer.py +++ b/src/aorta/training/fsdp_trainer.py @@ -70,6 +70,9 @@ class FSDPConfig: forward_prefetch: bool = True sync_module_states: bool = True param_init_device: str = "cpu" + # For HYBRID_SHARD: GPUs per node (None = auto-detect from LOCAL_WORLD_SIZE env var) + # Only set this if auto-detection fails or you want to override + hybrid_shard_gpus_per_node: Optional[int] = None @dataclass @@ -245,9 +248,17 @@ def build_fsdp_model( transformer_auto_wrap_policy, transformer_layer_cls={nn.TransformerEncoderLayer} ) + # Create process groups for hybrid_shard strategy + process_group = None + if sharding == ShardingStrategy.HYBRID_SHARD: + process_group = _create_hybrid_shard_process_groups(fsdp_cfg.hybrid_shard_gpus_per_node) + if process_group is not None: + log.info("Created custom process groups for HYBRID_SHARD strategy") + fsdp_model = FSDP( model.to(device), sharding_strategy=sharding, + process_group=process_group, auto_wrap_policy=auto_wrap_policy, use_orig_params=fsdp_cfg.use_orig_params, backward_prefetch=backward_prefetch, @@ -261,6 +272,81 @@ def build_fsdp_model( return fsdp_model +def _create_hybrid_shard_process_groups(gpus_per_node: Optional[int] = None): + """ + Create process groups for HYBRID_SHARD strategy. + + Args: + gpus_per_node: Number of GPUs per node. If None, auto-detects from LOCAL_WORLD_SIZE. + This should match the --nproc value from torchrun. + + Returns: + Tuple of (shard_group, replicate_group) or None + """ + if not dist.is_initialized(): + return None + + world_size = dist.get_world_size() + rank = dist.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + + # Auto-detect GPUs per node from environment if not provided + if gpus_per_node is None: + # torchrun sets LOCAL_WORLD_SIZE to the number of processes per node + local_world_size_str = os.environ.get("LOCAL_WORLD_SIZE") + if local_world_size_str: + gpus_per_node = int(local_world_size_str) + log.info("Auto-detected gpus_per_node=%d from LOCAL_WORLD_SIZE", gpus_per_node) + else: + log.error( + "Cannot determine gpus_per_node: LOCAL_WORLD_SIZE not set and " + "hybrid_shard_gpus_per_node not configured. Set hybrid_shard_gpus_per_node in config." + ) + return None + + # Validate configuration + if world_size % gpus_per_node != 0: + log.error( + "Invalid HYBRID_SHARD config: world_size=%d not divisible by gpus_per_node=%d", + world_size, gpus_per_node + ) + return None + + num_nodes = world_size // gpus_per_node + node_id = rank // gpus_per_node + + if num_nodes <= 1: + log.warning("HYBRID_SHARD with single node - consider using FULL_SHARD instead") + return None + + log.info( + "Creating HYBRID_SHARD process groups | rank=%d world_size=%d num_nodes=%d gpus_per_node=%d node_id=%d", + rank, world_size, num_nodes, gpus_per_node, node_id + ) + + # Intra-node groups: shard within each node + for i in range(num_nodes): + ranks_in_node = list(range(i * gpus_per_node, (i + 1) * gpus_per_node)) + group = dist.new_group(ranks=ranks_in_node) + if i == node_id: + my_shard_group = group + + # Inter-node groups: replicate across nodes (same local_rank) + for local_r in range(gpus_per_node): + ranks_across_nodes = [node * gpus_per_node + local_r for node in range(num_nodes)] + group = dist.new_group(ranks=ranks_across_nodes) + if local_r == local_rank: + my_replicate_group = group + + log.info( + "Created process groups | shard_group_size=%d replicate_group_size=%d", + dist.get_world_size(my_shard_group), + dist.get_world_size(my_replicate_group), + ) + + return (my_shard_group, my_replicate_group) + + def build_ddp_model( model_cfg: ModelConfig, ddp_cfg: DDPConfig,