diff --git a/.gitignore b/.gitignore
index 0aaa6d8..b40cab9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,19 +42,25 @@ checkpoints/
 runs/
 logs/
 tensorboard/
+experiments/
+artifacts/
+artifacts_*/
+overlap_debug*/
 
 # Experiment outputs and traces
-experiments/
 *.json
+*.jsonl
 *.xlsx
+*.log
+*.logs
 *_merged_enhanced.json
 nccl_thread_sweep_*.log
 nccl_thread_sweep_summary_*.txt
 expected_outputs/
 testdata/
-*.log
 actual_outputs/
 *.html
+
 # IDE/project-specific folders
 .vscode/
 .idea/
@@ -68,7 +74,13 @@ Thumbs.db
 # Data files
 *.dat
 
+# Profiler traces
 tmp_prof.*
 rocprof_traces
 overlap.*
 .*repro.*
+*.gz
+trace_*
+
+# Multi-node IP configuration (should be customized per deployment)
+node_ip_list.txt
diff --git a/config/multi_node/distributed_multinode.yaml b/config/multi_node/distributed_multinode.yaml
index 143c680..acd9162 100644
--- a/config/multi_node/distributed_multinode.yaml
+++ b/config/multi_node/distributed_multinode.yaml
@@ -5,7 +5,7 @@ training:
   batch_size: 448
   gradient_accumulation: 4
   mixed_precision: bf16
-  max_steps: 2000
+  max_steps: 10
   grad_clip_norm: 1.0
   output_dir: artifacts/optuna_tracelens_sweep_fixed/trial_0000
   log_interval: 20
@@ -46,6 +46,8 @@ fsdp:
   forward_prefetch: true
   sync_module_states: true
   param_init_device: meta
+  # hybrid_shard_gpus_per_node: auto-detects from --nproc (via LOCAL_WORLD_SIZE)
+  # Only set manually if auto-detection fails
 distributed:
   backend: nccl
   mode: fsdp
@@ -54,7 +56,7 @@ distributed:
   static_graph: true
   find_unused_parameters: false
 compile:
-  enabled: true
+  enabled: false
   backend: inductor
   mode: max-autotune
   fullgraph: false
@@ -82,10 +84,10 @@ dataloader:
   pin_memory: true
 profiling:
   enabled: true
-  wait: 5
-  warmup: 5
-  active: 1
-  repeat: 1
+  wait: 1
+  warmup: 2
+  active: 5
+  repeat: 0
   record_shapes: false
   profile_memory: false
   with_stack: false
diff --git a/config/multi_node/shampoo_opt_multi_node.yaml b/config/multi_node/shampoo_opt_multi_node.yaml
new file mode 100644
index 0000000..88fc42b
--- /dev/null
+++ b/config/multi_node/shampoo_opt_multi_node.yaml
@@ -0,0 +1,113 @@
+# Shampoo-only variant of the user workload.
+# NOTE: User reports this configuration produces NaNs when using Shampoo,
+# while AdamW and other optimizers remain healthy. Monitor loss closely.
+
+logging:
+  level: INFO
+
+training:
+  epochs: 10
+  batch_size: 512
+  gradient_accumulation: 2
+  mixed_precision: bf16
+  max_steps: 2200
+  grad_clip_norm: 1.0
+  output_dir: artifacts/user_shampoo
+  log_interval: 20
+  additional_compute_streams: 2
+  lightweight_op_waves: 3
+
+optimizer:
+  name: shampoo
+  lr: 0.0002
+  weight_decay: 0.01
+  betas: [0.9, 0.985]
+  eps: 1.0e-8
+
+scheduler:
+  warmup_steps: 200
+  total_steps: 2200
+
+dataset:
+  num_samples: 200000
+  sequence_length: 160
+  dense_dim: 256
+  sparse_features: 64
+  vocab_size: 350000
+  num_dense_features: 32
+  seed: 2025
+
+model:
+  vocab_size: 350000
+  embedding_dim: 256
+  num_dense_features: 32
+  dense_dim: 256
+  model_dim: 1024
+  num_heads: 16
+  num_layers: 18
+  dropout: 0.1
+  mlp_hidden_dim: 4096
+
+fsdp:
+  sharding_strategy: hybrid_shard
+  backward_prefetch: BACKWARD_PRE
+  use_orig_params: true
+  limit_all_gathers: true
+  forward_prefetch: true
+  sync_module_states: true
+  param_init_device: meta
+
+distributed:
+  backend: nccl
+  mode: fsdp
+  bucket_cap_mb: 128
+  gradient_as_bucket_view: true
+  static_graph: true
+  find_unused_parameters: false
+
+compile:
+  enabled: false
+  backend: inductor
+  mode: max-autotune
+  fullgraph: false
+  dynamic: false
+
+streams:
+  num_streams: 4
+  high_priority:
+    - allreduce
+    - reducescatter
+  stream_assignments:
+    compute:
+      - dev6_stream3
+      - dev6_stream9
+    communication:
+      - dev6_stream13
+      - dev6_stream17
+    reducescatter:
+      - dev6_stream22
+    aux:
+      - dev6_stream0
+
+dataloader:
+  num_workers: 0
+  pin_memory: true
+
+profiling:
+  enabled: true
+  wait: 2
+  warmup: 2
+  active: 6
+  repeat: 1
+  record_shapes: true
+  profile_memory: true
+  with_stack: false
+  with_flops: false
+  # tensorboard: true
+  # chrome_trace: true
+  tensorboard: false
+  chrome_trace: false
+  trace_filename: user_shampoo.json
+
+tracelens:
+  enabled: false
diff --git a/docker/Dockerfile.rocm70_9-1-shampoo b/docker/Dockerfile.rocm70_9-1-shampoo
new file mode 100644
index 0000000..4ce3c4f
--- /dev/null
+++ b/docker/Dockerfile.rocm70_9-1-shampoo
@@ -0,0 +1,57 @@
+# Start from the existing PyTorch ROCm image
+FROM rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly
+
+# Switch to root to install packages
+USER root
+
+# Install wget and git if not available
+RUN yum install -y wget git 2>/dev/null || apt-get update && apt-get install -y wget git || true
+
+# Download and install amdgpu-install package for RHEL
+RUN wget https://artifactory-cdn.amd.com/artifactory/list/amdgpu-rpm/rhel/amdgpu-install-internal-7.0_9-1.noarch.rpm && \
+    yum install -y ./amdgpu-install-internal-7.0_9-1.noarch.rpm || rpm -ivh ./amdgpu-install-internal-7.0_9-1.noarch.rpm && \
+    rm amdgpu-install-internal-7.0_9-1.noarch.rpm
+
+# Update amdgpu-repo with specific builds
+RUN amdgpu-repo --amdgpu-build=2247890 --rocm-build=compute-rocm-rel-7.0-meta/7
+
+# Since base image already has ROCm, just update the key runtime components
+RUN yum update -y --skip-broken \
+    rocm-hip \
+    rocm-libs \
+    rocm-hip-libraries \
+    rocm-hip-runtime-devel \
+    hip-base \
+    hip-dev \
+    hip-runtime-amd \
+    rocm-core || echo "Updated available ROCm packages"
+
+RUN python3.10 -m pip install git+https://github.com/AMD-AGI/TraceLens.git
+RUN python3.10 -m pip install openpyxl seaborn
+
+# Download and install rocprof-trace-decoder
+RUN wget https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.6/rocprof-trace-decoder-manylinux-2.28-0.1.6-Linux.rpm \
+    -O /tmp/rocprof-trace-decoder.rpm && \
+    rpm -i /tmp/rocprof-trace-decoder.rpm && \
+    rm /tmp/rocprof-trace-decoder.rpm
+
+# Install Facebook Distributed Shampoo optimizer
+RUN cd /tmp && \
+    git clone https://github.com/facebookresearch/optimizers.git && \
+    cd optimizers && \
+    python3.10 -m pip install . && \
+    cd / && \
+    rm -rf /tmp/optimizers
+
+# Verify Shampoo installation
+RUN python3.10 -c "import distributed_shampoo; print('[OK] Shampoo optimizer installed successfully')"
+
+# Update environment variables
+ENV ROCM_HOME=/opt/rocm
+ENV PATH=$ROCM_HOME/bin:$PATH
+ENV LD_LIBRARY_PATH=$ROCM_HOME/lib:$LD_LIBRARY_PATH
+
+# Set working directory
+WORKDIR /workspace/aorta
+
+CMD ["/bin/bash"]
diff --git a/docker/docker-compose.rocm70_9-1-shampoo.yaml b/docker/docker-compose.rocm70_9-1-shampoo.yaml
new file mode 100644
index 0000000..232e3a3
--- /dev/null
+++ b/docker/docker-compose.rocm70_9-1-shampoo.yaml
@@ -0,0 +1,35 @@
+services:
+  torchenv-rocm70-shampoo:
+    container_name: training-overlap-bugs-rocm70_9-1-shampoo
+    build:
+      context: .
+      dockerfile: Dockerfile.rocm70_9-1-shampoo
+      network: host
+    user: root
+    privileged: true
+    network_mode: host
+    group_add:
+      - video
+    ipc: host
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    environment:
+      - RCCL_FOLDER=/rccl
+      - LD_LIBRARY_PATH=/rccl/build/release:${LD_LIBRARY_PATH:-}
+      - TORCH_NCCL_HIGH_PRIORITY=1
+
+    volumes:
+      - /home/manrao:/manrao
+      - /home/oyazdanb/aorta:/workspace/aorta
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    working_dir: /workspace/aorta
+    shm_size: 17G
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    stdin_open: true
+    tty: true
diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml
index c4704ae..7f082e2 100644
--- a/docker/docker-compose.rocm70_9-1.yaml
+++ b/docker/docker-compose.rocm70_9-1.yaml
@@ -4,6 +4,7 @@ services:
     build:
       context: .
       dockerfile: Dockerfile.rocm70_9-1
+      network: host
     user: root
     privileged: true
     network_mode: host
diff --git a/scripts/multi_node/README.md b/scripts/multi_node/README.md
new file mode 100644
index 0000000..d4e1786
--- /dev/null
+++ b/scripts/multi_node/README.md
@@ -0,0 +1,239 @@
+# Multi-Node Training
+
+Scripts for multi-node distributed training with custom NCCL channel and thread configurations.
+
+## Table of Contents
+
+- [Quick Start](#quick-start)
+- [Slurm Setup](#slurm-setup)
+- [Usage](#usage)
+- [Stopping Training](#stopping-training)
+- [Troubleshooting](#troubleshooting)
+- [NCCL Configuration](#nccl-configuration)
+- [Conductor Setup](#conductor-setup)
+
+## Prerequisites
+
+- 2+ machines with ROCm GPUs, Docker, network connectivity (host mode)
+- Passwordless SSH between nodes
+- `scripts/multi_node/node_ip_list.txt` with node hostnames - master first
+- All nodes on same git branch
+
+## File Structure
+
+```
+aorta/
+├── scripts/multi_node/
+│   ├── master_launch.sh                # Main entrypoint
+│   ├── start_docker_all_nodes.sh       # Start Docker on all nodes
+│   ├── setup_multi_node.sh             # Automated setup (2+ nodes)
+│   ├── config_node.sh                  # Per-node setup
+│   ├── local_launch.sh                 # Per-node training (runs in Docker)
+│   ├── set_env_variables.sh            # NCCL/RCCL config
+│   └── node_ip_list.txt                # Node hostnames
+├── docker/
+│   ├── docker-compose.rocm70_9-1.yaml         # Base Docker config
+│   └── docker-compose.rocm70_9-1-shampoo.yaml # Docker with Shampoo optimizer
+├── config/
+│   ├── multi_node/
+│   │   └── distributed_multinode.yaml  # Default config
+│   └── shampoo_opt.yaml                # Shampoo optimizer config
+```
+
+## Quick Start
+
+```bash
+# First time setup (once per allocation)
+scontrol show hostnames $SLURM_NODELIST > scripts/multi_node/node_ip_list.txt
+./scripts/multi_node/start_docker_all_nodes.sh
+
+# Run training
+./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --nproc 8
+```
+
+World size: `NPROC_PER_NODE × NUM_NODES` (e.g., 8 GPUs/node × 2 nodes = 16)
+
+---
+
+## Slurm Setup
+
+### Step 1: Pull Base Docker Image
+
+```bash
+docker pull rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly
+
+# If authentication required
+docker login
+```
+
+### Step 2: Allocate Nodes
+
+```bash
+# From head node
+salloc -N 3 -p gpu_partition -t 4:00:00
+squeue -u $USER
+```
+
+### Step 3: Create node_ip_list.txt
+
+```bash
+cd /path/to/aorta/scripts/multi_node
+scontrol show hostnames $SLURM_NODELIST > node_ip_list.txt
+cat node_ip_list.txt
+```
+
+### Step 4: SSH to Master and Test Connectivity
+
+```bash
+ssh node1-hostname
+cd /path/to/aorta
+
+# Test worker connectivity
+ssh node2-hostname hostname
+ssh node3-hostname hostname
+```
+
+### Step 5: Pull Image on All Nodes
+
+```bash
+for HOST in $(cat scripts/multi_node/node_ip_list.txt); do
+  ssh $HOST "docker pull rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly"
+done
+```
+
+### Step 6: Start Docker and Run Training
+
+```bash
+./scripts/multi_node/start_docker_all_nodes.sh
+
+./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --nproc 8
+```
+
+---
+
+## Usage
+
+```bash
+# Basic launch (defaults: 28 channels, 256 threads, 8 GPUs/node)
+./scripts/multi_node/master_launch.sh
+
+# Custom parameters
+./scripts/multi_node/master_launch.sh -c 28 -t 256 -p 4 -f config/custom.yaml
+```
+
+### Parameters
+
+| Flag | Option | Default | Description |
+|------|--------|---------|-------------|
+| -c | --channels | 28 | NCCL_MAX_NCHANNELS |
+| -t | --threads | 256 | RCCL_THREADS_PER_BLOCK |
+| -p | --nproc | 8 | GPUs per node |
+| -f | --config | config/multi_node/distributed_multinode.yaml | Config file |
+| -r | --rocprof | false | Enable rocprofv3 |
+| -m | --stats | false | rocprof stats |
+|  | --rocprof-input | none | rocprof yaml |
+|  | --master-port | auto | Master port |
+
+Environment variables: `CHANNELS=42 THREADS=512 ./scripts/multi_node/master_launch.sh`
+
+GPU subset: Use `-p 4` or `export CUDA_VISIBLE_DEVICES=0,2,4,6`
+
+### Custom Config
+
+Select a config file from `config/` or `config/multi_node/`:
+
+```bash
+./scripts/multi_node/master_launch.sh \
+    --channels 28 --threads 256 \
+    --config config/multi_node/distributed_multinode.yaml
+```
+
+### Monitoring
+
+```bash
+tail -f experiments/multinode_*/logs/node_*.txt                        # All nodes
+tail -f experiments/multinode_*/logs/node_0_*.txt                      # Master only
+cat experiments/multinode_*/outputs/rank_00_metrics.jsonl | tail -n 5  # Metrics
+```
+
+---
+
+## Stopping Training
+
+`Ctrl+C` stops monitoring but training continues in background.
+
+To stop training:
+```bash
+for HOST in $(cat scripts/multi_node/node_ip_list.txt); do
+  ssh $HOST "docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f 'train.py|torchrun'"
+done
+```
+
+---
+
+## Troubleshooting
+
+| Issue | Solution |
+|-------|----------|
+| Script hangs | Check last [STAGE] message |
+| SSH fails | `ssh-copy-id $USER@<host>` |
+| Docker version mismatch | `docker compose version` on each node |
+| NCCL timeout | Update `NCCL_SOCKET_IFNAME` in `set_env_variables.sh` |
+| World size mismatch | Check `rocm-smi --showid \| wc -l`, adjust `--nproc` |
+
+---
+
+## NCCL Configuration
+
+Edit `set_env_variables.sh`:
+
+**InfiniBand:**
+```bash
+export NCCL_IB_DISABLE=0
+export NCCL_IB_HCA=mlx5_0  # Check: ibstat
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=ib0
+```
+
+**Ethernet:**
+```bash
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_NSOCKS_PERTHREAD=4
+export NCCL_SOCKET_NTHREADS=2
+```
+
+**Debug:** `export NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=ALL`
+
+---
+
+## Conductor Setup
+
+For Conductor environments with SSH key management:
+
+### SSH Key Setup
+
+```bash
+ssh-keygen -t rsa -b 4096 -C "conductor-multi-node" -f ~/.ssh/id_rsa_conductor -N ''
+cat ~/.ssh/id_rsa_conductor.pub
+```
+
+Register public key with your cluster's SSH key management system.
+
+```bash
+cat >> ~/.ssh/config << 'EOF'
+Host *.dcgpu smci350-* *.zts-gtu.dcgpu
+    IdentityFile ~/.ssh/id_rsa_conductor
+    StrictHostKeyChecking no
+EOF
+chmod 600 ~/.ssh/config
+```
+
+### Run Setup and Start Docker
+
+```bash
+./scripts/multi_node/setup_multi_node.sh
+./scripts/multi_node/start_docker_all_nodes.sh
+```
+
+Creates `node_ip_list.txt` with hostnames, detects network interfaces, verifies SSH and git branches.
diff --git a/scripts/multi_node/config_node.sh b/scripts/multi_node/config_node.sh
new file mode 100755
index 0000000..ddd631e
--- /dev/null
+++ b/scripts/multi_node/config_node.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Per-node configuration and launch script for Aorta GEMM training
+# This script runs on each node (via SSH or locally)
+
+NODE_RANK=$(echo "$1" | sed 's/"//g')
+NODE_IP=$(echo "$2" | sed 's/"//g')
+MASTER_IP=$(echo "$3" | sed 's/"//g')
+MASTER_PORT=$(echo "$4" | sed 's/"//g')
+NNODES=$(echo "$5" | sed 's/"//g')
+WORLD_SIZE=$(echo "$6" | sed 's/"//g')
+WORKDIR=$(echo "$7" | sed 's/"//g')
+EXPERIMENT_DIR=$(echo "$8" | sed 's/"//g')
+CONFIG_FILE=$(echo "$9" | sed 's/"//g')
+NPROC_PER_NODE=$(echo "${10}" | sed 's/"//g')
+CHANNELS=$(echo "${11}" | sed 's/"//g')
+THREADS=$(echo "${12}" | sed 's/"//g')
+ENABLE_ROCPROF=$(echo "${13}" | sed 's/"//g')
+ROCPROF_STATS=$(echo "${14}" | sed 's/"//g')
+ROCPROF_INPUT=$(echo "${15}" | sed 's/"//g')
+
+echo "============================================"
+echo "Node Configuration"
+echo "============================================"
+echo "Node Rank: $NODE_RANK"
+echo "Node IP: $NODE_IP"
+echo "Master IP: $MASTER_IP"
+echo "Master Port: $MASTER_PORT"
+echo "Number of Nodes: $NNODES"
+echo "World Size: $WORLD_SIZE GPUs"
+echo "Processes per node: $NPROC_PER_NODE"
+echo "Work Directory: $WORKDIR"
+echo "Experiment Directory: $EXPERIMENT_DIR"
+echo "Config File: $CONFIG_FILE"
+echo "Channels: $CHANNELS"
+echo "Threads: $THREADS"
+echo "============================================"
+echo ""
+
+# Change to working directory
+cd "$WORKDIR" || exit 1
+
+# Activate virtual environment if it exists
+VENV_PATH="$WORKDIR/.venv"
+if [[ -d "$VENV_PATH" ]]; then
+    echo "Activating virtual environment at $VENV_PATH"
+    source "$VENV_PATH/bin/activate"
+fi
+
+# Source common environment variables
+if [[ -f "$WORKDIR/scripts/multi_node/set_env_variables.sh" ]]; then
+    echo "Sourcing set_env_variables.sh"
+    source "$WORKDIR/scripts/multi_node/set_env_variables.sh"
+else
+    echo "Warning: set_env_variables.sh not found, using default NCCL settings"
+    export NCCL_DEBUG=WARN
+    export NCCL_IB_DISABLE=0
+    export NCCL_SOCKET_IFNAME=eth0
+fi
+
+echo ""
+echo "Environment configured. Starting GEMM training..."
+echo ""
+
+# Launch local_launch.sh with all parameters
+"$WORKDIR/scripts/multi_node/local_launch.sh" \
+    "$NODE_RANK" "$NODE_IP" "$MASTER_IP" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" \
+    "$EXPERIMENT_DIR" "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" \
+    "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT"
+
+echo ""
+echo "============================================"
+echo "Node $NODE_RANK training completed"
+echo "============================================"
diff --git a/scripts/multi_node/local_launch.sh b/scripts/multi_node/local_launch.sh
new file mode 100755
index 0000000..f517906
--- /dev/null
+++ b/scripts/multi_node/local_launch.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# Multi-node local launch script for GEMM training
+# Runs on each node with single channel/thread configuration
+
+if [[ $# -lt 11 ]]; then
+  echo "Usage: $0 <NODE_RANK> <NODE_IP> <MASTER_IP> <MASTER_PORT> <NNODES> <WORLD_SIZE> <EXPERIMENT_DIR> <CONFIG_FILE> <NPROC_PER_NODE> <CHANNELS> <THREADS> [ENABLE_ROCPROF] [ROCPROF_STATS] [ROCPROF_INPUT]"
+  exit 1
+fi
+
+NODE_RANK="$1"
+NODE_IP="$2"
+MASTER_IP="$3"
+MASTER_PORT="$4"
+NNODES="$5"
+WORLD_SIZE="$6"
+EXPERIMENT_DIR="$7"
+CONFIG_FILE="$8"
+NPROC_PER_NODE="$9"
+CHANNELS="${10}"
+THREADS="${11}"
+ENABLE_ROCPROF="${12:-false}"
+ROCPROF_STATS="${13:-false}"
+ROCPROF_INPUT="${14:-}"
+
+echo "=========================================="
+echo "Local Launch Configuration"
+echo "=========================================="
+echo "Node Rank: $NODE_RANK"
+echo "Node IP: $NODE_IP"
+echo "Master IP: $MASTER_IP"
+echo "Master Port: $MASTER_PORT"
+echo "Number of Nodes: $NNODES"
+echo "World Size: $WORLD_SIZE"
+echo "Processes per node: $NPROC_PER_NODE"
+echo "Experiment Dir: $EXPERIMENT_DIR"
+echo "Config File: $CONFIG_FILE"
+echo "Channels: $CHANNELS"
+echo "Threads: $THREADS"
+echo "rocprof enabled: $ENABLE_ROCPROF"
+echo "=========================================="
+echo ""
+
+# Output directory for this configuration
+OUTPUT_DIR="${EXPERIMENT_DIR}/${THREADS}thread_${CHANNELS}channels"
+mkdir -p "${OUTPUT_DIR}"
+
+# Convert host path to Docker path for use inside container
+# Docker mounts host aorta directory -> /workspace/aorta
+# Extract aorta root from EXPERIMENT_DIR (e.g., /home/user/aorta/experiments/... -> /home/user/aorta)
+AORTA_ROOT_FROM_EXP=$(echo "$EXPERIMENT_DIR" | sed 's|/experiments/.*||')
+# Replace the aorta root with /workspace/aorta
+OUTPUT_DIR_DOCKER=$(echo "$OUTPUT_DIR" | sed "s|^${AORTA_ROOT_FROM_EXP}|/workspace/aorta|")
+
+# Also convert CONFIG_FILE to Docker path if it's an absolute path
+if [[ "$CONFIG_FILE" =~ ^/ ]]; then
+    CONFIG_FILE_DOCKER=$(echo "$CONFIG_FILE" | sed "s|^${AORTA_ROOT_FROM_EXP}|/workspace/aorta|")
+else
+    CONFIG_FILE_DOCKER="$CONFIG_FILE"
+fi
+
+# Log file
+LOG_FILE="${OUTPUT_DIR}/node_${NODE_RANK}_output.log"
+
+# Function to log with timestamp
+log() {
+    local message="$1"
+    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
+    echo "[${timestamp}] [Node ${NODE_RANK}] ${message}" | tee -a "${LOG_FILE}"
+}
+
+# Cleanup function
+cleanup() {
+    echo ""
+    echo "=== Caught interrupt signal ===" | tee -a "${LOG_FILE}"
+    log "Cleaning up training processes on node ${NODE_RANK}..."
+
+    # Try to kill processes inside Docker container
+    docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "train.py" 2>/dev/null || true
+    docker exec training-overlap-bugs-rocm70_9-1 pkill -9 -f "torchrun" 2>/dev/null || true
+
+    # Also try on host (in case anything leaked)
+    sudo pkill -9 -f "train.py" 2>/dev/null || true
+    sudo pkill -9 -f "torchrun" 2>/dev/null || true
+
+    log "Cleanup complete. Exiting."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
+log "Starting multi-node training with RCCL_THREADS_PER_BLOCK=${THREADS}, NCCL_MAX_NCHANNELS=${CHANNELS}"
+log "Output directory: ${OUTPUT_DIR}"
+
+START_TIME=$(date +%s)
+
+# Docker container name (update if different)
+DOCKER_CONTAINER="training-overlap-bugs-rocm70_9-1"
+
+# Check if Docker container is running
+if ! docker ps --format '{{.Names}}' | grep -q "^${DOCKER_CONTAINER}$"; then
+    log "ERROR: Docker container '${DOCKER_CONTAINER}' is not running"
+    log "Start it with: cd /path/to/aorta/docker && docker compose -f docker-compose.rocm70_9-1.yaml up -d"
+    exit 1
+fi
+
+log "Docker container '${DOCKER_CONTAINER}' is running"
+
+# Base command for torchrun with multi-node parameters
+BASE_CMD="torchrun --nnodes ${NNODES} --node_rank ${NODE_RANK} --nproc_per_node ${NPROC_PER_NODE} --master_addr ${MASTER_IP} --master_port ${MASTER_PORT} train.py --config ${CONFIG_FILE_DOCKER}"
+BASE_OVERRIDES="--override profiling.tensorboard=false"
+
+# Build docker exec prefix with environment variables
+DOCKER_EXEC="docker exec \
+    -e RCCL_THREADS_PER_BLOCK=${THREADS} \
+    -e NCCL_MAX_NCHANNELS=${CHANNELS} \
+    -e HSA_ENABLE_SDMA=0 \
+    -e PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 \
+    ${DOCKER_CONTAINER}"
+
+# Run with or without rocprofv3
+if [ "${ENABLE_ROCPROF}" = "true" ]; then
+    ROCPROF_DIR="${OUTPUT_DIR}/rocprof_traces/node_${NODE_RANK}"
+    mkdir -p "${ROCPROF_DIR}"
+
+    if [ -n "${ROCPROF_INPUT}" ]; then
+        log "Using rocprofv3 input file: ${ROCPROF_INPUT}"
+        ${DOCKER_EXEC} bash -c "rocprofv3 -i ${ROCPROF_INPUT} -d ${ROCPROF_DIR} -- \
+            ${BASE_CMD} ${BASE_OVERRIDES} \
+            --override training.output_dir=${OUTPUT_DIR_DOCKER}" \
+            2>&1 | tee -a "${LOG_FILE}"
+    else
+        ROCPROF_ARGS="--kernel-trace"
+        if [ "${ROCPROF_STATS}" = "true" ]; then
+            ROCPROF_ARGS="${ROCPROF_ARGS} --stats"
+        fi
+
+        log "Running with rocprofv3 kernel tracing inside Docker"
+        ${DOCKER_EXEC} bash -c "rocprofv3 ${ROCPROF_ARGS} -d ${ROCPROF_DIR} -- \
+            ${BASE_CMD} ${BASE_OVERRIDES} \
+            --override training.output_dir=${OUTPUT_DIR_DOCKER}" \
+            2>&1 | tee -a "${LOG_FILE}"
+    fi
+else
+    log "Running inside Docker container"
+    log "Command: ${BASE_CMD} ${BASE_OVERRIDES} --override training.output_dir=${OUTPUT_DIR_DOCKER}"
+    ${DOCKER_EXEC} bash -c "${BASE_CMD} ${BASE_OVERRIDES} \
+        --override training.output_dir=${OUTPUT_DIR_DOCKER}" \
+        2>&1 | tee -a "${LOG_FILE}"
+fi
+
+EXIT_CODE=${PIPESTATUS[0]}
+END_TIME=$(date +%s)
+DURATION=$((END_TIME - START_TIME))
+
+if [ $EXIT_CODE -eq 0 ]; then
+    log "Training completed successfully (duration: ${DURATION}s)"
+else
+    log "Training failed with exit code: $EXIT_CODE (duration: ${DURATION}s)"
+fi
+
+echo ""
+log "Node ${NODE_RANK} finished"
diff --git a/scripts/multi_node/master_launch.sh b/scripts/multi_node/master_launch.sh
new file mode 100755
index 0000000..f88d2fe
--- /dev/null
+++ b/scripts/multi_node/master_launch.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+# Multi-node orchestration script for Aorta GEMM training
+# Adapted from DLRM master_launch.sh pattern
+#
+# TODO: Convert to SLURM-native launch using srun instead of SSH to individual nodes.
+#       Currently this script runs from a compute node and SSHs to other nodes.
+#       Ideally, we should run SLURM commands from the login node, which would
+#       eliminate the need for SSH connectivity checks and branch verification.
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "Options:"
+    echo "  -c, --channels CHANNELS     NCCL_MAX_NCHANNELS value (default: 28)"
+    echo "  -t, --threads THREADS       RCCL_THREADS_PER_BLOCK value (default: 256)"
+    echo "  -f, --config CONFIG         Config file path (default: config/multi_node/distributed_multinode.yaml)"
+    echo "  -p, --nproc NPROC           Number of processes per node (default: 8)"
+    echo "  -r, --rocprof               Enable rocprofv3 tracing"
+    echo "  -m, --stats                 Enable rocprof stats (CU utilization, occupancy)"
+    echo "      --rocprof-input FILE    Use rocprofv3 input yaml/json"
+    echo "      --master-port PORT      Master port (default: auto-select)"
+    echo "  -h, --help                  Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --channels 28 --threads 256"
+    echo "  $0 -c 28 -t 256 --rocprof"
+    echo "  $0 --channels 28 --config config/my_custom.yaml"
+    echo ""
+    echo "Or use environment variables:"
+    echo "  CHANNELS=28 THREADS=256 $0"
+    exit 1
+}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MACHINE_IP_FILE="$SCRIPT_DIR/node_ip_list.txt"  # Contains hostnames or IPs
+
+# Default values (can be overridden by env vars or command-line args)
+CONFIG_FILE="${CONFIG_FILE:-config/multi_node/distributed_multinode.yaml}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+CHANNELS="${CHANNELS:-28}"
+THREADS="${THREADS:-256}"
+ENABLE_ROCPROF="${ENABLE_ROCPROF:-false}"
+ROCPROF_STATS="${ROCPROF_STATS:-false}"
+ROCPROF_INPUT="${ROCPROF_INPUT:-}"
+MASTER_PORT="${MASTER_PORT:-}"
+
+# Parse command-line arguments (override env vars)
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--channels)
+            CHANNELS="$2"
+            shift 2
+            ;;
+        -t|--threads)
+            THREADS="$2"
+            shift 2
+            ;;
+        -f|--config)
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        -p|--nproc)
+            NPROC_PER_NODE="$2"
+            shift 2
+            ;;
+        -r|--rocprof)
+            ENABLE_ROCPROF="true"
+            shift
+            ;;
+        -m|--stats)
+            ROCPROF_STATS="true"
+            shift
+            ;;
+        --rocprof-input)
+            ROCPROF_INPUT="$2"
+            shift 2
+            ;;
+        --master-port)
+            MASTER_PORT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+
+if [[ -z "$MASTER_PORT" ]]; then
+  if ! MASTER_PORT=$(python3 - <<'PY'
+import socket
+s=socket.socket()
+s.bind(('',0))
+print(s.getsockname()[1])
+s.close()
+PY
+  ); then
+    echo "Error: Failed to auto-select master port. Set MASTER_PORT manually."
+    exit 1
+  fi
+fi
+
+# Check git branch consistency before launching
+echo "=== Checking git branch consistency ==="
+MASTER_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo")
+
+if [[ "$MASTER_BRANCH" != "not-a-git-repo" ]]; then
+    echo "Master node branch: $MASTER_BRANCH"
+
+    node=0
+    while IFS= read -r HOST || [[ -n "$HOST" ]]; do  # HOST can be hostname or IP
+        if [[ -z "$HOST" ]]; then continue; fi
+
+        if [[ "$node" -gt 0 ]]; then
+            WORKER_BRANCH=$(ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "cd ~/aorta && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo")
+
+            if [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then
+                echo "[WARN] Worker node $HOST: Not a git repository"
+            elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then
+                echo ""
+                echo "[ERROR] Branch mismatch on worker node $HOST!"
+                echo "  Master: $MASTER_BRANCH"
+                echo "  Worker: $WORKER_BRANCH"
+                echo ""
+                echo "Fix: ssh $USER@$HOST 'cd ~/aorta && git checkout $MASTER_BRANCH && git pull'"
+                echo ""
+                exit 1
+            else
+                echo "Worker node $HOST: $WORKER_BRANCH [OK]"
+            fi
+        fi
+        ((node++))
+    done < "$MACHINE_IP_FILE"
+else
+    echo "[WARN] Not a git repository - skipping branch check"
+fi
+echo ""
+
+TRACE_TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+EXPERIMENT_DIR="$HOME/aorta/experiments/multinode_${CHANNELS}ch_${THREADS}th_${TRACE_TIMESTAMP}"
+mkdir -p "$EXPERIMENT_DIR"
+mkdir -p "$EXPERIMENT_DIR/logs"
+
+echo "=== Aorta Multi-Node GEMM Training ==="
+echo "Experiment directory: $EXPERIMENT_DIR"
+echo "Config file: $CONFIG_FILE"
+echo "NCCL Channels: $CHANNELS"
+echo "RCCL Threads per block: $THREADS"
+echo "Processes per node: $NPROC_PER_NODE"
+echo "rocprof enabled: $ENABLE_ROCPROF"
+
+NUM_NODES=$(awk 'NF' "$MACHINE_IP_FILE" | wc -l)
+WORLD_SIZE=$((NPROC_PER_NODE * NUM_NODES))
+NNODES=$NUM_NODES
+
+echo "Number of nodes: $NUM_NODES"
+echo "World size: $WORLD_SIZE (GPUs)"
+echo "Using MASTER_PORT: $MASTER_PORT"
+echo ""
+
+node=0
+while IFS= read -r HOST || [[ -n "$HOST" ]]; do  # HOST can be hostname or IP
+  if [[ -z "$HOST" ]]; then
+    continue
+  fi
+
+  echo "Setting up Node: $node, Host: $HOST"
+
+  TIME=$(date +"%Y%m%d_%H%M%S")
+  LOG_FILE="$EXPERIMENT_DIR/logs/node_${node}_${TIME}.txt"
+
+  if [[ "$node" -eq 0 ]]; then
+      MASTER_ADDR="$HOST"
+      echo "Master node: $MASTER_ADDR"
+      echo ""
+
+      ./scripts/multi_node/config_node.sh "$node" "$HOST" "$MASTER_ADDR" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" "$PWD" "$EXPERIMENT_DIR" \
+        "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT" \
+        > "$LOG_FILE" 2>&1 &
+
+  else
+      # Note: stdin explicitly redirected from config_node.sh, so -n flag not needed
+      ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
+          "$USER"@"$HOST" bash -s -- "$node" "$HOST" "$MASTER_ADDR" "$MASTER_PORT" "$NNODES" "$WORLD_SIZE" "$PWD" "$EXPERIMENT_DIR" \
+          "$CONFIG_FILE" "$NPROC_PER_NODE" "$CHANNELS" "$THREADS" "$ENABLE_ROCPROF" "$ROCPROF_STATS" "$ROCPROF_INPUT" \
+        < ./scripts/multi_node/config_node.sh \
+        > "$LOG_FILE" 2>&1 &
+  fi
+
+  ((node++))
+
+done < "$MACHINE_IP_FILE"
+
+echo ""
+echo "=== All nodes launched ==="
+echo "Monitor logs in: $EXPERIMENT_DIR/logs/"
+echo ""
+echo "To monitor progress:"
+echo "  tail -f $EXPERIMENT_DIR/logs/node_0_*.txt"
+echo ""
+echo "To check all nodes:"
+echo "  tail -f $EXPERIMENT_DIR/logs/node_*.txt"
+echo ""
+echo "Waiting for training to complete..."
+echo "Press Ctrl+C to stop monitoring (training will continue in background)"
+
+wait
+
+echo ""
+echo "=== Training completed ==="
+echo "Results saved to: $EXPERIMENT_DIR"
+
+# ================================================================================
+# HOW TO STOP TRAINING
+# ================================================================================
+# Press Ctrl+C above (stops monitoring, but training continues in background)
+#
+# To find running processes:
+#   ps aux | grep -E 'config_node.sh|torchrun.*train.py' | grep -v grep
+#
+# To stop by PID (replace 12345 with your PID):
+#   for HOST in $(cat scripts/multi_node/node_ip_list.txt); do
+#     ssh $USER@$HOST "kill -9 12345"
+#   done
+# ================================================================================
diff --git a/scripts/multi_node/set_env_variables.sh b/scripts/multi_node/set_env_variables.sh
new file mode 100755
index 0000000..3e9c070
--- /dev/null
+++ b/scripts/multi_node/set_env_variables.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Global NCCL/RCCL environment variables for multi-node training
+# Based on DLRM_set_env_variables.sh
+
+# NCCL Debug Settings (use INFO for debugging network issues)
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=INIT,NET
+# Try disabling IB if InfiniBand is not properly configured
+export NCCL_IB_DISABLE=1
+
+# IB/RNIC Configuration (commented out when IB is disabled)
+# export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
+# export NCCL_IB_GID_INDEX=3
+export NCCL_NCHANNELS_PER_NET_PEER=8
+
+# HSA Settings for ROCm
+export HSA_ENABLE_IPC_MODE_LEGACY=1
+
+# NCCL Protocol
+export NCCL_PROTO=Simple
+
+# Channel Configuration (can be overridden by sweep parameters)
+export NCCL_MIN_NCHANNELS=40
+export NCCL_MAX_NCHANNELS=40
+
+# Network Interface
+# Change this to match your network interface: eth0, ib0, enp49s0f0np0, etc.
+# Temporarily commented out for auto-detection:
+# export NCCL_SOCKET_IFNAME=enp193s0f0
+
+# PyTorch ROCm Profiler
+export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1
+
+# Optional: Force non-overlap for debugging
+# export GPU_MAX_HW_QUEUES=1
+# unset TORCH_NCCL_HIGH_PRIORITY
+
+# Optional: Disable SDMA for testing
+# export HSA_ENABLE_SDMA=0
+
+# Optional: Disable IB for Ethernet-only testing
+# export NCCL_IB_DISABLE=1
diff --git a/scripts/multi_node/setup_multi_node.sh b/scripts/multi_node/setup_multi_node.sh
new file mode 100755
index 0000000..6270a1a
--- /dev/null
+++ b/scripts/multi_node/setup_multi_node.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+# Interactive setup script for multi-node training
+# Run this on Machine 1 (master node)
+
+set -e
+
+echo "================================================"
+echo "  Multi-Node Training Setup"
+echo "================================================"
+echo ""
+echo "Prerequisites:"
+echo "  - You have access to multiple machines"
+echo "  - You can SSH into all machines from the master node"
+echo "  - You have the hostnames/IPs of all machines"
+echo ""
+read -p "Press Enter to continue..."
+echo ""
+
+# Get current machine info
+CURRENT_HOST=$(hostname)
+CURRENT_IP=$(hostname -I | awk '{print $1}')
+
+echo "Current machine (Master Node):"
+echo "  Hostname: $CURRENT_HOST"
+echo "  IP: $CURRENT_IP"
+echo ""
+
+# Ask for number of worker nodes
+read -p "How many worker nodes? (default: 1): " NUM_WORKERS
+NUM_WORKERS=${NUM_WORKERS:-1}
+
+if ! [[ "$NUM_WORKERS" =~ ^[0-9]+$ ]] || [[ "$NUM_WORKERS" -lt 1 ]]; then
+    echo "Error: Number of workers must be a positive integer"
+    exit 1
+fi
+
+echo "Setting up 1 master + $NUM_WORKERS worker(s) = $((NUM_WORKERS + 1)) total nodes"
+echo ""
+
+# Collect worker hostnames
+WORKER_HOSTS=()
+for i in $(seq 1 $NUM_WORKERS); do
+    read -p "Enter hostname for worker $i: " WORKER_HOST
+
+    if [[ -z "$WORKER_HOST" ]]; then
+        echo "Error: Worker hostname cannot be empty"
+        exit 1
+    fi
+
+    WORKER_HOSTS+=("$WORKER_HOST")
+done
+echo ""
+
+# Test SSH to all workers and collect IPs
+echo "Testing SSH connections to workers..."
+WORKER_IPS=()
+for i in "${!WORKER_HOSTS[@]}"; do
+    WORKER_HOST="${WORKER_HOSTS[$i]}"
+    WORKER_NUM=$((i + 1))
+
+    echo "Worker $WORKER_NUM: $WORKER_HOST"
+
+    if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$USER@$WORKER_HOST" "hostname" >/dev/null 2>&1; then
+        echo "  [OK] SSH successful"
+        WORKER_IP=$(ssh "$USER@$WORKER_HOST" "hostname -I | awk '{print \$1}'")
+        echo "  [OK] IP: $WORKER_IP"
+        WORKER_IPS+=("$WORKER_IP")
+    else
+        echo "  [FAIL] SSH failed"
+        echo ""
+        echo "Fixes:"
+        echo "  1. Ensure your SSH key is registered with your cluster management system"
+        echo "  2. Generate and copy SSH key:"
+        echo "     ssh-keygen -t rsa -b 4096 -C 'multi-node' -f ~/.ssh/id_rsa_cluster -N ''"
+        echo "     # Then register the public key (~/.ssh/id_rsa_cluster.pub) with your cluster"
+        echo "  3. Or use ssh-copy-id if direct access is available:"
+        echo "     ssh-copy-id -i ~/.ssh/id_rsa_cluster.pub $USER@$WORKER_HOST"
+        exit 1
+    fi
+    echo ""
+done
+
+# Test connectivity and reverse SSH
+echo "Testing connectivity and reverse SSH..."
+for i in "${!WORKER_HOSTS[@]}"; do
+    WORKER_HOST="${WORKER_HOSTS[$i]}"
+    WORKER_IP="${WORKER_IPS[$i]}"
+    WORKER_NUM=$((i + 1))
+
+    echo "Worker $WORKER_NUM ($WORKER_HOST):"
+
+    # Test ping
+    if ping -c 2 "$WORKER_IP" >/dev/null 2>&1; then
+        echo "  [OK] Ping successful"
+    else
+        echo "  [WARN] Ping failed (might be okay if ICMP is blocked)"
+    fi
+
+    # Test reverse SSH
+    if ssh "$USER@$WORKER_HOST" "ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no $USER@$CURRENT_HOST hostname" >/dev/null 2>&1; then
+        echo "  [OK] Reverse SSH successful"
+    else
+        echo "  [WARN] Reverse SSH failed - setting up passwordless SSH"
+
+        # Generate key on worker if needed
+        ssh "$USER@$WORKER_HOST" "test -f ~/.ssh/id_rsa || ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa" >/dev/null 2>&1
+
+        # Copy worker's public key to master
+        WORKER_PUBKEY=$(ssh "$USER@$WORKER_HOST" "cat ~/.ssh/id_rsa.pub")
+        mkdir -p ~/.ssh
+        echo "$WORKER_PUBKEY" >> ~/.ssh/authorized_keys
+        chmod 600 ~/.ssh/authorized_keys
+
+        # Test again
+        if ssh "$USER@$WORKER_HOST" "ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no $USER@$CURRENT_HOST hostname" >/dev/null 2>&1; then
+            echo "  [OK] Reverse SSH now working"
+        else
+            echo "  [FAIL] Reverse SSH still failing - manual setup needed"
+        fi
+    fi
+    echo ""
+done
+
+# Check if code exists on workers
+AORTA_PATH="$HOME/aorta"
+echo "Checking code availability on workers..."
+
+MASTER_INODE=$(stat -c %i "$AORTA_PATH" 2>/dev/null || echo "0")
+MASTER_BRANCH=$(cd "$AORTA_PATH" && git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo")
+SHARED_FS=true
+
+for i in "${!WORKER_HOSTS[@]}"; do
+    WORKER_HOST="${WORKER_HOSTS[$i]}"
+    WORKER_NUM=$((i + 1))
+
+    echo "Worker $WORKER_NUM ($WORKER_HOST):"
+
+    if ssh "$USER@$WORKER_HOST" "test -d $AORTA_PATH" 2>/dev/null; then
+        echo "  [OK] Code found: $AORTA_PATH"
+
+        # Check if it's the same filesystem
+        WORKER_INODE=$(ssh "$USER@$WORKER_HOST" "stat -c %i $AORTA_PATH" 2>/dev/null || echo "0")
+
+        if [[ "$MASTER_INODE" == "$WORKER_INODE" ]] && [[ "$MASTER_INODE" != "0" ]]; then
+            echo "  [OK] Shared filesystem detected"
+        else
+            echo "  [WARN] Separate filesystem - manual sync needed"
+            SHARED_FS=false
+        fi
+
+        # Check git branch
+        WORKER_BRANCH=$(ssh "$USER@$WORKER_HOST" "cd $AORTA_PATH && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo")
+
+        if [[ "$MASTER_BRANCH" == "not-a-git-repo" ]] || [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then
+            echo "  [WARN] Not a git repository - cannot verify branch"
+        elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then
+            echo "  [ERROR] Branch mismatch!"
+            echo "    Master: $MASTER_BRANCH"
+            echo "    Worker: $WORKER_BRANCH"
+            echo ""
+            echo "  Fix: ssh $WORKER_HOST 'cd $AORTA_PATH && git checkout $MASTER_BRANCH && git pull'"
+            exit 1
+        else
+            echo "  [OK] Branch: $MASTER_BRANCH"
+        fi
+    else
+        echo "  [FAIL] Code not found"
+        echo "  You'll need to clone or rsync the code to: $AORTA_PATH"
+        SHARED_FS=false
+    fi
+    echo ""
+done
+
+# Check GPUs
+echo "Checking GPUs on all nodes..."
+MASTER_GPUS=$(rocm-smi --showid 2>/dev/null | grep -c "GPU" || echo "unknown")
+echo "Master: $MASTER_GPUS GPUs"
+
+GPU_MISMATCH=false
+for i in "${!WORKER_HOSTS[@]}"; do
+    WORKER_HOST="${WORKER_HOSTS[$i]}"
+    WORKER_NUM=$((i + 1))
+    WORKER_GPUS=$(ssh "$USER@$WORKER_HOST" "rocm-smi --showid 2>/dev/null | grep -c GPU" || echo "unknown")
+
+    echo "Worker $WORKER_NUM: $WORKER_GPUS GPUs"
+
+    if [[ "$MASTER_GPUS" != "$WORKER_GPUS" ]]; then
+        GPU_MISMATCH=true
+    fi
+done
+
+if [[ "$GPU_MISMATCH" == "true" ]]; then
+    echo "[WARN] GPU count mismatch detected"
+    echo "  Use --nproc flag with master_launch.sh to specify GPU count per node"
+fi
+echo ""
+
+# Detect network interface
+echo "Detecting network interface..."
+INTERFACE=$(ifconfig 2>/dev/null | grep -E "^(ib|enp|eth)" | head -1 | cut -d: -f1 || echo "unknown")
+if [[ "$INTERFACE" == "unknown" ]]; then
+    INTERFACE=$(ip addr show 2>/dev/null | grep -E "^[0-9]+: (ib|enp|eth)" | head -1 | awk '{print $2}' | tr -d ':' || echo "eth0")
+fi
+
+echo "  Detected interface: $INTERFACE"
+
+# Ask user to confirm or change
+read -p "Network interface for NCCL (press Enter to use detected, or enter different name): " USER_INTERFACE
+if [[ -n "$USER_INTERFACE" ]]; then
+    INTERFACE="$USER_INTERFACE"
+    echo "  Using: $INTERFACE"
+else
+    echo "  Using detected interface: $INTERFACE"
+fi
+echo ""
+
+# Create node_ip_list.txt (stores hostnames, not IPs) in scripts/multi_node/
+echo "Creating node_ip_list.txt (with hostnames)..."
+NODE_IP_FILE="$AORTA_PATH/scripts/multi_node/node_ip_list.txt"
+
+# Write master hostname first
+echo "$CURRENT_HOST" > "$NODE_IP_FILE"
+
+# Add all worker hostnames
+for WORKER_HOST in "${WORKER_HOSTS[@]}"; do
+    echo "$WORKER_HOST" >> "$NODE_IP_FILE"
+done
+
+echo "[OK] Created $NODE_IP_FILE:"
+cat "$NODE_IP_FILE"
+echo ""
+echo "[INFO] File contains hostnames (not IPs) for SSH compatibility with config files"
+echo ""
+
+# Update network interface in set_env_variables.sh
+echo "Updating network interface in set_env_variables.sh..."
+if [[ -f "scripts/multi_node/set_env_variables.sh" ]]; then
+    # Backup original
+    cp scripts/multi_node/set_env_variables.sh scripts/multi_node/set_env_variables.sh.bak
+
+    # Update interface
+    sed -i "s/export NCCL_SOCKET_IFNAME=.*/export NCCL_SOCKET_IFNAME=$INTERFACE/" scripts/multi_node/set_env_variables.sh
+
+    echo "[OK] Updated NCCL_SOCKET_IFNAME=$INTERFACE"
+else
+    echo "[WARN] set_env_variables.sh not found - manual configuration needed"
+fi
+echo ""
+
+# Summary
+echo "================================================"
+echo "  Setup Complete!"
+echo "================================================"
+echo ""
+echo "Configuration Summary:"
+echo "  Total Nodes: $((NUM_WORKERS + 1)) (1 master + $NUM_WORKERS workers)"
+echo "  Network Interface: $INTERFACE"
+echo "  Shared Filesystem: ${SHARED_FS:-false}"
+echo ""
+echo "  Master: $CURRENT_HOST ($CURRENT_IP) - $MASTER_GPUS GPUs"
+for i in "${!WORKER_HOSTS[@]}"; do
+    WORKER_HOST="${WORKER_HOSTS[$i]}"
+    WORKER_IP="${WORKER_IPS[$i]}"
+    WORKER_NUM=$((i + 1))
+    WORKER_GPUS=$(ssh "$USER@$WORKER_HOST" "rocm-smi --showid 2>/dev/null | grep -c GPU" || echo "unknown")
+    echo "  Worker $WORKER_NUM: $WORKER_HOST ($WORKER_IP) - $WORKER_GPUS GPUs"
+done
+echo ""
+echo "Node IP list created at:"
+echo "  $AORTA_PATH/scripts/multi_node/node_ip_list.txt"
+echo ""
+
+if [[ "${SHARED_FS:-false}" == "false" ]]; then
+    echo "[IMPORTANT] Sync code to all workers before running:"
+    for WORKER_HOST in "${WORKER_HOSTS[@]}"; do
+        echo "  ssh $WORKER_HOST 'cd ~/ && git clone <repo> aorta'"
+    done
+    echo "  OR use rsync:"
+    for WORKER_HOST in "${WORKER_HOSTS[@]}"; do
+        echo "  rsync -avz $AORTA_PATH/ $WORKER_HOST:$AORTA_PATH/"
+    done
+    echo ""
+fi
+
+echo "Next Steps:"
+echo ""
+echo "1. Start Docker on all nodes (run once, containers persist):"
+echo "  cd $AORTA_PATH"
+echo "  ./scripts/multi_node/start_docker_all_nodes.sh"
+echo ""
+echo "2. Launch training (run as many times as you want):"
+echo "  ./scripts/multi_node/master_launch.sh --channels 28 --threads 256"
+echo ""
+echo "3. Monitor training:"
+echo "  tail -f experiments/multinode_*/logs/node_*.txt"
+echo ""
+echo "Additional Options:"
+echo ""
+echo "  Different parameters:"
+echo "    ./scripts/multi_node/master_launch.sh --channels 42 --threads 512 --nproc 8"
+echo ""
+echo "  With profiling:"
+echo "    ./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --rocprof --stats"
+echo ""
+echo "  Custom config:"
+echo "    ./scripts/multi_node/master_launch.sh --config config/distributed_two_nodes.yaml"
+echo ""
+echo "Stop training on all nodes:"
+for WORKER_HOST in "${WORKER_HOSTS[@]}"; do
+    echo "  ssh $WORKER_HOST 'pkill -9 -f train.py'"
+done
+echo "  pkill -9 -f train.py  # On master"
+echo ""
+echo "Stop Docker when completely done:"
+echo "  for IP in \$(cat node_ip_list.txt); do"
+echo "    ssh \$USER@\$IP 'cd $AORTA_PATH/docker && docker compose -f docker-compose.rocm70_9-1.yaml down'"
+echo "  done"
+echo ""
diff --git a/scripts/multi_node/start_docker_all_nodes.sh b/scripts/multi_node/start_docker_all_nodes.sh
new file mode 100755
index 0000000..f93ccd7
--- /dev/null
+++ b/scripts/multi_node/start_docker_all_nodes.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Start Docker containers on all nodes for multi-node training
+# Usage: ./start_docker_all_nodes.sh [docker-compose-file] [container-name]
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+AORTA_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+MACHINE_IP_FILE="$SCRIPT_DIR/node_ip_list.txt"  # Contains hostnames or IPs
+
+# Allow custom docker-compose file and container name via arguments
+DOCKER_COMPOSE_FILE="${1:-docker/docker-compose.rocm70_9-1.yaml}"
+DOCKER_CONTAINER="${2:-training-overlap-bugs-rocm70_9-1}"
+
+if [[ ! -f "$MACHINE_IP_FILE" ]]; then
+    echo "Error: $MACHINE_IP_FILE not found"
+    echo "Run setup_multi_node.sh first"
+    exit 1
+fi
+
+cd "$AORTA_ROOT"
+
+# Check git branch consistency before starting Docker
+echo "=== Checking git branch consistency ==="
+MASTER_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "not-a-git-repo")
+
+if [[ "$MASTER_BRANCH" != "not-a-git-repo" ]]; then
+    echo "Master node branch: $MASTER_BRANCH"
+    TOTAL_NODES=$(grep -c . "$MACHINE_IP_FILE" || echo "0")
+    echo "Found $TOTAL_NODES nodes in $MACHINE_IP_FILE"
+    echo ""
+
+    node=0
+    while IFS= read -r HOST || [[ -n "$HOST" ]]; do  # HOST can be hostname or IP
+        # Skip empty lines
+        if [[ -z "$HOST" ]]; then
+            continue
+        fi
+
+        if [[ "$node" -gt 0 ]]; then
+            echo "[STAGE] Checking worker node $node ($HOST)..."
+            WORKER_BRANCH=$(ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "cd ~/aorta && git rev-parse --abbrev-ref HEAD 2>/dev/null" || echo "not-a-git-repo")
+
+            if [[ "$WORKER_BRANCH" == "not-a-git-repo" ]]; then
+                echo "  [WARN] Worker node $HOST: Not a git repository"
+            elif [[ "$MASTER_BRANCH" != "$WORKER_BRANCH" ]]; then
+                echo "  [ERROR] Branch mismatch on node $HOST!"
+                echo "  Master: $MASTER_BRANCH"
+                echo "  Worker: $WORKER_BRANCH"
+                echo ""
+                echo "Fix: ssh $USER@$HOST 'cd ~/aorta && git checkout $MASTER_BRANCH && git pull'"
+                exit 1
+            else
+                echo "  Worker node $HOST: $WORKER_BRANCH [OK]"
+            fi
+        fi
+        ((node++)) || true
+    done < "$MACHINE_IP_FILE"
+    echo ""
+    echo "Branch check complete [OK]"
+    echo ""
+else
+    echo "[WARN] Not a git repository - skipping branch check"
+    echo ""
+fi
+
+echo "=== Starting Docker containers on all nodes ==="
+TOTAL_NODES=$(wc -l < "$MACHINE_IP_FILE")
+echo "Total nodes to process: $TOTAL_NODES"
+echo ""
+
+node=0
+while IFS= read -r HOST || [[ -n "$HOST" ]]; do  # HOST can be hostname or IP
+  if [[ -z "$HOST" ]]; then
+    continue
+  fi
+
+  echo "Node $node (Host: $HOST):"
+
+  if [[ "$node" -eq 0 ]]; then
+    # Master node (local)
+    echo "  [STAGE] Checking existing containers on master..."
+    if docker ps --format '{{.Names}}' < /dev/null | grep -q "^${DOCKER_CONTAINER}$"; then
+      echo "  [INFO] Container already running, restarting..."
+    fi
+
+    echo "  [STAGE] Running docker compose up -d on master..."
+    COMPOSE_FILE_PATH="${DOCKER_COMPOSE_FILE#docker/}"
+    cd docker && docker compose -f "$COMPOSE_FILE_PATH" up -d < /dev/null && cd ..
+
+    echo "  [STAGE] Verifying master container..."
+    if docker ps --format '{{.Names}}' < /dev/null | grep -q "^${DOCKER_CONTAINER}$"; then
+      echo "  [OK] Docker container '${DOCKER_CONTAINER}' is running"
+    else
+      echo "  [FAIL] Failed to start Docker container"
+      exit 1
+    fi
+  else
+    # Worker nodes (via SSH)
+    echo "  [STAGE] Connecting to worker via SSH..."
+    if ! ssh -n -o ConnectTimeout=10 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "echo 'SSH connection successful'" > /dev/null 2>&1; then
+      echo "  [FAIL] Cannot SSH to worker node $HOST"
+      exit 1
+    fi
+    echo "  [OK] SSH connection successful"
+
+    echo "  [STAGE] Checking existing containers on worker..."
+    if ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "docker ps --format '{{.Names}}'" | grep -q "^${DOCKER_CONTAINER}$"; then
+      echo "  [INFO] Container already running, restarting..."
+    fi
+
+    echo "  [STAGE] Running docker compose up -d on worker..."
+    COMPOSE_FILE_PATH="${DOCKER_COMPOSE_FILE#docker/}"
+    ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" \
+      "cd /home/$USER/aorta/docker && docker compose -f $COMPOSE_FILE_PATH up -d"
+
+    echo "  [STAGE] Verifying worker container..."
+    if ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$USER@$HOST" "docker ps --format '{{.Names}}'" | grep -q "^${DOCKER_CONTAINER}$"; then
+      echo "  [OK] Docker container '${DOCKER_CONTAINER}' is running on worker"
+    else
+      echo "  [FAIL] Failed to start Docker container on worker"
+      exit 1
+    fi
+  fi
+
+  echo ""
+  ((node++)) || true
+done < "$MACHINE_IP_FILE"
+
+echo "=== All Docker containers started successfully ==="
+echo "Docker container: $DOCKER_CONTAINER"
+echo ""
+echo "Verify with:"
+echo "  docker ps  # Check master"
+while IFS= read -r HOST || [[ -n "$HOST" ]]; do
+  if [[ -z "$HOST" ]]; then continue; fi
+  if [[ "$node" -gt 1 ]]; then
+    echo "  ssh $USER@$HOST 'docker ps'  # Check worker"
+  fi
+  ((node++))
+done < "$MACHINE_IP_FILE"
+echo ""
+echo "Ready to launch training:"
+echo "  ./scripts/multi_node/master_launch.sh --channels 28 --threads 256 --config <your-config>.yaml"
diff --git a/src/aorta/training/fsdp_trainer.py b/src/aorta/training/fsdp_trainer.py
index b38f260..eea1923 100644
--- a/src/aorta/training/fsdp_trainer.py
+++ b/src/aorta/training/fsdp_trainer.py
@@ -70,6 +70,9 @@ class FSDPConfig:
     forward_prefetch: bool = True
     sync_module_states: bool = True
     param_init_device: str = "cpu"
+    # For HYBRID_SHARD: GPUs per node (None = auto-detect from LOCAL_WORLD_SIZE env var)
+    # Only set this if auto-detection fails or you want to override
+    hybrid_shard_gpus_per_node: Optional[int] = None
 
 
 @dataclass
@@ -245,9 +248,17 @@ def build_fsdp_model(
         transformer_auto_wrap_policy, transformer_layer_cls={nn.TransformerEncoderLayer}
     )
 
+    # Create process groups for hybrid_shard strategy
+    process_group = None
+    if sharding == ShardingStrategy.HYBRID_SHARD:
+        process_group = _create_hybrid_shard_process_groups(fsdp_cfg.hybrid_shard_gpus_per_node)
+        if process_group is not None:
+            log.info("Created custom process groups for HYBRID_SHARD strategy")
+
     fsdp_model = FSDP(
         model.to(device),
         sharding_strategy=sharding,
+        process_group=process_group,
         auto_wrap_policy=auto_wrap_policy,
         use_orig_params=fsdp_cfg.use_orig_params,
         backward_prefetch=backward_prefetch,
@@ -261,6 +272,81 @@ def build_fsdp_model(
     return fsdp_model
 
 
+def _create_hybrid_shard_process_groups(gpus_per_node: Optional[int] = None):
+    """
+    Create process groups for HYBRID_SHARD strategy.
+
+    Args:
+        gpus_per_node: Number of GPUs per node. If None, auto-detects from LOCAL_WORLD_SIZE.
+                       This should match the --nproc value from torchrun.
+
+    Returns:
+        Tuple of (shard_group, replicate_group) or None
+    """
+    if not dist.is_initialized():
+        return None
+
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+
+    # Auto-detect GPUs per node from environment if not provided
+    if gpus_per_node is None:
+        # torchrun sets LOCAL_WORLD_SIZE to the number of processes per node
+        local_world_size_str = os.environ.get("LOCAL_WORLD_SIZE")
+        if local_world_size_str:
+            gpus_per_node = int(local_world_size_str)
+            log.info("Auto-detected gpus_per_node=%d from LOCAL_WORLD_SIZE", gpus_per_node)
+        else:
+            log.error(
+                "Cannot determine gpus_per_node: LOCAL_WORLD_SIZE not set and "
+                "hybrid_shard_gpus_per_node not configured. Set hybrid_shard_gpus_per_node in config."
+            )
+            return None
+
+    # Validate configuration
+    if world_size % gpus_per_node != 0:
+        log.error(
+            "Invalid HYBRID_SHARD config: world_size=%d not divisible by gpus_per_node=%d",
+            world_size, gpus_per_node
+        )
+        return None
+
+    num_nodes = world_size // gpus_per_node
+    node_id = rank // gpus_per_node
+
+    if num_nodes <= 1:
+        log.warning("HYBRID_SHARD with single node - consider using FULL_SHARD instead")
+        return None
+
+    log.info(
+        "Creating HYBRID_SHARD process groups | rank=%d world_size=%d num_nodes=%d gpus_per_node=%d node_id=%d",
+        rank, world_size, num_nodes, gpus_per_node, node_id
+    )
+
+    # Intra-node groups: shard within each node
+    for i in range(num_nodes):
+        ranks_in_node = list(range(i * gpus_per_node, (i + 1) * gpus_per_node))
+        group = dist.new_group(ranks=ranks_in_node)
+        if i == node_id:
+            my_shard_group = group
+
+    # Inter-node groups: replicate across nodes (same local_rank)
+    for local_r in range(gpus_per_node):
+        ranks_across_nodes = [node * gpus_per_node + local_r for node in range(num_nodes)]
+        group = dist.new_group(ranks=ranks_across_nodes)
+        if local_r == local_rank:
+            my_replicate_group = group
+
+    log.info(
+        "Created process groups | shard_group_size=%d replicate_group_size=%d",
+        dist.get_world_size(my_shard_group),
+        dist.get_world_size(my_replicate_group),
+    )
+
+    return (my_shard_group, my_replicate_group)
+
+
 def build_ddp_model(
     model_cfg: ModelConfig,
     ddp_cfg: DDPConfig,