ROCm · oyazdanb · Jan 16, 2026 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -42,19 +42,25 @@ checkpoints/
 runs/
 logs/
 tensorboard/
+experiments/
+artifacts/
+artifacts_*/
+overlap_debug*/
 
 # Experiment outputs and traces
-experiments/
 *.json
+*.jsonl
 *.xlsx
+*.log
+*.logs
 *_merged_enhanced.json
 nccl_thread_sweep_*.log
 nccl_thread_sweep_summary_*.txt
 expected_outputs/
 testdata/
-*.log
 actual_outputs/
 *.html
+
 # IDE/project-specific folders
 .vscode/
 .idea/
@@ -68,7 +74,13 @@ Thumbs.db
 # Data files
 *.dat
 
+# Profiler traces
 tmp_prof.*
 rocprof_traces
 overlap.*
 .*repro.*
+*.gz
+trace_*
+
+# Multi-node IP configuration (should be customized per deployment)
+node_ip_list.txt
diff --git a/config/multi_node/distributed_multinode.yaml b/config/multi_node/distributed_multinode.yaml
@@ -5,7 +5,7 @@ training:
   batch_size: 448
   gradient_accumulation: 4
   mixed_precision: bf16
-  max_steps: 2000
+  max_steps: 10
   grad_clip_norm: 1.0
   output_dir: artifacts/optuna_tracelens_sweep_fixed/trial_0000
   log_interval: 20
@@ -46,6 +46,8 @@ fsdp:
   forward_prefetch: true
   sync_module_states: true
   param_init_device: meta
+  # hybrid_shard_gpus_per_node: auto-detects from --nproc (via LOCAL_WORLD_SIZE)
+  # Only set manually if auto-detection fails
 distributed:
   backend: nccl
   mode: fsdp
@@ -54,7 +56,7 @@ distributed:
   static_graph: true
   find_unused_parameters: false
 compile:
-  enabled: true
+  enabled: false
   backend: inductor
   mode: max-autotune
   fullgraph: false
@@ -82,10 +84,10 @@ dataloader:
   pin_memory: true
 profiling:
   enabled: true
-  wait: 5
-  warmup: 5
-  active: 1
-  repeat: 1
+  wait: 1
+  warmup: 2
+  active: 5
+  repeat: 0
   record_shapes: false
   profile_memory: false
   with_stack: false

diff --git a/config/multi_node/shampoo_opt_multi_node.yaml b/config/multi_node/shampoo_opt_multi_node.yaml
@@ -0,0 +1,113 @@
+# Shampoo-only variant of the user workload.
+# NOTE: User reports this configuration produces NaNs when using Shampoo,
+# while AdamW and other optimizers remain healthy. Monitor loss closely.
+
+logging:
+  level: INFO
+
+training:
+  epochs: 10
+  batch_size: 512
+  gradient_accumulation: 2
+  mixed_precision: bf16
+  max_steps: 2200
+  grad_clip_norm: 1.0
+  output_dir: artifacts/user_shampoo
+  log_interval: 20
+  additional_compute_streams: 2
+  lightweight_op_waves: 3
+
+optimizer:
+  name: shampoo
+  lr: 0.0002
+  weight_decay: 0.01
+  betas: [0.9, 0.985]
+  eps: 1.0e-8
+
+scheduler:
+  warmup_steps: 200
+  total_steps: 2200
+
+dataset:
+  num_samples: 200000
+  sequence_length: 160
+  dense_dim: 256
+  sparse_features: 64
+  vocab_size: 350000
+  num_dense_features: 32
+  seed: 2025
+
+model:
+  vocab_size: 350000
+  embedding_dim: 256
+  num_dense_features: 32
+  dense_dim: 256
+  model_dim: 1024
+  num_heads: 16
+  num_layers: 18
+  dropout: 0.1
+  mlp_hidden_dim: 4096
+
+fsdp:
+  sharding_strategy: hybrid_shard
+  backward_prefetch: BACKWARD_PRE
+  use_orig_params: true
+  limit_all_gathers: true
+  forward_prefetch: true
+  sync_module_states: true
+  param_init_device: meta
+
+distributed:
+  backend: nccl
+  mode: fsdp
+  bucket_cap_mb: 128
+  gradient_as_bucket_view: true
+  static_graph: true
+  find_unused_parameters: false
+
+compile:
+  enabled: false
+  backend: inductor
+  mode: max-autotune
+  fullgraph: false
+  dynamic: false
+
+streams:
+  num_streams: 4
+  high_priority:
+    - allreduce
+    - reducescatter
+  stream_assignments:
+    compute:
+      - dev6_stream3
+      - dev6_stream9
+    communication:
+      - dev6_stream13
+      - dev6_stream17
+    reducescatter:
+      - dev6_stream22
+    aux:
+      - dev6_stream0
+
+dataloader:
+  num_workers: 0
+  pin_memory: true
+
+profiling:
+  enabled: true
+  wait: 2
+  warmup: 2
+  active: 6
+  repeat: 1
+  record_shapes: true
+  profile_memory: true
+  with_stack: false
+  with_flops: false
+  # tensorboard: true
+  # chrome_trace: true
+  tensorboard: false
+  chrome_trace: false
+  trace_filename: user_shampoo.json
+
+tracelens:
+  enabled: false
diff --git a/docker/Dockerfile.rocm70_9-1-shampoo b/docker/Dockerfile.rocm70_9-1-shampoo
@@ -0,0 +1,57 @@
+# Start from the existing PyTorch ROCm image
+FROM rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly
+
+# Switch to root to install packages
+USER root
+
+# Install wget and git if not available
+RUN yum install -y wget git 2>/dev/null || apt-get update && apt-get install -y wget git || true
+
+# Download and install amdgpu-install package for RHEL
+RUN wget https://artifactory-cdn.amd.com/artifactory/list/amdgpu-rpm/rhel/amdgpu-install-internal-7.0_9-1.noarch.rpm && \
+    yum install -y ./amdgpu-install-internal-7.0_9-1.noarch.rpm || rpm -ivh ./amdgpu-install-internal-7.0_9-1.noarch.rpm && \
+    rm amdgpu-install-internal-7.0_9-1.noarch.rpm
+
+# Update amdgpu-repo with specific builds
+RUN amdgpu-repo --amdgpu-build=2247890 --rocm-build=compute-rocm-rel-7.0-meta/7
+
+# Since base image already has ROCm, just update the key runtime components
+RUN yum update -y --skip-broken \
+    rocm-hip \
+    rocm-libs \
+    rocm-hip-libraries \
+    rocm-hip-runtime-devel \
+    hip-base \
+    hip-dev \
+    hip-runtime-amd \
+    rocm-core || echo "Updated available ROCm packages"
+
+RUN python3.10 -m pip install git+https://github.com/AMD-AGI/TraceLens.git
+RUN python3.10 -m pip install openpyxl seaborn
+
+# Download and install rocprof-trace-decoder
+RUN wget https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.6/rocprof-trace-decoder-manylinux-2.28-0.1.6-Linux.rpm \
+    -O /tmp/rocprof-trace-decoder.rpm && \
+    rpm -i /tmp/rocprof-trace-decoder.rpm && \
+    rm /tmp/rocprof-trace-decoder.rpm
+
+# Install Facebook Distributed Shampoo optimizer
+RUN cd /tmp && \
+    git clone https://github.com/facebookresearch/optimizers.git && \
+    cd optimizers && \
+    python3.10 -m pip install . && \
+    cd / && \
+    rm -rf /tmp/optimizers
+
+# Verify Shampoo installation
+RUN python3.10 -c "import distributed_shampoo; print('[OK] Shampoo optimizer installed successfully')"
+
+# Update environment variables
+ENV ROCM_HOME=/opt/rocm
+ENV PATH=$ROCM_HOME/bin:$PATH
+ENV LD_LIBRARY_PATH=$ROCM_HOME/lib:$LD_LIBRARY_PATH
+
+# Set working directory
+WORKDIR /workspace/aorta
+
+CMD ["/bin/bash"]
diff --git a/docker/docker-compose.rocm70_9-1-shampoo.yaml b/docker/docker-compose.rocm70_9-1-shampoo.yaml
@@ -0,0 +1,35 @@
+services:
+  torchenv-rocm70-shampoo:
+    container_name: training-overlap-bugs-rocm70_9-1-shampoo
+    build:
+      context: .
+      dockerfile: Dockerfile.rocm70_9-1-shampoo
+      network: host
+    user: root
+    privileged: true
+    network_mode: host
+    group_add:
+      - video
+    ipc: host
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    environment:
+      - RCCL_FOLDER=/rccl
+      - LD_LIBRARY_PATH=/rccl/build/release:${LD_LIBRARY_PATH:-}
+      - TORCH_NCCL_HIGH_PRIORITY=1
+
+    volumes:
+      - /home/manrao:/manrao
+      - /home/oyazdanb/aorta:/workspace/aorta
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    working_dir: /workspace/aorta
+    shm_size: 17G
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    stdin_open: true
+    tty: true
diff --git a/docker/docker-compose.rocm70_9-1.yaml b/docker/docker-compose.rocm70_9-1.yaml
@@ -4,6 +4,7 @@ services:
     build:
       context: .
       dockerfile: Dockerfile.rocm70_9-1
+      network: host
     user: root
     privileged: true
     network_mode: host