Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,25 @@ checkpoints/
runs/
logs/
tensorboard/
experiments/
artifacts/
artifacts_*/
overlap_debug*/

# Experiment outputs and traces
experiments/
*.json
*.jsonl
*.xlsx
*.log
*.logs
*_merged_enhanced.json
nccl_thread_sweep_*.log
nccl_thread_sweep_summary_*.txt
expected_outputs/
testdata/
*.log
actual_outputs/
*.html

# IDE/project-specific folders
.vscode/
.idea/
Expand All @@ -68,7 +74,13 @@ Thumbs.db
# Data files
*.dat

# Profiler traces
tmp_prof.*
rocprof_traces
overlap.*
.*repro.*
*.gz
trace_*

# Multi-node IP configuration (should be customized per deployment)
node_ip_list.txt
14 changes: 8 additions & 6 deletions config/multi_node/distributed_multinode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ training:
batch_size: 448
gradient_accumulation: 4
mixed_precision: bf16
max_steps: 2000
max_steps: 10
grad_clip_norm: 1.0
output_dir: artifacts/optuna_tracelens_sweep_fixed/trial_0000
log_interval: 20
Expand Down Expand Up @@ -46,6 +46,8 @@ fsdp:
forward_prefetch: true
sync_module_states: true
param_init_device: meta
# hybrid_shard_gpus_per_node: auto-detects from --nproc (via LOCAL_WORLD_SIZE)
# Only set manually if auto-detection fails
distributed:
backend: nccl
mode: fsdp
Expand All @@ -54,7 +56,7 @@ distributed:
static_graph: true
find_unused_parameters: false
compile:
enabled: true
enabled: false
backend: inductor
mode: max-autotune
fullgraph: false
Expand Down Expand Up @@ -82,10 +84,10 @@ dataloader:
pin_memory: true
profiling:
enabled: true
wait: 5
warmup: 5
active: 1
repeat: 1
wait: 1
warmup: 2
active: 5
repeat: 0
record_shapes: false
profile_memory: false
with_stack: false
Expand Down
113 changes: 113 additions & 0 deletions config/multi_node/shampoo_opt_multi_node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Shampoo-only variant of the user workload.
# NOTE: User reports this configuration produces NaNs when using Shampoo,
# while AdamW and other optimizers remain healthy. Monitor loss closely.

logging:
level: INFO

training:
epochs: 10
batch_size: 512
gradient_accumulation: 2
mixed_precision: bf16
max_steps: 2200
grad_clip_norm: 1.0
output_dir: artifacts/user_shampoo
log_interval: 20
additional_compute_streams: 2
lightweight_op_waves: 3

optimizer:
name: shampoo
lr: 0.0002
weight_decay: 0.01
betas: [0.9, 0.985]
eps: 1.0e-8

scheduler:
warmup_steps: 200
total_steps: 2200

dataset:
num_samples: 200000
sequence_length: 160
dense_dim: 256
sparse_features: 64
vocab_size: 350000
num_dense_features: 32
seed: 2025

model:
vocab_size: 350000
embedding_dim: 256
num_dense_features: 32
dense_dim: 256
model_dim: 1024
num_heads: 16
num_layers: 18
dropout: 0.1
mlp_hidden_dim: 4096

fsdp:
sharding_strategy: hybrid_shard
backward_prefetch: BACKWARD_PRE
use_orig_params: true
limit_all_gathers: true
forward_prefetch: true
sync_module_states: true
param_init_device: meta

distributed:
backend: nccl
mode: fsdp
bucket_cap_mb: 128
gradient_as_bucket_view: true
static_graph: true
find_unused_parameters: false

compile:
enabled: false
backend: inductor
mode: max-autotune
fullgraph: false
dynamic: false

streams:
num_streams: 4
high_priority:
- allreduce
- reducescatter
stream_assignments:
compute:
- dev6_stream3
- dev6_stream9
communication:
- dev6_stream13
- dev6_stream17
reducescatter:
- dev6_stream22
aux:
- dev6_stream0

dataloader:
num_workers: 0
pin_memory: true

profiling:
enabled: true
wait: 2
warmup: 2
active: 6
repeat: 1
record_shapes: true
profile_memory: true
with_stack: false
with_flops: false
# tensorboard: true
# chrome_trace: true
tensorboard: false
chrome_trace: false
trace_filename: user_shampoo.json

tracelens:
enabled: false
57 changes: 57 additions & 0 deletions docker/Dockerfile.rocm70_9-1-shampoo
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Start from the existing PyTorch ROCm image
FROM rocm/pytorch-private:20251030_rocm_e2e_phantom_mi350_genai_nightly

# Switch to root to install packages
USER root

# Install wget and git if not available
RUN yum install -y wget git 2>/dev/null || apt-get update && apt-get install -y wget git || true

# Download and install amdgpu-install package for RHEL
RUN wget https://artifactory-cdn.amd.com/artifactory/list/amdgpu-rpm/rhel/amdgpu-install-internal-7.0_9-1.noarch.rpm && \
yum install -y ./amdgpu-install-internal-7.0_9-1.noarch.rpm || rpm -ivh ./amdgpu-install-internal-7.0_9-1.noarch.rpm && \
rm amdgpu-install-internal-7.0_9-1.noarch.rpm

# Update amdgpu-repo with specific builds
RUN amdgpu-repo --amdgpu-build=2247890 --rocm-build=compute-rocm-rel-7.0-meta/7

# Since base image already has ROCm, just update the key runtime components
RUN yum update -y --skip-broken \
rocm-hip \
rocm-libs \
rocm-hip-libraries \
rocm-hip-runtime-devel \
hip-base \
hip-dev \
hip-runtime-amd \
rocm-core || echo "Updated available ROCm packages"

RUN python3.10 -m pip install git+https://github.com/AMD-AGI/TraceLens.git
RUN python3.10 -m pip install openpyxl seaborn

# Download and install rocprof-trace-decoder
RUN wget https://github.com/ROCm/rocprof-trace-decoder/releases/download/0.1.6/rocprof-trace-decoder-manylinux-2.28-0.1.6-Linux.rpm \
-O /tmp/rocprof-trace-decoder.rpm && \
rpm -i /tmp/rocprof-trace-decoder.rpm && \
rm /tmp/rocprof-trace-decoder.rpm

# Install Facebook Distributed Shampoo optimizer
RUN cd /tmp && \
git clone https://github.com/facebookresearch/optimizers.git && \
cd optimizers && \
python3.10 -m pip install . && \
cd / && \
rm -rf /tmp/optimizers

# Verify Shampoo installation
RUN python3.10 -c "import distributed_shampoo; print('[OK] Shampoo optimizer installed successfully')"

# Update environment variables
ENV ROCM_HOME=/opt/rocm
ENV PATH=$ROCM_HOME/bin:$PATH
ENV LD_LIBRARY_PATH=$ROCM_HOME/lib:$LD_LIBRARY_PATH

# Set working directory
WORKDIR /workspace/aorta

CMD ["/bin/bash"]
35 changes: 35 additions & 0 deletions docker/docker-compose.rocm70_9-1-shampoo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
services:
torchenv-rocm70-shampoo:
container_name: training-overlap-bugs-rocm70_9-1-shampoo
build:
context: .
dockerfile: Dockerfile.rocm70_9-1-shampoo
network: host
user: root
privileged: true
network_mode: host
group_add:
- video
ipc: host
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
environment:
- RCCL_FOLDER=/rccl
- LD_LIBRARY_PATH=/rccl/build/release:${LD_LIBRARY_PATH:-}
- TORCH_NCCL_HIGH_PRIORITY=1

volumes:
- /home/manrao:/manrao
- /home/oyazdanb/aorta:/workspace/aorta
devices:
- /dev/kfd
- /dev/dri
working_dir: /workspace/aorta
shm_size: 17G
ulimits:
memlock: -1
stack: 67108864
stdin_open: true
tty: true
1 change: 1 addition & 0 deletions docker/docker-compose.rocm70_9-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ services:
build:
context: .
dockerfile: Dockerfile.rocm70_9-1
network: host
user: root
privileged: true
network_mode: host
Expand Down
Loading