diff --git a/docs/performance-summary.md b/docs/performance-summary.md index 3876e485..791eb7b9 100644 --- a/docs/performance-summary.md +++ b/docs/performance-summary.md @@ -46,14 +46,20 @@ The performance data includes: | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| -|Wan 2.1 14B|32|64|1|37440|0|1|0|1|4|0|0|787.59| +|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|899.62| #### System: DGX-GB300 | Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | |-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| -|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,022.26| +|Wan 2.1 14B|32|64|1|37440|0|1|0|1|2|0|0|1,030.67| + +#### System: DGX-B200 + +| Model | #-GPUs | GBS | MBS | Sequence Length | FSDP | TP | SP | PP | CP | VP | EP | Model TFLOP / sec / GPU | +|-------|--------|-----|-----|-----------------|------|----|----|----|----|----|----|-------------------------| +|Wan 2.1 14B|32|64|1|37440|1|1|0|1|2|0|0|804.02| #### System: DGX-H100 diff --git a/examples/megatron/recipes/wan/README_perf_test.md b/examples/megatron/recipes/wan/README_perf_test.md index d95a4c14..d101ca33 100644 --- a/examples/megatron/recipes/wan/README_perf_test.md +++ b/examples/megatron/recipes/wan/README_perf_test.md @@ -5,7 +5,7 @@ This guide provides concise steps to set up the environment and run WAN pretrain ## Container Launch ```bash -CONT="nvcr.io/nvidia/nemo:25.09.00" +CONT="nvcr.io/nvidia/nemo:25.11" MOUNT="/lustre/fsw/:/lustre/fsw/" srun -t 02:00:00 \ @@ -28,18 +28,18 @@ cd /opt/ # DFM (pinned) git clone --no-checkout https://github.com/NVIDIA-NeMo/DFM.git -git -C DFM checkout 174bb7b34de002ebbbcae1ba8e2b12363c7dee01 +git -C DFM checkout 9eaace14995a724c982fe53726a909be2edc93cb export DFM_PATH=/opt/DFM # Megatron-Bridge (pinned) rm -rf /opt/Megatron-Bridge -git clone --no-checkout https://github.com/huvunvidia/Megatron-Bridge.git -git -C Megatron-Bridge checkout 713ab548e4bfee307eb94a7bb3f57c17dbb31b50 +git clone --no-checkout https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +git -C Megatron-Bridge checkout 953aabf75c0500180dc14a6a76cf9e7e7c4baec7 # Megatron-LM (pinned) rm -rf /opt/Megatron-LM git clone --no-checkout https://github.com/NVIDIA/Megatron-LM.git -git -C Megatron-LM checkout ce8185cbbe04f38beb74360e878450f2e8525885 +git -C Megatron-LM checkout 2d398b42fd4237fffb553109563d73ac099751c3 # Python path export PYTHONPATH="${DFM_PATH}/.:/opt/Megatron-Bridge/.:/opt/Megatron-LM" @@ -141,7 +141,13 @@ NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 examples/megatron/recipes/wan/pret ### Using mock data (optional, for debugging) - Using `--mock` argument. -- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = F * H * W * number_packed_samples`. +- Adjust `video_size` (F_latents, H_latents, W_latents) and `number_packed_samples` of `WanMockDataModuleConfig` in `wan.py`. Total `seq_len = (F_latents // patch_temporal) * (H_latents // patch_temporal) * (W_latents // patch_temporal) * number_packed_samples`. + +### Reproducing performance recipes + +- Please use the appropriate system config recipe in `examples/megatron/recipes/wan/conf/_perf_pretrain_mock.yaml` +- Usage example `examples/megatron/recipes/wan/pretrain_wan.py --mock --training-mode pretrain --config-file examples/megatron/recipes/wan/conf/gb300_perf_pretrain_mock.yaml` +- Note that the FLOPs calculation for Wan 2.1 is not currently supported in Megatron-Bridge. Please use a manual calculator until a fix is made. ## Inference diff --git a/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml new file mode 100644 index 00000000..13fc0c8d --- /dev/null +++ b/examples/megatron/recipes/wan/conf/b200_perf_pretrain_mock.yaml @@ -0,0 +1,43 @@ +model: + tensor_model_parallel_size: 1 + sequence_parallel: false + pipeline_model_parallel_size: 1 + context_parallel_size: 2 + crossattn_emb_size: 5120 + hidden_size: 5120 + ffn_hidden_size: 13824 + num_attention_heads: 40 + num_layers: 40 + qkv_format: thd + seq_length: 2048 # This is not used + +train: + global_batch_size: 64 + micro_batch_size: 1 + eval_iters: 0 + +scheduler: + lr_decay_style: constant + lr_warmup_iters: 0 + +optimizer: + lr: 5e-6 + min_lr: 5e-6 + +dataset: + seq_length: 2048 # This is not used + global_batch_size: 64 + micro_batch_size: 1 + +logger: + log_interval: 1 + +ddp: + use_megatron_fsdp: true + data_parallel_sharding_strategy: "optim_grads_params" + +dist: + use_megatron_fsdp: true + +checkpoint: + ckpt_format: "fsdp_dtensor" diff --git a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml index 7b170d36..13fc0c8d 100644 --- a/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml +++ b/examples/megatron/recipes/wan/conf/gb200_perf_pretrain_mock.yaml @@ -2,7 +2,7 @@ model: tensor_model_parallel_size: 1 sequence_parallel: false pipeline_model_parallel_size: 1 - context_parallel_size: 4 + context_parallel_size: 2 crossattn_emb_size: 5120 hidden_size: 5120 ffn_hidden_size: 13824 @@ -31,3 +31,13 @@ dataset: logger: log_interval: 1 + +ddp: + use_megatron_fsdp: true + data_parallel_sharding_strategy: "optim_grads_params" + +dist: + use_megatron_fsdp: true + +checkpoint: + ckpt_format: "fsdp_dtensor"