diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index 67ff8a71d2..891976166d 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -84,77 +84,6 @@ policy: &POLICY_BASE foreach: False fused: False - megatron_cfg: &MEGATRON_BASE - enabled: false - empty_unused_memory_level: 0 - activation_checkpointing: false - converter_type: "Qwen3ForCausalLM" - tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 2 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 2 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 2.00001e-5 - min_lr: 2.0e-5 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 10 - lr_warmup_init: 2.0e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - scheduler: - name: "torch.optim.lr_scheduler.LinearLR" kwargs: diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index ae2fbcd3e1..2865707fbb 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -35,6 +35,8 @@ policy: &POLICY_BASE make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2} + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config + megatron_cfg: &MEGATRON_BASE enabled: true empty_unused_memory_level: 0 @@ -140,6 +142,7 @@ policy: &POLICY_BASE teacher: <<: *POLICY_BASE model_name: "Qwen/Qwen3-4B" + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config megatron_cfg: <<: *MEGATRON_BASE context_parallel_size: 2 diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index f2b57b0bbd..ef21c555d2 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -106,78 +106,7 @@ policy: factor: 1.0 total_iters: 10000000000 - milestones: [20] - - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: true - freeze_moe_router: false - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 #4.0e-5 - min_lr: 5.0e-6 #4.0e-5 - weight_decay: 0.1 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-8 - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_warmup_iters: 1 - lr_warmup_init: 0.00000001 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - data_parallel_sharding_strategy: "optim_grads_params" - use_custom_fsdp: false - data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 90269726d7..35dbe01e79 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -105,81 +105,6 @@ policy: lora_A_init: "xavier" # Initialization method for LoRA A matrix: "xavier" or "uniform" use_triton: true # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1 - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - fp8_cfg: null - - env_vars: null # See docs/design-docs/sequence-packing-and-dynamic-batching.md # for more details on dynamic batching and sequence packing. diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index a9368481ae..671e0cbbb1 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -70,75 +70,9 @@ policy: sequence_length_round: 64 max_grad_norm: 1.0 - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - optimizer: null # remove default FSDP optimizer - - megatron_cfg: - enabled: true - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" + optimizer: null # remove default FSDP optimizer generation: backend: "vllm" diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml index 4d17fdcea3..c89e4e57b8 100644 --- a/examples/configs/grpo_math_70B_megatron.yaml +++ b/examples/configs/grpo_math_70B_megatron.yaml @@ -22,6 +22,8 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config + megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml index 977ab394b5..e52b3d2d3e 100644 --- a/examples/configs/grpo_math_8B_megatron.yaml +++ b/examples/configs/grpo_math_8B_megatron.yaml @@ -28,6 +28,8 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + megatron_cfg: enabled: true empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. diff --git a/examples/configs/grpo_math_8B_megatron_fp8.yaml b/examples/configs/grpo_math_8B_megatron_fp8.yaml index ba6ee6e5c8..9548979c1c 100644 --- a/examples/configs/grpo_math_8B_megatron_fp8.yaml +++ b/examples/configs/grpo_math_8B_megatron_fp8.yaml @@ -19,4 +19,4 @@ policy: optimizer: use_precision_aware_optimizer: false env_vars: - NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" \ No newline at end of file + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml index 37616e32b0..2d4f0f3151 100644 --- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml +++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml @@ -26,6 +26,8 @@ policy: scheduler: null # remove default FSDP scheduler + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config + megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml index 9035a3598c..8f615b4361 100644 --- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml +++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml @@ -34,6 +34,7 @@ policy: dtensor_cfg: _v2: false context_parallel_size: 4 + megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config megatron_cfg: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml index 6fda3fe24e..d8cce7d5d0 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml @@ -22,6 +22,7 @@ policy: ${.megatron_cfg.context_parallel_size}}, 2} megatron_cfg: enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config teacher: model_name: Qwen/Qwen3-32B dtensor_cfg: @@ -30,6 +31,7 @@ teacher: enabled: false sequence_packing: enabled: true + megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 8df4bc3fb0..44843ac0c1 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -18,6 +18,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 8b3a43ea28..8e8b2a8a3d 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -20,6 +20,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true pipeline_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml index 8d19757d54..0523d30ac8 100644 --- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml @@ -17,6 +17,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config megatron_cfg: enabled: true activation_checkpointing: true diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml index b3dec78e98..4f2a8ee3ec 100755 --- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml @@ -8,6 +8,7 @@ policy: model_name: openai/gpt-oss-20b train_micro_batch_size: 1 max_total_sequence_length: 4096 + megatron_recipe: megatron.bridge.recipes.openai.gpt_oss.gpt_oss_20b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 8 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml index dcd791eee6..8d21260fc6 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml @@ -16,6 +16,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true converter_type: LlamaForCausalLM diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml index 6411c6fb49..4930f552c2 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml @@ -17,6 +17,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true converter_type: LlamaForCausalLM diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index 333a06d980..3133e9d3eb 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -10,6 +10,7 @@ policy: tokenizer: name: meta-llama/Llama-3.2-1B-Instruct optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config megatron_cfg: enabled: true scheduler: diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml index bb641388d8..f89d752e81 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml @@ -10,6 +10,7 @@ policy: tokenizer: name: meta-llama/Llama-3.2-1B-Instruct optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config megatron_cfg: enabled: true scheduler: diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml index 92fb87c196..5c8d8594fd 100644 --- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml +++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml @@ -20,6 +20,7 @@ policy: make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null scheduler: null + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_finetune_config megatron_cfg: enabled: true converter_type: LlamaForCausalLM diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml index 27108c55c7..951bb0371f 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml @@ -18,6 +18,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config megatron_cfg: enabled: true moe_router_dtype: fp32 diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml index 83ea6128ef..8674bdf00a 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -20,6 +20,7 @@ policy: algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config megatron_cfg: enabled: true expert_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml index 86690abcc2..cd7a7c8b96 100644 --- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml @@ -8,6 +8,7 @@ policy: tokenizer: name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 optimizer: null + megatron_recipe: megatron.bridge.recipes.nemotronh.nemotron_nano_12b_v2_pretrain_config megatron_cfg: enabled: true bias_activation_fusion: false diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index fd0a48a663..e37c892929 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -14,6 +14,7 @@ policy: max_total_sequence_length: 4096 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen25_7b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 6e0aa5cd81..c7f3eca79f 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -17,6 +17,7 @@ policy: enabled: false algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml index 69ff4a4229..777100853f 100644 --- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml @@ -15,6 +15,7 @@ policy: enabled: false optimizer: null scheduler: null + megatron_recipe: megatron.bridge.recipes.qwen.qwen3_8b_pretrain_config megatron_cfg: enabled: true converter_type: Qwen3ForCausalLM diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp deleted file mode 100644 index 287b7b0973..0000000000 Binary files a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp and /dev/null differ diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp deleted file mode 100644 index 98e5b39f68..0000000000 Binary files a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp and /dev/null differ diff --git a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml index 9c4edd2b30..2bfaf20955 100644 --- a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml +++ b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml @@ -40,6 +40,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size}, ${mul:2, ${policy.dtensor_cfg.context_parallel_size}}} + megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config megatron_cfg: empty_unused_memory_level: 2 enabled: true diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml index 04fc067d6e..890124d3e0 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -4,14 +4,14 @@ checkpointing: policy: sequence_packing: enabled: false - megatron_cfg: - pipeline_model_parallel_size: 8 - expert_model_parallel_size: 16 - num_layers_in_first_pipeline_stage: 7 - num_layers_in_last_pipeline_stage: 6 generation: vllm_cfg: tensor_parallel_size: 32 + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 logger: log_dir: logs/grpo-deepseek-v3-32n4g wandb: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml index 75457ab802..7965f72764 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml @@ -19,6 +19,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config_32nodes megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml index a99f7c1498..e3c9e25c85 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml @@ -17,6 +17,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml index afdbf8c414..fb0f103855 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml @@ -17,6 +17,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml index 1376c8d340..e2e02de396 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml @@ -19,6 +19,7 @@ policy: make_sequence_length_divisible_by: 1 dtensor_cfg: enabled: false + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_235b_a22b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml index 21b9746f4b..c4749c0faf 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml index 2270d5e272..d2a4eb24b5 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml index 795764d3ee..6a029c6fde 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml @@ -7,6 +7,7 @@ checkpointing: checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g policy: model_name: Qwen/Qwen3-30B-A3B + megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config train_micro_batch_size: 1 max_total_sequence_length: 4096 dtensor_cfg: diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml index 2e441cdb5f..d17dad323a 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml index ad780ebc50..7b33ced71a 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml @@ -14,6 +14,7 @@ policy: optimizer: null scheduler: null make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config megatron_cfg: enabled: true empty_unused_memory_level: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index bb43955812..c638f8a85d 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -17,6 +17,7 @@ policy: max_total_sequence_length: 4096 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml index b2b76c0afd..96ccf66d44 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml @@ -14,6 +14,7 @@ policy: chat_template: default dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true peft: diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index aa62330e3e..43e358acea 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -19,6 +19,7 @@ policy: enabled: true make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 7e9452dff7..d3ba4e5a28 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -17,6 +17,7 @@ policy: enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config megatron_cfg: enabled: true tensor_model_parallel_size: 2 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml index d3bdd77bb2..0b3388f915 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml @@ -9,6 +9,7 @@ policy: max_total_sequence_length: 16384 dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config megatron_cfg: enabled: true tensor_model_parallel_size: 4 diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml index d81a58980e..45188bc54e 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml @@ -23,4 +23,4 @@ logger: wandb: name: vlm-grpo-3b-megatron cluster: - gpus_per_node: 8 \ No newline at end of file + gpus_per_node: 8 diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml index 4b0936fec5..49e56d11e8 100644 --- a/examples/configs/rm.yaml +++ b/examples/configs/rm.yaml @@ -73,62 +73,6 @@ policy: foreach: false fused: false - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 2 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - - optimizer: - optimizer: "adam" - lr: 2.0e-6 - min_lr: 1.9999e-6 - weight_decay: 0.1 - bf16: false - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 1.9999e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: false - data_parallel_sharding_strategy: "optim_grads_params" - - data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 6d53d7f606..71d8c1cc84 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -88,93 +88,6 @@ policy: foreach: False fused: False - ## ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - env_vars: {} - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - peft: - enabled: false - target_modules: [] - exclude_modules: [] - dim: 8 - alpha: 32 - dropout: 0.0 - dropout_position: "post" - lora_A_init_method: "xavier" - lora_B_init_method: "zero" - a2a_experimental: false - lora_dtype: None - - - optimizer: - optimizer: "adam" # When weight decay is set, it actually uses AdamW - lr: 5.0e-6 - min_lr: 4.9999e-6 - weight_decay: 0.1 # When weight decay is set, it actually uses AdamW - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 4.9999e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - data_parallel_sharding_strategy: "optim_grads_params" - use_custom_fsdp: false - data: max_input_seq_length: ${policy.max_total_sequence_length} add_bos: true diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml index 63fa6d65e4..00b7bbf8e7 100644 --- a/examples/configs/sft_openmathinstruct2.yaml +++ b/examples/configs/sft_openmathinstruct2.yaml @@ -39,9 +39,6 @@ policy: context_parallel_size: 1 custom_parallel_plan: null - megatron_cfg: - enabled: false - dynamic_batching: enabled: false diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index faca12e0ae..2d137012ef 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -32,6 +32,8 @@ policy: dtensor_cfg: enabled: false + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + megatron_cfg: activation_checkpointing: false context_parallel_size: 1 diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index f9612007a4..c28b11add8 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -80,79 +80,6 @@ policy: context_parallel_size: 1 custom_parallel_plan: null - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 13 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - # dynamic_batching improves performance by ensuring logprob and training microbatches # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length # responses are sorted by sequence length and bucketed into microbatches with a total diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index b32cd7df04..9b0275ca47 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -77,7 +77,6 @@ policy: train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} sequence_length_round: 64 - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} max_grad_norm: 1.0 sequence_packing: enabled: false @@ -123,65 +122,6 @@ policy: resources: gpus_per_node: null num_nodes: null - megatron_cfg: - enabled: true - empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. - activation_checkpointing: false - converter_type: Qwen2ForCausalLM - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: fp64 - moe_router_load_balancing_type: none - moe_router_bias_update_rate: 0.0 - moe_permute_fusion: false - apply_rope_fusion: true - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: False - moe_per_layer_logging: False - moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" - moe_shared_expert_overlap: false - optimizer: - optimizer: adam - lr: 2.0e-07 - min_lr: 2.0e-07 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: float32 - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1.0e-08 - sgd_momentum: 0.9 - use_distributed_optimizer: true - use_precision_aware_optimizer: true - clip_grad: ${policy.max_grad_norm} - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: constant - lr_decay_style: constant - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 2.0e-08 - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: false - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: optim_grads_params data: max_input_seq_length: ${policy.max_total_sequence_length} shuffle: true diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py index 4fc25d0d3c..790146ecaa 100644 --- a/nemo_rl/models/megatron/__init__.py +++ b/nemo_rl/models/megatron/__init__.py @@ -11,3 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from nemo_rl.models.megatron.recipe_config import ( + load_recipe, +) + +__all__ = [ + "load_recipe", +] diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py index f884e95e1b..87ffd9e83f 100644 --- a/nemo_rl/models/megatron/data.py +++ b/nemo_rl/models/megatron/data.py @@ -128,6 +128,7 @@ def get_microbatch_iterator( mbs: int, straggler_timer: StragglerDetector, seq_length_key: Optional[str] = None, + model_cfg: Optional[Any] = None, ) -> Tuple[Iterator[ProcessedMicrobatch], int, int, int, int]: """Create a processed microbatch iterator from a batch of data. @@ -140,6 +141,8 @@ def get_microbatch_iterator( cfg: Configuration dictionary mbs: Microbatch size seq_length_key: Key for sequence lengths in data dict (auto-detected if None) + model_cfg: Optional Megatron model config (ConfigContainer). When provided, + parallelism settings are read from here instead of the raw config dict. Returns: Tuple containing the iterator and metadata @@ -175,6 +178,7 @@ def get_microbatch_iterator( ) = _get_pack_sequence_parameters_for_megatron( cfg["megatron_cfg"], pack_seq_dim_size, + model_cfg=model_cfg, ) micro_batch_size = 1 else: @@ -528,12 +532,15 @@ def _pack_sequences_for_megatron( def _get_pack_sequence_parameters_for_megatron( megatron_cfg: dict, max_seq_len_in_batch: int, + model_cfg: Optional[Any] = None, ): """Get pack sequence parameters for Megatron model processing with optional context parallelism. Args: - megatron_cfg: Megatron configuration + megatron_cfg: Megatron configuration dict (from YAML) max_seq_len_in_batch: Maximum sequence length in batch + model_cfg: Optional Megatron model config (ConfigContainer). When provided, + parallelism settings are read from here instead of the raw config dict. Returns: Tuple of: @@ -541,10 +548,16 @@ def _get_pack_sequence_parameters_for_megatron( - pad_packed_seq_to_multiple_of: Pad packed sequences to a multiple of this value - pad_packed_seq_to: Pad packed sequences to this value (before CP) """ - tp_size = megatron_cfg["tensor_model_parallel_size"] - sp = megatron_cfg["sequence_parallel"] - pp_size = megatron_cfg["pipeline_model_parallel_size"] - cp_size = megatron_cfg["context_parallel_size"] + if model_cfg is not None: + tp_size = model_cfg.tensor_model_parallel_size + sp = model_cfg.sequence_parallel + pp_size = model_cfg.pipeline_model_parallel_size + cp_size = model_cfg.context_parallel_size + else: + tp_size = megatron_cfg["tensor_model_parallel_size"] + sp = megatron_cfg.get("sequence_parallel", False) + pp_size = megatron_cfg["pipeline_model_parallel_size"] + cp_size = megatron_cfg["context_parallel_size"] fp8_cfg = megatron_cfg.get("fp8_cfg", None) or {} use_fp8 = fp8_cfg.get("enabled", False) use_blockwise_fp8 = fp8_cfg.get("fp8_recipe", None) == "blockwise" diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py new file mode 100644 index 0000000000..4bf3d900fd --- /dev/null +++ b/nemo_rl/models/megatron/recipe_config.py @@ -0,0 +1,81 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Recipe-based configuration for NeMo-RL Megatron integration. + +This module provides a clean integration with Megatron-Bridge recipes, +allowing NeMo-RL to use pre-configured training recipes as a base and +layer RL-specific settings on top. + +Recipes are specified via their fully qualified Python import path in the +YAML config under ``policy.megatron_recipe``. For example: + + policy: + megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config + megatron_cfg: + ... + +The import path is resolved at runtime using ``load_recipe()``. +""" + +import importlib + +from megatron.bridge.training.config import ConfigContainer + + +def load_recipe(recipe_path: str) -> ConfigContainer: + """ + Dynamically import and call a Megatron-Bridge recipe function. + + Args: + recipe_path: Fully qualified Python import path to the recipe function. + For example: ``megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config`` + + Returns: + A ConfigContainer produced by calling the recipe function. + + Raises: + ValueError: If the recipe path is invalid or the function cannot be found. + TypeError: If the resolved object is not callable. + """ + module_path, _, func_name = recipe_path.rpartition(".") + if not module_path or not func_name: + raise ValueError( + f"Invalid recipe path '{recipe_path}'. " + "Expected a fully qualified Python path like " + "'megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config'" + ) + + try: + module = importlib.import_module(module_path) + except ImportError as e: + raise ValueError( + f"Could not import module '{module_path}' from recipe path '{recipe_path}': {e}" + ) from e + + recipe_fn = getattr(module, func_name, None) + if recipe_fn is None: + raise ValueError( + f"Module '{module_path}' has no attribute '{func_name}'. " + f"Check that the recipe function name is correct in '{recipe_path}'." + ) + + if not callable(recipe_fn): + raise TypeError( + f"'{recipe_path}' resolved to a non-callable object of type {type(recipe_fn).__name__}. " + "Expected a recipe function." + ) + + return recipe_fn() diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 24bfdb0605..b9eb13c6e3 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -31,6 +31,7 @@ CheckpointConfig, ConfigContainer, DistributedDataParallelConfig, + DistributedInitConfig, LoggerConfig, OptimizerConfig, SchedulerConfig, @@ -68,6 +69,7 @@ from nemo_rl.distributed.named_sharding import NamedSharding from nemo_rl.models.megatron.community_import import import_model_from_hf_name from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig +from nemo_rl.models.megatron.recipe_config import load_recipe from nemo_rl.models.policy import PolicyConfig from nemo_rl.models.policy.utils import ( configure_dynamo_cache, @@ -213,9 +215,7 @@ def validate_and_set_config( } dtype = dtype_map[config["precision"]] - # Optimizer configuration - optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"] - offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"] + # Reward models are not yet supported with Megatron. if "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"]: @@ -225,13 +225,21 @@ def validate_and_set_config( ) megatron_cfg, model_cfg = setup_model_config( - config, rank, dtype, hf_model_name, pretrained_path, weights_path + config=config, + rank=rank, + dtype=dtype, + hf_model_name=hf_model_name, + pretrained_path=pretrained_path, + weights_path=weights_path, ) + # Optimizer configuration + optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload + offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"] final_padded_vocab_size = calculate_padded_vocab_size( megatron_cfg.model.vocab_size, megatron_cfg.model.make_vocab_size_divisible_by, - config["megatron_cfg"]["tensor_model_parallel_size"], + megatron_cfg.model.tensor_model_parallel_size, ) return RuntimeConfig( @@ -262,7 +270,6 @@ def validate_model_paths(config: PolicyConfig) -> tuple[str, str, bool]: return hf_model_name, pretrained_path, pt_checkpoint_exists - def setup_model_config( config: PolicyConfig, rank, @@ -271,39 +278,49 @@ def setup_model_config( pretrained_path: str, weights_path: Optional[str] = None, ) -> tuple[ConfigContainer, Any]: - """Handle all the model configuration logic.""" - # Load pretrained run config - pretrained_run_config = os.path.join( - pretrained_path, "iter_0000000/run_config.yaml" - ) - - if not os.path.exists(pretrained_run_config): - raise FileNotFoundError( - f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. " - "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory " - "not being mounted on this node. Please check" + """Setup model configuration.""" + model_cfg = None + megatron_recipe = config.get("megatron_recipe") or config.get( + "megatron_cfg", {} + ).get("megatron_recipe") + + if megatron_recipe: + # Use Megatron-Bridge recipe specified in config + print(f"[INFO] Using Megatron-Bridge recipe: {megatron_recipe}") + megatron_cfg = load_recipe(megatron_recipe) + model_cfg = megatron_cfg.model + else: + # Load pretrained run config + pretrained_run_config = os.path.join( + pretrained_path, "iter_0000000/run_config.yaml" ) - try: - cfg_from_pretrained = ConfigContainer.from_yaml( - pretrained_run_config, mode=InstantiationMode.STRICT - ) - except Exception as e: - # Add helpful context as a note to the exception - e.add_note( - f"\n{'=' * 80}\n" - f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n" - f"created with an older version of megatron-bridge.\n" - f"If this checkpoint is old or was generated by a different code version,\n" - f"try deleting it and rerunning the code.\n" - f"The checkpoint will be automatically regenerated with the current version.\n\n" - f"Checkpoint location: {pretrained_path}\n" - f"{'=' * 80}" - ) - raise + if not os.path.exists(pretrained_run_config): + raise FileNotFoundError( + f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. " + "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory " + "not being mounted on this node. Please check" + ) - model_cfg = cfg_from_pretrained.model - cfg_from_pretrained.logger = LoggerConfig() + try: + megatron_cfg = ConfigContainer.from_yaml( + pretrained_run_config, mode=InstantiationMode.STRICT + ) + except Exception as e: + # Add helpful context as a note to the exception + e.add_note( + f"\n{'=' * 80}\n" + f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n" + f"created with an older version of megatron-bridge.\n" + f"If this checkpoint is old or was generated by a different code version,\n" + f"try deleting it and rerunning the code.\n" + f"The checkpoint will be automatically regenerated with the current version.\n\n" + f"Checkpoint location: {pretrained_path}\n" + f"{'=' * 80}" + ) + raise + + model_cfg = megatron_cfg.model # Apply parallelism settings _apply_parallelism_config(model_cfg, config) @@ -317,28 +334,32 @@ def setup_model_config( # Apply performance settings _apply_performance_config(model_cfg, config) - # Validate optimizer configuration - _validate_optimizer_config(config) # Optional layernorm epsilon if "layernorm_epsilon" in config["megatron_cfg"]: model_cfg.layernorm_epsilon = config["megatron_cfg"]["layernorm_epsilon"] + # Create checkpoint configs + checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path) + + # Update megatron config with checkpoint, optimizer, scheduler, etc. + _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name) + + _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer) + # Validate chunking configuration _validate_chunking_config(config) - # Create checkpoint configs - checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path) + # Validate optimizer configuration + _validate_optimizer_config(megatron_cfg) # Validate training configuration - _validate_training_config(config, model_cfg) + _validate_training_config(megatron_cfg, model_cfg) - # Create final megatron config - megatron_cfg = _create_megatron_config( - model_cfg, checkpoint_config, config, hf_model_name, dtype - ) - - _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer) + if "make_sequence_length_divisible_by" not in config: + config["make_sequence_length_divisible_by"] = ( + model_cfg.tensor_model_parallel_size + ) return megatron_cfg, model_cfg @@ -351,13 +372,13 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None: model_cfg.pipeline_model_parallel_size = config["megatron_cfg"][ "pipeline_model_parallel_size" ] - model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"][ - "num_layers_in_first_pipeline_stage" - ] - model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"][ - "num_layers_in_last_pipeline_stage" - ] - model_cfg.sequence_parallel = config["megatron_cfg"]["sequence_parallel"] + model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"].get( + "num_layers_in_first_pipeline_stage", None + ) + model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"].get( + "num_layers_in_last_pipeline_stage", None + ) + model_cfg.sequence_parallel = config["megatron_cfg"].get("sequence_parallel", False) model_cfg.context_parallel_size = config["megatron_cfg"]["context_parallel_size"] if model_cfg.context_parallel_size > 1: @@ -368,41 +389,49 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None: def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None: """Apply Mixture of Experts configuration.""" - model_cfg.expert_tensor_parallel_size = config["megatron_cfg"][ - "expert_tensor_parallel_size" - ] - model_cfg.expert_model_parallel_size = config["megatron_cfg"][ - "expert_model_parallel_size" - ] + megatron_cfg = config["megatron_cfg"] + model_cfg.expert_tensor_parallel_size = megatron_cfg.get( + "expert_tensor_parallel_size", 1 + ) + model_cfg.expert_model_parallel_size = megatron_cfg.get( + "expert_model_parallel_size", 1 + ) # MoE stability settings # Setting moe_router_dtype to higher precision (e.g. fp64) can improve numerical stability, # especially when using many experts. - model_cfg.moe_router_dtype = config["megatron_cfg"]["moe_router_dtype"] + if "moe_router_dtype" in megatron_cfg: + model_cfg.moe_router_dtype = megatron_cfg["moe_router_dtype"] # The below two configs (and "freeze_moe_router") are used to stabilize moe training # by preventing updates to the moe router. We found that this is helpful in reducing # logprob error during training. # Set this to "none" to disable load balancing loss. - model_cfg.moe_router_load_balancing_type = config["megatron_cfg"][ - "moe_router_load_balancing_type" - ] + if "moe_router_load_balancing_type" in megatron_cfg: + model_cfg.moe_router_load_balancing_type = megatron_cfg[ + "moe_router_load_balancing_type" + ] # Set this to 0.0 to disable updates to the moe router expert bias - model_cfg.moe_router_bias_update_rate = config["megatron_cfg"][ - "moe_router_bias_update_rate" - ] + if "moe_router_bias_update_rate" in megatron_cfg: + model_cfg.moe_router_bias_update_rate = megatron_cfg[ + "moe_router_bias_update_rate" + ] - model_cfg.moe_enable_deepep = config["megatron_cfg"]["moe_enable_deepep"] - model_cfg.moe_token_dispatcher_type = config["megatron_cfg"][ - "moe_token_dispatcher_type" - ] - model_cfg.moe_shared_expert_overlap = config["megatron_cfg"][ - "moe_shared_expert_overlap" - ] + if "moe_enable_deepep" in megatron_cfg: + model_cfg.moe_enable_deepep = megatron_cfg["moe_enable_deepep"] + if "moe_token_dispatcher_type" in megatron_cfg: + model_cfg.moe_token_dispatcher_type = megatron_cfg[ + "moe_token_dispatcher_type" + ] + if "moe_shared_expert_overlap" in megatron_cfg: + model_cfg.moe_shared_expert_overlap = megatron_cfg[ + "moe_shared_expert_overlap" + ] - model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"] + if "moe_permute_fusion" in megatron_cfg: + model_cfg.moe_permute_fusion = megatron_cfg["moe_permute_fusion"] def _apply_precision_config( @@ -433,8 +462,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: """Apply performance optimization configuration.""" model_cfg.parallel_output = True + megatron_cfg = config["megatron_cfg"] + # Activation checkpointing - if config["megatron_cfg"]["activation_checkpointing"]: + if megatron_cfg.get("activation_checkpointing", False): model_cfg.recompute_granularity = "full" model_cfg.recompute_method = "uniform" model_cfg.recompute_num_layers = 1 @@ -449,8 +480,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) # Fusion settings - model_cfg.apply_rope_fusion = config["megatron_cfg"]["apply_rope_fusion"] - model_cfg.bias_activation_fusion = config["megatron_cfg"]["bias_activation_fusion"] + if "apply_rope_fusion" in megatron_cfg: + model_cfg.apply_rope_fusion = megatron_cfg["apply_rope_fusion"] + if "bias_activation_fusion" in megatron_cfg: + model_cfg.bias_activation_fusion = megatron_cfg["bias_activation_fusion"] # FP8 configuration fp8_cfg = config["megatron_cfg"].get("fp8_cfg", None) @@ -469,12 +502,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) -def _validate_optimizer_config(config: PolicyConfig) -> None: +def _validate_optimizer_config(megatron_cfg: ConfigContainer) -> None: """Validate optimizer configuration.""" - optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"] - optimizer_offload_fraction = config["megatron_cfg"]["optimizer"][ - "optimizer_offload_fraction" - ] + optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload + optimizer_offload_fraction = megatron_cfg.optimizer.optimizer_offload_fraction if optimizer_cpu_offload: # Currently, hybrid optimizer (partly on GPU and partly on CPU) is not supported because it conflicts with the way @@ -512,9 +543,9 @@ def _create_checkpoint_config( ) -def _validate_training_config(config: PolicyConfig, model_cfg: Any) -> None: +def _validate_training_config(megatron_cfg: ConfigContainer, model_cfg: Any) -> None: """Validate training configuration.""" - assert "train_iters" in config["megatron_cfg"], ( + assert megatron_cfg.train.train_iters is not None, ( "train_iters must be set in megatron_cfg. For an example, see " "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179." ) @@ -570,51 +601,94 @@ def _validate_dtype_config( ) -def _create_megatron_config( - model_cfg: Any, +def _update_dataclass_fields(target: Any, updates: dict) -> None: + """Update a dataclass with values from a dictionary. + + Only sets fields that are present in the updates dict. Fields not in + the dict retain their original values. + + Args: + target: A dataclass instance to update + updates: Dictionary of field names to new values + """ + for key, value in updates.items(): + if hasattr(target, key): + setattr(target, key, value) + + +def _update_megatron_config( + megatron_cfg: ConfigContainer, checkpoint_config: CheckpointConfig, config: PolicyConfig, hf_model_name: str, - dtype: torch.dtype, -) -> ConfigContainer: - """Create the final Megatron configuration container.""" - return ConfigContainer( - model=model_cfg, - checkpoint=checkpoint_config, - logger=LoggerConfig(logging_level=0), - train=TrainingConfig( - micro_batch_size=1, # ignored - global_batch_size=config["train_global_batch_size"], # ignored - train_iters=config["megatron_cfg"]["train_iters"], - ), - optimizer=OptimizerConfig(**config["megatron_cfg"]["optimizer"]), - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["grad_reduce_in_fp32"], - overlap_grad_reduce=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["overlap_grad_reduce"], - overlap_param_gather=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["overlap_param_gather"], - # we need to set average_in_collective=False with calculate_per_token_loss=T - # otherwise, mcore throws an assertion error. - average_in_collective=False, # Required with calculate_per_token_loss=True - use_distributed_optimizer=config["megatron_cfg"]["optimizer"][ - "use_distributed_optimizer" - ], - data_parallel_sharding_strategy=config["megatron_cfg"][ - "distributed_data_parallel_config" - ]["data_parallel_sharding_strategy"], - ), - scheduler=SchedulerConfig(**config["megatron_cfg"]["scheduler"]), - dataset=None, - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=hf_model_name, - ), +) -> None: + """Update the existing ConfigContainer with checkpoint, optimizer, scheduler, and other settings. + + This modifies megatron_cfg in-place. For sub-configs (optimizer, ddp, scheduler, etc.), + only fields explicitly provided in the NeMo-RL config are updated; other fields retain + their original values from the recipe or checkpoint. + """ + megatron_cfg_dict = config.get("megatron_cfg", {}) + + # Ensure dist config is initialized (required for validate()) + if megatron_cfg.dist is None: + megatron_cfg.dist = DistributedInitConfig() + + # Always replace checkpoint config (NeMo-RL manages checkpoints) + megatron_cfg.checkpoint = checkpoint_config + + # Always set logger + megatron_cfg.logger = LoggerConfig(logging_level=0) + + # Update training config - these are NeMo-RL specific + if megatron_cfg.train is None: + megatron_cfg.train = TrainingConfig() + megatron_cfg.train.micro_batch_size = 1 # ignored by NeMo-RL + megatron_cfg.train.global_batch_size = config.get("train_global_batch_size", 1) # ignored by NeMo-RL + if "train_iters" in megatron_cfg_dict: + megatron_cfg.train.train_iters = megatron_cfg_dict["train_iters"] + + # Update optimizer config - merge with existing + optimizer_overrides = megatron_cfg_dict.get("optimizer", {}) + if optimizer_overrides: + if megatron_cfg.optimizer is None: + megatron_cfg.optimizer = OptimizerConfig(**optimizer_overrides) + else: + _update_dataclass_fields(megatron_cfg.optimizer, optimizer_overrides) + + # Update DDP config - merge with existing + ddp_overrides = megatron_cfg_dict.get("distributed_data_parallel_config", {}) + if megatron_cfg.ddp is None: + megatron_cfg.ddp = DistributedDataParallelConfig() + + # Apply explicit DDP overrides from config + if ddp_overrides: + _update_dataclass_fields(megatron_cfg.ddp, ddp_overrides) + + # NeMo-RL required DDP settings (always set) + megatron_cfg.ddp.check_for_nan_in_grad = True + # Required with calculate_per_token_loss=True, otherwise mcore throws assertion error + megatron_cfg.ddp.average_in_collective = False + + # Sync use_distributed_optimizer between optimizer and ddp + if megatron_cfg.optimizer is not None: + megatron_cfg.ddp.use_distributed_optimizer = megatron_cfg.optimizer.use_distributed_optimizer + + # Update scheduler config - merge with existing + scheduler_overrides = megatron_cfg_dict.get("scheduler", {}) + if scheduler_overrides: + if megatron_cfg.scheduler is None: + megatron_cfg.scheduler = SchedulerConfig(**scheduler_overrides) + else: + _update_dataclass_fields(megatron_cfg.scheduler, scheduler_overrides) + + # NeMo-RL handles data separately + megatron_cfg.dataset = None + + # Update tokenizer config - always set for HuggingFace tokenizer + megatron_cfg.tokenizer = TokenizerConfig( + tokenizer_type="HuggingFaceTokenizer", + tokenizer_model=hf_model_name, ) @@ -679,7 +753,7 @@ def setup_model_and_optimizer( use_peft = policy_cfg["megatron_cfg"].get("peft", {}).get("enabled", False) mixed_precision_wrapper = Float16Module - if policy_cfg["megatron_cfg"]["freeze_moe_router"]: + if policy_cfg["megatron_cfg"].get("freeze_moe_router", False): if use_peft: raise ValueError( "Freezing the MOE router is not currently supported when using PEFT" @@ -946,10 +1020,8 @@ def finalize_megatron_setup( ) should_disable_forward_pre_hook = ( - config["megatron_cfg"]["optimizer"]["use_distributed_optimizer"] - and config["megatron_cfg"]["distributed_data_parallel_config"][ - "overlap_param_gather" - ] + megatron_cfg.optimizer.use_distributed_optimizer + and megatron_cfg.ddp.overlap_param_gather ) return megatron_tokenizer, megatron_bridge, should_disable_forward_pre_hook, dp_size diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 363399cbca..d83a209f49 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -256,6 +256,11 @@ class PolicyConfig(TypedDict): reward_model_cfg: NotRequired[RewardModelConfig] dtensor_cfg: DTensorConfig | DTensorConfigDisabled megatron_cfg: NotRequired[MegatronConfig | MegatronConfigDisabled] + # Fully qualified Python import path to a Megatron-Bridge recipe function. + # When set, the recipe is loaded at runtime to provide the base model configuration. + # When null/unset, configuration is loaded from the checkpoint's run_config.yaml. + # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config" + megatron_recipe: NotRequired[str | None] hf_config_overrides: NotRequired[dict[str, Any]] dynamic_batching: DynamicBatchingConfig | DynamicBatchingConfigDisabled sequence_packing: NotRequired[SequencePackingConfig | SequencePackingConfigDisabled] diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py index 48ba0623e2..ef23ff556e 100644 --- a/nemo_rl/models/policy/workers/megatron_policy_worker.py +++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py @@ -279,6 +279,15 @@ def __init__( self.optimizer, ) + # Dump ConfigContainer to YAML for inspection (only on rank 0) + if self.rank == 0: + config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config.yaml" + try: + self.megatron_cfg.to_yaml(config_dump_path) + print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}") + except Exception as e: + print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}") + # vars used for refit ## will be initialized in prepare_refit_info # refit_param_info_mcore combines the conversion tasks with the param memory @@ -372,6 +381,7 @@ def train( self.cfg, mbs, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) # Track total microbatches for MoE aux-loss averaging total_num_microbatches += int(num_microbatches) @@ -556,6 +566,7 @@ def get_logprobs( self.cfg, logprob_batch_size, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) def forward_step_fn( @@ -763,6 +774,7 @@ def get_topk_logits( self.cfg, logprob_batch_size, straggler_timer=self.mcore_state.straggler_timer, + model_cfg=self.megatron_cfg.model, ) def forward_step_fn(