diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
index 67ff8a71d2..891976166d 100644
--- a/examples/configs/distillation_math.yaml
+++ b/examples/configs/distillation_math.yaml
@@ -84,77 +84,6 @@ policy: &POLICY_BASE
             foreach: False
             fused: False
 
-    megatron_cfg: &MEGATRON_BASE
-        enabled: false
-        empty_unused_memory_level: 0
-        activation_checkpointing: false
-        converter_type: "Qwen3ForCausalLM"
-        tensor_model_parallel_size: 2
-        expert_tensor_parallel_size: 1
-        expert_model_parallel_size: 1
-        pipeline_model_parallel_size: 2
-        num_layers_in_first_pipeline_stage: null
-        num_layers_in_last_pipeline_stage: null
-        context_parallel_size: 2
-        pipeline_dtype: ${policy.precision}
-        sequence_parallel: false
-        freeze_moe_router: true
-        moe_router_dtype: "fp64"
-        moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-        moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-        moe_permute_fusion: false
-        #gives ~20% training perf speedup with sequence packing 
-        apply_rope_fusion: True
-        bias_activation_fusion: True
-        defer_fp32_logits: False
-        moe_per_layer_logging: False
-        moe_enable_deepep: false
-        moe_token_dispatcher_type: "allgather"
-        moe_shared_expert_overlap: false
-        
-        optimizer:
-            optimizer: "adam"
-            lr: 2.00001e-5
-            min_lr: 2.0e-5
-            weight_decay: 0.01
-            bf16: true
-            fp16: false
-            params_dtype: "float32"
-
-            #adam
-            adam_beta1: 0.9
-            adam_beta2: 0.999
-            adam_eps: 1e-8
-
-            #sgd
-            sgd_momentum: 0.9
-
-            #distributed optimizer
-            use_distributed_optimizer: true
-            use_precision_aware_optimizer: true
-
-            # optimizer cpu offload
-            optimizer_cpu_offload: false
-            optimizer_offload_fraction: 0.0
-
-            clip_grad: ${policy.max_grad_norm}
-
-        scheduler:
-            start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-            end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-            weight_decay_incr_style: "constant"
-            lr_decay_style: "constant"
-            lr_decay_iters: 1000
-            lr_warmup_iters: 10
-            lr_warmup_init: 2.0e-6
-
-        distributed_data_parallel_config:
-            grad_reduce_in_fp32: false
-            overlap_grad_reduce: true
-            overlap_param_gather: true
-            use_custom_fsdp: false
-            data_parallel_sharding_strategy: "optim_grads_params"
-
     scheduler:
         - name: "torch.optim.lr_scheduler.LinearLR"
           kwargs:
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index ae2fbcd3e1..2865707fbb 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -35,6 +35,8 @@ policy: &POLICY_BASE
 
     make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2}
 
+    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config
+
     megatron_cfg: &MEGATRON_BASE
         enabled: true
         empty_unused_memory_level: 0
@@ -140,6 +142,7 @@ policy: &POLICY_BASE
 teacher:
     <<: *POLICY_BASE
     model_name: "Qwen/Qwen3-4B"
+    megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
     megatron_cfg:
         <<: *MEGATRON_BASE
         context_parallel_size: 2
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index f2b57b0bbd..ef21c555d2 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -106,78 +106,7 @@ policy:
         factor: 1.0
         total_iters: 10000000000
     - milestones: [20]
-    
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: true
-    freeze_moe_router: false
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6 #4.0e-5
-      min_lr: 5.0e-6 #4.0e-5
-      weight_decay: 0.1
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-8
 
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_warmup_iters: 1
-      lr_warmup_init: 0.00000001
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-      use_custom_fsdp: false
-    
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 90269726d7..35dbe01e79 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -105,81 +105,6 @@ policy:
       lora_A_init: "xavier"  # Initialization method for LoRA A matrix: "xavier" or "uniform"
       use_triton: true  # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1
   
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-    fp8_cfg: null
-
-    env_vars: null
 
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index a9368481ae..671e0cbbb1 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -70,75 +70,9 @@ policy:
     sequence_length_round: 64
 
   max_grad_norm: 1.0
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
 
-  optimizer: null # remove default FSDP optimizer
-
-  megatron_cfg:
-    enabled: true
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
 
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
+  optimizer: null # remove default FSDP optimizer
 
   generation:
     backend: "vllm"
diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml
index 4d17fdcea3..c89e4e57b8 100644
--- a/examples/configs/grpo_math_70B_megatron.yaml
+++ b/examples/configs/grpo_math_70B_megatron.yaml
@@ -22,6 +22,8 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
+  
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/grpo_math_8B_megatron.yaml b/examples/configs/grpo_math_8B_megatron.yaml
index 977ab394b5..e52b3d2d3e 100644
--- a/examples/configs/grpo_math_8B_megatron.yaml
+++ b/examples/configs/grpo_math_8B_megatron.yaml
@@ -28,6 +28,8 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
diff --git a/examples/configs/grpo_math_8B_megatron_fp8.yaml b/examples/configs/grpo_math_8B_megatron_fp8.yaml
index ba6ee6e5c8..9548979c1c 100644
--- a/examples/configs/grpo_math_8B_megatron_fp8.yaml
+++ b/examples/configs/grpo_math_8B_megatron_fp8.yaml
@@ -19,4 +19,4 @@ policy:
     optimizer:
       use_precision_aware_optimizer: false
     env_vars:
-      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
\ No newline at end of file
+      NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
diff --git a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
index 37616e32b0..2d4f0f3151 100644
--- a/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
+++ b/examples/configs/grpo_math_qwen30ba3b_megatron.yaml
@@ -26,6 +26,8 @@ policy:
 
   scheduler: null # remove default FSDP scheduler
 
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config
+  
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
index 9035a3598c..8f615b4361 100644
--- a/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
+++ b/examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
@@ -34,6 +34,7 @@ policy:
   dtensor_cfg:
     _v2: false
     context_parallel_size: 4
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
   megatron_cfg:
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
index 6fda3fe24e..d8cce7d5d0 100644
--- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
+++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
@@ -22,6 +22,7 @@ policy:
     ${.megatron_cfg.context_parallel_size}}, 2}
   megatron_cfg:
     enabled: true
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
 teacher:
   model_name: Qwen/Qwen3-32B
   dtensor_cfg:
@@ -30,6 +31,7 @@ teacher:
     enabled: false
   sequence_packing:
     enabled: true
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index 8df4bc3fb0..44843ac0c1 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -18,6 +18,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 8b3a43ea28..8e8b2a8a3d 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -20,6 +20,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     pipeline_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
index 8d19757d54..0523d30ac8 100644
--- a/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-dapomath17k-dsv3-megatron.yaml
@@ -17,6 +17,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.deepseek.deepseek_v3.deepseek_v3_pretrain_config
   megatron_cfg:
     enabled: true
     activation_checkpointing: true
diff --git a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
index b3dec78e98..4f2a8ee3ec 100755
--- a/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-gptoss-20b-8n8g-megatron.yaml
@@ -8,6 +8,7 @@ policy:
   model_name: openai/gpt-oss-20b
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
+  megatron_recipe: megatron.bridge.recipes.openai.gpt_oss.gpt_oss_20b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 8
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
index dcd791eee6..8d21260fc6 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-rollouts.v3.yaml
@@ -16,6 +16,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     converter_type: LlamaForCausalLM
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
index 6411c6fb49..4930f552c2 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
@@ -17,6 +17,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     converter_type: LlamaForCausalLM
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
index 333a06d980..3133e9d3eb 100755
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -10,6 +10,7 @@ policy:
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
   megatron_cfg:
     enabled: true
     scheduler:
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
index bb641388d8..f89d752e81 100644
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.yaml
@@ -10,6 +10,7 @@ policy:
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama32_1b_pretrain_config
   megatron_cfg:
     enabled: true
     scheduler:
diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
index 92fb87c196..5c8d8594fd 100644
--- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
+++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
@@ -20,6 +20,7 @@ policy:
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
   scheduler: null
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_finetune_config
   megatron_cfg:
     enabled: true
     converter_type: LlamaForCausalLM
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
index 27108c55c7..951bb0371f 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
@@ -18,6 +18,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config
   megatron_cfg:
     enabled: true
     moe_router_dtype: fp32
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
index 83ea6128ef..8674bdf00a 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
@@ -20,6 +20,7 @@ policy:
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.moonlight.moonlight_16b_pretrain_config
   megatron_cfg:
     enabled: true
     expert_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
index 86690abcc2..cd7a7c8b96 100644
--- a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml
@@ -8,6 +8,7 @@ policy:
   tokenizer:
     name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.nemotronh.nemotron_nano_12b_v2_pretrain_config
   megatron_cfg:
     enabled: true
     bias_activation_fusion: false
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
index fd0a48a663..e37c892929 100755
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -14,6 +14,7 @@ policy:
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen2.qwen25_7b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 6e0aa5cd81..c7f3eca79f 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -17,6 +17,7 @@ policy:
     enabled: false
     algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
index 69ff4a4229..777100853f 100644
--- a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-fp8-kvcache-megatron.yaml
@@ -15,6 +15,7 @@ policy:
     enabled: false
   optimizer: null
   scheduler: null
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen3_8b_pretrain_config
   megatron_cfg:
     enabled: true
     converter_type: Qwen3ForCausalLM
diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp
deleted file mode 100644
index 287b7b0973..0000000000
Binary files a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n4g.yaml.swp and /dev/null differ
diff --git a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp b/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp
deleted file mode 100644
index 98e5b39f68..0000000000
Binary files a/examples/configs/recipes/llm/performance/.grpo-deepseek-v3-32n8g.yaml.swp and /dev/null differ
diff --git a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
index 9c4edd2b30..2bfaf20955 100644
--- a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
+++ b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
@@ -40,6 +40,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size},
     ${mul:2, ${policy.dtensor_cfg.context_parallel_size}}}
+  megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config
   megatron_cfg:
     empty_unused_memory_level: 2
     enabled: true
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
index 04fc067d6e..890124d3e0 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml
@@ -4,14 +4,14 @@ checkpointing:
 policy:
   sequence_packing:
     enabled: false
-  megatron_cfg:
-    pipeline_model_parallel_size: 8
-    expert_model_parallel_size: 16
-    num_layers_in_first_pipeline_stage: 7
-    num_layers_in_last_pipeline_stage: 6
   generation:
     vllm_cfg:
       tensor_parallel_size: 32
+  megatron_cfg:
+      pipeline_model_parallel_size: 8
+      expert_model_parallel_size: 16
+      num_layers_in_first_pipeline_stage: 7
+      num_layers_in_last_pipeline_stage: 6
 logger:
   log_dir: logs/grpo-deepseek-v3-32n4g
   wandb:
diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
index 75457ab802..7965f72764 100644
--- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n8g.yaml
@@ -19,6 +19,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.deepseek.deepseek_v3_pretrain_config_32nodes
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
index a99f7c1498..e3c9e25c85 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml
@@ -17,6 +17,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
index afdbf8c414..fb0f103855 100644
--- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g.yaml
@@ -17,6 +17,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
index 1376c8d340..e2e02de396 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n8g.yaml
@@ -19,6 +19,7 @@ policy:
   make_sequence_length_divisible_by: 1
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_235b_a22b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
index 21b9746f4b..c4749c0faf 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
index 2270d5e272..d2a4eb24b5 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
index 795764d3ee..6a029c6fde 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g.yaml
@@ -7,6 +7,7 @@ checkpointing:
   checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g
 policy:
   model_name: Qwen/Qwen3-30B-A3B
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3_moe.qwen3_30b_a3b_pretrain_config
   train_micro_batch_size: 1
   max_total_sequence_length: 4096
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
index 2e441cdb5f..d17dad323a 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
index ad780ebc50..7b33ced71a 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
@@ -14,6 +14,7 @@ policy:
   optimizer: null
   scheduler: null
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
+  megatron_recipe: megatron_bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index bb43955812..c638f8a85d 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -17,6 +17,7 @@ policy:
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
index b2b76c0afd..96ccf66d44 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-lora.yaml
@@ -14,6 +14,7 @@ policy:
     chat_template: default
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     peft:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index aa62330e3e..43e358acea 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -19,6 +19,7 @@ policy:
     enabled: true
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 7e9452dff7..d3ba4e5a28 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -17,6 +17,7 @@ policy:
     enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   optimizer: null
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 2
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
index d3bdd77bb2..0b3388f915 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
@@ -9,6 +9,7 @@ policy:
   max_total_sequence_length: 16384
   dtensor_cfg:
     enabled: false
+  megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
   megatron_cfg:
     enabled: true
     tensor_model_parallel_size: 4
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
index d81a58980e..45188bc54e 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.yaml
@@ -23,4 +23,4 @@ logger:
   wandb:
     name: vlm-grpo-3b-megatron
 cluster:
-  gpus_per_node: 8
\ No newline at end of file
+  gpus_per_node: 8
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
index 4b0936fec5..49e56d11e8 100644
--- a/examples/configs/rm.yaml
+++ b/examples/configs/rm.yaml
@@ -73,62 +73,6 @@ policy:
       foreach: false
       fused: false
     
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 2
-    pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 2.0e-6
-      min_lr: 1.9999e-6
-      weight_decay: 0.1
-      bf16: false
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 1.9999e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-    
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 6d53d7f606..71d8c1cc84 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -88,93 +88,6 @@ policy:
       foreach: False
       fused: False
     
-  ## ignored since enabled=false, but needed for testing purposes
-  megatron_cfg:
-    enabled: false
-    env_vars: {}
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-    freeze_moe_router: false
-    moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    peft:
-      enabled: false
-      target_modules: []
-      exclude_modules: []
-      dim: 8
-      alpha: 32
-      dropout: 0.0
-      dropout_position: "post"
-      lora_A_init_method: "xavier"
-      lora_B_init_method: "zero"
-      a2a_experimental: false
-      lora_dtype: None
-
-
-    optimizer:
-      optimizer: "adam" # When weight decay is set, it actually uses AdamW 
-      lr: 5.0e-6
-      min_lr: 4.9999e-6
-      weight_decay: 0.1 # When weight decay is set, it actually uses AdamW
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 4.9999e-6
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-      use_custom_fsdp: false
-
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   add_bos: true
diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
index 63fa6d65e4..00b7bbf8e7 100644
--- a/examples/configs/sft_openmathinstruct2.yaml
+++ b/examples/configs/sft_openmathinstruct2.yaml
@@ -39,9 +39,6 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
-  megatron_cfg:
-    enabled: false
-
   dynamic_batching:
     enabled: false
 
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
index faca12e0ae..2d137012ef 100644
--- a/examples/configs/sft_openmathinstruct2_megatron.yaml
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -32,6 +32,8 @@ policy:
   dtensor_cfg:
     enabled: false
 
+  megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+
   megatron_cfg:
     activation_checkpointing: false
     context_parallel_size: 1
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
index f9612007a4..c28b11add8 100644
--- a/examples/configs/vlm_grpo_3B.yaml
+++ b/examples/configs/vlm_grpo_3B.yaml
@@ -80,79 +80,6 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
-  megatron_cfg:
-    enabled: false
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 13
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-
   # dynamic_batching improves performance by ensuring logprob and training microbatches
   # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
   # responses are sorted by sequence length and bucketed into microbatches with a total
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
index b32cd7df04..9b0275ca47 100644
--- a/examples/configs/vlm_grpo_3B_megatron.yaml
+++ b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -77,7 +77,6 @@ policy:
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
     sequence_length_round: 64
-  make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   max_grad_norm: 1.0
   sequence_packing:
     enabled: false
@@ -123,65 +122,6 @@ policy:
       resources:
         gpus_per_node: null
         num_nodes: null
-  megatron_cfg:
-    enabled: true
-    empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
-    activation_checkpointing: false
-    converter_type: Qwen2ForCausalLM
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: fp64
-    moe_router_load_balancing_type: none
-    moe_router_bias_update_rate: 0.0
-    moe_permute_fusion: false
-    apply_rope_fusion: true
-    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
-    bias_activation_fusion: True
-    defer_fp32_logits: False
-    moe_per_layer_logging: False
-    moe_enable_deepep: false
-    moe_token_dispatcher_type: "allgather"
-    moe_shared_expert_overlap: false
-    optimizer:
-      optimizer: adam
-      lr: 2.0e-07
-      min_lr: 2.0e-07
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: float32
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1.0e-08
-      sgd_momentum: 0.9
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-      clip_grad: ${policy.max_grad_norm}
-      # optimizer cpu offload
-      optimizer_cpu_offload: false
-      optimizer_offload_fraction: 0.0
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: constant
-      lr_decay_style: constant
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
-      lr_warmup_init: 2.0e-08
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: false
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: optim_grads_params
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/nemo_rl/models/megatron/__init__.py b/nemo_rl/models/megatron/__init__.py
index 4fc25d0d3c..790146ecaa 100644
--- a/nemo_rl/models/megatron/__init__.py
+++ b/nemo_rl/models/megatron/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from nemo_rl.models.megatron.recipe_config import (
+    load_recipe,
+)
+
+__all__ = [
+    "load_recipe",
+]
diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py
index f884e95e1b..87ffd9e83f 100644
--- a/nemo_rl/models/megatron/data.py
+++ b/nemo_rl/models/megatron/data.py
@@ -128,6 +128,7 @@ def get_microbatch_iterator(
     mbs: int,
     straggler_timer: StragglerDetector,
     seq_length_key: Optional[str] = None,
+    model_cfg: Optional[Any] = None,
 ) -> Tuple[Iterator[ProcessedMicrobatch], int, int, int, int]:
     """Create a processed microbatch iterator from a batch of data.
 
@@ -140,6 +141,8 @@ def get_microbatch_iterator(
         cfg: Configuration dictionary
         mbs: Microbatch size
         seq_length_key: Key for sequence lengths in data dict (auto-detected if None)
+        model_cfg: Optional Megatron model config (ConfigContainer). When provided,
+            parallelism settings are read from here instead of the raw config dict.
 
     Returns:
         Tuple containing the iterator and metadata
@@ -175,6 +178,7 @@ def get_microbatch_iterator(
         ) = _get_pack_sequence_parameters_for_megatron(
             cfg["megatron_cfg"],
             pack_seq_dim_size,
+            model_cfg=model_cfg,
         )
         micro_batch_size = 1
     else:
@@ -528,12 +532,15 @@ def _pack_sequences_for_megatron(
 def _get_pack_sequence_parameters_for_megatron(
     megatron_cfg: dict,
     max_seq_len_in_batch: int,
+    model_cfg: Optional[Any] = None,
 ):
     """Get pack sequence parameters for Megatron model processing with optional context parallelism.
 
     Args:
-        megatron_cfg: Megatron configuration
+        megatron_cfg: Megatron configuration dict (from YAML)
         max_seq_len_in_batch: Maximum sequence length in batch
+        model_cfg: Optional Megatron model config (ConfigContainer). When provided,
+            parallelism settings are read from here instead of the raw config dict.
 
     Returns:
         Tuple of:
@@ -541,10 +548,16 @@ def _get_pack_sequence_parameters_for_megatron(
         - pad_packed_seq_to_multiple_of: Pad packed sequences to a multiple of this value
         - pad_packed_seq_to: Pad packed sequences to this value (before CP)
     """
-    tp_size = megatron_cfg["tensor_model_parallel_size"]
-    sp = megatron_cfg["sequence_parallel"]
-    pp_size = megatron_cfg["pipeline_model_parallel_size"]
-    cp_size = megatron_cfg["context_parallel_size"]
+    if model_cfg is not None:
+        tp_size = model_cfg.tensor_model_parallel_size
+        sp = model_cfg.sequence_parallel
+        pp_size = model_cfg.pipeline_model_parallel_size
+        cp_size = model_cfg.context_parallel_size
+    else:
+        tp_size = megatron_cfg["tensor_model_parallel_size"]
+        sp = megatron_cfg.get("sequence_parallel", False)
+        pp_size = megatron_cfg["pipeline_model_parallel_size"]
+        cp_size = megatron_cfg["context_parallel_size"]
     fp8_cfg = megatron_cfg.get("fp8_cfg", None) or {}
     use_fp8 = fp8_cfg.get("enabled", False)
     use_blockwise_fp8 = fp8_cfg.get("fp8_recipe", None) == "blockwise"
diff --git a/nemo_rl/models/megatron/recipe_config.py b/nemo_rl/models/megatron/recipe_config.py
new file mode 100644
index 0000000000..4bf3d900fd
--- /dev/null
+++ b/nemo_rl/models/megatron/recipe_config.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Recipe-based configuration for NeMo-RL Megatron integration.
+
+This module provides a clean integration with Megatron-Bridge recipes,
+allowing NeMo-RL to use pre-configured training recipes as a base and
+layer RL-specific settings on top.
+
+Recipes are specified via their fully qualified Python import path in the
+YAML config under ``policy.megatron_recipe``. For example:
+
+    policy:
+      megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
+      megatron_cfg:
+        ...
+
+The import path is resolved at runtime using ``load_recipe()``.
+"""
+
+import importlib
+
+from megatron.bridge.training.config import ConfigContainer
+
+
+def load_recipe(recipe_path: str) -> ConfigContainer:
+    """
+    Dynamically import and call a Megatron-Bridge recipe function.
+
+    Args:
+        recipe_path: Fully qualified Python import path to the recipe function.
+            For example: ``megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config``
+
+    Returns:
+        A ConfigContainer produced by calling the recipe function.
+
+    Raises:
+        ValueError: If the recipe path is invalid or the function cannot be found.
+        TypeError: If the resolved object is not callable.
+    """
+    module_path, _, func_name = recipe_path.rpartition(".")
+    if not module_path or not func_name:
+        raise ValueError(
+            f"Invalid recipe path '{recipe_path}'. "
+            "Expected a fully qualified Python path like "
+            "'megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config'"
+        )
+
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ValueError(
+            f"Could not import module '{module_path}' from recipe path '{recipe_path}': {e}"
+        ) from e
+
+    recipe_fn = getattr(module, func_name, None)
+    if recipe_fn is None:
+        raise ValueError(
+            f"Module '{module_path}' has no attribute '{func_name}'. "
+            f"Check that the recipe function name is correct in '{recipe_path}'."
+        )
+
+    if not callable(recipe_fn):
+        raise TypeError(
+            f"'{recipe_path}' resolved to a non-callable object of type {type(recipe_fn).__name__}. "
+            "Expected a recipe function."
+        )
+
+    return recipe_fn()
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 24bfdb0605..b9eb13c6e3 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -31,6 +31,7 @@
     CheckpointConfig,
     ConfigContainer,
     DistributedDataParallelConfig,
+    DistributedInitConfig,
     LoggerConfig,
     OptimizerConfig,
     SchedulerConfig,
@@ -68,6 +69,7 @@
 from nemo_rl.distributed.named_sharding import NamedSharding
 from nemo_rl.models.megatron.community_import import import_model_from_hf_name
 from nemo_rl.models.megatron.config import ModelAndOptimizerState, RuntimeConfig
+from nemo_rl.models.megatron.recipe_config import load_recipe
 from nemo_rl.models.policy import PolicyConfig
 from nemo_rl.models.policy.utils import (
     configure_dynamo_cache,
@@ -213,9 +215,7 @@ def validate_and_set_config(
     }
     dtype = dtype_map[config["precision"]]
 
-    # Optimizer configuration
-    optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"]
-    offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"]
+
 
     # Reward models are not yet supported with Megatron.
     if "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"]:
@@ -225,13 +225,21 @@ def validate_and_set_config(
         )
 
     megatron_cfg, model_cfg = setup_model_config(
-        config, rank, dtype, hf_model_name, pretrained_path, weights_path
+        config=config,
+        rank=rank,
+        dtype=dtype,
+        hf_model_name=hf_model_name,
+        pretrained_path=pretrained_path,
+        weights_path=weights_path,
     )
+    # Optimizer configuration
+    optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload
+    offload_optimizer_for_logprob = config["offload_optimizer_for_logprob"]
 
     final_padded_vocab_size = calculate_padded_vocab_size(
         megatron_cfg.model.vocab_size,
         megatron_cfg.model.make_vocab_size_divisible_by,
-        config["megatron_cfg"]["tensor_model_parallel_size"],
+        megatron_cfg.model.tensor_model_parallel_size,
     )
 
     return RuntimeConfig(
@@ -262,7 +270,6 @@ def validate_model_paths(config: PolicyConfig) -> tuple[str, str, bool]:
 
     return hf_model_name, pretrained_path, pt_checkpoint_exists
 
-
 def setup_model_config(
     config: PolicyConfig,
     rank,
@@ -271,39 +278,49 @@ def setup_model_config(
     pretrained_path: str,
     weights_path: Optional[str] = None,
 ) -> tuple[ConfigContainer, Any]:
-    """Handle all the model configuration logic."""
-    # Load pretrained run config
-    pretrained_run_config = os.path.join(
-        pretrained_path, "iter_0000000/run_config.yaml"
-    )
-
-    if not os.path.exists(pretrained_run_config):
-        raise FileNotFoundError(
-            f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. "
-            "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory "
-            "not being mounted on this node. Please check"
+    """Setup model configuration."""
+    model_cfg = None
+    megatron_recipe = config.get("megatron_recipe") or config.get(
+        "megatron_cfg", {}
+    ).get("megatron_recipe")
+
+    if megatron_recipe:
+        # Use Megatron-Bridge recipe specified in config
+        print(f"[INFO] Using Megatron-Bridge recipe: {megatron_recipe}")
+        megatron_cfg = load_recipe(megatron_recipe)
+        model_cfg = megatron_cfg.model
+    else:
+        # Load pretrained run config
+        pretrained_run_config = os.path.join(
+            pretrained_path, "iter_0000000/run_config.yaml"
         )
 
-    try:
-        cfg_from_pretrained = ConfigContainer.from_yaml(
-            pretrained_run_config, mode=InstantiationMode.STRICT
-        )
-    except Exception as e:
-        # Add helpful context as a note to the exception
-        e.add_note(
-            f"\n{'=' * 80}\n"
-            f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n"
-            f"created with an older version of megatron-bridge.\n"
-            f"If this checkpoint is old or was generated by a different code version,\n"
-            f"try deleting it and rerunning the code.\n"
-            f"The checkpoint will be automatically regenerated with the current version.\n\n"
-            f"Checkpoint location: {pretrained_path}\n"
-            f"{'=' * 80}"
-        )
-        raise
+        if not os.path.exists(pretrained_run_config):
+            raise FileNotFoundError(
+                f"Pretrained run config not found at {pretrained_run_config} on rank={rank}. "
+                "This usually means that the one-time HF->mcore conversion on rank=0 saved to a directory "
+                "not being mounted on this node. Please check"
+            )
 
-    model_cfg = cfg_from_pretrained.model
-    cfg_from_pretrained.logger = LoggerConfig()
+        try:
+            megatron_cfg = ConfigContainer.from_yaml(
+                pretrained_run_config, mode=InstantiationMode.STRICT
+            )
+        except Exception as e:
+            # Add helpful context as a note to the exception
+            e.add_note(
+                f"\n{'=' * 80}\n"
+                f"NOTE: A common cause of this error is when the HF->mcore converted checkpoint is\n"
+                f"created with an older version of megatron-bridge.\n"
+                f"If this checkpoint is old or was generated by a different code version,\n"
+                f"try deleting it and rerunning the code.\n"
+                f"The checkpoint will be automatically regenerated with the current version.\n\n"
+                f"Checkpoint location: {pretrained_path}\n"
+                f"{'=' * 80}"
+            )
+            raise
+
+        model_cfg = megatron_cfg.model
 
     # Apply parallelism settings
     _apply_parallelism_config(model_cfg, config)
@@ -317,28 +334,32 @@ def setup_model_config(
     # Apply performance settings
     _apply_performance_config(model_cfg, config)
 
-    # Validate optimizer configuration
-    _validate_optimizer_config(config)
 
     # Optional layernorm epsilon
     if "layernorm_epsilon" in config["megatron_cfg"]:
         model_cfg.layernorm_epsilon = config["megatron_cfg"]["layernorm_epsilon"]
 
+    # Create checkpoint configs
+    checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path)
+
+    # Update megatron config with checkpoint, optimizer, scheduler, etc.
+    _update_megatron_config(megatron_cfg, checkpoint_config, config, hf_model_name)
+
+    _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer)
+
     # Validate chunking configuration
     _validate_chunking_config(config)
 
-    # Create checkpoint configs
-    checkpoint_config = _create_checkpoint_config(pretrained_path, weights_path)
+    # Validate optimizer configuration
+    _validate_optimizer_config(megatron_cfg)
 
     # Validate training configuration
-    _validate_training_config(config, model_cfg)
+    _validate_training_config(megatron_cfg, model_cfg)
 
-    # Create final megatron config
-    megatron_cfg = _create_megatron_config(
-        model_cfg, checkpoint_config, config, hf_model_name, dtype
-    )
-
-    _validate_dtype_config(dtype, megatron_cfg.model, megatron_cfg.optimizer)
+    if "make_sequence_length_divisible_by" not in config:
+        config["make_sequence_length_divisible_by"] = (
+            model_cfg.tensor_model_parallel_size
+        )
 
     return megatron_cfg, model_cfg
 
@@ -351,13 +372,13 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None:
     model_cfg.pipeline_model_parallel_size = config["megatron_cfg"][
         "pipeline_model_parallel_size"
     ]
-    model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"][
-        "num_layers_in_first_pipeline_stage"
-    ]
-    model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"][
-        "num_layers_in_last_pipeline_stage"
-    ]
-    model_cfg.sequence_parallel = config["megatron_cfg"]["sequence_parallel"]
+    model_cfg.num_layers_in_first_pipeline_stage = config["megatron_cfg"].get(
+        "num_layers_in_first_pipeline_stage", None
+    )
+    model_cfg.num_layers_in_last_pipeline_stage = config["megatron_cfg"].get(
+        "num_layers_in_last_pipeline_stage", None
+    )
+    model_cfg.sequence_parallel = config["megatron_cfg"].get("sequence_parallel", False)
     model_cfg.context_parallel_size = config["megatron_cfg"]["context_parallel_size"]
 
     if model_cfg.context_parallel_size > 1:
@@ -368,41 +389,49 @@ def _apply_parallelism_config(model_cfg: Any, config: PolicyConfig) -> None:
 
 def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None:
     """Apply Mixture of Experts configuration."""
-    model_cfg.expert_tensor_parallel_size = config["megatron_cfg"][
-        "expert_tensor_parallel_size"
-    ]
-    model_cfg.expert_model_parallel_size = config["megatron_cfg"][
-        "expert_model_parallel_size"
-    ]
+    megatron_cfg = config["megatron_cfg"]
+    model_cfg.expert_tensor_parallel_size = megatron_cfg.get(
+        "expert_tensor_parallel_size", 1
+    )
+    model_cfg.expert_model_parallel_size = megatron_cfg.get(
+        "expert_model_parallel_size", 1
+    )
 
     # MoE stability settings
 
     # Setting moe_router_dtype to higher precision (e.g. fp64) can improve numerical stability,
     # especially when using many experts.
-    model_cfg.moe_router_dtype = config["megatron_cfg"]["moe_router_dtype"]
+    if "moe_router_dtype" in megatron_cfg:
+        model_cfg.moe_router_dtype = megatron_cfg["moe_router_dtype"]
 
     # The below two configs (and "freeze_moe_router") are used to stabilize moe training
     # by preventing updates to the moe router. We found that this is helpful in reducing
     # logprob error during training.
 
     # Set this to "none" to disable load balancing loss.
-    model_cfg.moe_router_load_balancing_type = config["megatron_cfg"][
-        "moe_router_load_balancing_type"
-    ]
+    if "moe_router_load_balancing_type" in megatron_cfg:
+        model_cfg.moe_router_load_balancing_type = megatron_cfg[
+            "moe_router_load_balancing_type"
+        ]
     # Set this to 0.0 to disable updates to the moe router expert bias
-    model_cfg.moe_router_bias_update_rate = config["megatron_cfg"][
-        "moe_router_bias_update_rate"
-    ]
+    if "moe_router_bias_update_rate" in megatron_cfg:
+        model_cfg.moe_router_bias_update_rate = megatron_cfg[
+            "moe_router_bias_update_rate"
+        ]
 
-    model_cfg.moe_enable_deepep = config["megatron_cfg"]["moe_enable_deepep"]
-    model_cfg.moe_token_dispatcher_type = config["megatron_cfg"][
-        "moe_token_dispatcher_type"
-    ]
-    model_cfg.moe_shared_expert_overlap = config["megatron_cfg"][
-        "moe_shared_expert_overlap"
-    ]
+    if "moe_enable_deepep" in megatron_cfg:
+        model_cfg.moe_enable_deepep = megatron_cfg["moe_enable_deepep"]
+    if "moe_token_dispatcher_type" in megatron_cfg:
+        model_cfg.moe_token_dispatcher_type = megatron_cfg[
+            "moe_token_dispatcher_type"
+        ]
+    if "moe_shared_expert_overlap" in megatron_cfg:
+        model_cfg.moe_shared_expert_overlap = megatron_cfg[
+            "moe_shared_expert_overlap"
+        ]
 
-    model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"]
+    if "moe_permute_fusion" in megatron_cfg:
+        model_cfg.moe_permute_fusion = megatron_cfg["moe_permute_fusion"]
 
 
 def _apply_precision_config(
@@ -433,8 +462,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
     """Apply performance optimization configuration."""
     model_cfg.parallel_output = True
 
+    megatron_cfg = config["megatron_cfg"]
+
     # Activation checkpointing
-    if config["megatron_cfg"]["activation_checkpointing"]:
+    if megatron_cfg.get("activation_checkpointing", False):
         model_cfg.recompute_granularity = "full"
         model_cfg.recompute_method = "uniform"
         model_cfg.recompute_num_layers = 1
@@ -449,8 +480,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
         )
 
     # Fusion settings
-    model_cfg.apply_rope_fusion = config["megatron_cfg"]["apply_rope_fusion"]
-    model_cfg.bias_activation_fusion = config["megatron_cfg"]["bias_activation_fusion"]
+    if "apply_rope_fusion" in megatron_cfg:
+        model_cfg.apply_rope_fusion = megatron_cfg["apply_rope_fusion"]
+    if "bias_activation_fusion" in megatron_cfg:
+        model_cfg.bias_activation_fusion = megatron_cfg["bias_activation_fusion"]
 
     # FP8 configuration
     fp8_cfg = config["megatron_cfg"].get("fp8_cfg", None)
@@ -469,12 +502,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
             )
 
 
-def _validate_optimizer_config(config: PolicyConfig) -> None:
+def _validate_optimizer_config(megatron_cfg: ConfigContainer) -> None:
     """Validate optimizer configuration."""
-    optimizer_cpu_offload = config["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"]
-    optimizer_offload_fraction = config["megatron_cfg"]["optimizer"][
-        "optimizer_offload_fraction"
-    ]
+    optimizer_cpu_offload = megatron_cfg.optimizer.optimizer_cpu_offload
+    optimizer_offload_fraction = megatron_cfg.optimizer.optimizer_offload_fraction
 
     if optimizer_cpu_offload:
         # Currently, hybrid optimizer (partly on GPU and partly on CPU) is not supported because it conflicts with the way
@@ -512,9 +543,9 @@ def _create_checkpoint_config(
     )
 
 
-def _validate_training_config(config: PolicyConfig, model_cfg: Any) -> None:
+def _validate_training_config(megatron_cfg: ConfigContainer, model_cfg: Any) -> None:
     """Validate training configuration."""
-    assert "train_iters" in config["megatron_cfg"], (
+    assert megatron_cfg.train.train_iters is not None, (
         "train_iters must be set in megatron_cfg. For an example, see "
         "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179."
     )
@@ -570,51 +601,94 @@ def _validate_dtype_config(
         )
 
 
-def _create_megatron_config(
-    model_cfg: Any,
+def _update_dataclass_fields(target: Any, updates: dict) -> None:
+    """Update a dataclass with values from a dictionary.
+
+    Only sets fields that are present in the updates dict. Fields not in
+    the dict retain their original values.
+
+    Args:
+        target: A dataclass instance to update
+        updates: Dictionary of field names to new values
+    """
+    for key, value in updates.items():
+        if hasattr(target, key):
+            setattr(target, key, value)
+
+
+def _update_megatron_config(
+    megatron_cfg: ConfigContainer,
     checkpoint_config: CheckpointConfig,
     config: PolicyConfig,
     hf_model_name: str,
-    dtype: torch.dtype,
-) -> ConfigContainer:
-    """Create the final Megatron configuration container."""
-    return ConfigContainer(
-        model=model_cfg,
-        checkpoint=checkpoint_config,
-        logger=LoggerConfig(logging_level=0),
-        train=TrainingConfig(
-            micro_batch_size=1,  # ignored
-            global_batch_size=config["train_global_batch_size"],  # ignored
-            train_iters=config["megatron_cfg"]["train_iters"],
-        ),
-        optimizer=OptimizerConfig(**config["megatron_cfg"]["optimizer"]),
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["grad_reduce_in_fp32"],
-            overlap_grad_reduce=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["overlap_grad_reduce"],
-            overlap_param_gather=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["overlap_param_gather"],
-            # we need to set average_in_collective=False with calculate_per_token_loss=T
-            # otherwise, mcore throws an assertion error.
-            average_in_collective=False,  # Required with calculate_per_token_loss=True
-            use_distributed_optimizer=config["megatron_cfg"]["optimizer"][
-                "use_distributed_optimizer"
-            ],
-            data_parallel_sharding_strategy=config["megatron_cfg"][
-                "distributed_data_parallel_config"
-            ]["data_parallel_sharding_strategy"],
-        ),
-        scheduler=SchedulerConfig(**config["megatron_cfg"]["scheduler"]),
-        dataset=None,
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=hf_model_name,
-        ),
+) -> None:
+    """Update the existing ConfigContainer with checkpoint, optimizer, scheduler, and other settings.
+
+    This modifies megatron_cfg in-place. For sub-configs (optimizer, ddp, scheduler, etc.),
+    only fields explicitly provided in the NeMo-RL config are updated; other fields retain
+    their original values from the recipe or checkpoint.
+    """
+    megatron_cfg_dict = config.get("megatron_cfg", {})
+
+    # Ensure dist config is initialized (required for validate())
+    if megatron_cfg.dist is None:
+        megatron_cfg.dist = DistributedInitConfig()
+
+    # Always replace checkpoint config (NeMo-RL manages checkpoints)
+    megatron_cfg.checkpoint = checkpoint_config
+
+    # Always set logger
+    megatron_cfg.logger = LoggerConfig(logging_level=0)
+
+    # Update training config - these are NeMo-RL specific
+    if megatron_cfg.train is None:
+        megatron_cfg.train = TrainingConfig()
+    megatron_cfg.train.micro_batch_size = 1  # ignored by NeMo-RL
+    megatron_cfg.train.global_batch_size = config.get("train_global_batch_size", 1) # ignored by NeMo-RL
+    if "train_iters" in megatron_cfg_dict:
+        megatron_cfg.train.train_iters = megatron_cfg_dict["train_iters"]
+
+    # Update optimizer config - merge with existing
+    optimizer_overrides = megatron_cfg_dict.get("optimizer", {})
+    if optimizer_overrides:
+        if megatron_cfg.optimizer is None:
+            megatron_cfg.optimizer = OptimizerConfig(**optimizer_overrides)
+        else:
+            _update_dataclass_fields(megatron_cfg.optimizer, optimizer_overrides)
+
+    # Update DDP config - merge with existing
+    ddp_overrides = megatron_cfg_dict.get("distributed_data_parallel_config", {})
+    if megatron_cfg.ddp is None:
+        megatron_cfg.ddp = DistributedDataParallelConfig()
+
+    # Apply explicit DDP overrides from config
+    if ddp_overrides:
+        _update_dataclass_fields(megatron_cfg.ddp, ddp_overrides)
+
+    # NeMo-RL required DDP settings (always set)
+    megatron_cfg.ddp.check_for_nan_in_grad = True
+    # Required with calculate_per_token_loss=True, otherwise mcore throws assertion error
+    megatron_cfg.ddp.average_in_collective = False
+
+    # Sync use_distributed_optimizer between optimizer and ddp
+    if megatron_cfg.optimizer is not None:
+        megatron_cfg.ddp.use_distributed_optimizer = megatron_cfg.optimizer.use_distributed_optimizer
+
+    # Update scheduler config - merge with existing
+    scheduler_overrides = megatron_cfg_dict.get("scheduler", {})
+    if scheduler_overrides:
+        if megatron_cfg.scheduler is None:
+            megatron_cfg.scheduler = SchedulerConfig(**scheduler_overrides)
+        else:
+            _update_dataclass_fields(megatron_cfg.scheduler, scheduler_overrides)
+
+    # NeMo-RL handles data separately
+    megatron_cfg.dataset = None
+
+    # Update tokenizer config - always set for HuggingFace tokenizer
+    megatron_cfg.tokenizer = TokenizerConfig(
+        tokenizer_type="HuggingFaceTokenizer",
+        tokenizer_model=hf_model_name,
     )
 
 
@@ -679,7 +753,7 @@ def setup_model_and_optimizer(
     use_peft = policy_cfg["megatron_cfg"].get("peft", {}).get("enabled", False)
 
     mixed_precision_wrapper = Float16Module
-    if policy_cfg["megatron_cfg"]["freeze_moe_router"]:
+    if policy_cfg["megatron_cfg"].get("freeze_moe_router", False):
         if use_peft:
             raise ValueError(
                 "Freezing the MOE router is not currently supported when using PEFT"
@@ -946,10 +1020,8 @@ def finalize_megatron_setup(
     )
 
     should_disable_forward_pre_hook = (
-        config["megatron_cfg"]["optimizer"]["use_distributed_optimizer"]
-        and config["megatron_cfg"]["distributed_data_parallel_config"][
-            "overlap_param_gather"
-        ]
+        megatron_cfg.optimizer.use_distributed_optimizer
+        and megatron_cfg.ddp.overlap_param_gather
     )
 
     return megatron_tokenizer, megatron_bridge, should_disable_forward_pre_hook, dp_size
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 363399cbca..d83a209f49 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -256,6 +256,11 @@ class PolicyConfig(TypedDict):
     reward_model_cfg: NotRequired[RewardModelConfig]
     dtensor_cfg: DTensorConfig | DTensorConfigDisabled
     megatron_cfg: NotRequired[MegatronConfig | MegatronConfigDisabled]
+    # Fully qualified Python import path to a Megatron-Bridge recipe function.
+    # When set, the recipe is loaded at runtime to provide the base model configuration.
+    # When null/unset, configuration is loaded from the checkpoint's run_config.yaml.
+    # Example: "megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config"
+    megatron_recipe: NotRequired[str | None]
     hf_config_overrides: NotRequired[dict[str, Any]]
     dynamic_batching: DynamicBatchingConfig | DynamicBatchingConfigDisabled
     sequence_packing: NotRequired[SequencePackingConfig | SequencePackingConfigDisabled]
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
index 48ba0623e2..ef23ff556e 100644
--- a/nemo_rl/models/policy/workers/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -279,6 +279,15 @@ def __init__(
             self.optimizer,
         )
 
+        # Dump ConfigContainer to YAML for inspection (only on rank 0)
+        if self.rank == 0:
+            config_dump_path = "/lustre/fsw/portfolios/coreai/users/sfawzy/final_megatron_config.yaml"
+            try:
+                self.megatron_cfg.to_yaml(config_dump_path)
+                print(f"[DEBUG] Saved final ConfigContainer to: {config_dump_path}")
+            except Exception as e:
+                print(f"[WARNING] Failed to save ConfigContainer to YAML: {e}")
+
         # vars used for refit
         ## will be initialized in prepare_refit_info
         # refit_param_info_mcore combines the conversion tasks with the param memory
@@ -372,6 +381,7 @@ def train(
                     self.cfg,
                     mbs,
                     straggler_timer=self.mcore_state.straggler_timer,
+                    model_cfg=self.megatron_cfg.model,
                 )
                 # Track total microbatches for MoE aux-loss averaging
                 total_num_microbatches += int(num_microbatches)
@@ -556,6 +566,7 @@ def get_logprobs(
             self.cfg,
             logprob_batch_size,
             straggler_timer=self.mcore_state.straggler_timer,
+            model_cfg=self.megatron_cfg.model,
         )
 
         def forward_step_fn(
@@ -763,6 +774,7 @@ def get_topk_logits(
             self.cfg,
             logprob_batch_size,
             straggler_timer=self.mcore_state.straggler_timer,
+            model_cfg=self.megatron_cfg.model,
         )
 
         def forward_step_fn(