Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 0 additions & 71 deletions examples/configs/distillation_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,77 +84,6 @@ policy: &POLICY_BASE
foreach: False
fused: False

megatron_cfg: &MEGATRON_BASE
enabled: false
empty_unused_memory_level: 0
activation_checkpointing: false
converter_type: "Qwen3ForCausalLM"
tensor_model_parallel_size: 2
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 2
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 2
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 2.00001e-5
min_lr: 2.0e-5
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 10
lr_warmup_init: 2.0e-6

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
Expand Down
3 changes: 3 additions & 0 deletions examples/configs/distillation_math_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ policy: &POLICY_BASE

make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2}

megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config

megatron_cfg: &MEGATRON_BASE
enabled: true
empty_unused_memory_level: 0
Expand Down Expand Up @@ -140,6 +142,7 @@ policy: &POLICY_BASE
teacher:
<<: *POLICY_BASE
model_name: "Qwen/Qwen3-4B"
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
megatron_cfg:
<<: *MEGATRON_BASE
context_parallel_size: 2
Expand Down
71 changes: 0 additions & 71 deletions examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,78 +106,7 @@ policy:
factor: 1.0
total_iters: 10000000000
- milestones: [20]

## ignored since enabled=false, but needed for testing purposes
megatron_cfg:
enabled: false
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 2
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: true
freeze_moe_router: false
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 5.0e-6 #4.0e-5
min_lr: 5.0e-6 #4.0e-5
weight_decay: 0.1
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_warmup_iters: 1
lr_warmup_init: 0.00000001

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
data_parallel_sharding_strategy: "optim_grads_params"
use_custom_fsdp: false

data:
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true
Expand Down
75 changes: 0 additions & 75 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,81 +105,6 @@ policy:
lora_A_init: "xavier" # Initialization method for LoRA A matrix: "xavier" or "uniform"
use_triton: true # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1

megatron_cfg:
enabled: false
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
converter_type: "Qwen2ForCausalLM"
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

fp8_cfg: null

env_vars: null

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
Expand Down
68 changes: 1 addition & 67 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,75 +70,9 @@ policy:
sequence_length_round: 64

max_grad_norm: 1.0
# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}

optimizer: null # remove default FSDP optimizer

megatron_cfg:
enabled: true
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
converter_type: "Qwen2ForCausalLM"
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"
optimizer: null # remove default FSDP optimizer

generation:
backend: "vllm"
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_70B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config

Comment on lines +25 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for any llama3.3 or llama33 references in the codebase
echo "=== Searching for llama3.3 or llama33 references ==="
rg -i 'llama3[._]?3' --type yaml --type py -g '!*.pyc' 2>/dev/null | head -50

echo ""
echo "=== Searching for all llama pretrain_config references ==="
rg 'llama.*pretrain_config|llama.*config' --type py --type yaml 2>/dev/null | head -50

echo ""
echo "=== Searching for megatron.bridge.recipes.llama directory/references ==="
find . -path '*/megatron*' -name '*llama*' 2>/dev/null | grep -E '\.(py|yaml)$' | head -30

Repository: NVIDIA-NeMo/RL

Length of output: 3275


🏁 Script executed:

#!/bin/bash
# Read the full file to see if there are comments explaining the recipe choice
echo "=== Full content of grpo_math_70B_megatron.yaml ==="
cat examples/configs/grpo_math_70B_megatron.yaml

echo ""
echo "=== Search for TODOs or FIXMEs related to llama or recipe ==="
rg -n 'TODO|FIXME' examples/configs/grpo_math_70B_megatron.yaml

echo ""
echo "=== Check megatron.bridge documentation or references ==="
rg -n 'megatron.*bridge.*llama' --type py --type md 2>/dev/null | head -20

echo ""
echo "=== Search for Llama 3.3 in model name references ==="
rg -n 'Llama-3.3|Llama 3.3' --type yaml 2>/dev/null

Repository: NVIDIA-NeMo/RL

Length of output: 2726


Update model to Llama 3.1 or verify recipe compatibility with Llama 3.3.

Line 9 specifies meta-llama/Llama-3.3-70B-Instruct, but line 25 references llama31_70b_pretrain_config (Llama 3.1). No llama33_70b_pretrain_config exists in the codebase. Either change the model to meta-llama/Llama-3.1-70B-Instruct to match the recipe, or provide an alternative Llama 3.3–compatible recipe reference. All other similar configurations (e.g., grpo_math_8B_megatron.yaml, sft_openmathinstruct2_megatron.yaml) use matching model and recipe versions.

🤖 Prompt for AI Agents
In `@examples/configs/grpo_math_70B_megatron.yaml` around lines 25 - 26, The
config mixes Llama 3.3 model with a Llama 3.1 recipe: update either the model
string or the recipe to match versions—specifically change the model from
"meta-llama/Llama-3.3-70B-Instruct" to "meta-llama/Llama-3.1-70B-Instruct" to
match the existing megatron_recipe value "llama31_70b_pretrain_config", or
replace "megatron_recipe:
megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config" with a Llama
3.3–compatible recipe if you have one; ensure consistency across similar configs
like grpo_math_8B_megatron.yaml and sft_openmathinstruct2_megatron.yaml.

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_8B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config

megatron_cfg:
enabled: true
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/grpo_math_8B_megatron_fp8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ policy:
optimizer:
use_precision_aware_optimizer: false
env_vars:
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_qwen30ba3b_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
Expand Down
1 change: 1 addition & 0 deletions examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ policy:
dtensor_cfg:
_v2: false
context_parallel_size: 4
megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
megatron_cfg:
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ policy:
${.megatron_cfg.context_parallel_size}}, 2}
megatron_cfg:
enabled: true
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find all Qwen3 related recipe files and configs
echo "=== Searching for Qwen3 recipe definitions ==="
find . -type f \( -name "*.py" -o -name "*.yaml" \) | xargs grep -l "qwen3.*recipe\|qwen3.*config" | head -20

echo -e "\n=== Looking for available Qwen3 megatron recipes ==="
rg "qwen3.*pretrain_config|def.*qwen3.*config" --type py -A 2 | head -50

echo -e "\n=== Checking for 1.7b Qwen3 recipe variants ==="
rg "qwen3.*1[._]7" --type py --type yaml

echo -e "\n=== Checking the specific recipe module path ==="
find . -path "*megatron/bridge/recipes/qwen/qwen3*" -type f | head -10

Repository: NVIDIA-NeMo/RL

Length of output: 3262


🏁 Script executed:

#!/bin/bash
# Check the file in question and nearby distillation configs
echo "=== Current file content ==="
head -40 examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml

echo -e "\n=== Check other distillation YAML files for megatron_recipe usage ==="
find examples/configs/recipes/llm -name "*distillation*.yaml" -type f | xargs grep -l "megatron_recipe"

echo -e "\n=== Show megatron_recipe usage in distillation configs ==="
find examples/configs/recipes/llm -name "*distillation*.yaml" -type f -exec grep -H -B 2 -A 2 "megatron_recipe" {} \;

Repository: NVIDIA-NeMo/RL

Length of output: 2029


🏁 Script executed:

#!/bin/bash
# Search for how megatron_recipe is used in the codebase (in Python code)
echo "=== How megatron_recipe is referenced in code ==="
rg "megatron_recipe" --type py -B 3 -A 3 | head -100

echo -e "\n=== Check if there's a mapping or factory for Qwen3 recipes ==="
rg "qwen3_.*pretrain_config|qwen3_.*model_config" --type py

Repository: NVIDIA-NeMo/RL

Length of output: 2617


🏁 Script executed:

#!/bin/bash
# Search for megatron.bridge recipe implementations
echo "=== Searching for megatron bridge recipes ==="
find . -path "*megatron*bridge*recipe*" -o -path "*megatron*recipe*" | grep -E "(\.py|\.yaml)$" | head -20

echo -e "\n=== Check if there's a separate install for megatron-bridge ==="
ls -la requirements*.txt pyproject.toml setup.py 2>/dev/null | head -10

echo -e "\n=== Search for any import of megatron.bridge ==="
rg "from megatron|import megatron" --type py -B 1 -A 1 | head -40

echo -e "\n=== Check the distillation_math.yaml base config ==="
cat examples/configs/distillation_math.yaml | head -100

Repository: NVIDIA-NeMo/RL

Length of output: 6630


🏁 Script executed:

#!/bin/bash
# Check if there are test files or other distillation configs that might hint at expected models
echo "=== Check other distillation YAML configs to understand model/recipe patterns ==="
find examples/configs/recipes/llm -name "*distillation*.yaml" -type f -exec basename {} \; | sort

echo -e "\n=== Look at the FSDP variant for comparison ==="
head -40 examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.yaml

Repository: NVIDIA-NeMo/RL

Length of output: 1702


🏁 Script executed:

#!/bin/bash
# Check if the policy model_name is specified elsewhere or if it's inferred
echo "=== Search for how policy model_name is determined ==="
rg "model_name|student|policy" nemo_rl/models/ --type py -B 2 -A 2 | grep -A 5 -B 5 "1.7\|1_7\|student" | head -60

echo -e "\n=== Check if megatron_recipe can override model size or just provide base config ==="
rg "megatron_recipe|load_recipe" nemo_rl/models/ --type py -B 3 -A 3 | head -80

Repository: NVIDIA-NeMo/RL

Length of output: 4475


🏁 Script executed:

#!/bin/bash
# Search for any Qwen3 recipe definitions in megatron-bridge or available recipes
echo "=== Search for all available Qwen3 recipes ==="
rg "qwen3.*pretrain_config|def.*qwen3_.*_config" -g "*.py" 

echo -e "\n=== Check pyproject.toml for megatron-bridge dependency and version ==="
grep -A 10 -B 2 "megatron" pyproject.toml

echo -e "\n=== Check if there's a mapping of model sizes to recipes ==="
rg "1.7|1_7|1\.7|4b|8b|32b" nemo_rl/models/ --type py | grep -i recipe

Repository: NVIDIA-NeMo/RL

Length of output: 2883


🏁 Script executed:

#!/bin/bash
# Check the git history or comments around this change
echo "=== Check if there are any recent changes or TODOs about this recipe ==="
rg "TODO|FIXME|XXX|qwen3_32b_pretrain_config" nemo_rl/ examples/ --type py --type yaml -B 2 -A 2

echo -e "\n=== Compare the megatron recipe variant with FSDP variant more carefully ==="
echo "=== FSDP (without megatron_recipe) ==="
grep -A 30 "^policy:" examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.yaml | head -40

echo -e "\n=== Megatron (with megatron_recipe) ==="
grep -A 30 "^policy:" examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml | head -40

Repository: NVIDIA-NeMo/RL

Length of output: 24191


Policy uses 32B megatron_recipe but should use 1.7B recipe for the student model.

The policy (student model) is configured with megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config, which provides the 32B model architecture. However, the policy model name inherited from base config is Qwen/Qwen3-1.7B-Base (1.7B). This mismatch will cause the student to be loaded with the wrong model architecture. Replace with a 1.7B recipe, or if no 1.7B recipe exists, remove the recipe and use the HuggingFace model directly like the FSDP variant does.

🤖 Prompt for AI Agents
In
`@examples/configs/recipes/llm/distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml`
at line 25, The policy's megatron_recipe is pointing to the 32B config
(megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config) while the policy
model name is Qwen/Qwen3-1.7B-Base; change megatron_recipe to the matching 1.7B
recipe (if one exists) or remove the megatron_recipe entry so the policy uses
the HuggingFace model directly (like the FSDP variant) to ensure the student
model architecture matches Qwen/Qwen3-1.7B-Base.

teacher:
model_name: Qwen/Qwen3-32B
dtensor_cfg:
Expand All @@ -30,6 +31,7 @@ teacher:
enabled: false
sequence_packing:
enabled: true
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
megatron_cfg:
enabled: true
tensor_model_parallel_size: 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ policy:
enabled: false
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
optimizer: null
megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
megatron_cfg:
enabled: true
tensor_model_parallel_size: 4
Expand Down
Loading
Loading