Skip to content
Open

Mdp #1849

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[submodule "3rdparty/Megatron-LM"]
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
url = https://github.com/terrykong/Megatron-LM.git
branch = yuya/nemo-rl-use-dev
url = https://github.com/yaoyu-33/Megatron-LM.git
branch = main
shallow = true
[submodule "3rdparty/Megatron-Bridge"]
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 324 files
5 changes: 3 additions & 2 deletions 3rdparty/Megatron-Bridge-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
bridge_package_name = "megatron.bridge"

CACHED_DEPENDENCIES = [
"transformers>=4.57.1",
"accelerate",
"transformers==4.57.1",
"datasets",
"omegaconf>=2.3.0",
"tensorboard>=2.19.0",
Expand All @@ -40,7 +41,7 @@
"hydra-core>1.3,<=1.3.2",
"megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
"qwen-vl-utils",
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
"transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
"mamba-ssm",
"nvidia-resiliency-ext",
"causal-conv1d",
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM-workspace/Megatron-LM
Submodule Megatron-LM updated 627 files
16 changes: 8 additions & 8 deletions 3rdparty/Megatron-LM-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,30 +44,30 @@
CACHED_DEPENDENCIES = [
# Default dependencies from pyproject.toml
"torch",
"numpy<2.0.0",
"numpy",
"packaging>=24.2",
# Dev dependencies from pyproject.toml
"nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
"transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
"nvidia-resiliency-ext",
"tqdm",
"einops~=0.8",
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
"nvtx~=0.2",
"multi-storage-client~=0.27",
"opentelemetry-api~=1.33.1",
"setuptools<80.0.0",
"mamba-ssm~=2.2",
"causal-conv1d~=1.5",
"nv-grouped-gemm~=1.1",
"megatron-energon[av_decode]~=6.0",
"av<16.0.0",
"flashinfer-python",
"av",
"flashinfer-python~=0.5.0",
"wget",
"onnxscript",
"flash-linear-attention~=0.3.2",
# VCS dependency - must match pyproject.toml [tool.uv.sources]
"emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
"datasets",
"fastapi~=0.50",
]


Expand Down
15 changes: 9 additions & 6 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,17 @@ policy:
top_p: 1.0
top_k: null
mcore_generation_config:
buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers
buffer_size_gb: 25 # Total GPU memory (in GB) allocated for KV cache buffers
buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests
num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes
num_cuda_graphs: 6 # Number of CUDA graphs to pre-compile for different batch sizes
block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity)
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
enable_chunked_prefill: true # Split long prefills into chunks for better memory management
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
unified_memory_level: 1 # Unified memory usage level (0=disabled, 1+=enables unified memory with static tensor addresses)
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
reset_cuda_graphs: true
offload_kv_cache_during_training: true # Move KV cache to CPU during training
enable_cuda_graph: true
enable_chunked_prefill: false

vllm_cfg:
tensor_parallel_size: 1
Expand Down Expand Up @@ -178,8 +181,8 @@ logger:
swanlab_enabled: false # Disable SwanLab logging
monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "grpo-dev"
name: "sj_megatron_1B"
project: "qwen_30b_final"
name: "none"
swanlab:
project: "grpo-dev"
name: "sj_megatron_1B"
Expand Down
10 changes: 5 additions & 5 deletions examples/configs/grpo_math_qwen30ba3b_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
defaults: "grpo_math_1B_megatron.yaml"

grpo:
num_prompts_per_step: 64
num_generations_per_prompt: 32
num_prompts_per_step: 16
num_generations_per_prompt: 8

policy:
model_name: "Qwen/Qwen3-30B-A3B"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
train_global_batch_size: 512
train_global_batch_size: 64
train_micro_batch_size: 1
generation_batch_size: 32 # Only used when generating using HF backend
logprob_batch_size: 4
max_total_sequence_length: 4096
max_total_sequence_length: 1024
precision: "bfloat16"

dtensor_cfg:
Expand Down Expand Up @@ -68,7 +68,7 @@ policy:
stop_token_ids: null
stop_strings: null
vllm_cfg:
tensor_parallel_size: 4
tensor_parallel_size: 16
gpu_memory_utilization: 0.7
enforce_eager: false
max_model_len: ${policy.max_total_sequence_length}
Expand Down
Loading
Loading