NVIDIA-NeMo · shanmugamr1992 · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 17, 2026
@@ -1,7 +1,7 @@
 [submodule "3rdparty/Megatron-LM"]
 	path = 3rdparty/Megatron-LM-workspace/Megatron-LM
-	url = https://github.com/terrykong/Megatron-LM.git
-	branch = yuya/nemo-rl-use-dev
+	url = https://github.com/yaoyu-33/Megatron-LM.git
+	branch = main
 	shallow = true
 [submodule "3rdparty/Megatron-Bridge"]
 	path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge

@@ -26,7 +26,8 @@
 bridge_package_name = "megatron.bridge"
 
 CACHED_DEPENDENCIES = [
-    "transformers>=4.57.1",
+    "accelerate",
+    "transformers==4.57.1",
     "datasets",
     "omegaconf>=2.3.0",
     "tensorboard>=2.19.0",
@@ -40,7 +41,7 @@
     "hydra-core>1.3,<=1.3.2",
     "megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
     "qwen-vl-utils",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
+    "transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
     "mamba-ssm",
     "nvidia-resiliency-ext",
     "causal-conv1d",

@@ -44,30 +44,30 @@
 CACHED_DEPENDENCIES = [
     # Default dependencies from pyproject.toml
     "torch",
-    "numpy<2.0.0",
+    "numpy",
     "packaging>=24.2",
     # Dev dependencies from pyproject.toml
-    "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
-    "nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
+    "nvidia-modelopt[torch]; sys_platform != 'darwin'",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
+    "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
     "nvtx~=0.2",
     "multi-storage-client~=0.27",
     "opentelemetry-api~=1.33.1",
-    "setuptools<80.0.0",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
-    "av<16.0.0",
-    "flashinfer-python",
+    "av",
+    "flashinfer-python~=0.5.0",
     "wget",
     "onnxscript",
-    "flash-linear-attention~=0.3.2",
     # VCS dependency - must match pyproject.toml [tool.uv.sources]
     "emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
+    "datasets",
+    "fastapi~=0.50",
 ]
 
 

@@ -143,14 +143,17 @@ policy:
     top_p: 1.0
     top_k: null
     mcore_generation_config:
-      buffer_size_gb: 20  # Total GPU memory (in GB) allocated for KV cache buffers
+      buffer_size_gb: 25  # Total GPU memory (in GB) allocated for KV cache buffers
       buffer_guaranteed_fraction: 0.1  # Fraction of buffer reserved for guaranteed active requests
-      num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
+      num_cuda_graphs: 6  # Number of CUDA graphs to pre-compile for different batch sizes
       block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
       use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
-      enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
-      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      unified_memory_level: 1  # Unified memory usage level (0=disabled, 1+=enables unified memory with static tensor addresses)
       max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
+      reset_cuda_graphs: true
+      offload_kv_cache_during_training: true  # Move KV cache to CPU during training
+      enable_cuda_graph: true
+      enable_chunked_prefill: false
 
     vllm_cfg:
       tensor_parallel_size: 1
@@ -178,8 +181,8 @@ logger:
   swanlab_enabled: false # Disable SwanLab logging
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
-    project: "grpo-dev"
-    name: "sj_megatron_1B"
+    project: "qwen_30b_final"
+    name: "none"
   swanlab:
     project: "grpo-dev"
     name: "sj_megatron_1B"

@@ -2,18 +2,18 @@
 defaults: "grpo_math_1B_megatron.yaml"
 
 grpo:
-  num_prompts_per_step: 64
-  num_generations_per_prompt: 32
+  num_prompts_per_step: 16
+  num_generations_per_prompt: 8
 
 policy:
   model_name: "Qwen/Qwen3-30B-A3B"
   tokenizer:
     name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 512
+  train_global_batch_size: 64
   train_micro_batch_size: 1
   generation_batch_size: 32 # Only used when generating using HF backend
   logprob_batch_size: 4
-  max_total_sequence_length: 4096
+  max_total_sequence_length: 1024
   precision: "bfloat16"
 
   dtensor_cfg:
@@ -68,7 +68,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
-      tensor_parallel_size: 4
+      tensor_parallel_size: 16
       gpu_memory_utilization: 0.7
       enforce_eager: false
       max_model_len: ${policy.max_total_sequence_length}