squash MCore bump + related updates

yfw · yfw · commit 7d3719232883 · 2026-02-09T16:19:31.000-08:00
Signed-off-by: Yi-Fu Wu &lt;yifu.wu@gmail.com&gt;
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "3rdparty/Megatron-LM"]
 	path = 3rdparty/Megatron-LM-workspace/Megatron-LM
-	url = https://github.com/terrykong/Megatron-LM.git
-	branch = yuya/nemo-rl-use-dev
+	url = https://github.com/yaoyu-33/Megatron-LM.git
+	branch = main
 	shallow = true
 [submodule "3rdparty/Megatron-Bridge"]
 	path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 1e9a459b43aa1f62ca1356e554d2b0196ebdd546
+Subproject commit 15398e08fc86be3de084c7382116527246ab1852
diff --git a/3rdparty/Megatron-Bridge-workspace/setup.py b/3rdparty/Megatron-Bridge-workspace/setup.py
@@ -26,8 +26,9 @@
 bridge_package_name = "megatron.bridge"
 
 CACHED_DEPENDENCIES = [
-    "transformers>=4.57.1",
+    "transformers<5.0.0",
     "datasets",
+    "accelerate",
     "omegaconf>=2.3.0",
     "tensorboard>=2.19.0",
     "typing-extensions",
@@ -40,7 +41,7 @@
     "hydra-core>1.3,<=1.3.2",
     "megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
     "qwen-vl-utils",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
+    "transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
     "mamba-ssm",
     "nvidia-resiliency-ext",
     "causal-conv1d",
diff --git a/3rdparty/Megatron-LM-workspace/Megatron-LM b/3rdparty/Megatron-LM-workspace/Megatron-LM
@@ -1 +1 @@
-Subproject commit b73ae5cdab9d409fcface2b2f3c375710abe6911
+Subproject commit 193463c4f8414e6906a40dd527a450bca50706b1
diff --git a/3rdparty/Megatron-LM-workspace/setup.py b/3rdparty/Megatron-LM-workspace/setup.py
@@ -44,30 +44,31 @@
 CACHED_DEPENDENCIES = [
     # Default dependencies from pyproject.toml
     "torch",
-    "numpy<2.0.0",
+    "numpy",
     "packaging>=24.2",
     # Dev dependencies from pyproject.toml
-    "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
-    "nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
+    "nvidia-modelopt[torch]; sys_platform != 'darwin'",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
+    "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
     "nvtx~=0.2",
     "multi-storage-client~=0.27",
     "opentelemetry-api~=1.33.1",
-    "setuptools<80.0.0",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
+    "flash-linear-attention~=0.3.2",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
-    "av<16.0.0",
-    "flashinfer-python",
+    "av",
+    "flashinfer-python~=0.5.0",
     "wget",
     "onnxscript",
-    "flash-linear-attention~=0.3.2",
     # VCS dependency - must match pyproject.toml [tool.uv.sources]
     "emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
+    "datasets",
+    "fastapi~=0.50",
 ]
 
 
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
@@ -51,6 +51,7 @@
 from megatron.bridge.utils.instantiate_utils import InstantiationMode
 from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
 from megatron.core import parallel_state
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer import MegatronModule
 from megatron.core.transformer.module import Float16Module
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -731,6 +732,8 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
         pre_wrap_hook.extend([composed_peft_hook])
 
     # Model, optimizer, and learning rate.
+    pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+    setattr(megatron_cfg.model, "_pg_collection", pg_collection)
     model = get_model(
         megatron_cfg.model,
         megatron_cfg.ddp,
@@ -739,6 +742,7 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
         data_parallel_random_init=megatron_cfg.rng.data_parallel_random_init,
         pre_wrap_hook=pre_wrap_hook,
         mixed_precision_wrapper=mixed_precision_wrapper,
+        pg_collection=pg_collection,
     )
     if load_optimizer:
         optimizer, scheduler = setup_optimizer(
@@ -872,6 +876,7 @@ def setup_reference_model_state(
         overlap_param_gather_with_optimizer_step=megatron_cfg.optimizer.overlap_param_gather_with_optimizer_step,
         pre_wrap_hook=megatron_cfg.rng.data_parallel_random_init,
         mixed_precision_wrapper=ref_mixed_precision_wrapper,
+        pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
     )
 
     print("Loading the Reference Model")
@@ -925,6 +930,7 @@ def finalize_megatron_setup(
         megatron_cfg.ddp,
         optimizer,
         align_grad_reduce=megatron_cfg.dist.align_grad_reduce,
+        pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
     )
 
     tokenizer_config = TokenizerConfig(
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -27,6 +27,7 @@
     maybe_finalize_async_save,
     save_checkpoint,
 )
+from megatron.bridge.training.utils.pg_utils import get_pg_collection
 from megatron.bridge.training.utils.train_utils import (
     logical_and_across_model_parallel_group,
     reduce_max_stat_across_model_parallel_group,
@@ -55,6 +56,7 @@
     is_pipeline_last_stage,
 )
 from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from transformers import PreTrainedTokenizerBase
 
@@ -415,18 +417,20 @@ def train(
                 else:
                     update_successful, grad_norm, num_zeros_in_grad = (True, 0.0, 0.0)
 
+                pg_collection = get_pg_collection(self.model)
+
                 # when freezing sub-models we may have a mixture of successful and unsucessful ranks,
                 # so we must gather across mp ranks
                 update_successful = logical_and_across_model_parallel_group(
-                    update_successful
+                    update_successful, mp_group=pg_collection.mp
                 )
                 # grad_norm and num_zeros_in_grad will be None on ranks without trainable params,
                 # so we must gather across mp ranks
                 grad_norm: float = reduce_max_stat_across_model_parallel_group(
-                    grad_norm
+                    grad_norm, mp_group=pg_collection.mp
                 )
                 num_zeros_in_grad: float = reduce_max_stat_across_model_parallel_group(
-                    num_zeros_in_grad
+                    num_zeros_in_grad, mp_group=pg_collection.mp
                 )
 
                 if update_successful:
diff --git a/uv.lock b/uv.lock