Skip to content

Commit 7d37192

Browse files
committed
squash MCore bump + related updates
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
1 parent e48d2d2 commit 7d37192

8 files changed

Lines changed: 43 additions & 27 deletions

File tree

.gitmodules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[submodule "3rdparty/Megatron-LM"]
22
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
3-
url = https://github.com/terrykong/Megatron-LM.git
4-
branch = yuya/nemo-rl-use-dev
3+
url = https://github.com/yaoyu-33/Megatron-LM.git
4+
branch = main
55
shallow = true
66
[submodule "3rdparty/Megatron-Bridge"]
77
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 384 files

3rdparty/Megatron-Bridge-workspace/setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
bridge_package_name = "megatron.bridge"
2727

2828
CACHED_DEPENDENCIES = [
29-
"transformers>=4.57.1",
29+
"transformers<5.0.0",
3030
"datasets",
31+
"accelerate",
3132
"omegaconf>=2.3.0",
3233
"tensorboard>=2.19.0",
3334
"typing-extensions",
@@ -40,7 +41,7 @@
4041
"hydra-core>1.3,<=1.3.2",
4142
"megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
4243
"qwen-vl-utils",
43-
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
44+
"transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
4445
"mamba-ssm",
4546
"nvidia-resiliency-ext",
4647
"causal-conv1d",
Submodule Megatron-LM updated 966 files

3rdparty/Megatron-LM-workspace/setup.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,30 +44,31 @@
4444
CACHED_DEPENDENCIES = [
4545
# Default dependencies from pyproject.toml
4646
"torch",
47-
"numpy<2.0.0",
47+
"numpy",
4848
"packaging>=24.2",
4949
# Dev dependencies from pyproject.toml
50-
"nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
51-
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
52-
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
50+
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
51+
"transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
52+
"nvidia-resiliency-ext",
5353
"tqdm",
5454
"einops~=0.8",
5555
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
5656
"nvtx~=0.2",
5757
"multi-storage-client~=0.27",
5858
"opentelemetry-api~=1.33.1",
59-
"setuptools<80.0.0",
6059
"mamba-ssm~=2.2",
6160
"causal-conv1d~=1.5",
61+
"flash-linear-attention~=0.3.2",
6262
"nv-grouped-gemm~=1.1",
6363
"megatron-energon[av_decode]~=6.0",
64-
"av<16.0.0",
65-
"flashinfer-python",
64+
"av",
65+
"flashinfer-python~=0.5.0",
6666
"wget",
6767
"onnxscript",
68-
"flash-linear-attention~=0.3.2",
6968
# VCS dependency - must match pyproject.toml [tool.uv.sources]
7069
"emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
70+
"datasets",
71+
"fastapi~=0.50",
7172
]
7273

7374

nemo_rl/models/megatron/setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
from megatron.bridge.utils.instantiate_utils import InstantiationMode
5252
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
5353
from megatron.core import parallel_state
54+
from megatron.core.process_groups_config import ProcessGroupCollection
5455
from megatron.core.transformer import MegatronModule
5556
from megatron.core.transformer.module import Float16Module
5657
from megatron.core.transformer.transformer_config import TransformerConfig
@@ -731,6 +732,8 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
731732
pre_wrap_hook.extend([composed_peft_hook])
732733

733734
# Model, optimizer, and learning rate.
735+
pg_collection = ProcessGroupCollection.use_mpu_process_groups()
736+
setattr(megatron_cfg.model, "_pg_collection", pg_collection)
734737
model = get_model(
735738
megatron_cfg.model,
736739
megatron_cfg.ddp,
@@ -739,6 +742,7 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
739742
data_parallel_random_init=megatron_cfg.rng.data_parallel_random_init,
740743
pre_wrap_hook=pre_wrap_hook,
741744
mixed_precision_wrapper=mixed_precision_wrapper,
745+
pg_collection=pg_collection,
742746
)
743747
if load_optimizer:
744748
optimizer, scheduler = setup_optimizer(
@@ -872,6 +876,7 @@ def setup_reference_model_state(
872876
overlap_param_gather_with_optimizer_step=megatron_cfg.optimizer.overlap_param_gather_with_optimizer_step,
873877
pre_wrap_hook=megatron_cfg.rng.data_parallel_random_init,
874878
mixed_precision_wrapper=ref_mixed_precision_wrapper,
879+
pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
875880
)
876881

877882
print("Loading the Reference Model")
@@ -925,6 +930,7 @@ def finalize_megatron_setup(
925930
megatron_cfg.ddp,
926931
optimizer,
927932
align_grad_reduce=megatron_cfg.dist.align_grad_reduce,
933+
pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
928934
)
929935

930936
tokenizer_config = TokenizerConfig(

nemo_rl/models/policy/workers/megatron_policy_worker.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
maybe_finalize_async_save,
2828
save_checkpoint,
2929
)
30+
from megatron.bridge.training.utils.pg_utils import get_pg_collection
3031
from megatron.bridge.training.utils.train_utils import (
3132
logical_and_across_model_parallel_group,
3233
reduce_max_stat_across_model_parallel_group,
@@ -55,6 +56,7 @@
5556
is_pipeline_last_stage,
5657
)
5758
from megatron.core.pipeline_parallel import get_forward_backward_func
59+
from megatron.core.process_groups_config import ProcessGroupCollection
5860
from megatron.core.rerun_state_machine import get_rerun_state_machine
5961
from transformers import PreTrainedTokenizerBase
6062

@@ -415,18 +417,20 @@ def train(
415417
else:
416418
update_successful, grad_norm, num_zeros_in_grad = (True, 0.0, 0.0)
417419

420+
pg_collection = get_pg_collection(self.model)
421+
418422
# when freezing sub-models we may have a mixture of successful and unsucessful ranks,
419423
# so we must gather across mp ranks
420424
update_successful = logical_and_across_model_parallel_group(
421-
update_successful
425+
update_successful, mp_group=pg_collection.mp
422426
)
423427
# grad_norm and num_zeros_in_grad will be None on ranks without trainable params,
424428
# so we must gather across mp ranks
425429
grad_norm: float = reduce_max_stat_across_model_parallel_group(
426-
grad_norm
430+
grad_norm, mp_group=pg_collection.mp
427431
)
428432
num_zeros_in_grad: float = reduce_max_stat_across_model_parallel_group(
429-
num_zeros_in_grad
433+
num_zeros_in_grad, mp_group=pg_collection.mp
430434
)
431435

432436
if update_successful:

uv.lock

Lines changed: 14 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)