NVIDIA-NeMo
diff --git a/‎nemo/collections/llm/recipes/llama4_e128.py‎
Lines changed: 3 additions & 8 deletions b/‎nemo/collections/llm/recipes/llama4_e128.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎nemo/collections/llm/recipes/nemotronh_56b.py‎
Lines changed: 2 additions & 12 deletions b/‎nemo/collections/llm/recipes/nemotronh_56b.py‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎nemo/collections/llm/recipes/precision/mixed_precision.py‎
Lines changed: 3 additions & 0 deletions b/‎nemo/collections/llm/recipes/precision/mixed_precision.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nemo/lightning/fabric/plugins.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo/lightning/fabric/plugins.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo/lightning/pytorch/plugins/mixed_precision.py‎
Lines changed: 3 additions & 0 deletions b/‎nemo/lightning/pytorch/plugins/mixed_precision.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nemo/lightning/run/plugins.py‎
Lines changed: 17 additions & 0 deletions b/‎nemo/lightning/run/plugins.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎scripts/performance/argument_parser.py‎
Lines changed: 141 additions & 7 deletions b/‎scripts/performance/argument_parser.py‎
Lines changed: 141 additions & 7 deletions
@@ -319,15 +319,9 @@ def finetune_recipe(
         packed_sequence,
     )
     if peft_scheme is None or peft_scheme.lower() == 'none':
-        recipe.trainer.strategy.tensor_model_parallel_size = 4
-        recipe.trainer.strategy.expert_tensor_model_parallel_size = 4
-        recipe.trainer.strategy.expert_model_parallel_size = 32
+        recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 5e-6
     elif peft_scheme.lower() in ['lora', 'dora']:
-        recipe.trainer.strategy.sequence_parallel = True
-        recipe.trainer.strategy.tensor_model_parallel_size = 8
-        recipe.trainer.strategy.expert_tensor_model_parallel_size = 8
-        recipe.trainer.strategy.pipeline_model_parallel_size = 4
         recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
         recipe.peft.dim = 8
         recipe.peft.alpha = 16
@@ -397,9 +391,10 @@ def finetune_performance_optimizations(
     recipe.trainer.callbacks.append(
         run.Config(
             MegatronCommOverlapCallback,
-            tp_comm_overlap=False,
+            tp_comm_overlap=True,
         )
     )
+    recipe.trainer.callbacks.append(run.Config(MegatronTokenDropCallback))
     recipe.trainer.callbacks.append(run.Config(TimingCallback))
     recipe.trainer.callbacks.append(
         run.Config(
 
@@ -30,7 +30,6 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import nemotron_h_bf16_with_fp8_current_scaling_mixed
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
@@ -143,22 +142,13 @@ def trainer(
             DistributedDataParallelConfig,
             check_for_nan_in_grad=True,
             overlap_grad_reduce=True,
-            overlap_param_gather=False,  # Verify that this works
+            overlap_param_gather=True,  # Verify that this works
             grad_reduce_in_fp32=True,
         ),
     )
 
     callbacks = [
         run.Config(TimingCallback),
-        run.Config(
-            ModelCheckpoint,
-            every_n_train_steps=val_check_interval,
-            dirpath=dir,
-            save_top_k=save_top_k,
-            always_save_context=True,
-            save_optim_on_train_end=True,
-            save_context_on_train_end=True,
-        ),
     ]
     trainer = run.Config(
         nl.Trainer,
@@ -175,7 +165,7 @@ def trainer(
         use_distributed_sampler=False,
         plugins=[nemotron_h_bf16_with_fp8_current_scaling_mixed()],
         val_check_interval=val_check_interval,
-        enable_checkpointing=True,
+        enable_checkpointing=False,  # Fix this: disable checkpointing for now
     )
     return trainer
 
 
@@ -86,6 +86,7 @@ def fp16_with_fp8_mixed() -> run.Config[MegatronMixedPrecision]:
     cfg.fp8_amax_history_len = 1024
     cfg.fp8_amax_compute_algo = "max"
     cfg.fp8_param_gather = True
+    cfg.reuse_grad_buf_for_mxfp8_param_ag = True
     return cfg
 
 
@@ -99,6 +100,7 @@ def bf16_with_mxfp8_mixed() -> run.Config[MegatronMixedPrecision]:
     cfg.fp8 = 'hybrid'
     cfg.fp8_recipe = "mxfp8"
     cfg.fp8_param_gather = True
+    cfg.reuse_grad_buf_for_mxfp8_param_ag = True
     return cfg
 
 
@@ -112,6 +114,7 @@ def fp16_with_mxfp8_mixed() -> run.Config[MegatronMixedPrecision]:
     cfg.fp8 = 'hybrid'
     cfg.fp8_recipe = "mxfp8"
     cfg.fp8_param_gather = True
+    cfg.reuse_grad_buf_for_mxfp8_param_ag = True
     return cfg
 
 
 
@@ -60,6 +60,7 @@ def __init__(
         first_last_layers_bf16: bool = False,
         num_layers_at_start_in_bf16: int = 0,
         num_layers_at_end_in_bf16: int = 0,
+        reuse_grad_buf_for_mxfp8_param_ag: bool = False,
         fp8_margin: int = 0,
         fp8_amax_history_len: int = 1,
         fp8_amax_compute_algo: str = "most_recent",
@@ -104,6 +105,7 @@ def __init__(
             first_last_layers_bf16=first_last_layers_bf16,
             num_layers_at_start_in_bf16=num_layers_at_start_in_bf16,
             num_layers_at_end_in_bf16=num_layers_at_end_in_bf16,
+            reuse_grad_buf_for_mxfp8_param_ag=reuse_grad_buf_for_mxfp8_param_ag,
             fp8_margin=fp8_margin,
             fp8_amax_history_len=fp8_amax_history_len,
             fp8_amax_compute_algo=fp8_amax_compute_algo,
 
@@ -86,6 +86,7 @@ class DtypeConfig:
     hysteresis: float = (None,)
     num_layers_at_start_in_bf16: int = 0
     num_layers_at_end_in_bf16: int = 0
+    reuse_grad_buf_for_mxfp8_param_ag: bool = False
 
 
 class MegatronMixedPrecision(Precision):
@@ -122,6 +123,7 @@ def __init__(
         fp16_hysteresis: int = 2,
         num_layers_at_start_in_bf16: int = 0,
         num_layers_at_end_in_bf16: int = 0,
+        reuse_grad_buf_for_mxfp8_param_ag: bool = False,
     ) -> None:
         if fp8_params is not None:
             logging.warning(
@@ -161,6 +163,7 @@ def __init__(
             fp8_param_gather=fp8_param_gather,
             num_layers_at_start_in_bf16=num_layers_at_start_in_bf16,
             num_layers_at_end_in_bf16=num_layers_at_end_in_bf16,
+            reuse_grad_buf_for_mxfp8_param_ag=reuse_grad_buf_for_mxfp8_param_ag,
             # fp16 loss scale
             loss_scale=fp16_loss_scale,
             initial_loss_scale=fp16_initial_loss_scale,
 
@@ -158,7 +158,9 @@ class NsysPlugin(run.Plugin):
     end_step: int
     ranks: Optional[list[int]] = None
     nsys_trace: Optional[list[str]] = None
+    nsys_extra_args: Optional[list[str]] = None
     gen_shape: bool = False
+    nsys_gpu_metrics: bool = False
 
     def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         """Set up the nsys profiling plugin."""
@@ -179,6 +181,21 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         if isinstance(executor, run.SlurmExecutor):
             # NOTE: DO NOT change to f-string, `%q{}` is Slurm placeholder
             launcher.nsys_filename = "profile_%p_%q{SLURM_JOB_ID}_node%q{SLURM_NODEID}_rank%q{SLURM_PROCID}"
+            launcher.nsys_extra_args = self.nsys_extra_args or [
+                "--force-overwrite=true",
+                "--capture-range=cudaProfilerApi",
+                "--capture-range-end=stop",
+                "--cuda-graph-trace=node",
+                "--cuda-event-trace=false",
+                "--nvtx-domain-include=NCCL",
+            ]
+        if self.nsys_gpu_metrics:
+            if hasattr(launcher, "nsys_gpu_metrics"):
+                launcher.nsys_gpu_metrics = self.nsys_gpu_metrics
+            else:
+                logging.warning(
+                    "Unable to enable nsys gpu metrics collection. Please upgrade Nemo-Run to include commit 70a0df4."
+                )
 
 
 @dataclass(kw_only=True)
 
@@ -26,14 +26,19 @@ def parse_cli_args():
     """
     parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning")
 
-    parser.add_argument(
+    subparsers = parser.add_subparsers(dest="cluster_type", help='Type of cluster: slurm or runai')
+
+    slurm_parser = subparsers.add_parser('slurm', help="define variables for slurm launcher")
+    runai_parser = subparsers.add_parser('runai', help="define variables for runai launcher")
+
+    slurm_parser.add_argument(
         "-a",
         "--account",
         type=str,
         help="Slurm account to use for experiment",
         required=True,
     )
-    parser.add_argument(
+    slurm_parser.add_argument(
         "-p",
         "--partition",
         type=str,
@@ -48,22 +53,58 @@ def parse_cli_args():
         help="Target gpu type.",
         required=True,
     )
-    parser.add_argument(
+    slurm_parser.add_argument(
         "-l",
         "--log_dir",
         type=str,
         help=f"Directory for logging experiment results. Defaults to {get_nemorun_home()}",
         required=False,
         default=get_nemorun_home(),
     )
-    parser.add_argument(
+    slurm_parser.add_argument(
         "-t",
         "--time_limit",
         type=str,
         help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')",
         required=False,
         default="00:30:00",
     )
+    runai_parser.add_argument(
+        "-b",
+        "--base_url",
+        help="NVIDIA Run:ai API url to use for experiment. Should look like https://<base-url>/api/v1",
+        type=str,
+        required=True,
+    )
+
+    runai_parser.add_argument(
+        "-id",
+        "--app_id",
+        help="Name of NVIDIA Run:ai Application",
+        type=str,
+        required=True,
+    )
+    runai_parser.add_argument(
+        "-s",
+        "--app_secret",
+        help="NVIDIA Run:ai Application secret",
+        type=str,
+        required=True,
+    )
+    runai_parser.add_argument(
+        "-p",
+        "--project_name",
+        help="NVIDIA Run:ai Project to run the experiment in",
+        type=str,
+        required=True,
+    )
+    runai_parser.add_argument(
+        "-pd",
+        "--pvc_nemo_run_dir",
+        help="Directory path of your nemo-run home in Run:ai PVC",
+        type=str,
+        required=True,
+    )
     container_img_msg = [
         "NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'",
         "Make sure your NGC credentials are accessible in your environment.",
@@ -101,7 +142,7 @@ def parse_cli_args():
     parser.add_argument(
         "-en",
         "--enable_nsys",
-        help="Enable Nsys profiling. Diabled by default",
+        help="Enable Nsys profiling. Disabled by default",
         action="store_true",
     )
     parser.add_argument(
@@ -274,7 +315,7 @@ def parse_cli_args():
         type=int,
         help="Number of train steps. Defaults to 100",
         required=False,
-        default=100,
+        default=50,
     )
 
     def bool_arg(arg):
@@ -349,6 +390,52 @@ def bool_arg(arg):
         required=False,
         default=None,
     )
+    parser.add_argument(
+        "-nlay",
+        "--num_layers",
+        type=int,
+        help="Sets number of model layers.",
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "-hs",
+        "--hidden_size",
+        type=int,
+        help="Sets hidden model size",
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "-pss", "--profiling_start_step", type=int, help="Defines start step for profiling", required=False, default=46
+    )
+    parser.add_argument(
+        "-pso", "--profiling_stop_step", type=int, help="Defines start step for profiling", required=False, default=50
+    )
+
+    parser.add_argument(
+        "-pgm",
+        "--profiling_gpu_metrics",
+        help="Enable nsys gpu metrics. Disabled by default.",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "-cps",
+        "--checkpoint_save",
+        type=bool_arg,
+        help="When enabled will trigger checkpoint save operation at the end of training",
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "-cpl",
+        "--checkpoint_load_path",
+        type=str,
+        help="Path to checkpoint to load prior to training start",
+        required=False,
+        default=None,
+    )
 
     def list_of_strings(arg):
         return arg.split(',')
@@ -368,7 +455,7 @@ def list_of_strings(arg):
         "-cm",
         "--custom_mounts",
         type=list_of_strings,
-        help="Comma separated string of mounts",
+        help="Comma separated string of mounts. For Run:ai, each mount must be in name:path:k8s-claimName format",
         required=False,
         default=[],
     )
@@ -386,4 +473,51 @@ def list_of_strings(arg):
         default=None,
     )
 
+    parser.add_argument(
+        "--skip_import_checkpoint",
+        help="Skips checkpoint import, finetuning job and only downloads the dataset.",
+        action="store_true",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--skip_dataset_download",
+        help="Skips dataset download, finetuning job and only downloads the checkpoint.",
+        action="store_true",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--skip_finetuning",
+        help="Skips finetuning and only downloads the checkpoint and dataset.",
+        action="store_true",
+        required=False,
+    )
+
+    parser.add_argument(
+        "-ev",
+        "--custom_env_vars",
+        type=str,
+        required=False,
+        default={},
+    )
+
+    parser.add_argument(
+        "-cpin",
+        "--cpu_pinning",
+        type=int,
+        help="Enable CPU pinning to improve performance on some clusters by setting numbers of CPUs per task. Disabled by default",
+        required=False,
+        default=0,
+    )
+
+    parser.add_argument(
+        "-vb",
+        "--enable_vboost",
+        help="Enable VBoost which steers more power towards tensor cores. Disabled by default",
+        type=bool_arg,
+        required=False,
+        default=None,
+    )
+
     return parser