diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py index e4bd38d65a32..065186ff83de 100644 --- a/nemo/lightning/run/plugins.py +++ b/nemo/lightning/run/plugins.py @@ -160,6 +160,7 @@ class NsysPlugin(run.Plugin): nsys_trace: Optional[list[str]] = None nsys_extra_args: Optional[list[str]] = None gen_shape: bool = False + nsys_gpu_metrics: bool = False def setup(self, task: run.Partial | run.Script, executor: run.Executor): """Set up the nsys profiling plugin.""" @@ -186,6 +187,13 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor): "--cuda-event-trace=false", "--nvtx-domain-include=NCCL", ] + if self.nsys_gpu_metrics: + if hasattr(launcher, "nsys_gpu_metrics"): + launcher.nsys_gpu_metrics = self.nsys_gpu_metrics + else: + logging.warning( + "Unable to enable nsys gpu metrics collection. Please upgrade Nemo-Run to include commit 70a0df4." + ) @dataclass(kw_only=True) diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 892144702b7e..e6a6075afe30 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -380,6 +380,13 @@ def bool_arg(arg): "-pso", "--profiling_stop_step", type=int, help="Defines start step for profiling", required=False, default=50 ) + parser.add_argument( + "-pgm", + "--profiling_gpu_metrics", + help="Enable nsys gpu metrics. Disabled by default.", + action="store_true", + ) + parser.add_argument( "-cps", "--checkpoint_save", diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py index bb1657c56478..156f4393407f 100644 --- a/scripts/performance/llm/pretrain_deepseek_v3.py +++ b/scripts/performance/llm/pretrain_deepseek_v3.py @@ -200,6 +200,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) # nsys takes precedent over ncclttrace diff --git a/scripts/performance/llm/pretrain_grok1_314b.py b/scripts/performance/llm/pretrain_grok1_314b.py index 86c56e130c33..6476a0af795f 100644 --- a/scripts/performance/llm/pretrain_grok1_314b.py +++ b/scripts/performance/llm/pretrain_grok1_314b.py @@ -423,6 +423,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) # nsys takes precedent over ncclttrace diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py index 497746b30007..65902607d59c 100644 --- a/scripts/performance/llm/pretrain_llama31_405b.py +++ b/scripts/performance/llm/pretrain_llama31_405b.py @@ -188,6 +188,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) # nsys takes precedent over ncclttrace diff --git a/scripts/performance/llm/pretrain_llama4_e128.py b/scripts/performance/llm/pretrain_llama4_e128.py index d81ff19b0f94..374c0696e459 100644 --- a/scripts/performance/llm/pretrain_llama4_e128.py +++ b/scripts/performance/llm/pretrain_llama4_e128.py @@ -161,6 +161,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) # nsys takes precedent over ncclttrace diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py index 81201d91f65e..f19eb6185663 100644 --- a/scripts/performance/llm/pretrain_nemotron4_15b.py +++ b/scripts/performance/llm/pretrain_nemotron4_15b.py @@ -202,6 +202,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py index 0face12828a1..3a96ae6ca60f 100644 --- a/scripts/performance/llm/pretrain_nemotron4_340b.py +++ b/scripts/performance/llm/pretrain_nemotron4_340b.py @@ -193,6 +193,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) diff --git a/scripts/performance/llm/pretrain_nemotronh_56b.py b/scripts/performance/llm/pretrain_nemotronh_56b.py index 719b6abb914d..2377c352a7eb 100644 --- a/scripts/performance/llm/pretrain_nemotronh_56b.py +++ b/scripts/performance/llm/pretrain_nemotronh_56b.py @@ -143,6 +143,7 @@ def override_recipe_configs( start_step=args.profiling_start_step, end_step=args.profiling_stop_step, ranks=list(range(num_nodes * args.gpus_per_node)), + nsys_gpu_metrics=args.profiling_gpu_metrics, ) ) elif args.enable_nccltrace: # nsys takes precedent over nccltrace