From e2fb0754e33732714f40b5278adde50bc7dd1448 Mon Sep 17 00:00:00 2001 From: Aishwarya Bhandare Date: Wed, 4 Jun 2025 16:32:21 -0700 Subject: [PATCH 1/2] GPU metrics only on rank 0 Signed-off-by: ashbhandare --- nemo_run/core/execution/base.py | 3 +++ nemo_run/core/execution/launcher.py | 1 + nemo_run/core/execution/slurm.py | 13 ++++++++++++- nemo_run/run/torchx_backend/packaging.py | 4 ++-- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/base.py b/nemo_run/core/execution/base.py index 66c2ec8e..863193e5 100644 --- a/nemo_run/core/execution/base.py +++ b/nemo_run/core/execution/base.py @@ -165,6 +165,9 @@ def get_launcher_prefix(self) -> Optional[list[str]]: os.makedirs(os.path.join(self.job_dir, launcher.nsys_folder), exist_ok=True) return launcher.get_nsys_prefix(profile_dir=self.job_dir) + def get_nsys_entrypoint(self) -> str: + return ("nsys","") + def supports_launcher_transform(self) -> bool: return False diff --git a/nemo_run/core/execution/launcher.py b/nemo_run/core/execution/launcher.py index f24a1d2b..7e6c14a8 100644 --- a/nemo_run/core/execution/launcher.py +++ b/nemo_run/core/execution/launcher.py @@ -24,6 +24,7 @@ class Launcher(ConfigurableMixin): "--cuda-event-trace=false", ] ) + nsys_gpu_metrics: bool = False def get_nsys_prefix(self, profile_dir: str) -> Optional[list[str]]: """Make a command prefix for nsys profiling""" diff --git a/nemo_run/core/execution/slurm.py b/nemo_run/core/execution/slurm.py index 3f662723..3ee31325 100644 --- a/nemo_run/core/execution/slurm.py +++ b/nemo_run/core/execution/slurm.py @@ -547,7 +547,18 @@ def assign( def get_launcher_prefix(self) -> Optional[list[str]]: launcher = self.get_launcher() if launcher.nsys_profile: - return launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}") + nsys_prefix = launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}") + if launcher.nsys_gpu_metrics: + nsys_prefix += ["$GPU_METRICS_FLAG"] + return nsys_prefix + + def get_nsys_entrypoint(self) -> str: + launcher = self.get_launcher() + entrypoint, postfix = "nsys", "" + if launcher.nsys_gpu_metrics: + entrypoint="bash -c 'GPU_METRICS_FLAG=\"\"; if [ \"$SLURM_PROCID\" -eq 0 ]; then GPU_METRICS_FLAG=\"--gpu-metrics-devices=all\"; fi; nsys" + postfix="'" + return (entrypoint, postfix) def supports_launcher_transform(self) -> bool: return True if isinstance(self.get_launcher(), SlurmTemplate) else False diff --git a/nemo_run/run/torchx_backend/packaging.py b/nemo_run/run/torchx_backend/packaging.py index e915e9b0..6c70e8bf 100644 --- a/nemo_run/run/torchx_backend/packaging.py +++ b/nemo_run/run/torchx_backend/packaging.py @@ -225,8 +225,8 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool): nsys_prefix = executor.get_launcher_prefix() if nsys_prefix: role.args = [role.entrypoint] + role.args - role.entrypoint = "nsys" - role.args = nsys_prefix + role.args + role.entrypoint, nsys_postfix = executor.get_nsys_entrypoint() + role.args = nsys_prefix + role.args + [nsys_postfix] if metadata: if USE_WITH_RAY_CLUSTER_KEY in metadata: From 51b8479a775925431c5a01942a05f761a6ac309b Mon Sep 17 00:00:00 2001 From: ashbhandare Date: Mon, 9 Jun 2025 17:33:50 -0700 Subject: [PATCH 2/2] Add unit tests Signed-off-by: ashbhandare --- nemo_run/core/execution/base.py | 2 +- nemo_run/core/execution/slurm.py | 6 +++--- test/core/execution/test_base.py | 5 +++++ test/core/execution/test_slurm.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/nemo_run/core/execution/base.py b/nemo_run/core/execution/base.py index 863193e5..870ea8bb 100644 --- a/nemo_run/core/execution/base.py +++ b/nemo_run/core/execution/base.py @@ -166,7 +166,7 @@ def get_launcher_prefix(self) -> Optional[list[str]]: return launcher.get_nsys_prefix(profile_dir=self.job_dir) def get_nsys_entrypoint(self) -> str: - return ("nsys","") + return ("nsys", "") def supports_launcher_transform(self) -> bool: return False diff --git a/nemo_run/core/execution/slurm.py b/nemo_run/core/execution/slurm.py index 3ee31325..b7a49419 100644 --- a/nemo_run/core/execution/slurm.py +++ b/nemo_run/core/execution/slurm.py @@ -551,13 +551,13 @@ def get_launcher_prefix(self) -> Optional[list[str]]: if launcher.nsys_gpu_metrics: nsys_prefix += ["$GPU_METRICS_FLAG"] return nsys_prefix - + def get_nsys_entrypoint(self) -> str: launcher = self.get_launcher() entrypoint, postfix = "nsys", "" if launcher.nsys_gpu_metrics: - entrypoint="bash -c 'GPU_METRICS_FLAG=\"\"; if [ \"$SLURM_PROCID\" -eq 0 ]; then GPU_METRICS_FLAG=\"--gpu-metrics-devices=all\"; fi; nsys" - postfix="'" + entrypoint = 'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys' + postfix = "'" return (entrypoint, postfix) def supports_launcher_transform(self) -> bool: diff --git a/test/core/execution/test_base.py b/test/core/execution/test_base.py index 8b6d69d6..c122b519 100644 --- a/test/core/execution/test_base.py +++ b/test/core/execution/test_base.py @@ -126,6 +126,11 @@ def test_get_launcher_str(self): executor = Executor(launcher="torchrun") assert isinstance(executor.get_launcher(), Torchrun) + def test_get_nsys_entrypoint(self): + mock_launcher = Launcher() + executor = Executor(launcher=mock_launcher) + assert executor.get_nsys_entrypoint() == ("nsys", "") + def test_cleanup(self): executor = Executor() assert executor.cleanup("handle") is None diff --git a/test/core/execution/test_slurm.py b/test/core/execution/test_slurm.py index 0a2063bb..4c681111 100644 --- a/test/core/execution/test_slurm.py +++ b/test/core/execution/test_slurm.py @@ -170,10 +170,38 @@ def test_get_launcher_prefix(self): launcher_mock = MagicMock() launcher_mock.nsys_profile = True launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"] + launcher_mock.nsys_gpu_metrics = False with patch.object(executor, "get_launcher", return_value=launcher_mock): assert executor.get_launcher_prefix() == ["nsys", "profile"] + def test_get_launcher_prefix_with_gpu_metrics(self): + """Test the get_launcher_prefix method with nsys_profile when gpu metrics is enabled.""" + executor = SlurmExecutor(account="test") + + # Test with launcher that has nsys_profile + launcher_mock = MagicMock() + launcher_mock.nsys_profile = True + launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"] + launcher_mock.nsys_gpu_metrics = True + + with patch.object(executor, "get_launcher", return_value=launcher_mock): + assert executor.get_launcher_prefix() == ["nsys", "profile", "$GPU_METRICS_FLAG"] + + def test_get_nsys_entrypoint(self): + """Test the get_nsys_entrypoint method with nsys_profile.""" + executor = SlurmExecutor(account="test") + + # Test with launcher that has nsys_profile + launcher_mock = MagicMock() + launcher_mock.nsys_gpu_metrics = True + + with patch.object(executor, "get_launcher", return_value=launcher_mock): + assert executor.get_nsys_entrypoint() == ( + 'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys', + "'", + ) + def test_supports_launcher_transform(self): """Test the supports_launcher_transform method.""" executor = SlurmExecutor(account="test")