Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions nemo_run/core/execution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ def get_launcher_prefix(self) -> Optional[list[str]]:
os.makedirs(os.path.join(self.job_dir, launcher.nsys_folder), exist_ok=True)
return launcher.get_nsys_prefix(profile_dir=self.job_dir)

def get_nsys_entrypoint(self) -> str:
return ("nsys", "")

def supports_launcher_transform(self) -> bool:
return False

Expand Down
1 change: 1 addition & 0 deletions nemo_run/core/execution/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Launcher(ConfigurableMixin):
"--cuda-event-trace=false",
]
)
nsys_gpu_metrics: bool = False

def get_nsys_prefix(self, profile_dir: str) -> Optional[list[str]]:
"""Make a command prefix for nsys profiling"""
Expand Down
13 changes: 12 additions & 1 deletion nemo_run/core/execution/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,18 @@ def assign(
def get_launcher_prefix(self) -> Optional[list[str]]:
launcher = self.get_launcher()
if launcher.nsys_profile:
return launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}")
nsys_prefix = launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}")
if launcher.nsys_gpu_metrics:
nsys_prefix += ["$GPU_METRICS_FLAG"]
return nsys_prefix

def get_nsys_entrypoint(self) -> str:
launcher = self.get_launcher()
entrypoint, postfix = "nsys", ""
if launcher.nsys_gpu_metrics:
entrypoint = 'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys'
postfix = "'"
return (entrypoint, postfix)

def supports_launcher_transform(self) -> bool:
return True if isinstance(self.get_launcher(), SlurmTemplate) else False
Expand Down
4 changes: 2 additions & 2 deletions nemo_run/run/torchx_backend/packaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool):
nsys_prefix = executor.get_launcher_prefix()
if nsys_prefix:
role.args = [role.entrypoint] + role.args
role.entrypoint = "nsys"
role.args = nsys_prefix + role.args
role.entrypoint, nsys_postfix = executor.get_nsys_entrypoint()
role.args = nsys_prefix + role.args + [nsys_postfix]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you wrap this in an if condition? i.e. only add the postfix if its not empty.


if metadata:
if USE_WITH_RAY_CLUSTER_KEY in metadata:
Expand Down
5 changes: 5 additions & 0 deletions test/core/execution/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def test_get_launcher_str(self):
executor = Executor(launcher="torchrun")
assert isinstance(executor.get_launcher(), Torchrun)

def test_get_nsys_entrypoint(self):
mock_launcher = Launcher()
executor = Executor(launcher=mock_launcher)
assert executor.get_nsys_entrypoint() == ("nsys", "")

def test_cleanup(self):
executor = Executor()
assert executor.cleanup("handle") is None
28 changes: 28 additions & 0 deletions test/core/execution/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,38 @@ def test_get_launcher_prefix(self):
launcher_mock = MagicMock()
launcher_mock.nsys_profile = True
launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"]
launcher_mock.nsys_gpu_metrics = False

with patch.object(executor, "get_launcher", return_value=launcher_mock):
assert executor.get_launcher_prefix() == ["nsys", "profile"]

def test_get_launcher_prefix_with_gpu_metrics(self):
"""Test the get_launcher_prefix method with nsys_profile when gpu metrics is enabled."""
executor = SlurmExecutor(account="test")

# Test with launcher that has nsys_profile
launcher_mock = MagicMock()
launcher_mock.nsys_profile = True
launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"]
launcher_mock.nsys_gpu_metrics = True

with patch.object(executor, "get_launcher", return_value=launcher_mock):
assert executor.get_launcher_prefix() == ["nsys", "profile", "$GPU_METRICS_FLAG"]

def test_get_nsys_entrypoint(self):
"""Test the get_nsys_entrypoint method with nsys_profile."""
executor = SlurmExecutor(account="test")

# Test with launcher that has nsys_profile
launcher_mock = MagicMock()
launcher_mock.nsys_gpu_metrics = True

with patch.object(executor, "get_launcher", return_value=launcher_mock):
assert executor.get_nsys_entrypoint() == (
'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys',
"'",
)

def test_supports_launcher_transform(self):
"""Test the supports_launcher_transform method."""
executor = SlurmExecutor(account="test")
Expand Down
Loading