From d5ab0fa2af53fccc4ab1cc6f8d42d192bc145298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 12:50:16 +0000 Subject: [PATCH 01/12] fix: DGXC logstreaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 8eb0edff..4fca2f51 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -29,7 +29,7 @@ import requests from invoke.context import Context -from nemo_run.config import get_nemorun_home +from nemo_run.config import RUNDIR_NAME, get_nemorun_home from nemo_run.core.execution.base import Executor, ExecutorMacros from nemo_run.core.packaging.base import Packager from nemo_run.core.packaging.git import GitArchivePackager @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/logs/output-$HOSTNAME.log +{" ".join(cmd)} 2>&1 | tee -a /{RUNDIR_NAME}/log_{name}.out """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) @@ -393,9 +393,9 @@ def fetch_logs( self.pvc_job_dir = os.path.join(self.pvc_nemo_run_dir, job_subdir) files = [] - while len(files) < self.nodes: - files = list(glob.glob(f"{self.pvc_job_dir}/logs/output-*.log")) - logger.info(f"Waiting for {self.nodes - len(files)} log files to be created...") + while len(files) < 1: + files = list(glob.glob(f"/{RUNDIR_NAME}/log_*.out")) + logger.info(f"Waiting for {1 - len(files)} log files to be created...") time.sleep(3) cmd.extend(files) From 60dd00f9aa7501bf15f7317c76c57ff7b510ac7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 12:58:17 +0000 Subject: [PATCH 02/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 4fca2f51..0202ee4c 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -394,7 +394,7 @@ def fetch_logs( files = [] while len(files) < 1: - files = list(glob.glob(f"/{RUNDIR_NAME}/log_*.out")) + files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) logger.info(f"Waiting for {1 - len(files)} log files to be created...") time.sleep(3) From c4f08e99339da03c366b6f189b55d7a4a0a6b416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 13:59:01 +0000 Subject: [PATCH 03/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 0202ee4c..16cb19ab 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a /{RUNDIR_NAME}/log_{name}.out +{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/output-$HOSTNAME.log """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) From 023feac388f1e644a54454bbaee7b8ad3fb38171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 14:06:10 +0000 Subject: [PATCH 04/12] self.nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 16cb19ab..0252f2ad 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -393,9 +393,9 @@ def fetch_logs( self.pvc_job_dir = os.path.join(self.pvc_nemo_run_dir, job_subdir) files = [] - while len(files) < 1: + while len(files) < self.nodes: files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) - logger.info(f"Waiting for {1 - len(files)} log files to be created...") + logger.info(f"Waiting for {self.nodes - len(files)} log files to be created...") time.sleep(3) cmd.extend(files) From e5f596ff64127abe3c44a35b57e066f570e1e010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 14:14:05 +0000 Subject: [PATCH 05/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 0252f2ad..982430f3 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/output-$HOSTNAME.log +{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.log """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) From 0ef32b2bb9a42e743e947cad750a9424465a9bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 14:29:18 +0000 Subject: [PATCH 06/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 982430f3..325e0694 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -394,7 +394,7 @@ def fetch_logs( files = [] while len(files) < self.nodes: - files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) + files = list(glob.glob(f"{self.pvc_job_dir}/log_*.log")) logger.info(f"Waiting for {self.nodes - len(files)} log files to be created...") time.sleep(3) From b4862b63d88af493e217bbe268c11d6f6888f70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 14:50:57 +0000 Subject: [PATCH 07/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 325e0694..484d5299 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.log +{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.out """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) @@ -394,7 +394,7 @@ def fetch_logs( files = [] while len(files) < self.nodes: - files = list(glob.glob(f"{self.pvc_job_dir}/log_*.log")) + files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) logger.info(f"Waiting for {self.nodes - len(files)} log files to be created...") time.sleep(3) From 19244d41ba58fe4895858f4003e762cfcda73982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 14:58:30 +0000 Subject: [PATCH 08/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 484d5299..15319a67 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -410,7 +410,7 @@ def fetch_logs( for line in iter(proc.stdout.readline, ""): if ( line - and not line.rstrip("\n").endswith(".log <==") + and not line.rstrip("\n").endswith(".out <==") and line.rstrip("\n") != "" ): yield f"{line}" From 95ea8fea3865cce6d0ee07aa9b7aa22ce5a3693b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 15:06:05 +0000 Subject: [PATCH 09/12] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 15319a67..3fb7ffbe 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -29,7 +29,7 @@ import requests from invoke.context import Context -from nemo_run.config import RUNDIR_NAME, get_nemorun_home +from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor, ExecutorMacros from nemo_run.core.packaging.base import Packager from nemo_run.core.packaging.git import GitArchivePackager From 355f28dd539afb79b3b30ab10ff178cdf3a078e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 15:25:31 +0000 Subject: [PATCH 10/12] log_all_ranks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 3fb7ffbe..f0a8b838 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.out +{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.out {self.pvc_job_dir}/log_all_ranks.out """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) @@ -393,9 +393,9 @@ def fetch_logs( self.pvc_job_dir = os.path.join(self.pvc_nemo_run_dir, job_subdir) files = [] - while len(files) < self.nodes: + while len(files) < (self.nodes + 1): files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) - logger.info(f"Waiting for {self.nodes - len(files)} log files to be created...") + logger.info(f"Waiting for {(self.nodes + 1) - len(files)} log files to be created...") time.sleep(3) cmd.extend(files) From c14a45bfc844d2d96060959aef006efc2058fb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 4 Dec 2025 16:02:47 +0000 Subject: [PATCH 11/12] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 7 +++++-- test/core/execution/test_dgxcloud.py | 14 +++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index f0a8b838..d871ce96 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -393,9 +393,12 @@ def fetch_logs( self.pvc_job_dir = os.path.join(self.pvc_nemo_run_dir, job_subdir) files = [] - while len(files) < (self.nodes + 1): + while len(files) < self.nodes: files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) - logger.info(f"Waiting for {(self.nodes + 1) - len(files)} log files to be created...") + files = [f for f in files if "log_all_ranks" not in f] + logger.info( + f"Waiting for {self.nodes + 1 - len(files)} log files to be created in {self.pvc_job_dir}..." + ) time.sleep(3) cmd.extend(files) diff --git a/test/core/execution/test_dgxcloud.py b/test/core/execution/test_dgxcloud.py index e8042f68..9098b5c5 100644 --- a/test/core/execution/test_dgxcloud.py +++ b/test/core/execution/test_dgxcloud.py @@ -110,8 +110,8 @@ def test_fetch_logs_streaming(self, mock_sleep, mock_popen, mock_glob): # Mock log files mock_glob.return_value = [ - "/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-0.log", - "/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-1.log", + "/workspace/nemo_run/experiments/exp1/task1/log_worker-0.out", + "/workspace/nemo_run/experiments/exp1/task1/log_worker-1.out", ] # Mock process that yields log lines @@ -162,7 +162,7 @@ def test_fetch_logs_non_streaming(self, mock_sleep, mock_popen, mock_glob): # Mock log files mock_glob.return_value = [ - "/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-0.log", + "/workspace/nemo_run/experiments/exp1/task1/log_worker-0.out", ] # Mock process that yields log lines @@ -232,7 +232,7 @@ def test_fetch_logs_waits_for_running_status(self, mock_glob, mock_sleep): with patch.object(executor, "status", side_effect=status_values): # Mock glob to prevent it from blocking - mock_glob.return_value = ["/workspace/nemo_run/logs/output.log"] + mock_glob.return_value = ["/workspace/nemo_run/logs/outputlog_"] with patch("subprocess.Popen") as mock_popen: mock_process = MagicMock() @@ -257,10 +257,10 @@ def test_fetch_logs_waits_for_log_files(self, mock_popen, mock_glob, mock_sleep) # Mock glob to return incomplete files first, then all files mock_glob.side_effect = [ [], # No files yet - ["/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-0.log"], # 1 of 2 + ["/workspace/nemo_run/experiments/exp1/task1/log_worker-0.out"], # 1 of 2 [ # All 2 files - "/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-0.log", - "/workspace/nemo_run/experiments/exp1/task1/logs/output-worker-1.log", + "/workspace/nemo_run/experiments/exp1/task1/log_worker-0.out", + "/workspace/nemo_run/experiments/exp1/task1/log_worker-1.out", ], ] From ab33f7827a5bf98bb7ad4c8ef8dc27d59052e106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 5 Dec 2025 13:08:11 +0000 Subject: [PATCH 12/12] log-allranks_0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index d871ce96..13596ebf 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -323,7 +323,7 @@ def launch(self, name: str, cmd: list[str]) -> tuple[str, str]: ln -s {self.pvc_job_dir}/ /nemo_run cd /nemo_run/code mkdir -p {self.pvc_job_dir}/logs -{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.out {self.pvc_job_dir}/log_all_ranks.out +{" ".join(cmd)} 2>&1 | tee -a {self.pvc_job_dir}/log_$HOSTNAME.out {self.pvc_job_dir}/log-allranks_0.out """ with open(os.path.join(self.job_dir, "launch_script.sh"), "w+") as f: f.write(launch_script) @@ -395,7 +395,7 @@ def fetch_logs( files = [] while len(files) < self.nodes: files = list(glob.glob(f"{self.pvc_job_dir}/log_*.out")) - files = [f for f in files if "log_all_ranks" not in f] + files = [f for f in files if "log-allranks_0" not in f] logger.info( f"Waiting for {self.nodes + 1 - len(files)} log files to be created in {self.pvc_job_dir}..." )