From 7492db2d829fb592326583b51071f94b672e088a Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 4 Mar 2025 15:59:35 -0800 Subject: [PATCH 1/5] Add an option to use local tunnel Signed-off-by: Igor Gitman --- nemo_skills/pipeline/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo_skills/pipeline/utils.py b/nemo_skills/pipeline/utils.py index 4bd8d1980c..706d17c646 100644 --- a/nemo_skills/pipeline/utils.py +++ b/nemo_skills/pipeline/utils.py @@ -498,6 +498,15 @@ def tunnel_hash(tunnel): def get_tunnel(cluster_config): + if "ssh_tunnel" not in cluster_config: + if "job_dir" not in cluster_config: + raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.") + if os.getenv('NEMORUN_HOME', '/home').startswith('/home'): + LOG.warning( + "If your /home folder is limited in space it's recommended to " + "explicitly set NEMORUN_HOME to point to a larger disk." + ) + return run.LocalTunnel(cluster_config["job_dir"]) return _get_tunnel_cached(**cluster_config["ssh_tunnel"]) From bd14c857c74fec0bdcabc04be85a87751b5a2a5b Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 4 Mar 2025 17:04:56 -0800 Subject: [PATCH 2/5] Some more changes for local tunnel Signed-off-by: Igor Gitman --- nemo_skills/pipeline/utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nemo_skills/pipeline/utils.py b/nemo_skills/pipeline/utils.py index 706d17c646..ee55f9231b 100644 --- a/nemo_skills/pipeline/utils.py +++ b/nemo_skills/pipeline/utils.py @@ -506,7 +506,7 @@ def get_tunnel(cluster_config): "If your /home folder is limited in space it's recommended to " "explicitly set NEMORUN_HOME to point to a larger disk." ) - return run.LocalTunnel(cluster_config["job_dir"]) + return run.LocalTunnel(job_dir=cluster_config["job_dir"]) return _get_tunnel_cached(**cluster_config["ssh_tunnel"]) @@ -1155,7 +1155,7 @@ def add_task( if cluster_config["executor"] != "local": tunnel = get_tunnel(cluster_config) - if reuse_code: + if isinstance(tunnel, run.SSHTunnel) and reuse_code: reuse_code_exp = reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel)) if reuse_code_exp is not None: if isinstance(reuse_code_exp, run.Experiment): @@ -1167,7 +1167,8 @@ def add_task( reuse_dir = reuse_exp.tunnels[tunnel.key].packaging_jobs['nemo-run'].dst_path for executor in executors: executor.packager.symlink_from_remote_dir = reuse_dir - else: # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated + # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated + elif isinstance(tunnel, run.SSHTunnel): REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None) if len(commands) == 1: @@ -1200,6 +1201,8 @@ def run_exp(exp, cluster_config, sequential=None): exp.run(detach=True, sequential=False if sequential is None else sequential) # caching the experiment code for reuse - ssh_hash = tunnel_hash(get_tunnel(cluster_config)) - if ssh_hash not in REUSE_CODE_EXP: - REUSE_CODE_EXP[ssh_hash] = exp + tunnel = get_tunnel(cluster_config) + if isinstance(tunnel, run.SSHTunnel): + ssh_hash = tunnel_hash(tunnel) + if ssh_hash not in REUSE_CODE_EXP: + REUSE_CODE_EXP[ssh_hash] = exp From f5edb00367a791a4ea1aebe9556d5838a5b4791d Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 5 Mar 2025 14:29:27 -0800 Subject: [PATCH 3/5] Properly set nemorun home Signed-off-by: Igor Gitman --- nemo_skills/pipeline/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/nemo_skills/pipeline/utils.py b/nemo_skills/pipeline/utils.py index ee55f9231b..61669064a0 100644 --- a/nemo_skills/pipeline/utils.py +++ b/nemo_skills/pipeline/utils.py @@ -29,6 +29,7 @@ import yaml from huggingface_hub import get_token from invoke import StreamWatcher +from nemo_run.config import set_nemorun_home from nemo_run.core.execution.docker import DockerExecutor from nemo_run.core.execution.slurm import SlurmJobDetails from nemo_run.core.tunnel import SSHTunnel @@ -501,12 +502,8 @@ def get_tunnel(cluster_config): if "ssh_tunnel" not in cluster_config: if "job_dir" not in cluster_config: raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.") - if os.getenv('NEMORUN_HOME', '/home').startswith('/home'): - LOG.warning( - "If your /home folder is limited in space it's recommended to " - "explicitly set NEMORUN_HOME to point to a larger disk." - ) - return run.LocalTunnel(job_dir=cluster_config["job_dir"]) + set_nemorun_home(cluster_config["job_dir"]) + return run.LocalTunnel(job_dir="") return _get_tunnel_cached(**cluster_config["ssh_tunnel"]) From aae63e0bcb95b2d79b16011f387e482ffe17a9ab Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 5 Mar 2025 14:39:52 -0800 Subject: [PATCH 4/5] Set nemorunhome when reading config Signed-off-by: Igor Gitman --- nemo_skills/pipeline/utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/nemo_skills/pipeline/utils.py b/nemo_skills/pipeline/utils.py index 61669064a0..64a6085d50 100644 --- a/nemo_skills/pipeline/utils.py +++ b/nemo_skills/pipeline/utils.py @@ -472,7 +472,14 @@ def get_cluster_config(cluster=None, config_dir=None): if not Path(config_file).exists(): raise ValueError(f"Cluster config {config_file} not found.") - return read_config(config_file) + cluster_config = read_config(config_file) + + if cluster_config['executor'] == 'slurm' and "ssh_tunnel" not in cluster_config: + if "job_dir" not in cluster_config: + raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.") + set_nemorun_home(cluster_config["job_dir"]) + + return cluster_config @lru_cache @@ -500,9 +507,7 @@ def tunnel_hash(tunnel): def get_tunnel(cluster_config): if "ssh_tunnel" not in cluster_config: - if "job_dir" not in cluster_config: - raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.") - set_nemorun_home(cluster_config["job_dir"]) + LOG.info("No ssh_tunnel configuration found, assuming we are running from the cluster already.") return run.LocalTunnel(job_dir="") return _get_tunnel_cached(**cluster_config["ssh_tunnel"]) From e6be4ae99a768aad5069377455bc79284d21b7dc Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Wed, 5 Mar 2025 14:45:53 -0800 Subject: [PATCH 5/5] Update docs Signed-off-by: Igor Gitman --- cluster_configs/example-slurm.yaml | 6 ++++++ docs/basics/prerequisites.md | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml index ed5973f5c9..99fcb6789d 100644 --- a/cluster_configs/example-slurm.yaml +++ b/cluster_configs/example-slurm.yaml @@ -33,6 +33,10 @@ job_name_prefix: "nemo_skills:" # identity: +# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel +# job_dir: + + # define your account/partition here # account: # partition: @@ -47,6 +51,8 @@ job_name_prefix: "nemo_skills:" # e.g. # - /trt_models:/trt_models # - /data:/data +# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do +# - /NeMo-Aligner:/opt/NeMo-Aligner # can use this section to set timeouts for different partitions diff --git a/docs/basics/prerequisites.md b/docs/basics/prerequisites.md index 3d45835405..bf1579a3ec 100644 --- a/docs/basics/prerequisites.md +++ b/docs/basics/prerequisites.md @@ -58,6 +58,10 @@ You can also use `NEMO_SKILLS_CONFIG` env variable instead of the `--cluster` pa The cluster config defines an executor (local or slurm), mounts for data/model access and (slurm-only) various parameters such as account, partition, ssh-tunnel arguments and so on. +The recommended way to launch jobs on slurm is by running all commands locally and specifying `ssh_tunnel` portion in cluster config +to let NeMo-Run know how to connect there. But if you prefer to run from the cluster directly, you can instal NeMo-Skills there +and then only specify `job_dir` parameter without using `ssh_tunnel` section in the config. + ### Environment variables You can define environment variables in the cluster config file, which will be set inside the container.