diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml index ed5973f5c9..99fcb6789d 100644 --- a/cluster_configs/example-slurm.yaml +++ b/cluster_configs/example-slurm.yaml @@ -33,6 +33,10 @@ job_name_prefix: "nemo_skills:" # identity: +# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel +# job_dir: + + # define your account/partition here # account: # partition: @@ -47,6 +51,8 @@ job_name_prefix: "nemo_skills:" # e.g. # - /trt_models:/trt_models # - /data:/data +# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do +# - /NeMo-Aligner:/opt/NeMo-Aligner # can use this section to set timeouts for different partitions diff --git a/docs/basics/prerequisites.md b/docs/basics/prerequisites.md index 3d45835405..bf1579a3ec 100644 --- a/docs/basics/prerequisites.md +++ b/docs/basics/prerequisites.md @@ -58,6 +58,10 @@ You can also use `NEMO_SKILLS_CONFIG` env variable instead of the `--cluster` pa The cluster config defines an executor (local or slurm), mounts for data/model access and (slurm-only) various parameters such as account, partition, ssh-tunnel arguments and so on. +The recommended way to launch jobs on slurm is by running all commands locally and specifying `ssh_tunnel` portion in cluster config +to let NeMo-Run know how to connect there. But if you prefer to run from the cluster directly, you can instal NeMo-Skills there +and then only specify `job_dir` parameter without using `ssh_tunnel` section in the config. + ### Environment variables You can define environment variables in the cluster config file, which will be set inside the container. diff --git a/nemo_skills/pipeline/utils.py b/nemo_skills/pipeline/utils.py index 4bd8d1980c..64a6085d50 100644 --- a/nemo_skills/pipeline/utils.py +++ b/nemo_skills/pipeline/utils.py @@ -29,6 +29,7 @@ import yaml from huggingface_hub import get_token from invoke import StreamWatcher +from nemo_run.config import set_nemorun_home from nemo_run.core.execution.docker import DockerExecutor from nemo_run.core.execution.slurm import SlurmJobDetails from nemo_run.core.tunnel import SSHTunnel @@ -471,7 +472,14 @@ def get_cluster_config(cluster=None, config_dir=None): if not Path(config_file).exists(): raise ValueError(f"Cluster config {config_file} not found.") - return read_config(config_file) + cluster_config = read_config(config_file) + + if cluster_config['executor'] == 'slurm' and "ssh_tunnel" not in cluster_config: + if "job_dir" not in cluster_config: + raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.") + set_nemorun_home(cluster_config["job_dir"]) + + return cluster_config @lru_cache @@ -498,6 +506,9 @@ def tunnel_hash(tunnel): def get_tunnel(cluster_config): + if "ssh_tunnel" not in cluster_config: + LOG.info("No ssh_tunnel configuration found, assuming we are running from the cluster already.") + return run.LocalTunnel(job_dir="") return _get_tunnel_cached(**cluster_config["ssh_tunnel"]) @@ -1146,7 +1157,7 @@ def add_task( if cluster_config["executor"] != "local": tunnel = get_tunnel(cluster_config) - if reuse_code: + if isinstance(tunnel, run.SSHTunnel) and reuse_code: reuse_code_exp = reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel)) if reuse_code_exp is not None: if isinstance(reuse_code_exp, run.Experiment): @@ -1158,7 +1169,8 @@ def add_task( reuse_dir = reuse_exp.tunnels[tunnel.key].packaging_jobs['nemo-run'].dst_path for executor in executors: executor.packager.symlink_from_remote_dir = reuse_dir - else: # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated + # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated + elif isinstance(tunnel, run.SSHTunnel): REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None) if len(commands) == 1: @@ -1191,6 +1203,8 @@ def run_exp(exp, cluster_config, sequential=None): exp.run(detach=True, sequential=False if sequential is None else sequential) # caching the experiment code for reuse - ssh_hash = tunnel_hash(get_tunnel(cluster_config)) - if ssh_hash not in REUSE_CODE_EXP: - REUSE_CODE_EXP[ssh_hash] = exp + tunnel = get_tunnel(cluster_config) + if isinstance(tunnel, run.SSHTunnel): + ssh_hash = tunnel_hash(tunnel) + if ssh_hash not in REUSE_CODE_EXP: + REUSE_CODE_EXP[ssh_hash] = exp