Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cluster_configs/example-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ job_name_prefix: "nemo_skills:"
# identity: <can specify ssh key to avoid entering password>


# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>


# define your account/partition here
# account: <slurm account>
# partition: <slurm partition>
Expand All @@ -47,6 +51,8 @@ job_name_prefix: "nemo_skills:"
# e.g.
# - <path on slurm>/trt_models:/trt_models
# - <path on slurm>/data:/data
# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
# - <path on slurm>/NeMo-Aligner:/opt/NeMo-Aligner


# can use this section to set timeouts for different partitions
Expand Down
4 changes: 4 additions & 0 deletions docs/basics/prerequisites.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ You can also use `NEMO_SKILLS_CONFIG` env variable instead of the `--cluster` pa
The cluster config defines an executor (local or slurm), mounts for data/model access and (slurm-only) various parameters
such as account, partition, ssh-tunnel arguments and so on.

The recommended way to launch jobs on slurm is by running all commands locally and specifying `ssh_tunnel` portion in cluster config
to let NeMo-Run know how to connect there. But if you prefer to run from the cluster directly, you can instal NeMo-Skills there
and then only specify `job_dir` parameter without using `ssh_tunnel` section in the config.

### Environment variables

You can define environment variables in the cluster config file, which will be set inside the container.
Expand Down
26 changes: 20 additions & 6 deletions nemo_skills/pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import yaml
from huggingface_hub import get_token
from invoke import StreamWatcher
from nemo_run.config import set_nemorun_home
from nemo_run.core.execution.docker import DockerExecutor
from nemo_run.core.execution.slurm import SlurmJobDetails
from nemo_run.core.tunnel import SSHTunnel
Expand Down Expand Up @@ -471,7 +472,14 @@ def get_cluster_config(cluster=None, config_dir=None):
if not Path(config_file).exists():
raise ValueError(f"Cluster config {config_file} not found.")

return read_config(config_file)
cluster_config = read_config(config_file)

if cluster_config['executor'] == 'slurm' and "ssh_tunnel" not in cluster_config:
if "job_dir" not in cluster_config:
raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.")
set_nemorun_home(cluster_config["job_dir"])

return cluster_config


@lru_cache
Expand All @@ -498,6 +506,9 @@ def tunnel_hash(tunnel):


def get_tunnel(cluster_config):
if "ssh_tunnel" not in cluster_config:
LOG.info("No ssh_tunnel configuration found, assuming we are running from the cluster already.")
return run.LocalTunnel(job_dir="")
return _get_tunnel_cached(**cluster_config["ssh_tunnel"])


Expand Down Expand Up @@ -1146,7 +1157,7 @@ def add_task(

if cluster_config["executor"] != "local":
tunnel = get_tunnel(cluster_config)
if reuse_code:
if isinstance(tunnel, run.SSHTunnel) and reuse_code:
reuse_code_exp = reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel))
if reuse_code_exp is not None:
if isinstance(reuse_code_exp, run.Experiment):
Expand All @@ -1158,7 +1169,8 @@ def add_task(
reuse_dir = reuse_exp.tunnels[tunnel.key].packaging_jobs['nemo-run'].dst_path
for executor in executors:
executor.packager.symlink_from_remote_dir = reuse_dir
else: # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated
# if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated
elif isinstance(tunnel, run.SSHTunnel):
REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None)

if len(commands) == 1:
Expand Down Expand Up @@ -1191,6 +1203,8 @@ def run_exp(exp, cluster_config, sequential=None):
exp.run(detach=True, sequential=False if sequential is None else sequential)

# caching the experiment code for reuse
ssh_hash = tunnel_hash(get_tunnel(cluster_config))
if ssh_hash not in REUSE_CODE_EXP:
REUSE_CODE_EXP[ssh_hash] = exp
tunnel = get_tunnel(cluster_config)
if isinstance(tunnel, run.SSHTunnel):
ssh_hash = tunnel_hash(tunnel)
if ssh_hash not in REUSE_CODE_EXP:
REUSE_CODE_EXP[ssh_hash] = exp