Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions nemo_skills/pipeline/utils/exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,21 @@ def get_exp_handles(expname: str, ignore_finished=True, ignore_exp_not_exists=Tr
is called, but finish before nemo-run submits a new job (which might take minutes)
"""

def _get_handles(exp):
def _get_handles(exp: run.Experiment):
handles = []
for job in exp.jobs:
status_dict = exp.status(return_dict=True)
assert status_dict, f"No status found for experiment {exp._id}"
for _, status_info in status_dict.items():
if not ignore_finished or (
job.status(exp._runner) in [AppState.RUNNING, AppState.PENDING, AppState.SUBMITTED, AppState.UNKNOWN]
status_info['status']
in [
AppState.RUNNING,
AppState.PENDING,
AppState.SUBMITTED,
AppState.UNKNOWN,
]
):
handles.append(job.handle)
handles.append(status_info['handle'])
continue
return handles

Expand Down Expand Up @@ -611,7 +619,12 @@ def get_exp(expname, cluster_config, _reuse_exp=None):
# nemo-run redefines the handlers, so removing ours to avoid duplicate logs
remove_handlers()
if cluster_config['executor'] == 'slurm':
return run.Experiment(expname)
return run.Experiment(
expname,
skip_status_at_exit=True,
serialize_metadata_for_scripts=False,
threadpool_workers=cluster_config.get('num_workers', 4),
)
# hiding all nemo-run logs otherwise as they are not useful locally
if cluster_config['executor'] == 'local':
return run.Experiment(expname, clean_mode=True)
Expand Down