diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index cf3ec55085..2381738490 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -57,13 +57,21 @@ def get_exp_handles(expname: str, ignore_finished=True, ignore_exp_not_exists=Tr is called, but finish before nemo-run submits a new job (which might take minutes) """ - def _get_handles(exp): + def _get_handles(exp: run.Experiment): handles = [] - for job in exp.jobs: + status_dict = exp.status(return_dict=True) + assert status_dict, f"No status found for experiment {exp._id}" + for _, status_info in status_dict.items(): if not ignore_finished or ( - job.status(exp._runner) in [AppState.RUNNING, AppState.PENDING, AppState.SUBMITTED, AppState.UNKNOWN] + status_info['status'] + in [ + AppState.RUNNING, + AppState.PENDING, + AppState.SUBMITTED, + AppState.UNKNOWN, + ] ): - handles.append(job.handle) + handles.append(status_info['handle']) continue return handles @@ -611,7 +619,12 @@ def get_exp(expname, cluster_config, _reuse_exp=None): # nemo-run redefines the handlers, so removing ours to avoid duplicate logs remove_handlers() if cluster_config['executor'] == 'slurm': - return run.Experiment(expname) + return run.Experiment( + expname, + skip_status_at_exit=True, + serialize_metadata_for_scripts=False, + threadpool_workers=cluster_config.get('num_workers', 4), + ) # hiding all nemo-run logs otherwise as they are not useful locally if cluster_config['executor'] == 'local': return run.Experiment(expname, clean_mode=True)