Skip to content

Commit c961477

Browse files
hemildesaijubick1337
authored andcommitted
Use parallel experiments with configurable worker (#596)
Signed-off-by: jubick1337 <mattyson.so@gmail.com>
1 parent b84e79b commit c961477

1 file changed

Lines changed: 18 additions & 5 deletions

File tree

  • nemo_skills/pipeline/utils

nemo_skills/pipeline/utils/exp.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,21 @@ def get_exp_handles(expname: str, ignore_finished=True, ignore_exp_not_exists=Tr
5757
is called, but finish before nemo-run submits a new job (which might take minutes)
5858
"""
5959

60-
def _get_handles(exp):
60+
def _get_handles(exp: run.Experiment):
6161
handles = []
62-
for job in exp.jobs:
62+
status_dict = exp.status(return_dict=True)
63+
assert status_dict, f"No status found for experiment {exp._id}"
64+
for _, status_info in status_dict.items():
6365
if not ignore_finished or (
64-
job.status(exp._runner) in [AppState.RUNNING, AppState.PENDING, AppState.SUBMITTED, AppState.UNKNOWN]
66+
status_info['status']
67+
in [
68+
AppState.RUNNING,
69+
AppState.PENDING,
70+
AppState.SUBMITTED,
71+
AppState.UNKNOWN,
72+
]
6573
):
66-
handles.append(job.handle)
74+
handles.append(status_info['handle'])
6775
continue
6876
return handles
6977

@@ -611,7 +619,12 @@ def get_exp(expname, cluster_config, _reuse_exp=None):
611619
# nemo-run redefines the handlers, so removing ours to avoid duplicate logs
612620
remove_handlers()
613621
if cluster_config['executor'] == 'slurm':
614-
return run.Experiment(expname)
622+
return run.Experiment(
623+
expname,
624+
skip_status_at_exit=True,
625+
serialize_metadata_for_scripts=False,
626+
threadpool_workers=cluster_config.get('num_workers', 4),
627+
)
615628
# hiding all nemo-run logs otherwise as they are not useful locally
616629
if cluster_config['executor'] == 'local':
617630
return run.Experiment(expname, clean_mode=True)

0 commit comments

Comments
 (0)