Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions nemo_run/run/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@
from rich.console import Group
from rich.live import Live
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskID, TimeElapsedColumn
from rich.progress import Task as RichTask
from rich.progress import TaskID, TimeElapsedColumn
from rich.syntax import Syntax
from torchx.specs.api import AppState

Expand Down Expand Up @@ -225,11 +224,14 @@ class Experiment(ConfigurableMixin):
def catalog(
cls: Type["Experiment"],
title: str = "",
exp_dir_infix: str | None = None,
) -> list[str]:
"""
List all experiments inside get_nemorun_home(), optionally with the provided title.
"""
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
if exp_dir_infix is None:
exp_dir_infix = os.path.join("experiments", title)
parent_dir = os.path.join(get_nemorun_home(), exp_dir_infix)
return _get_sorted_dirs(parent_dir)

@classmethod
Expand Down Expand Up @@ -263,12 +265,14 @@ def _from_config(cls: Type["Experiment"], exp_dir: str) -> "Experiment":
def from_id(
cls: Type["Experiment"],
id: str,
exp_dir_infix: str | None = None,
) -> "Experiment":
"""
Reconstruct an experiment with the specified id.
"""
title, _, _ = id.rpartition("_")
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
if exp_dir_infix is None:
exp_dir_infix = os.path.join("experiments", id.rpartition("_")[0])
parent_dir = os.path.join(get_nemorun_home(), exp_dir_infix)
exp_dir = os.path.join(parent_dir, id)

assert os.path.isdir(exp_dir), f"Experiment {id} not found."
Expand All @@ -280,11 +284,14 @@ def from_id(
def from_title(
cls: Type["Experiment"],
title: str,
exp_dir_infix: str | None = None,
) -> "Experiment":
"""
Reconstruct an experiment with the specified title.
"""
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
if exp_dir_infix is None:
exp_dir_infix = os.path.join("experiments", title)
parent_dir = os.path.join(get_nemorun_home(), exp_dir_infix)
exp_dir = _get_latest_dir(parent_dir)

assert os.path.isdir(exp_dir), f"Experiment {id} not found."
Expand All @@ -303,6 +310,7 @@ def __init__(
base_dir: str | None = None,
clean_mode: bool = False,
enable_goodbye_message: bool = True,
exp_dir_infix: str | None = None,
) -> None:
"""
Initializes an experiment run by creating its metadata directory and saving the experiment config.
Expand Down Expand Up @@ -330,7 +338,12 @@ def __init__(
self._enable_goodbye_message = enable_goodbye_message

base_dir = str(base_dir or get_nemorun_home())
self._exp_dir = os.path.join(base_dir, "experiments", title, self._id)
if exp_dir_infix is None:
self._exp_dir_infix = os.path.join("experiments", title)
else:
self._exp_dir_infix = exp_dir_infix

self._exp_dir = os.path.join(base_dir, self._exp_dir_infix, self._id)

self.log_level = log_level
self._runner = get_runner(component_defaults=None, experiment=self)
Expand Down Expand Up @@ -359,6 +372,7 @@ def to_config(self) -> Config:
executor=self.executor.to_config(),
log_level=self.log_level,
clean_mode=self.clean_mode,
exp_dir_infix=self._exp_dir_infix,
)

def _save_experiment(self, exist_ok: bool = False):
Expand Down Expand Up @@ -997,7 +1011,7 @@ def reset(self) -> "Experiment":

old_id, old_exp_dir, old_launched = self._id, self._exp_dir, self._launched
self._id = f"{self._title}_{int(time.time())}"
self._exp_dir = os.path.join(get_nemorun_home(), "experiments", self._title, self._id)
self._exp_dir = os.path.join(get_nemorun_home(), self._exp_dir_infix, self._id)
self._launched = False
self._live_progress = None

Expand Down Expand Up @@ -1047,7 +1061,7 @@ def reset(self) -> "Experiment":
f"[bold magenta]Failed resetting Experiment {self._id} due to error: {e}"
)
# Double check exp dir is unchanged
new_path = os.path.join(get_nemorun_home(), "experiments", self._title, self._id)
new_path = os.path.join(get_nemorun_home(), self._exp_dir_infix, self._id)
if self._exp_dir == new_path and new_path != old_exp_dir:
shutil.rmtree(self._exp_dir)

Expand Down
Loading
Loading