diff --git a/nemo_run/cli/api.py b/nemo_run/cli/api.py index 06c55213..70181ee0 100644 --- a/nemo_run/cli/api.py +++ b/nemo_run/cli/api.py @@ -56,7 +56,13 @@ from nemo_run.cli import devspace as devspace_cli from nemo_run.cli import experiment as experiment_cli from nemo_run.cli.cli_parser import parse_cli_args, parse_factory -from nemo_run.config import NEMORUN_HOME, Config, Partial, get_type_namespace, get_underlying_types +from nemo_run.config import ( + Config, + Partial, + get_nemorun_home, + get_type_namespace, + get_underlying_types, +) from nemo_run.core.execution import LocalExecutor, SkypilotExecutor, SlurmExecutor from nemo_run.core.execution.base import Executor from nemo_run.core.frontend.console.styles import BOX_STYLE, TABLE_STYLES @@ -758,7 +764,7 @@ def _search_workspace_file() -> str | None: file_names = [ "workspace_private.py", "workspace.py", - os.path.join(NEMORUN_HOME, "workspace.py"), + os.path.join(get_nemorun_home(), "workspace.py"), ] while True: diff --git a/nemo_run/config.py b/nemo_run/config.py index e46dd9c4..ca538b45 100644 --- a/nemo_run/config.py +++ b/nemo_run/config.py @@ -44,12 +44,33 @@ _BuildableT = TypeVar("_BuildableT", bound=fdl.Buildable) RECURSIVE_TYPES = (typing.Union, typing.Optional) -NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.expanduser("~/.nemo_run")) +_NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.expanduser("~/.nemo_run")) RUNDIR_NAME = "nemo_run" RUNDIR_SPECIAL_NAME = "/$nemo_run" SCRIPTS_DIR = "scripts" +def get_nemorun_home() -> str: + """ + Get the current NEMORUN_HOME directory path. + + Returns: + The path to the NEMORUN_HOME directory. + """ + return _NEMORUN_HOME + + +def set_nemorun_home(path: str) -> None: + """ + Set the NEMORUN_HOME directory path. + + Args: + path: The new path for NEMORUN_HOME. + """ + global _NEMORUN_HOME + _NEMORUN_HOME = os.path.expanduser(path) + + def get_type_namespace(typ: Type | Callable) -> str: """ Get the namespace of a type or callable. diff --git a/nemo_run/core/execution/base.py b/nemo_run/core/execution/base.py index 6e853d6c..dfe0515e 100644 --- a/nemo_run/core/execution/base.py +++ b/nemo_run/core/execution/base.py @@ -24,7 +24,7 @@ from torchx.specs import Role from typing_extensions import Self -from nemo_run.config import NEMORUN_HOME, ConfigurableMixin +from nemo_run.config import ConfigurableMixin, get_nemorun_home from nemo_run.core.execution.launcher import LAUNCHER_MAP, Launcher from nemo_run.core.packaging.base import Packager @@ -201,17 +201,17 @@ def import_executor( Example: executor = import_executor("local", file_path="path/to/executors.py") - executor = import_executor("gpu") # Uses the default location of os.path.join(NEMORUN_HOME, "executors.py") + executor = import_executor("gpu") # Uses the default location of os.path.join(get_nemorun_home(), "executors.py") Args: name (str): The name of the executor to retrieve. file_path (Optional[str]): The path to the Python file containing the executor definitions. - Defaults to None, in which case the default location of os.path.join(NEMORUN_HOME, "executors.py") is used. + Defaults to None, in which case the default location of os.path.join(get_nemorun_home(), "executors.py") is used. The file_path is expected to be a string representing a file path with the following structure: - It should be a path to a Python file (with a .py extension). - The file should contain a dictionary named `EXECUTOR_MAP` that maps executor names to their corresponding instances. - - The file can be located anywhere in the file system, but if not provided, it defaults to `NEMORUN_HOME/executors.py`. + - The file can be located anywhere in the file system, but if not provided, it defaults to `get_nemorun_home()/executors.py`. call (bool): If True, the value from the module is called with the rest of the given kwargs. Returns: @@ -219,7 +219,7 @@ def import_executor( """ if not file_path: - file_path = os.path.join(NEMORUN_HOME, "executors.py") + file_path = os.path.join(get_nemorun_home(), "executors.py") spec = importlib.util.spec_from_file_location("executors", file_path) assert spec diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 2c19f3f7..cbb5d0f8 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -234,7 +234,7 @@ def assign( == os.path.abspath(x["path"]), self.pvcs, ) - ), f"Need to specify atleast one PVC containing {self.job_dir}.\nTo update job dir to a PVC path, you can set the NEMORUN_HOME env var." + ), f"Need to specify atleast one PVC containing {self.job_dir}.\nTo update job dir to a PVC path, you can use set_nemorun_home() or the NEMORUN_HOME env var." def package(self, packager: Packager, job_name: str): assert self.experiment_id, "Executor not assigned to an experiment." diff --git a/nemo_run/core/execution/docker.py b/nemo_run/core/execution/docker.py index fcf15ab7..4cae70ff 100644 --- a/nemo_run/core/execution/docker.py +++ b/nemo_run/core/execution/docker.py @@ -31,7 +31,7 @@ parse_app_handle, ) -from nemo_run.config import NEMORUN_HOME, RUNDIR_NAME +from nemo_run.config import RUNDIR_NAME, get_nemorun_home from nemo_run.core.execution.base import Executor from nemo_run.core.packaging.base import Packager from nemo_run.core.packaging.git import GitArchivePackager @@ -50,7 +50,7 @@ fcntl = None FCNTL_AVAILABLE = False -DOCKER_JOB_DIRS = os.path.join(NEMORUN_HOME, ".docker_jobs.json") +DOCKER_JOB_DIRS = os.path.join(get_nemorun_home(), ".docker_jobs.json") NETWORK = "nemo_run" LABEL_EXPERIMENT_ID: str = "nemo-run/experiment-id" diff --git a/nemo_run/core/tunnel/client.py b/nemo_run/core/tunnel/client.py index 7abb31c1..316dbc78 100644 --- a/nemo_run/core/tunnel/client.py +++ b/nemo_run/core/tunnel/client.py @@ -32,12 +32,12 @@ from invoke.context import Context from invoke.runners import Result as RunResult -from nemo_run.config import NEMORUN_HOME, ConfigurableMixin +from nemo_run.config import ConfigurableMixin, get_nemorun_home from nemo_run.core.frontend.console.api import CONSOLE logger: logging.Logger = logging.getLogger(__name__) TUNNEL_DIR = ".tunnels" -TUNNEL_FILE_SUBPATH = os.path.join(NEMORUN_HOME, TUNNEL_DIR) +TUNNEL_FILE_SUBPATH = os.path.join(get_nemorun_home(), TUNNEL_DIR) def delete_tunnel_dir(file_path): @@ -136,7 +136,7 @@ def __post_init__(self): def _set_job_dir(self, experiment_id: str): experiment_title, _, _ = experiment_id.rpartition("_") - base_job_dir = self.job_dir or os.path.join(NEMORUN_HOME, "experiments") + base_job_dir = self.job_dir or os.path.join(get_nemorun_home(), "experiments") job_dir = os.path.join(base_job_dir, experiment_title, experiment_id) self.job_dir = job_dir diff --git a/nemo_run/run/experiment.py b/nemo_run/run/experiment.py index fe59e29f..34bb4961 100644 --- a/nemo_run/run/experiment.py +++ b/nemo_run/run/experiment.py @@ -41,11 +41,11 @@ import nemo_run as run from nemo_run.config import ( - NEMORUN_HOME, Config, ConfigurableMixin, Partial, Script, + get_nemorun_home, get_type_namespace, ) from nemo_run.core.execution.base import Executor @@ -101,7 +101,7 @@ class Experiment(ConfigurableMixin): The design is heavily inspired from `XManager `_. Under the hood, the Experiment metadata is stored in the local filesystem - inside a user specified directory controlled by NEMORUN_HOME env var. + inside a user specified directory controlled by get_nemorun_home() env var. We will explore making the metadata more persistent in the future. .. note:: @@ -206,9 +206,9 @@ def catalog( title: str = "", ) -> list[str]: """ - List all experiments inside NEMORUN_HOME, optionally with the provided title. + List all experiments inside get_nemorun_home(), optionally with the provided title. """ - parent_dir = os.path.join(NEMORUN_HOME, "experiments", title) + parent_dir = os.path.join(get_nemorun_home(), "experiments", title) return _get_sorted_dirs(parent_dir) @classmethod @@ -247,7 +247,7 @@ def from_id( Reconstruct an experiment with the specified id. """ title, _, _ = id.rpartition("_") - parent_dir = os.path.join(NEMORUN_HOME, "experiments", title) + parent_dir = os.path.join(get_nemorun_home(), "experiments", title) exp_dir = os.path.join(parent_dir, id) assert os.path.isdir(exp_dir), f"Experiment {id} not found." @@ -263,7 +263,7 @@ def from_title( """ Reconstruct an experiment with the specified title. """ - parent_dir = os.path.join(NEMORUN_HOME, "experiments", title) + parent_dir = os.path.join(get_nemorun_home(), "experiments", title) exp_dir = _get_latest_dir(parent_dir) assert os.path.isdir(exp_dir), f"Experiment {id} not found." @@ -303,7 +303,7 @@ def __init__( self._title = title self._id = id or f"{title}_{int(time.time())}" - base_dir = base_dir or NEMORUN_HOME + base_dir = base_dir or get_nemorun_home() self._exp_dir = os.path.join(base_dir, "experiments", title, self._id) self.log_level = log_level @@ -967,7 +967,7 @@ def reset(self) -> "Experiment": old_id, old_exp_dir, old_launched = self._id, self._exp_dir, self._launched self._id = f"{self._title}_{int(time.time())}" - self._exp_dir = os.path.join(NEMORUN_HOME, "experiments", self._title, self._id) + self._exp_dir = os.path.join(get_nemorun_home(), "experiments", self._title, self._id) self._launched = False self._live_progress = None @@ -1017,7 +1017,7 @@ def reset(self) -> "Experiment": f"[bold magenta]Failed resetting Experiment {self._id} due to error: {e}" ) # Double check exp dir is unchanged - new_path = os.path.join(NEMORUN_HOME, "experiments", self._title, self._id) + new_path = os.path.join(get_nemorun_home(), "experiments", self._title, self._id) if self._exp_dir == new_path and new_path != old_exp_dir: shutil.rmtree(self._exp_dir) diff --git a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py index 954d8ab8..4aca4a03 100644 --- a/nemo_run/run/torchx_backend/schedulers/dgxcloud.py +++ b/nemo_run/run/torchx_backend/schedulers/dgxcloud.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import Any, Optional +import fiddle as fdl +import fiddle._src.experimental.dataclasses as fdl_dc from torchx.schedulers.api import ( AppDryRunInfo, DescribeAppResponse, @@ -22,16 +24,14 @@ runopts, ) -import fiddle as fdl -import fiddle._src.experimental.dataclasses as fdl_dc -from nemo_run.config import NEMORUN_HOME +from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor from nemo_run.core.execution.dgxcloud import DGXCloudExecutor, DGXCloudState from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin # Local placeholder for storing DGX job states -DGX_JOB_DIRS = os.path.join(NEMORUN_HOME, ".dgx_jobs.json") +DGX_JOB_DIRS = os.path.join(get_nemorun_home(), ".dgx_jobs.json") # Example mapping from some DGX statuses to the TorchX AppState DGX_STATES: dict[DGXCloudState, AppState] = { diff --git a/nemo_run/run/torchx_backend/schedulers/local.py b/nemo_run/run/torchx_backend/schedulers/local.py index 8fe9404d..8d45839f 100644 --- a/nemo_run/run/torchx_backend/schedulers/local.py +++ b/nemo_run/run/torchx_backend/schedulers/local.py @@ -46,7 +46,7 @@ ) from torchx.specs.api import AppDef, AppState, Role -from nemo_run.config import NEMORUN_HOME +from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor from nemo_run.core.execution.local import LocalExecutor from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin @@ -59,7 +59,7 @@ fcntl = None FCNTL_AVAILABLE = False -LOCAL_JOB_DIRS = os.path.join(NEMORUN_HOME, ".local_jobs.json") +LOCAL_JOB_DIRS = os.path.join(get_nemorun_home(), ".local_jobs.json") class PersistentLocalScheduler(SchedulerMixin, LocalScheduler): # type: ignore diff --git a/nemo_run/run/torchx_backend/schedulers/skypilot.py b/nemo_run/run/torchx_backend/schedulers/skypilot.py index f3c0f3ca..bd2ac9ed 100644 --- a/nemo_run/run/torchx_backend/schedulers/skypilot.py +++ b/nemo_run/run/torchx_backend/schedulers/skypilot.py @@ -42,7 +42,7 @@ runopts, ) -from nemo_run.config import NEMORUN_HOME +from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor from nemo_run.core.execution.skypilot import _SKYPILOT_AVAILABLE, SkypilotExecutor from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin @@ -74,7 +74,7 @@ ... log: logging.Logger = logging.getLogger(__name__) -SKYPILOT_JOB_DIRS = os.path.join(NEMORUN_HOME, ".skypilot_jobs.json") +SKYPILOT_JOB_DIRS = os.path.join(get_nemorun_home(), ".skypilot_jobs.json") @dataclass diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index c0eafb51..686622de 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -50,7 +50,7 @@ ) from torchx.specs.api import is_terminal -from nemo_run.config import NEMORUN_HOME, from_dict +from nemo_run.config import from_dict, get_nemorun_home from nemo_run.core.execution.base import Executor from nemo_run.core.execution.slurm import SlurmBatchRequest, SlurmExecutor, SlurmJobDetails from nemo_run.core.tunnel.client import LocalTunnel, SSHTunnel, Tunnel @@ -58,7 +58,7 @@ from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin log: logging.Logger = logging.getLogger(__name__) -SLURM_JOB_DIRS = os.path.join(NEMORUN_HOME, ".slurm_jobs") +SLURM_JOB_DIRS = os.path.join(get_nemorun_home(), ".slurm_jobs") class SlurmTunnelScheduler(SchedulerMixin, SlurmScheduler): # type: ignore diff --git a/test/run/torchx_backend/test_api.py b/test/run/torchx_backend/test_api.py index b0aa7893..167d1a02 100644 --- a/test/run/torchx_backend/test_api.py +++ b/test/run/torchx_backend/test_api.py @@ -14,13 +14,14 @@ # limitations under the License. import time -from test.conftest import MockContext from unittest.mock import ANY, patch import pytest -from nemo_run.config import Partial, Script + +from nemo_run.config import Partial, Script, set_nemorun_home from nemo_run.core.execution.local import LocalExecutor from nemo_run.run.api import run +from test.conftest import MockContext class MockExecutor: @@ -67,8 +68,8 @@ def test_run_dryrun(dummy_partial: Partial, capsys): assert "Dry run for task torchx_backend.test_api:dummy_add" in stdout -def test_run_dryrun_with_executor(mocker, dummy_partial: Partial, capsys, tmpdir): - mocker.patch("nemo_run.run.experiment.NEMORUN_HOME", str(tmpdir)) +def test_run_dryrun_with_executor(dummy_partial: Partial, capsys, tmpdir): + set_nemorun_home(str(tmpdir)) run(dummy_partial, executor=LocalExecutor(), dryrun=True) stdout = capsys.readouterr().out assert "Entering Experiment torchx_backend.test_api.dummy_add with id" in stdout @@ -87,10 +88,9 @@ def test_run_script(capsys): @patch("builtins.print") def test_run_with_executor( mocked_print, - mocker, tmpdir, ): - mocker.patch("nemo_run.run.experiment.NEMORUN_HOME", str(tmpdir)) + set_nemorun_home(str(tmpdir)) script = Script(inline="echo 'Hello World Mock Test'") run(script, executor=LocalExecutor(), detach=False, tail_logs=True)