Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions nemo_run/cli/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,13 @@
from nemo_run.cli import devspace as devspace_cli
from nemo_run.cli import experiment as experiment_cli
from nemo_run.cli.cli_parser import parse_cli_args, parse_factory
from nemo_run.config import NEMORUN_HOME, Config, Partial, get_type_namespace, get_underlying_types
from nemo_run.config import (
Config,
Partial,
get_nemorun_home,
get_type_namespace,
get_underlying_types,
)
from nemo_run.core.execution import LocalExecutor, SkypilotExecutor, SlurmExecutor
from nemo_run.core.execution.base import Executor
from nemo_run.core.frontend.console.styles import BOX_STYLE, TABLE_STYLES
Expand Down Expand Up @@ -758,7 +764,7 @@ def _search_workspace_file() -> str | None:
file_names = [
"workspace_private.py",
"workspace.py",
os.path.join(NEMORUN_HOME, "workspace.py"),
os.path.join(get_nemorun_home(), "workspace.py"),
]

while True:
Expand Down
23 changes: 22 additions & 1 deletion nemo_run/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,33 @@
_BuildableT = TypeVar("_BuildableT", bound=fdl.Buildable)

RECURSIVE_TYPES = (typing.Union, typing.Optional)
NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.expanduser("~/.nemo_run"))
_NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.expanduser("~/.nemo_run"))
RUNDIR_NAME = "nemo_run"
RUNDIR_SPECIAL_NAME = "/$nemo_run"
SCRIPTS_DIR = "scripts"


def get_nemorun_home() -> str:
"""
Get the current NEMORUN_HOME directory path.

Returns:
The path to the NEMORUN_HOME directory.
"""
return _NEMORUN_HOME


def set_nemorun_home(path: str) -> None:
"""
Set the NEMORUN_HOME directory path.

Args:
path: The new path for NEMORUN_HOME.
"""
global _NEMORUN_HOME
_NEMORUN_HOME = os.path.expanduser(path)


def get_type_namespace(typ: Type | Callable) -> str:
"""
Get the namespace of a type or callable.
Expand Down
10 changes: 5 additions & 5 deletions nemo_run/core/execution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from torchx.specs import Role
from typing_extensions import Self

from nemo_run.config import NEMORUN_HOME, ConfigurableMixin
from nemo_run.config import ConfigurableMixin, get_nemorun_home
from nemo_run.core.execution.launcher import LAUNCHER_MAP, Launcher
from nemo_run.core.packaging.base import Packager

Expand Down Expand Up @@ -201,25 +201,25 @@ def import_executor(

Example:
executor = import_executor("local", file_path="path/to/executors.py")
executor = import_executor("gpu") # Uses the default location of os.path.join(NEMORUN_HOME, "executors.py")
executor = import_executor("gpu") # Uses the default location of os.path.join(get_nemorun_home(), "executors.py")

Args:
name (str): The name of the executor to retrieve.
file_path (Optional[str]): The path to the Python file containing the executor definitions.
Defaults to None, in which case the default location of os.path.join(NEMORUN_HOME, "executors.py") is used.
Defaults to None, in which case the default location of os.path.join(get_nemorun_home(), "executors.py") is used.

The file_path is expected to be a string representing a file path with the following structure:
- It should be a path to a Python file (with a .py extension).
- The file should contain a dictionary named `EXECUTOR_MAP` that maps executor names to their corresponding instances.
- The file can be located anywhere in the file system, but if not provided, it defaults to `NEMORUN_HOME/executors.py`.
- The file can be located anywhere in the file system, but if not provided, it defaults to `get_nemorun_home()/executors.py`.
call (bool): If True, the value from the module is called with the rest of the given kwargs.

Returns:
Executor: The executor instance corresponding to the given name.
"""

if not file_path:
file_path = os.path.join(NEMORUN_HOME, "executors.py")
file_path = os.path.join(get_nemorun_home(), "executors.py")

spec = importlib.util.spec_from_file_location("executors", file_path)
assert spec
Expand Down
2 changes: 1 addition & 1 deletion nemo_run/core/execution/dgxcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def assign(
== os.path.abspath(x["path"]),
self.pvcs,
)
), f"Need to specify atleast one PVC containing {self.job_dir}.\nTo update job dir to a PVC path, you can set the NEMORUN_HOME env var."
), f"Need to specify atleast one PVC containing {self.job_dir}.\nTo update job dir to a PVC path, you can use set_nemorun_home() or the NEMORUN_HOME env var."

def package(self, packager: Packager, job_name: str):
assert self.experiment_id, "Executor not assigned to an experiment."
Expand Down
4 changes: 2 additions & 2 deletions nemo_run/core/execution/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
parse_app_handle,
)

from nemo_run.config import NEMORUN_HOME, RUNDIR_NAME
from nemo_run.config import RUNDIR_NAME, get_nemorun_home
from nemo_run.core.execution.base import Executor
from nemo_run.core.packaging.base import Packager
from nemo_run.core.packaging.git import GitArchivePackager
Expand All @@ -50,7 +50,7 @@
fcntl = None
FCNTL_AVAILABLE = False

DOCKER_JOB_DIRS = os.path.join(NEMORUN_HOME, ".docker_jobs.json")
DOCKER_JOB_DIRS = os.path.join(get_nemorun_home(), ".docker_jobs.json")
NETWORK = "nemo_run"

LABEL_EXPERIMENT_ID: str = "nemo-run/experiment-id"
Expand Down
6 changes: 3 additions & 3 deletions nemo_run/core/tunnel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
from invoke.context import Context
from invoke.runners import Result as RunResult

from nemo_run.config import NEMORUN_HOME, ConfigurableMixin
from nemo_run.config import ConfigurableMixin, get_nemorun_home
from nemo_run.core.frontend.console.api import CONSOLE

logger: logging.Logger = logging.getLogger(__name__)
TUNNEL_DIR = ".tunnels"
TUNNEL_FILE_SUBPATH = os.path.join(NEMORUN_HOME, TUNNEL_DIR)
TUNNEL_FILE_SUBPATH = os.path.join(get_nemorun_home(), TUNNEL_DIR)


def delete_tunnel_dir(file_path):
Expand Down Expand Up @@ -136,7 +136,7 @@ def __post_init__(self):

def _set_job_dir(self, experiment_id: str):
experiment_title, _, _ = experiment_id.rpartition("_")
base_job_dir = self.job_dir or os.path.join(NEMORUN_HOME, "experiments")
base_job_dir = self.job_dir or os.path.join(get_nemorun_home(), "experiments")
job_dir = os.path.join(base_job_dir, experiment_title, experiment_id)
self.job_dir = job_dir

Expand Down
18 changes: 9 additions & 9 deletions nemo_run/run/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@

import nemo_run as run
from nemo_run.config import (
NEMORUN_HOME,
Config,
ConfigurableMixin,
Partial,
Script,
get_nemorun_home,
get_type_namespace,
)
from nemo_run.core.execution.base import Executor
Expand Down Expand Up @@ -101,7 +101,7 @@ class Experiment(ConfigurableMixin):
The design is heavily inspired from `XManager <https://github.com/google-deepmind/xmanager/blob/main/docs/xm_launch_api_principles.md>`_.

Under the hood, the Experiment metadata is stored in the local filesystem
inside a user specified directory controlled by NEMORUN_HOME env var.
inside a user specified directory controlled by get_nemorun_home() env var.
We will explore making the metadata more persistent in the future.

.. note::
Expand Down Expand Up @@ -206,9 +206,9 @@ def catalog(
title: str = "",
) -> list[str]:
"""
List all experiments inside NEMORUN_HOME, optionally with the provided title.
List all experiments inside get_nemorun_home(), optionally with the provided title.
"""
parent_dir = os.path.join(NEMORUN_HOME, "experiments", title)
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
return _get_sorted_dirs(parent_dir)

@classmethod
Expand Down Expand Up @@ -247,7 +247,7 @@ def from_id(
Reconstruct an experiment with the specified id.
"""
title, _, _ = id.rpartition("_")
parent_dir = os.path.join(NEMORUN_HOME, "experiments", title)
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
exp_dir = os.path.join(parent_dir, id)

assert os.path.isdir(exp_dir), f"Experiment {id} not found."
Expand All @@ -263,7 +263,7 @@ def from_title(
"""
Reconstruct an experiment with the specified title.
"""
parent_dir = os.path.join(NEMORUN_HOME, "experiments", title)
parent_dir = os.path.join(get_nemorun_home(), "experiments", title)
exp_dir = _get_latest_dir(parent_dir)

assert os.path.isdir(exp_dir), f"Experiment {id} not found."
Expand Down Expand Up @@ -303,7 +303,7 @@ def __init__(
self._title = title
self._id = id or f"{title}_{int(time.time())}"

base_dir = base_dir or NEMORUN_HOME
base_dir = base_dir or get_nemorun_home()
self._exp_dir = os.path.join(base_dir, "experiments", title, self._id)

self.log_level = log_level
Expand Down Expand Up @@ -967,7 +967,7 @@ def reset(self) -> "Experiment":

old_id, old_exp_dir, old_launched = self._id, self._exp_dir, self._launched
self._id = f"{self._title}_{int(time.time())}"
self._exp_dir = os.path.join(NEMORUN_HOME, "experiments", self._title, self._id)
self._exp_dir = os.path.join(get_nemorun_home(), "experiments", self._title, self._id)
self._launched = False
self._live_progress = None

Expand Down Expand Up @@ -1017,7 +1017,7 @@ def reset(self) -> "Experiment":
f"[bold magenta]Failed resetting Experiment {self._id} due to error: {e}"
)
# Double check exp dir is unchanged
new_path = os.path.join(NEMORUN_HOME, "experiments", self._title, self._id)
new_path = os.path.join(get_nemorun_home(), "experiments", self._title, self._id)
if self._exp_dir == new_path and new_path != old_exp_dir:
shutil.rmtree(self._exp_dir)

Expand Down
8 changes: 4 additions & 4 deletions nemo_run/run/torchx_backend/schedulers/dgxcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from pathlib import Path
from typing import Any, Optional

import fiddle as fdl
import fiddle._src.experimental.dataclasses as fdl_dc
from torchx.schedulers.api import (
AppDryRunInfo,
DescribeAppResponse,
Expand All @@ -22,16 +24,14 @@
runopts,
)

import fiddle as fdl
import fiddle._src.experimental.dataclasses as fdl_dc
from nemo_run.config import NEMORUN_HOME
from nemo_run.config import get_nemorun_home
from nemo_run.core.execution.base import Executor
from nemo_run.core.execution.dgxcloud import DGXCloudExecutor, DGXCloudState
from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin

# Local placeholder for storing DGX job states
DGX_JOB_DIRS = os.path.join(NEMORUN_HOME, ".dgx_jobs.json")
DGX_JOB_DIRS = os.path.join(get_nemorun_home(), ".dgx_jobs.json")

# Example mapping from some DGX statuses to the TorchX AppState
DGX_STATES: dict[DGXCloudState, AppState] = {
Expand Down
4 changes: 2 additions & 2 deletions nemo_run/run/torchx_backend/schedulers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
)
from torchx.specs.api import AppDef, AppState, Role

from nemo_run.config import NEMORUN_HOME
from nemo_run.config import get_nemorun_home
from nemo_run.core.execution.base import Executor
from nemo_run.core.execution.local import LocalExecutor
from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin
Expand All @@ -59,7 +59,7 @@
fcntl = None
FCNTL_AVAILABLE = False

LOCAL_JOB_DIRS = os.path.join(NEMORUN_HOME, ".local_jobs.json")
LOCAL_JOB_DIRS = os.path.join(get_nemorun_home(), ".local_jobs.json")


class PersistentLocalScheduler(SchedulerMixin, LocalScheduler): # type: ignore
Expand Down
4 changes: 2 additions & 2 deletions nemo_run/run/torchx_backend/schedulers/skypilot.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
runopts,
)

from nemo_run.config import NEMORUN_HOME
from nemo_run.config import get_nemorun_home
from nemo_run.core.execution.base import Executor
from nemo_run.core.execution.skypilot import _SKYPILOT_AVAILABLE, SkypilotExecutor
from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin
Expand Down Expand Up @@ -74,7 +74,7 @@
...

log: logging.Logger = logging.getLogger(__name__)
SKYPILOT_JOB_DIRS = os.path.join(NEMORUN_HOME, ".skypilot_jobs.json")
SKYPILOT_JOB_DIRS = os.path.join(get_nemorun_home(), ".skypilot_jobs.json")


@dataclass
Expand Down
4 changes: 2 additions & 2 deletions nemo_run/run/torchx_backend/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@
)
from torchx.specs.api import is_terminal

from nemo_run.config import NEMORUN_HOME, from_dict
from nemo_run.config import from_dict, get_nemorun_home
from nemo_run.core.execution.base import Executor
from nemo_run.core.execution.slurm import SlurmBatchRequest, SlurmExecutor, SlurmJobDetails
from nemo_run.core.tunnel.client import LocalTunnel, SSHTunnel, Tunnel
from nemo_run.run import experiment as run_experiment
from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin

log: logging.Logger = logging.getLogger(__name__)
SLURM_JOB_DIRS = os.path.join(NEMORUN_HOME, ".slurm_jobs")
SLURM_JOB_DIRS = os.path.join(get_nemorun_home(), ".slurm_jobs")


class SlurmTunnelScheduler(SchedulerMixin, SlurmScheduler): # type: ignore
Expand Down
12 changes: 6 additions & 6 deletions test/run/torchx_backend/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
# limitations under the License.

import time
from test.conftest import MockContext
from unittest.mock import ANY, patch

import pytest
from nemo_run.config import Partial, Script

from nemo_run.config import Partial, Script, set_nemorun_home
from nemo_run.core.execution.local import LocalExecutor
from nemo_run.run.api import run
from test.conftest import MockContext


class MockExecutor:
Expand Down Expand Up @@ -67,8 +68,8 @@ def test_run_dryrun(dummy_partial: Partial, capsys):
assert "Dry run for task torchx_backend.test_api:dummy_add" in stdout


def test_run_dryrun_with_executor(mocker, dummy_partial: Partial, capsys, tmpdir):
mocker.patch("nemo_run.run.experiment.NEMORUN_HOME", str(tmpdir))
def test_run_dryrun_with_executor(dummy_partial: Partial, capsys, tmpdir):
set_nemorun_home(str(tmpdir))
run(dummy_partial, executor=LocalExecutor(), dryrun=True)
stdout = capsys.readouterr().out
assert "Entering Experiment torchx_backend.test_api.dummy_add with id" in stdout
Expand All @@ -87,10 +88,9 @@ def test_run_script(capsys):
@patch("builtins.print")
def test_run_with_executor(
mocked_print,
mocker,
tmpdir,
):
mocker.patch("nemo_run.run.experiment.NEMORUN_HOME", str(tmpdir))
set_nemorun_home(str(tmpdir))
script = Script(inline="echo 'Hello World Mock Test'")

run(script, executor=LocalExecutor(), detach=False, tail_logs=True)
Expand Down
Loading