renamed to WorkerFailedError and AgentFailedError

apoorvkh · apoorvkh · commit 10fa1a00d56e · 2024-10-27T14:41:13.000-04:00
diff --git a/src/torchrunx/__init__.py b/src/torchrunx/__init__.py
@@ -1,8 +1,10 @@
-from .launcher import AgentKilledError, Launcher, LaunchResult, launch
+from .errors import AgentFailedError, WorkerFailedError
+from .launcher import Launcher, LaunchResult, launch
 from .logging_utils import add_filter_to_handler, file_handler, stream_handler
 
 __all__ = [
-    "AgentKilledError",
+    "AgentFailedError",
+    "WorkerFailedError",
     "Launcher",
     "launch",
     "LaunchResult",
diff --git a/src/torchrunx/errors.py b/src/torchrunx/errors.py
@@ -0,0 +1,5 @@
+class AgentFailedError(Exception):
+    pass
+
+class WorkerFailedError(Exception):
+    pass
diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -21,21 +21,17 @@
 import torch.distributed as dist
 
 from .environment import auto_hosts, auto_workers, slurm_hosts, slurm_workers
+from .errors import AgentFailedError, WorkerFailedError
 from .logging_utils import LogRecordSocketReceiver, default_handlers
 from .utils import (
     AgentStatus,
     ExceptionFromWorker,
     LauncherAgentGroup,
     LauncherPayload,
-    WorkerKilledError,
     get_open_port,
 )
 
 
-class AgentKilledError(Exception):
-    pass
-
-
 @dataclass
 class Launcher:
     hostnames: list[str] | Literal["auto", "slurm"] = "auto"
@@ -152,14 +148,14 @@ def run(  # noqa: C901, PLR0912
                     agent_statuses = launcher_agent_group.sync_agent_statuses(status=None)
                 except RuntimeError as e:
                     # occurs if any agent dies and communication times out
-                    raise AgentKilledError from e
+                    raise AgentFailedError from e
 
                 # raises specific exception if any agent fails
                 for s in agent_statuses:
                     for value in s.return_values:
                         if isinstance(value, ExceptionFromWorker):
                             raise value.exception
-                        if isinstance(value, WorkerKilledError):
+                        if isinstance(value, WorkerFailedError):
                             raise value
 
                 if all(s.state == "done" for s in agent_statuses):
diff --git a/src/torchrunx/utils.py b/src/torchrunx/utils.py
@@ -10,6 +10,8 @@
 import torch.distributed as dist
 from typing_extensions import Self
 
+from .errors import WorkerFailedError
+
 if TYPE_CHECKING:
     from torch.distributed.elastic.multiprocessing.api import RunProcsResult
 
@@ -94,11 +96,6 @@ class ExceptionFromWorker:
     exception: Exception
 
 
-@dataclass
-class WorkerKilledError(Exception):
-    failure: str
-
-
 @dataclass
 class AgentStatus:
     state: Literal["running", "failed", "done"]
@@ -111,7 +108,7 @@ def from_result(cls, result: RunProcsResult | None) -> Self:
         if result is None:
             return cls(state="running")
         for local_rank, failure in result.failures.items():
-            result.return_values[local_rank] = WorkerKilledError(failure.message)
+            result.return_values[local_rank] = WorkerFailedError(failure.message)
         return_values = list(result.return_values.values())
         failed = any(isinstance(v, ExceptionFromWorker) for v in return_values)
         state = "failed" if failed else "done"

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +class AgentFailedError(Exception):
 +    pass
++
 +class WorkerFailedError(Exception):
 +    pass