Moved failed errors to utils; raising AgentFailedError in _all_gather

apoorvkh · apoorvkh · commit c0baede0e418 · 2024-10-27T14:46:25.000-04:00
diff --git a/src/torchrunx/__init__.py b/src/torchrunx/__init__.py
@@ -1,6 +1,6 @@
-from .errors import AgentFailedError, WorkerFailedError
 from .launcher import Launcher, LaunchResult, launch
 from .logging_utils import add_filter_to_handler, file_handler, stream_handler
+from .utils import AgentFailedError, WorkerFailedError
 
 __all__ = [
     "AgentFailedError",
diff --git a/src/torchrunx/errors.py b/src/torchrunx/errors.py
diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -21,13 +21,13 @@
 import torch.distributed as dist
 
 from .environment import auto_hosts, auto_workers, slurm_hosts, slurm_workers
-from .errors import AgentFailedError, WorkerFailedError
 from .logging_utils import LogRecordSocketReceiver, default_handlers
 from .utils import (
     AgentStatus,
     ExceptionFromWorker,
     LauncherAgentGroup,
     LauncherPayload,
+    WorkerFailedError,
     get_open_port,
 )
 
@@ -144,11 +144,8 @@ def run(  # noqa: C901, PLR0912
             # loop to monitor agent statuses (until failed or done)
 
             while True:
-                try:
-                    agent_statuses = launcher_agent_group.sync_agent_statuses(status=None)
-                except RuntimeError as e:
-                    # occurs if any agent dies and communication times out
-                    raise AgentFailedError from e
+                # could raise AgentFailedError
+                agent_statuses = launcher_agent_group.sync_agent_statuses(status=None)
 
                 # raises specific exception if any agent fails
                 for s in agent_statuses:
diff --git a/src/torchrunx/utils.py b/src/torchrunx/utils.py
@@ -10,8 +10,6 @@
 import torch.distributed as dist
 from typing_extensions import Self
 
-from .errors import WorkerFailedError
-
 if TYPE_CHECKING:
     from torch.distributed.elastic.multiprocessing.api import RunProcsResult
 
@@ -22,6 +20,13 @@ def get_open_port() -> int:
         return s.getsockname()[1]
 
 
+class AgentFailedError(Exception):
+    pass
+
+class WorkerFailedError(Exception):
+    pass
+
+
 @dataclass
 class LauncherAgentGroup:
     launcher_hostname: str
@@ -52,11 +57,15 @@ def _deserialize(self, serialized: bytes) -> Any:
 
     def _all_gather(self, obj: Any) -> list:
         """gather object from every rank to list on every rank"""
-        object_bytes = self._serialize(obj)
-        object_list = [b""] * self.world_size
-        # raises RuntimeError if timeout
-        dist.all_gather_object(object_list=object_list, obj=object_bytes, group=self.group)
-        return [self._deserialize(o) for o in object_list]
+        try:
+            object_bytes = self._serialize(obj)
+            object_list = [b""] * self.world_size
+            # raises RuntimeError if timeout
+            dist.all_gather_object(object_list=object_list, obj=object_bytes, group=self.group)
+            return [self._deserialize(o) for o in object_list]
+        except RuntimeError as e:
+            # occurs if launcher or any agent dies and communication times out
+            raise AgentFailedError from e
 
     def sync_payloads(
         self,