small adjustments to logging messages

apoorvkh · apoorvkh · commit 41d97597fc88 · 2025-03-02T01:49:48.000-05:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ ignore = [
   "S607",    # bandit: subprocess
   "COM812",
   "ISC001",  # conflict with formatter
+  "G004"  # f-string in logging
 ]
 [tool.ruff.lint.per-file-ignores]
 "tests/**/*.py" = [
diff --git a/src/torchrunx/agent.py b/src/torchrunx/agent.py
@@ -47,7 +47,8 @@ def main(
         logger_port: Port for the logging server.
         hostname: Hostname of this agent.
     """
-    # Stream logs to logging server
+    # Setup logging & stream logs to server
+
     logger = logging.getLogger(f"{__package__}.{hostname}")
 
     log_records_to_socket(
@@ -56,10 +57,6 @@ def main(
 
     redirect_stdio_to_logger(logger)
 
-    logger.debug("Agent logging setup.")
-
-    # Set up launcher-agent group
-
     logger.debug("Initializing launcher-agent group.")
 
     launcher_agent_group = LauncherAgentGroup(
@@ -71,9 +68,7 @@ def main(
 
     agent_rank = launcher_agent_group.rank - 1
 
-    # Communicate initial payloads between launcher/agents
-
-    logger.debug("Sending agent details to launcher.")
+    logger.debug("Synchronizing launcher and agents.")
 
     payload = AgentPayload(
         hostname=socket.getfqdn(),
@@ -88,9 +83,7 @@ def main(
     worker_global_ranks = launcher_payload.worker_global_ranks[agent_rank]
     num_workers = len(worker_global_ranks)
 
-    # Spawn worker processes
-
-    logger.debug("Launching worker processes.")
+    logger.info(f"Starting {num_workers} worker processes.")
 
     ctx = dist_mp.start_processes(
         name=f"{hostname}_",
@@ -128,6 +121,8 @@ def main(
     # Monitor and communicate agent statuses
     # Terminate gracefully upon failure
 
+    logger.debug("Entering worker monitoring and agent communication loop.")
+
     try:
         status = None
         while True:
@@ -141,12 +136,12 @@ def main(
             all_done = all(s.state == "done" for s in agent_statuses)
             any_failed = any(s.state == "failed" for s in agent_statuses)
             if all_done or any_failed:
-                logger.debug("Workers exiting %s.", "cleanly" if not any_failed else "with errors")
+                logger.info(f"Workers exited {'with' if any_failed else 'without'} errors.")
                 break
     finally:
         ctx.close()
         sys.stdout.flush()
         sys.stderr.flush()
         launcher_agent_group.shutdown()
 
-    logger.debug("Agent exiting.")
+    logger.debug("Terminating agent process.")
diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -108,8 +108,9 @@ def run(  # noqa: C901, PLR0912, PLR0915
             msg = "The torch.distributed package is not available."
             raise RuntimeError(msg)
 
+        logger.debug("Preparing launch environment.")
+
         ###
-        logger.debug("Resolving environment.")
 
         hostnames, workers_per_host = resolve_environment(
             self.hostnames, self.workers_per_host, ssh_config_file=self.ssh_config_file
@@ -183,11 +184,11 @@ def handler_factory() -> list[logging.Handler]:
 
             log_process.start()
 
-            logger.debug("Launching agents.")
-
             # Start agents on each node
 
             for i, hostname in enumerate(hostnames):
+                logger.info(f'Launching "{func.__name__}" on {hostname}.')
+
                 execute_command(
                     command=build_launch_command(
                         launcher_hostname=launcher_hostname,
@@ -215,16 +216,15 @@ def handler_factory() -> list[logging.Handler]:
                 rank=0,
             )
 
-            logger.debug("Receiving agent details.")
-
             # Sync initial payloads between launcher and agents
 
+            logger.debug("Synchronizing launcher and agents.")
             launcher_payload, agent_payloads = launcher_agent_group.sync_payloads(payload=payload)
 
-            logger.debug("Entering agent monitoring loop.")
-
             # Monitor agent statuses (until failed or done)
 
+            logger.debug("Entering agent monitoring loop.")
+
             while True:
                 # could raise AgentFailedError
                 agent_statuses = launcher_agent_group.sync_agent_statuses(status=None)
@@ -238,17 +238,10 @@ def handler_factory() -> list[logging.Handler]:
                             raise v
 
                 if all(s.state == "done" for s in agent_statuses):
-                    logger.debug("All workers exited cleanly.")
+                    logger.info("All workers completed successfully.")
                     return_values: list[list[FunctionR]] = [s.return_values for s in agent_statuses]  # pyright: ignore [reportAssignmentType]
                     return LaunchResult.from_returns(hostnames, return_values)
         finally:
-            logger.debug("Stopping logging server.")
-
-            if stop_logging_event is not None:
-                stop_logging_event.set()
-            if log_process is not None:
-                log_process.kill()
-
             # cleanup: SIGTERM all agents
             if agent_payloads is not None:
                 for agent_payload, agent_hostname in zip(agent_payloads, hostnames):
@@ -264,6 +257,13 @@ def handler_factory() -> list[logging.Handler]:
                 logger.debug("Killing launcher-agent group.")
                 launcher_agent_group.shutdown()
 
+            logger.debug("Stopping logging server.")
+
+            if stop_logging_event is not None:
+                stop_logging_event.set()
+            if log_process is not None:
+                log_process.kill()
+
 
 @dataclass
 class LaunchResult(Generic[FunctionR]):

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ ignore = [`
`54`	`54`	`"S607", # bandit: subprocess`
`55`	`55`	`"COM812",`
`56`	`56`	`"ISC001", # conflict with formatter`
	`57`	`+ "G004" # f-string in logging`
`57`	`58`	`]`
`58`	`59`	`[tool.ruff.lint.per-file-ignores]`
`59`	`60`	`"tests/*/.py" = [`