Skip to content

Commit 0f34257

Browse files
authored
Allows to configure flight recorder file prefix (#1748)
Allows to configure flight recorder file prefix
1 parent 78841bf commit 0f34257

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

torchtitan/config/job_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,9 @@ class Comm:
645645
save_traces_folder: str = "comm_traces"
646646
"""Flight recorder trace files location"""
647647

648+
save_traces_file_prefix: str = "rank_"
649+
"""Flight recorder trace files prefix"""
650+
648651

649652
@dataclass
650653
class MemoryEstimation:

torchtitan/distributed/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def _get_distributed_backend(enable_cpu_backend):
258258
return backend
259259

260260
TRACE_BUFFER_SIZE = "TORCH_FR_BUFFER_SIZE"
261-
TRACE_FILE = "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"
261+
TRACE_FILE = "TORCH_FR_DUMP_TEMP_FILE"
262262
DUMP_ON_TIMEOUT = "TORCH_NCCL_DUMP_ON_TIMEOUT"
263263
ASYNC_ERROR_HANDLING = "TORCH_NCCL_ASYNC_ERROR_HANDLING"
264264
SKIP_CLEANUP = "3"
@@ -275,8 +275,9 @@ def _get_distributed_backend(enable_cpu_backend):
275275
# dump on timeout by default if trace buffer is enabled
276276
_warn_overwrite_env(DUMP_ON_TIMEOUT, "1")
277277
dump_dir = os.path.join(base_folder, comm_config.save_traces_folder)
278+
prefix = comm_config.save_traces_file_prefix
278279
os.makedirs(dump_dir, exist_ok=True)
279-
_warn_overwrite_env(TRACE_FILE, f"{dump_dir}/rank_")
280+
_warn_overwrite_env(TRACE_FILE, f"{dump_dir}/{prefix}")
280281

281282
torch.distributed.init_process_group(
282283
backend=_get_distributed_backend(enable_cpu_backend),

0 commit comments

Comments
 (0)