File tree Expand file tree Collapse file tree 2 files changed +6
-2
lines changed Expand file tree Collapse file tree 2 files changed +6
-2
lines changed Original file line number Diff line number Diff line change @@ -645,6 +645,9 @@ class Comm:
645
645
save_traces_folder : str = "comm_traces"
646
646
"""Flight recorder trace files location"""
647
647
648
+ save_traces_file_prefix : str = "rank_"
649
+ """Flight recorder trace files prefix"""
650
+
648
651
649
652
@dataclass
650
653
class MemoryEstimation :
Original file line number Diff line number Diff line change @@ -258,7 +258,7 @@ def _get_distributed_backend(enable_cpu_backend):
258
258
return backend
259
259
260
260
TRACE_BUFFER_SIZE = "TORCH_FR_BUFFER_SIZE"
261
- TRACE_FILE = "TORCH_NCCL_DEBUG_INFO_TEMP_FILE "
261
+ TRACE_FILE = "TORCH_FR_DUMP_TEMP_FILE "
262
262
DUMP_ON_TIMEOUT = "TORCH_NCCL_DUMP_ON_TIMEOUT"
263
263
ASYNC_ERROR_HANDLING = "TORCH_NCCL_ASYNC_ERROR_HANDLING"
264
264
SKIP_CLEANUP = "3"
@@ -275,8 +275,9 @@ def _get_distributed_backend(enable_cpu_backend):
275
275
# dump on timeout by default if trace buffer is enabled
276
276
_warn_overwrite_env (DUMP_ON_TIMEOUT , "1" )
277
277
dump_dir = os .path .join (base_folder , comm_config .save_traces_folder )
278
+ prefix = comm_config .save_traces_file_prefix
278
279
os .makedirs (dump_dir , exist_ok = True )
279
- _warn_overwrite_env (TRACE_FILE , f"{ dump_dir } /rank_ " )
280
+ _warn_overwrite_env (TRACE_FILE , f"{ dump_dir } /{ prefix } " )
280
281
281
282
torch .distributed .init_process_group (
282
283
backend = _get_distributed_backend (enable_cpu_backend ),
You can’t perform that action at this time.
0 commit comments