File tree Expand file tree Collapse file tree 1 file changed +24
-0
lines changed Expand file tree Collapse file tree 1 file changed +24
-0
lines changed Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ FT_REPLICA_ID=" ${FT_REPLICA_ID:- 0} "
4+ FT_GROUP_SIZE=" ${FT_GROUP_SIZE:- 1} "
5+
6+ TORCH_SHARE_RDZV_TCP_STORE=1 LOGLEVEL=INFO NCCL_DEBUG_SUBSYS=ALL NCCL_DEBUG=INFO TORCH_CPP_LOG_LEVEL=INFO CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=" ${FT_REPLICA_ID} " NGPU=1 ./run_train.sh \
7+ --fault_tolerance.enable \
8+ --fault_tolerance.group_size=" ${FT_GROUP_SIZE} " \
9+ --fault_tolerance.replica_id=" ${FT_REPLICA_ID} " \
10+ --training.local_batch_size=2 \
11+ --fault_tolerance.sync_steps=10 \
12+ --fault_tolerance.semi_sync_method=diloco \
13+ --parallelism.data_parallel_shard_degree=1 \
14+ --fault_tolerance.num_fragments=2 \
15+ --experimental.custom_args_module=torchtitan.components.ft.config \
16+ --profiling.enable_profiling \
17+ --profiling.profile_freq=5 \
18+ --profiling.profiler_active=5 \
19+ --profiling.profiler_warmup=0 \
20+ --training.steps=1000 \
21+ --comm.train_timeout_seconds=1 \
22+ --fault_tolerance.process_group=nccl \
23+ --checkpoint.no_enable_ft_checkpointing \
24+ --fault_tolerance.process_group_timeout_ms=1
You can’t perform that action at this time.
0 commit comments