22
33echo " $BUILDKITE_PARALLEL_JOB "
44echo " $BUILDKITE_PARALLEL_JOB_COUNT "
5+ echo " $BUILDKITE_BUILD_ID "
6+ echo " ${MASTER_ADDR} :${MASTER_PORT} "
57
68set -euox pipefail
79
810# 0. install bagua
911cp -a /upstream /workdir
10- export HOME =/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
12+ export WORKDIR =/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
1113
1214# 1. test communication_primitives api
1315echo " begin to test [communication_primitives]"
14- COMMUNICATION_SCRIPT=" /workdir /examples/communication_primitives/main.py"
15- NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
16+ COMMUNICATION_SCRIPT=" ${WORKDIR} /examples/communication_primitives/main.py"
17+ NCCL_SOCKET_IFNAME=^docker,lo,veth torchrun \
1618 --nnodes=2 \
1719 --nproc_per_node 4 \
18- --node_rank=0 \
19- --master_addr= " 10.158.66.134 " \
20- --master_port=1234 \
20+ --rdzv_id= ${BUILDKITE_BUILD_ID} \
21+ --rdzv_backend=c10d \
22+ --rdzv_endpoint= ${MASTER_ADDR} : ${MASTER_PORT} \
2123 ${COMMUNICATION_SCRIPT}
2224
2325# 2. benchmark test with all communication algorithms
@@ -77,20 +79,20 @@ function check_benchmark_log_approximation {
7779}
7880
7981CHECK_RESULT=()
80- SYNTHETIC_SCRIPT=" /workdir /examples/benchmark/synthetic_benchmark.py"
82+ SYNTHETIC_SCRIPT=" ${WORKDIR} /examples/benchmark/synthetic_benchmark.py"
8183algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
8284speeds=(185.0 180.0 150.0 115.0 190 165)
8385losses=(0.001763 0.001694 0.002583 0.001821 0.004000 0.000102)
8486length=${# algorithms[@]}
8587for (( i = 0 ; i < $length ; i++ )) ; do
8688 echo " begin to test [" ${algorithms[$i]} ]
8789 logfile=$( mktemp /tmp/bagua_benchmark_${algorithms[$i]} .XXXXXX.log)
88- NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
90+ NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 torchrun \
8991 --nnodes=2 \
9092 --nproc_per_node 4 \
91- --node_rank=0 \
92- --master_addr= " 10.158.66.134 " \
93- --master_port=1234 \
93+ --rdzv_id= ${BUILDKITE_BUILD_ID} \
94+ --rdzv_backend=c10d \
95+ --rdzv_endpoint= ${MASTER_ADDR} : ${MASTER_PORT} \
9496 ${SYNTHETIC_SCRIPT} \
9597 --num-iters 100 \
9698 --algorithm ${algorithms[$i]} \
@@ -126,14 +128,14 @@ function check_moe_log {
126128 fi
127129}
128130
129- MOE_SCRIPT=" /workdir /examples/moe/mnist_main.py"
131+ MOE_SCRIPT=" ${WORKDIR} /examples/moe/mnist_main.py"
130132logfile=$( mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
131- NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
133+ NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 torchrun \
132134 --nnodes=2 \
133135 --nproc_per_node 2 \
134- --node_rank=0 \
135- --master_addr= " 10.158.66.134 " \
136- --master_port=1234 \
136+ --rdzv_id= ${BUILDKITE_BUILD_ID} \
137+ --rdzv_backend=c10d \
138+ --rdzv_endpoint= ${MASTER_ADDR} : ${MASTER_PORT} \
137139 ${MOE_SCRIPT} \
138140 --algorithm gradient_allreduce \
139141 --epochs 5 \
@@ -144,7 +146,9 @@ check_moe_log ${logfile} 0.000071
144146
145147# 4. test moe checkpoint
146148logfile=$( mktemp /tmp/bagua_moe_checkpoint.XXXXXX.log)
147- CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.launch \
149+ CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
150+ --standalone \
151+ --nnodes=1 \
148152 --nproc_per_node 4 \
149153 ${MOE_SCRIPT} \
150154 --algorithm gradient_allreduce \
0 commit comments