Skip to content

Commit e6cc32d

Browse files
committed
upgrade gpu test for torch 1.13
1 parent a71c904 commit e6cc32d

6 files changed

Lines changed: 55 additions & 43 deletions

File tree

.buildkite/pipeline.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@ steps:
22
- label: "benchmark_master"
33
parallelism: 1
44
command: bash .buildkite/scripts/benchmark_master.sh
5+
env:
6+
MASTER_ADDR: "10.158.66.134"
7+
MASTER_PORT: "1234"
58
plugins:
69
- docker#v3.8.0:
7-
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
10+
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
811
workdir: /upstream
912
user: root
1013
propagate-environment: true
@@ -18,9 +21,12 @@ steps:
1821
- label: "benchmark_worker"
1922
parallelism: 1
2023
command: bash .buildkite/scripts/benchmark_worker.sh
24+
env:
25+
MASTER_ADDR: "10.158.66.134"
26+
MASTER_PORT: "1234"
2127
plugins:
2228
- docker#v3.8.0:
23-
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
29+
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
2430
workdir: /upstream
2531
user: root
2632
propagate-environment: true
@@ -34,7 +40,7 @@ steps:
3440
command: bash .buildkite/scripts/benchmark.sh
3541
plugins:
3642
- docker#v3.8.0:
37-
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
43+
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
3844
workdir: /upstream
3945
user: root
4046
propagate-environment: true
@@ -48,7 +54,7 @@ steps:
4854
command: bash .buildkite/scripts/run_pytest.sh
4955
plugins:
5056
- docker#v3.8.0:
51-
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
57+
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
5258
workdir: /upstream
5359
user: root
5460
propagate-environment: true

.buildkite/scripts/benchmark.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
66
set -euox pipefail
77

88
cp -a /upstream /workdir
9+
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
910

10-
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
11-
12-
SYNTHETIC_SCRIPT="examples/benchmark/synthetic_benchmark.py"
11+
SYNTHETIC_SCRIPT="$WORKDIR/examples/benchmark/synthetic_benchmark.py"
1312

1413
function check_benchmark_log {
1514
logfile=$1
@@ -21,7 +20,7 @@ function check_benchmark_log {
2120
}
2221

2322
logfile=$(mktemp /tmp/bagua_benchmark.XXXXXX.log)
24-
python -m bagua.distributed.run \
23+
torchrun \
2524
--standalone \
2625
--nnodes=1 \
2726
--nproc_per_node 4 \

.buildkite/scripts/benchmark_master.sh

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@
22

33
echo "$BUILDKITE_PARALLEL_JOB"
44
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
5+
echo "$BUILDKITE_BUILD_ID"
6+
echo "${MASTER_ADDR}:${MASTER_PORT}"
57

68
set -euox pipefail
79

810
# 0. install bagua
911
cp -a /upstream /workdir
10-
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
12+
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
1113

1214
# 1. test communication_primitives api
1315
echo "begin to test [communication_primitives]"
14-
COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
15-
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
16+
COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
17+
NCCL_SOCKET_IFNAME=^docker,lo,veth torchrun \
1618
--nnodes=2 \
1719
--nproc_per_node 4 \
18-
--node_rank=0 \
19-
--master_addr="10.158.66.134" \
20-
--master_port=1234 \
20+
--rdzv_id=${BUILDKITE_BUILD_ID} \
21+
--rdzv_backend=c10d \
22+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
2123
${COMMUNICATION_SCRIPT}
2224

2325
# 2. benchmark test with all communication algorithms
@@ -77,20 +79,20 @@ function check_benchmark_log_approximation {
7779
}
7880

7981
CHECK_RESULT=()
80-
SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
82+
SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
8183
algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
8284
speeds=(185.0 180.0 150.0 115.0 190 165)
8385
losses=(0.001763 0.001694 0.002583 0.001821 0.004000 0.000102)
8486
length=${#algorithms[@]}
8587
for ((i = 0; i < $length; i++)); do
8688
echo "begin to test ["${algorithms[$i]}]
8789
logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
88-
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
90+
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 torchrun \
8991
--nnodes=2 \
9092
--nproc_per_node 4 \
91-
--node_rank=0 \
92-
--master_addr="10.158.66.134" \
93-
--master_port=1234 \
93+
--rdzv_id=${BUILDKITE_BUILD_ID} \
94+
--rdzv_backend=c10d \
95+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
9496
${SYNTHETIC_SCRIPT} \
9597
--num-iters 100 \
9698
--algorithm ${algorithms[$i]} \
@@ -126,14 +128,14 @@ function check_moe_log {
126128
fi
127129
}
128130

129-
MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
131+
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
130132
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
131-
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
133+
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 torchrun \
132134
--nnodes=2 \
133135
--nproc_per_node 2 \
134-
--node_rank=0 \
135-
--master_addr="10.158.66.134" \
136-
--master_port=1234 \
136+
--rdzv_id=${BUILDKITE_BUILD_ID} \
137+
--rdzv_backend=c10d \
138+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
137139
${MOE_SCRIPT} \
138140
--algorithm gradient_allreduce \
139141
--epochs 5 \
@@ -144,7 +146,9 @@ check_moe_log ${logfile} 0.000071
144146

145147
# 4. test moe checkpoint
146148
logfile=$(mktemp /tmp/bagua_moe_checkpoint.XXXXXX.log)
147-
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.launch \
149+
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
150+
--standalone \
151+
--nnodes=1 \
148152
--nproc_per_node 4 \
149153
${MOE_SCRIPT} \
150154
--algorithm gradient_allreduce \

.buildkite/scripts/benchmark_worker.sh

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,39 @@
22

33
echo "$BUILDKITE_PARALLEL_JOB"
44
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
5+
echo "$BUILDKITE_BUILD_ID"
6+
echo "${MASTER_ADDR}:${MASTER_PORT}"
57

68
set -euox pipefail
79

810
# 0. install bagua
911
cp -a /upstream /workdir
10-
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
12+
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
1113

1214
# 1. test communication_primitives api
1315
echo "begin to test [communication_primitives]"
14-
COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
15-
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
16+
COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
17+
NCCL_SOCKET_IFNAME=^docker,lo,veth torchrun \
1618
--nnodes=2 \
1719
--nproc_per_node 4 \
18-
--node_rank=1 \
19-
--master_addr="10.158.66.134" \
20-
--master_port=1234 \
20+
--rdzv_id=${BUILDKITE_BUILD_ID} \
21+
--rdzv_backend=c10d \
22+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
2123
${COMMUNICATION_SCRIPT}
2224

2325
# 2. benchmark test with all communication algorithms
24-
SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
26+
SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
2527
algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
2628
length=${#algorithms[@]}
2729
for ((i = 0; i < $length; i++)); do
2830
echo "begin to test ["${algorithms[$i]}]
2931
logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
30-
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
32+
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 torchrun \
3133
--nnodes=2 \
3234
--nproc_per_node 4 \
33-
--node_rank=1 \
34-
--master_addr="10.158.66.134" \
35-
--master_port=1234 \
35+
--rdzv_id=${BUILDKITE_BUILD_ID} \
36+
--rdzv_backend=c10d \
37+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
3638
${SYNTHETIC_SCRIPT} \
3739
--num-iters 100 \
3840
--algorithm ${algorithms[$i]} \
@@ -43,14 +45,14 @@ for ((i = 0; i < $length; i++)); do
4345
done
4446

4547
# 3. test moe
46-
MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
48+
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
4749
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
48-
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
50+
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 torchrun \
4951
--nnodes=2 \
5052
--nproc_per_node 2 \
51-
--node_rank=1 \
52-
--master_addr="10.158.66.134" \
53-
--master_port=1234 \
53+
--rdzv_id=${BUILDKITE_BUILD_ID} \
54+
--rdzv_backend=c10d \
55+
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
5456
${MOE_SCRIPT} \
5557
--algorithm gradient_allreduce \
5658
--epochs 5 \

.buildkite/scripts/install_bagua.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
66
set -euox pipefail
77

88
pip uninstall -y bagua bagua-core
9-
export HOME=/workdir && cd $HOME
9+
#export HOME=/workdir && cd $HOME
1010
curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y
1111
source $HOME/.cargo/env
1212
# cd /workdir && python3 -m pip install --force-reinstall --no-cache-dir . || exit 1
13+
git config --global --add safe.directory /workdir/rust/bagua-core/bagua-core-internal/third_party/Aluminum
1314
cd /workdir && python3 setup.py install -f || exit 1
1415
rm -rf bagua bagua_core

.buildkite/scripts/run_pytest.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
55

66
set -euo pipefail
77
cp -a /upstream /workdir
8-
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
8+
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
99
pip install pytest-timeout
1010
pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git
1111
pytest --timeout=300 -s -o "testpaths=tests"

0 commit comments

Comments
 (0)