BaguaSys · wangraying · Dec 6, 2022 · Dec 8, 2022 · Dec 10, 2022 · Dec 10, 2022
@@ -2,9 +2,12 @@ steps:
   - label: "benchmark_master"
     parallelism: 1
     command: bash .buildkite/scripts/benchmark_master.sh
+    env:
+      MASTER_ADDR: "10.158.66.134"
+      MASTER_PORT: "29500"
     plugins:
-      - docker#v3.8.0:
-          image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
+      - docker#v5.3.0:
+          image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
           workdir: /upstream
           user: root
           propagate-environment: true
@@ -13,14 +16,18 @@ steps:
           ipc: host
           shm-size: 100gb
           always-pull: true
+          publish: [ "8001:8001", "29500:29500" ]
     agents:
       queue: "master"
   - label: "benchmark_worker"
     parallelism: 1
     command: bash .buildkite/scripts/benchmark_worker.sh
+    env:
+      MASTER_ADDR: "10.158.66.134"
+      MASTER_PORT: "29500"
     plugins:
-      - docker#v3.8.0:
-          image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
+      - docker#v5.3.0:
+          image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
           workdir: /upstream
           user: root
           propagate-environment: true
@@ -33,8 +40,8 @@ steps:
     parallelism: 1
     command: bash .buildkite/scripts/benchmark.sh
     plugins:
-      - docker#v3.8.0:
-          image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
+      - docker#v5.3.0:
+          image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
           workdir: /upstream
           user: root
           propagate-environment: true
@@ -47,8 +54,8 @@ steps:
     parallelism: 1
     command: bash .buildkite/scripts/run_pytest.sh
     plugins:
-      - docker#v3.8.0:
-          image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
+      - docker#v5.3.0:
+          image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
           workdir: /upstream
           user: root
           propagate-environment: true

@@ -6,10 +6,9 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
 set -euox pipefail
 
 cp -a /upstream /workdir
+export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
 
-export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
-
-SYNTHETIC_SCRIPT="examples/benchmark/synthetic_benchmark.py"
+SYNTHETIC_SCRIPT="$WORKDIR/examples/benchmark/synthetic_benchmark.py"
 
 function check_benchmark_log {
     logfile=$1

@@ -1,23 +1,30 @@
 #!/usr/bin/env bash
 
-echo "$BUILDKITE_PARALLEL_JOB"
-echo "$BUILDKITE_PARALLEL_JOB_COUNT"
+printenv
 
 set -euox pipefail
 
+python -m http.server 8001 &>/dev/null &
+apt-get update && apt-get install -y iputils-ping netcat
+ping ${MASTER_ADDR} -c 10
+nc -zv $MASTER_ADDR 8001
+nc -zv 127.0.0.1 8001
+
 # 0. install bagua
 cp -a /upstream /workdir
-export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
+export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
+
 
 # 1. test communication_primitives api
 echo "begin to test [communication_primitives]"
-COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
-NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
+COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
+NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
     --nnodes=2 \
     --nproc_per_node 4 \
-    --node_rank=0 \
-    --master_addr="10.158.66.134" \
-    --master_port=1234 \
+    --rdzv_id=${BUILDKITE_BUILD_ID} \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+    --rdzv_conf read_timeout=300 \
     ${COMMUNICATION_SCRIPT}
 
 # 2. benchmark test with all communication algorithms
@@ -77,20 +84,21 @@ function check_benchmark_log_approximation {
 }
 
 CHECK_RESULT=()
-SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
+SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
 algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
 speeds=(185.0 180.0 150.0 115.0 190 165)
 losses=(0.001763 0.001694 0.002583 0.001821 0.004000 0.000102)
 length=${#algorithms[@]}
 for ((i = 0; i < $length; i++)); do
     echo "begin to test ["${algorithms[$i]}]
     logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
-    NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.run \
         --nnodes=2 \
         --nproc_per_node 4 \
-        --node_rank=0 \
-        --master_addr="10.158.66.134" \
-        --master_port=1234 \
+        --rdzv_id=${BUILDKITE_BUILD_ID} \
+        --rdzv_backend=c10d \
+        --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+        --rdzv_conf read_timeout=300 \
         ${SYNTHETIC_SCRIPT} \
         --num-iters 100 \
         --algorithm ${algorithms[$i]} \
@@ -126,14 +134,15 @@ function check_moe_log {
     fi
 }
 
-MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
+MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
 logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
-NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
+NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
     --nnodes=2 \
     --nproc_per_node 2 \
-    --node_rank=0 \
-    --master_addr="10.158.66.134" \
-    --master_port=1234 \
+    --rdzv_id=${BUILDKITE_BUILD_ID} \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+    --rdzv_conf read_timeout=300 \
     ${MOE_SCRIPT} \
     --algorithm gradient_allreduce \
     --epochs 5 \
@@ -144,7 +153,9 @@ check_moe_log ${logfile} 0.000071
 
 # 4. test moe checkpoint
 logfile=$(mktemp /tmp/bagua_moe_checkpoint.XXXXXX.log)
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.launch \
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.run \
+    --standalone \
+    --nnodes=1 \
     --nproc_per_node 4 \
     ${MOE_SCRIPT} \
     --algorithm gradient_allreduce \

@@ -1,38 +1,45 @@
 #!/usr/bin/env bash
 
-echo "$BUILDKITE_PARALLEL_JOB"
-echo "$BUILDKITE_PARALLEL_JOB_COUNT"
+printenv
 
 set -euox pipefail
 
+python -m http.server 8001 &>/dev/null &
+apt-get update && apt-get install -y iputils-ping netcat
+ping ${MASTER_ADDR} -c 10
+nc -zv $MASTER_ADDR 8001
+nc -zv 127.0.0.1 8001
+
 # 0. install bagua
 cp -a /upstream /workdir
-export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
+export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
 
 # 1. test communication_primitives api
 echo "begin to test [communication_primitives]"
-COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
-NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
+COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
+NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
     --nnodes=2 \
     --nproc_per_node 4 \
-    --node_rank=1 \
-    --master_addr="10.158.66.134" \
-    --master_port=1234 \
+    --rdzv_id=${BUILDKITE_BUILD_ID} \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+    --rdzv_conf read_timeout=300 \
     ${COMMUNICATION_SCRIPT}
 
 # 2. benchmark test with all communication algorithms
-SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
+SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
 algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
 length=${#algorithms[@]}
 for ((i = 0; i < $length; i++)); do
     echo "begin to test ["${algorithms[$i]}]
     logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
-    NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.run \
         --nnodes=2 \
         --nproc_per_node 4 \
-        --node_rank=1 \
-        --master_addr="10.158.66.134" \
-        --master_port=1234 \
+        --rdzv_id=${BUILDKITE_BUILD_ID} \
+        --rdzv_backend=c10d \
+        --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+        --rdzv_conf read_timeout=300 \
         ${SYNTHETIC_SCRIPT} \
         --num-iters 100 \
         --algorithm ${algorithms[$i]} \
@@ -43,14 +50,15 @@ for ((i = 0; i < $length; i++)); do
 done
 
 # 3. test moe
-MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
+MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
 logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
-NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
+NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
     --nnodes=2 \
     --nproc_per_node 2 \
-    --node_rank=1 \
-    --master_addr="10.158.66.134" \
-    --master_port=1234 \
+    --rdzv_id=${BUILDKITE_BUILD_ID} \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
+    --rdzv_conf read_timeout=300 \
     ${MOE_SCRIPT} \
     --algorithm gradient_allreduce \
     --epochs 5 \

@@ -6,9 +6,8 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
 set -euox pipefail
 
 pip uninstall -y bagua bagua-core
-export HOME=/workdir && cd $HOME
 curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y
 source $HOME/.cargo/env
-# cd /workdir && python3 -m pip install --force-reinstall --no-cache-dir . || exit 1
+git config --global --add safe.directory /workdir/rust/bagua-core/bagua-core-internal/third_party/Aluminum
 cd /workdir && python3 setup.py install -f || exit 1
 rm -rf bagua bagua_core
@@ -5,7 +5,7 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
 
 set -euo pipefail
 cp -a /upstream /workdir
-export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
+export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
 pip install pytest-timeout
 pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git
 pytest --timeout=300 -s -o "testpaths=tests"
@@ -12,7 +12,7 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8
+    container: baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8
     steps:
     - uses: actions/checkout@v2
       with:

@@ -197,7 +197,7 @@ def get_args_parser() -> ArgumentParser:
         "--nproc_per_node",
         action=env,
         type=str,
-        default="auto",
+        default="1",
         help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
     )
 
@@ -250,7 +250,7 @@ def get_args_parser() -> ArgumentParser:
         "--max_restarts",
         action=env,
         type=int,
-        default=3,
+        default=0,
         help="Maximum number of worker group restarts before failing.",
     )
     parser.add_argument(
@@ -492,8 +492,8 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     nproc_per_node = determine_local_world_size(args.nproc_per_node)
     if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
         omp_num_threads = 1
-        print(
-            f"*****************************************\n"
+        log.warning(
+            f"\n*****************************************\n"
             f"Setting OMP_NUM_THREADS environment variable for each process to be "
             f"{omp_num_threads} in default, to avoid your system being overloaded, "
             f"please further tune the variable for optimal performance in "

@@ -29,32 +29,33 @@
     init_process_group,
     send,
     recv,
-    broadcast,
-    reduce,
-    reduce_inplace,
-    gather,
-    gather_inplace,
-    scatter,
-    scatter_inplace,
-    allreduce,
-    allreduce_inplace,
     allgather,
     allgather_inplace,
+    allreduce,
+    allreduce_inplace,
     alltoall,
     alltoall_inplace,
     alltoall_v,
     alltoall_v_inplace,
+    barrier,
+    broadcast,
+    gather,
+    gather_inplace,
+    reduce,
+    reduce_inplace,
     reduce_scatter,
     reduce_scatter_inplace,
+    scatter,
+    scatter_inplace,
     ReduceOp,
 )
 from .distributed import BaguaModule  # noqa: F401
 from .tensor import BaguaTensor  # noqa: F401
 from .env import (  # noqa: F401
-    get_rank,
-    get_world_size,
     get_local_rank,
     get_local_size,
+    get_rank,
+    get_world_size,
 )
 from . import contrib  # noqa: F401
 from . import communication  # noqa: F401