Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ steps:
- label: "benchmark_master"
parallelism: 1
command: bash .buildkite/scripts/benchmark_master.sh
env:
MASTER_ADDR: "10.158.66.134"
MASTER_PORT: "29500"
plugins:
- docker#v3.8.0:
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
- docker#v5.3.0:
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
workdir: /upstream
user: root
propagate-environment: true
Expand All @@ -13,14 +16,18 @@ steps:
ipc: host
shm-size: 100gb
always-pull: true
publish: [ "8001:8001", "29500:29500" ]
agents:
queue: "master"
- label: "benchmark_worker"
parallelism: 1
command: bash .buildkite/scripts/benchmark_worker.sh
env:
MASTER_ADDR: "10.158.66.134"
MASTER_PORT: "29500"
plugins:
- docker#v3.8.0:
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
- docker#v5.3.0:
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
workdir: /upstream
user: root
propagate-environment: true
Expand All @@ -33,8 +40,8 @@ steps:
parallelism: 1
command: bash .buildkite/scripts/benchmark.sh
plugins:
- docker#v3.8.0:
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
- docker#v5.3.0:
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
workdir: /upstream
user: root
propagate-environment: true
Expand All @@ -47,8 +54,8 @@ steps:
parallelism: 1
command: bash .buildkite/scripts/run_pytest.sh
plugins:
- docker#v3.8.0:
image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8"
- docker#v5.3.0:
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
workdir: /upstream
user: root
propagate-environment: true
Expand Down
5 changes: 2 additions & 3 deletions .buildkite/scripts/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
set -euox pipefail

cp -a /upstream /workdir
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1

export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1

SYNTHETIC_SCRIPT="examples/benchmark/synthetic_benchmark.py"
SYNTHETIC_SCRIPT="$WORKDIR/examples/benchmark/synthetic_benchmark.py"

function check_benchmark_log {
logfile=$1
Expand Down
49 changes: 30 additions & 19 deletions .buildkite/scripts/benchmark_master.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
#!/usr/bin/env bash

echo "$BUILDKITE_PARALLEL_JOB"
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
printenv

set -euox pipefail

python -m http.server 8001 &>/dev/null &
apt-get update && apt-get install -y iputils-ping netcat
ping ${MASTER_ADDR} -c 10
nc -zv $MASTER_ADDR 8001
nc -zv 127.0.0.1 8001

# 0. install bagua
cp -a /upstream /workdir
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1


# 1. test communication_primitives api
echo "begin to test [communication_primitives]"
COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 4 \
--node_rank=0 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${COMMUNICATION_SCRIPT}

# 2. benchmark test with all communication algorithms
Expand Down Expand Up @@ -77,20 +84,21 @@ function check_benchmark_log_approximation {
}

CHECK_RESULT=()
SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
speeds=(185.0 180.0 150.0 115.0 190 165)
losses=(0.001763 0.001694 0.002583 0.001821 0.004000 0.000102)
length=${#algorithms[@]}
for ((i = 0; i < $length; i++)); do
echo "begin to test ["${algorithms[$i]}]
logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 4 \
--node_rank=0 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${SYNTHETIC_SCRIPT} \
--num-iters 100 \
--algorithm ${algorithms[$i]} \
Expand Down Expand Up @@ -126,14 +134,15 @@ function check_moe_log {
fi
}

MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 2 \
--node_rank=0 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${MOE_SCRIPT} \
--algorithm gradient_allreduce \
--epochs 5 \
Expand All @@ -144,7 +153,9 @@ check_moe_log ${logfile} 0.000071

# 4. test moe checkpoint
logfile=$(mktemp /tmp/bagua_moe_checkpoint.XXXXXX.log)
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.launch \
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.run \
--standalone \
--nnodes=1 \
--nproc_per_node 4 \
${MOE_SCRIPT} \
--algorithm gradient_allreduce \
Expand Down
44 changes: 26 additions & 18 deletions .buildkite/scripts/benchmark_worker.sh
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
#!/usr/bin/env bash

echo "$BUILDKITE_PARALLEL_JOB"
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
printenv

set -euox pipefail

python -m http.server 8001 &>/dev/null &
apt-get update && apt-get install -y iputils-ping netcat
ping ${MASTER_ADDR} -c 10
nc -zv $MASTER_ADDR 8001
nc -zv 127.0.0.1 8001

# 0. install bagua
cp -a /upstream /workdir
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1

# 1. test communication_primitives api
echo "begin to test [communication_primitives]"
COMMUNICATION_SCRIPT="/workdir/examples/communication_primitives/main.py"
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.launch \
COMMUNICATION_SCRIPT="${WORKDIR}/examples/communication_primitives/main.py"
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 4 \
--node_rank=1 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${COMMUNICATION_SCRIPT}

# 2. benchmark test with all communication algorithms
SYNTHETIC_SCRIPT="/workdir/examples/benchmark/synthetic_benchmark.py"
SYNTHETIC_SCRIPT="${WORKDIR}/examples/benchmark/synthetic_benchmark.py"
algorithms=(gradient_allreduce bytegrad decentralized low_precision_decentralized async qadam)
length=${#algorithms[@]}
for ((i = 0; i < $length; i++)); do
echo "begin to test ["${algorithms[$i]}]
logfile=$(mktemp /tmp/bagua_benchmark_${algorithms[$i]}.XXXXXX.log)
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.launch \
NCCL_SOCKET_IFNAME=^docker,lo,veth GLOO_SOCKET_IFNAME=enp96s0f0 python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 4 \
--node_rank=1 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${SYNTHETIC_SCRIPT} \
--num-iters 100 \
--algorithm ${algorithms[$i]} \
Expand All @@ -43,14 +50,15 @@ for ((i = 0; i < $length; i++)); do
done

# 3. test moe
MOE_SCRIPT="/workdir/examples/moe/mnist_main.py"
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.launch \
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
--nnodes=2 \
--nproc_per_node 2 \
--node_rank=1 \
--master_addr="10.158.66.134" \
--master_port=1234 \
--rdzv_id=${BUILDKITE_BUILD_ID} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_conf read_timeout=300 \
${MOE_SCRIPT} \
--algorithm gradient_allreduce \
--epochs 5 \
Expand Down
3 changes: 1 addition & 2 deletions .buildkite/scripts/install_bagua.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"
set -euox pipefail

pip uninstall -y bagua bagua-core
export HOME=/workdir && cd $HOME
curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y
source $HOME/.cargo/env
# cd /workdir && python3 -m pip install --force-reinstall --no-cache-dir . || exit 1
git config --global --add safe.directory /workdir/rust/bagua-core/bagua-core-internal/third_party/Aluminum
cd /workdir && python3 setup.py install -f || exit 1
rm -rf bagua bagua_core
2 changes: 1 addition & 1 deletion .buildkite/scripts/run_pytest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ echo "$BUILDKITE_PARALLEL_JOB_COUNT"

set -euo pipefail
cp -a /upstream /workdir
export HOME=/workdir && cd $HOME && bash .buildkite/scripts/install_bagua.sh || exit 1
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
pip install pytest-timeout
pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git
pytest --timeout=300 -s -o "testpaths=tests"
2 changes: 1 addition & 1 deletion .github/workflows/bagua-python-package-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
jobs:
build:
runs-on: ubuntu-latest
container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8
container: baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8
steps:
- uses: actions/checkout@v2
with:
Expand Down
8 changes: 4 additions & 4 deletions bagua/distributed/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def get_args_parser() -> ArgumentParser:
"--nproc_per_node",
action=env,
type=str,
default="auto",
default="1",
help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
)

Expand Down Expand Up @@ -250,7 +250,7 @@ def get_args_parser() -> ArgumentParser:
"--max_restarts",
action=env,
type=int,
default=3,
default=0,
help="Maximum number of worker group restarts before failing.",
)
parser.add_argument(
Expand Down Expand Up @@ -492,8 +492,8 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
nproc_per_node = determine_local_world_size(args.nproc_per_node)
if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
omp_num_threads = 1
print(
f"*****************************************\n"
log.warning(
f"\n*****************************************\n"
f"Setting OMP_NUM_THREADS environment variable for each process to be "
f"{omp_num_threads} in default, to avoid your system being overloaded, "
f"please further tune the variable for optimal performance in "
Expand Down
23 changes: 12 additions & 11 deletions bagua/torch_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,32 +29,33 @@
init_process_group,
send,
recv,
broadcast,
reduce,
reduce_inplace,
gather,
gather_inplace,
scatter,
scatter_inplace,
allreduce,
allreduce_inplace,
allgather,
allgather_inplace,
allreduce,
allreduce_inplace,
alltoall,
alltoall_inplace,
alltoall_v,
alltoall_v_inplace,
barrier,
broadcast,
gather,
gather_inplace,
reduce,
reduce_inplace,
reduce_scatter,
reduce_scatter_inplace,
scatter,
scatter_inplace,
ReduceOp,
)
from .distributed import BaguaModule # noqa: F401
from .tensor import BaguaTensor # noqa: F401
from .env import ( # noqa: F401
get_rank,
get_world_size,
get_local_rank,
get_local_size,
get_rank,
get_world_size,
)
from . import contrib # noqa: F401
from . import communication # noqa: F401
Expand Down
Loading