Skip to content

Commit 3fad054

Browse files
committed
tmp save
1 parent e76ae6a commit 3fad054

14 files changed

Lines changed: 50 additions & 56 deletions

.buildkite/scripts/benchmark_master.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ set -euox pipefail
77
python -m http.server 8001 &>/dev/null &
88
apt-get update && apt-get install -y iputils-ping netcat
99
ping ${MASTER_ADDR} -c 10
10-
nc -zv $MASTER_ADDR 8000-9000
11-
nc -zv 127.0.0.1 8000-9000
10+
nc -zv $MASTER_ADDR 8001
11+
nc -zv 127.0.0.1 8001
1212

1313
# 0. install bagua
1414
cp -a /upstream /workdir

.buildkite/scripts/benchmark_worker.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ printenv
44

55
set -euox pipefail
66

7-
python -m http.server 8008 &>/dev/null &
7+
python -m http.server 8001 &>/dev/null &
88
apt-get update && apt-get install -y iputils-ping netcat
99
ping ${MASTER_ADDR} -c 10
10-
nc -zv $MASTER_ADDR 8000-9000
11-
nc -zv 127.0.0.1 8000-9000
10+
nc -zv $MASTER_ADDR 8001
11+
nc -zv 127.0.0.1 8001
1212

1313
# 0. install bagua
1414
cp -a /upstream /workdir

bagua/distributed/run.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def get_args_parser() -> ArgumentParser:
197197
"--nproc_per_node",
198198
action=env,
199199
type=str,
200-
default="auto",
200+
default="1",
201201
help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
202202
)
203203

@@ -250,7 +250,7 @@ def get_args_parser() -> ArgumentParser:
250250
"--max_restarts",
251251
action=env,
252252
type=int,
253-
default=3,
253+
default=0,
254254
help="Maximum number of worker group restarts before failing.",
255255
)
256256
parser.add_argument(
@@ -492,8 +492,8 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
492492
nproc_per_node = determine_local_world_size(args.nproc_per_node)
493493
if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
494494
omp_num_threads = 1
495-
print(
496-
f"*****************************************\n"
495+
log.warning(
496+
f"\n*****************************************\n"
497497
f"Setting OMP_NUM_THREADS environment variable for each process to be "
498498
f"{omp_num_threads} in default, to avoid your system being overloaded, "
499499
f"please further tune the variable for optimal performance in "

bagua/torch_api/communication.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,6 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
483483
`fatal runtime error: Rust cannot catch foreign exceptions` error.
484484
"""
485485

486-
487486
global _default_pg
488487
global _default_store
489488
global _autotune_service_port
@@ -510,7 +509,7 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
510509
os.environ["LOCAL_WORLD_SIZE"] = str(local_world_size)
511510

512511
_default_store = store
513-
512+
514513
if _autotune_service_port is None:
515514
if get_rank() == 0:
516515
_autotune_service_port = _find_free_bagua_service_port(_default_store)

tests/internal/multi_process_v2.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ class TestResult(NamedTuple):
2929

3030

3131
TEST_SKIPS = {
32-
"no_cuda": TestResult(74, "CUDA is not available."),
3332
"multi-gpu-1": TestResult(75, "Need at least 1 CUDA device"),
3433
"multi-gpu-2": TestResult(77, "Need at least 2 CUDA devices"),
3534
"multi-gpu-3": TestResult(80, "Need at least 3 CUDA devices"),
@@ -52,23 +51,6 @@ def make_error_result(msg: str):
5251
return TestResult(255, msg)
5352

5453

55-
def skip_if_no_gpu(func):
56-
"""Skips if the world size exceeds the number of GPUs, ensuring that if the
57-
test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
58-
59-
@wraps(func)
60-
def wrapper(*args, **kwargs):
61-
if not torch.cuda.is_available():
62-
sys.exit(TEST_SKIPS["no_cuda"].exit_code)
63-
world_size = int(os.environ["WORLD_SIZE"])
64-
if torch.cuda.device_count() < world_size:
65-
sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
66-
67-
return func(*args, **kwargs)
68-
69-
return wrapper
70-
71-
7254
def skip_if_lt_x_gpu(x):
7355
def decorator(func):
7456
@wraps(func)
@@ -106,7 +88,7 @@ def _get_timeout(self):
10688
return 300
10789

10890
def _init_bagua_distributed(self):
109-
logger.info("rank: {}, world_size: {}".format(self.rank, self.world_size()))
91+
logger.info("rank: {}, world_size: {}".format(self.rank, self.world_size))
11092

11193
torch.cuda.set_device(self.rank)
11294
store = torch.distributed.FileStore(self.file_name, self.world_size)

tests/torch_api/data_parallel/test_async_model_average.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ def tearDown(self):
7272

7373
@property
7474
def world_size(self) -> int:
75-
return torch.cuda.device_count()
75+
return 4
7676

77-
@skip_if_lt_x_gpu(2)
77+
@skip_if_lt_x_gpu(4)
7878
def test_algorithm(self):
7979
self._init_bagua_distributed()
8080
model, optimizer = create_model_and_optimizer(warmup_steps=0)

tests/torch_api/data_parallel/test_bagua_ddp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
import torch.nn.functional as F
1414
from torch import nn
15-
from tests.internal.torch.common_distributed import (
15+
from tests.internal.multi_process_v2 import (
1616
MultiProcessTestCase,
1717
skip_if_lt_x_gpu,
1818
)

tests/torch_api/data_parallel/test_broadcast_state.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,9 @@ def _check_result(self, test_id=None):
113113

114114
@property
115115
def world_size(self) -> int:
116-
return torch.cuda.device_count()
116+
return 4
117117

118-
@skip_if_lt_x_gpu(2)
118+
@skip_if_lt_x_gpu(4)
119119
def test_broadcast_module(self):
120120
# Set deterministic
121121
torch.backends.cudnn.benchmark = False

tests/torch_api/data_parallel/test_c10d_common.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,20 +1032,33 @@ def tearDown(self):
10321032
except OSError:
10331033
pass
10341034

1035-
def test_distributed_debug_mode(self):
1035+
def test_debug_level(self):
1036+
try:
1037+
del os.environ["TORCH_DISTRIBUTED_DEBUG"]
1038+
except KeyError:
1039+
pass
1040+
1041+
dist.set_debug_level_from_env()
10361042
# Default should be off
1037-
default_debug_mode = dist._get_debug_mode()
1038-
self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF)
1043+
default_debug_mode = dist.get_debug_level()
1044+
self.assertEqual(default_debug_mode, dist.DebugLevel.OFF)
10391045
mapping = {
1040-
"OFF": dist._DistributedDebugLevel.OFF,
1041-
"INFO": dist._DistributedDebugLevel.INFO,
1042-
"DETAIL": dist._DistributedDebugLevel.DETAIL,
1046+
"OFF": dist.DebugLevel.OFF,
1047+
"off": dist.DebugLevel.OFF,
1048+
"oFf": dist.DebugLevel.OFF,
1049+
"INFO": dist.DebugLevel.INFO,
1050+
"info": dist.DebugLevel.INFO,
1051+
"INfO": dist.DebugLevel.INFO,
1052+
"DETAIL": dist.DebugLevel.DETAIL,
1053+
"detail": dist.DebugLevel.DETAIL,
1054+
"DeTaIl": dist.DebugLevel.DETAIL,
10431055
}
10441056
invalid_debug_modes = ["foo", 0, 1, -1]
10451057

10461058
for mode in mapping.keys():
10471059
os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
1048-
set_debug_mode = dist._get_debug_mode()
1060+
dist.set_debug_level_from_env()
1061+
set_debug_mode = dist.get_debug_level()
10491062
self.assertEqual(
10501063
set_debug_mode,
10511064
mapping[mode],
@@ -1054,8 +1067,8 @@ def test_distributed_debug_mode(self):
10541067

10551068
for mode in invalid_debug_modes:
10561069
os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
1057-
with self.assertRaisesRegex(RuntimeError, "to be one of"):
1058-
dist._get_debug_mode()
1070+
with self.assertRaisesRegex(RuntimeError, "The value of TORCH_DISTRIBUTED_DEBUG must"):
1071+
dist.set_debug_level_from_env()
10591072

10601073

10611074
if __name__ == "__main__":

tests/torch_api/data_parallel/test_gradient_allreduce.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ def _check_result(self, test_id=None):
9191

9292
@property
9393
def world_size(self) -> int:
94-
return torch.cuda.device_count()
94+
return 4
9595

96-
@skip_if_lt_x_gpu(2)
96+
@skip_if_lt_x_gpu(4)
9797
def test_algorithm(self):
9898
# set deterministic
9999
torch.backends.cudnn.benchmark = False
@@ -103,7 +103,7 @@ def test_algorithm(self):
103103
self._init_bagua_distributed()
104104
return run_model(hierarchical=False)
105105

106-
@skip_if_lt_x_gpu(2)
106+
@skip_if_lt_x_gpu(4)
107107
def test_algorithm_hierarchical(self):
108108
# set deterministic
109109
torch.backends.cudnn.benchmark = False

0 commit comments

Comments
 (0)