tmp save

wangraying · wangraying · commit 3fad054a8cef · 2022-12-13T19:36:39.000+08:00
diff --git a/.buildkite/scripts/benchmark_master.sh b/.buildkite/scripts/benchmark_master.sh
@@ -7,8 +7,8 @@ set -euox pipefail
 python -m http.server 8001 &>/dev/null &
 apt-get update && apt-get install -y iputils-ping netcat
 ping ${MASTER_ADDR} -c 10
-nc -zv $MASTER_ADDR 8000-9000
-nc -zv 127.0.0.1 8000-9000
+nc -zv $MASTER_ADDR 8001
+nc -zv 127.0.0.1 8001
 
 # 0. install bagua
 cp -a /upstream /workdir
diff --git a/.buildkite/scripts/benchmark_worker.sh b/.buildkite/scripts/benchmark_worker.sh
@@ -4,11 +4,11 @@ printenv
 
 set -euox pipefail
 
-python -m http.server 8008 &>/dev/null &
+python -m http.server 8001 &>/dev/null &
 apt-get update && apt-get install -y iputils-ping netcat
 ping ${MASTER_ADDR} -c 10
-nc -zv $MASTER_ADDR 8000-9000
-nc -zv 127.0.0.1 8000-9000
+nc -zv $MASTER_ADDR 8001
+nc -zv 127.0.0.1 8001
 
 # 0. install bagua
 cp -a /upstream /workdir
diff --git a/bagua/distributed/run.py b/bagua/distributed/run.py
@@ -197,7 +197,7 @@ def get_args_parser() -> ArgumentParser:
         "--nproc_per_node",
         action=env,
         type=str,
-        default="auto",
+        default="1",
         help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
     )
 
@@ -250,7 +250,7 @@ def get_args_parser() -> ArgumentParser:
         "--max_restarts",
         action=env,
         type=int,
-        default=3,
+        default=0,
         help="Maximum number of worker group restarts before failing.",
     )
     parser.add_argument(
@@ -492,8 +492,8 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     nproc_per_node = determine_local_world_size(args.nproc_per_node)
     if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
         omp_num_threads = 1
-        print(
-            f"*****************************************\n"
+        log.warning(
+            f"\n*****************************************\n"
             f"Setting OMP_NUM_THREADS environment variable for each process to be "
             f"{omp_num_threads} in default, to avoid your system being overloaded, "
             f"please further tune the variable for optimal performance in "
diff --git a/bagua/torch_api/communication.py b/bagua/torch_api/communication.py
@@ -483,7 +483,6 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
         `fatal runtime error: Rust cannot catch foreign exceptions` error.
     """
 
-
     global _default_pg
     global _default_store
     global _autotune_service_port
@@ -510,7 +509,7 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
         os.environ["LOCAL_WORLD_SIZE"] = str(local_world_size)
 
         _default_store = store
-        
+
     if _autotune_service_port is None:
         if get_rank() == 0:
             _autotune_service_port = _find_free_bagua_service_port(_default_store)
diff --git a/tests/internal/multi_process_v2.py b/tests/internal/multi_process_v2.py
@@ -29,7 +29,6 @@ class TestResult(NamedTuple):
 
 
 TEST_SKIPS = {
-    "no_cuda": TestResult(74, "CUDA is not available."),
     "multi-gpu-1": TestResult(75, "Need at least 1 CUDA device"),
     "multi-gpu-2": TestResult(77, "Need at least 2 CUDA devices"),
     "multi-gpu-3": TestResult(80, "Need at least 3 CUDA devices"),
@@ -52,23 +51,6 @@ def make_error_result(msg: str):
     return TestResult(255, msg)
 
 
-def skip_if_no_gpu(func):
-    """Skips if the world size exceeds the number of GPUs, ensuring that if the
-    test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if not torch.cuda.is_available():
-            sys.exit(TEST_SKIPS["no_cuda"].exit_code)
-        world_size = int(os.environ["WORLD_SIZE"])
-        if torch.cuda.device_count() < world_size:
-            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
-
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -106,7 +88,7 @@ def _get_timeout(self):
         return 300
 
     def _init_bagua_distributed(self):
-        logger.info("rank: {}, world_size: {}".format(self.rank, self.world_size()))
+        logger.info("rank: {}, world_size: {}".format(self.rank, self.world_size))
 
         torch.cuda.set_device(self.rank)
         store = torch.distributed.FileStore(self.file_name, self.world_size)
diff --git a/tests/torch_api/data_parallel/test_async_model_average.py b/tests/torch_api/data_parallel/test_async_model_average.py
@@ -72,9 +72,9 @@ def tearDown(self):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm(self):
         self._init_bagua_distributed()
         model, optimizer = create_model_and_optimizer(warmup_steps=0)
diff --git a/tests/torch_api/data_parallel/test_bagua_ddp.py b/tests/torch_api/data_parallel/test_bagua_ddp.py
@@ -12,7 +12,7 @@
 
 import torch.nn.functional as F
 from torch import nn
-from tests.internal.torch.common_distributed import (
+from tests.internal.multi_process_v2 import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
 )
diff --git a/tests/torch_api/data_parallel/test_broadcast_state.py b/tests/torch_api/data_parallel/test_broadcast_state.py
@@ -113,9 +113,9 @@ def _check_result(self, test_id=None):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_broadcast_module(self):
         # Set deterministic
         torch.backends.cudnn.benchmark = False
diff --git a/tests/torch_api/data_parallel/test_c10d_common.py b/tests/torch_api/data_parallel/test_c10d_common.py
@@ -1032,20 +1032,33 @@ def tearDown(self):
         except OSError:
             pass
 
-    def test_distributed_debug_mode(self):
+    def test_debug_level(self):
+        try:
+            del os.environ["TORCH_DISTRIBUTED_DEBUG"]
+        except KeyError:
+            pass
+
+        dist.set_debug_level_from_env()
         # Default should be off
-        default_debug_mode = dist._get_debug_mode()
-        self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF)
+        default_debug_mode = dist.get_debug_level()
+        self.assertEqual(default_debug_mode, dist.DebugLevel.OFF)
         mapping = {
-            "OFF": dist._DistributedDebugLevel.OFF,
-            "INFO": dist._DistributedDebugLevel.INFO,
-            "DETAIL": dist._DistributedDebugLevel.DETAIL,
+            "OFF": dist.DebugLevel.OFF,
+            "off": dist.DebugLevel.OFF,
+            "oFf": dist.DebugLevel.OFF,
+            "INFO": dist.DebugLevel.INFO,
+            "info": dist.DebugLevel.INFO,
+            "INfO": dist.DebugLevel.INFO,
+            "DETAIL": dist.DebugLevel.DETAIL,
+            "detail": dist.DebugLevel.DETAIL,
+            "DeTaIl": dist.DebugLevel.DETAIL,
         }
         invalid_debug_modes = ["foo", 0, 1, -1]
 
         for mode in mapping.keys():
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-            set_debug_mode = dist._get_debug_mode()
+            dist.set_debug_level_from_env()
+            set_debug_mode = dist.get_debug_level()
             self.assertEqual(
                 set_debug_mode,
                 mapping[mode],
@@ -1054,8 +1067,8 @@ def test_distributed_debug_mode(self):
 
         for mode in invalid_debug_modes:
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-            with self.assertRaisesRegex(RuntimeError, "to be one of"):
-                dist._get_debug_mode()
+            with self.assertRaisesRegex(RuntimeError, "The value of TORCH_DISTRIBUTED_DEBUG must"):
+                dist.set_debug_level_from_env()
 
 
 if __name__ == "__main__":
diff --git a/tests/torch_api/data_parallel/test_gradient_allreduce.py b/tests/torch_api/data_parallel/test_gradient_allreduce.py
@@ -91,9 +91,9 @@ def _check_result(self, test_id=None):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm(self):
         # set deterministic
         torch.backends.cudnn.benchmark = False
@@ -103,7 +103,7 @@ def test_algorithm(self):
         self._init_bagua_distributed()
         return run_model(hierarchical=False)
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm_hierarchical(self):
         # set deterministic
         torch.backends.cudnn.benchmark = False
diff --git a/tests/torch_api/test_async_model_average.py b/tests/torch_api/test_async_model_average.py
@@ -70,9 +70,9 @@ def tearDown(self):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm(self):
         self._init_bagua_distributed()
         model, optimizer = create_model_and_optimizer(warmup_steps=0)
@@ -81,7 +81,7 @@ def test_algorithm(self):
             train_epoch(epoch, model, optimizer)
         model.bagua_algorithm.abort(model)
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_multiple_aborts(self):
         self._init_bagua_distributed()
         model, optimizer = create_model_and_optimizer(warmup_steps=10)
diff --git a/tests/torch_api/test_broadcast_state.py b/tests/torch_api/test_broadcast_state.py
@@ -114,9 +114,9 @@ def _check_result(self, test_id=None):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_broadcast_module(self):
         # Set deterministic
         torch.backends.cudnn.benchmark = False
diff --git a/tests/torch_api/test_gradient_allreduce.py b/tests/torch_api/test_gradient_allreduce.py
@@ -89,9 +89,9 @@ def _check_result(self, test_id=None):
 
     @property
     def world_size(self) -> int:
-        return torch.cuda.device_count()
+        return 4
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm(self):
         # set deterministic
         torch.backends.cudnn.benchmark = False
@@ -101,7 +101,7 @@ def test_algorithm(self):
         self._init_bagua_distributed()
         return run_model(hierarchical=False)
 
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
     def test_algorithm_hierarchical(self):
         # set deterministic
         torch.backends.cudnn.benchmark = False
diff --git a/tests/torch_api/test_process_group.py b/tests/torch_api/test_process_group.py
@@ -78,7 +78,7 @@ def test_from_torch_group(self):
         self.run_test_locally(run_from_torch_group, nprocs, args={}, results=None)
 
 
-from tests.internal.torch.common_distributed import (  # noqa: E402
+from tests.internal.multi_process_v2 import (  # noqa: E402
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
 )

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`
`13`	`13`	`import torch.nn.functional as F`
`14`	`14`	`from torch import nn`
`15`		`-from tests.internal.torch.common_distributed import (`
	`15`	`+from tests.internal.multi_process_v2 import (`
`16`	`16`	`MultiProcessTestCase,`
`17`	`17`	`skip_if_lt_x_gpu,`
`18`	`18`	`)`