[DDP] Track models with sync bn (pytorch#66680)

rohan-varma · facebook-github-bot · commit bff64e84cd10 · 2021-10-18T22:31:52.000-07:00
Summary: Pull Request resolved: pytorch#66680 Closes pytorch#66215. Tracks models with sync BN so we can find workflows that use them and target for perf optimization. ghstack-source-id: 140875182 Test Plan: CI Reviewed By: pritamdamania87 Differential Revision: D31679477 fbshipit-source-id: 0e68cd1a7aabbc5b26227895c53d33b8e98bfb8e
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
@@ -48,6 +48,7 @@ class Logger:
         device_ids: List[int],
         output_device: int,
         broadcast_buffers: bool,
+        has_sync_bn: bool,
     ): ...
     ...
 
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -454,6 +454,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           py::arg("device_ids"),
           py::arg("output_device"),
           py::arg("broadcast_buffers"),
+          py::arg("has_sync_bn"),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "set_runtime_stats_and_log",
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
@@ -155,7 +155,8 @@ void Logger::set_construction_data_and_log(
     const std::string& module_name,
     const std::vector<int>& device_ids,
     int output_device,
-    bool broadcast_buffers) {
+    bool broadcast_buffers,
+    bool has_sync_bn) {
   // No lock is needed, as it will be called in DistributedDataParallel
   // constructor.
   ddp_logging_data_->strs_map["module_name"] = module_name;
@@ -182,6 +183,7 @@ void Logger::set_construction_data_and_log(
   ddp_logging_data_->strs_map["device_ids"] = c10::Join(", ", device_ids);
   ddp_logging_data_->ints_map["output_device"] = output_device;
   ddp_logging_data_->ints_map["broadcast_buffers"] = broadcast_buffers;
+  ddp_logging_data_->ints_map["has_sync_bn"] = has_sync_bn;
   ddp_logging_data_->ints_map["bucket_cap_bytes"] = reducer_->bucket_bytes_cap_;
   ddp_logging_data_->ints_map["find_unused_parameters"] =
       reducer_->find_unused_parameters_;
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
@@ -12,7 +12,8 @@ class TORCH_API Logger {
       const std::string& module_name,
       const std::vector<int>& device_ids,
       int output_device,
-      bool broadcast_buffers);
+      bool broadcast_buffers,
+      bool has_sync_bn);
 
   void set_static_graph();
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -674,12 +674,19 @@ def _ddp_init_helper(
         # logger and reducer.
         self.reducer.set_logger(self.logger)
 
+        has_sync_bn = False
+        for submodule in self.module.modules():
+            if isinstance(submodule, torch.nn.SyncBatchNorm):
+                has_sync_bn = True
+                break
+
         # Set logging data that can be got during construction time.
         self.logger.set_construction_data_and_log(
             self.module.__class__.__name__,
             [] if self.device_ids is None else self.device_ids,
             -1 if self.output_device is None else self.output_device,
             self.broadcast_buffers,
+            has_sync_bn
         )
 
         # passing a handle to torch.nn.SyncBatchNorm layer
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -8263,6 +8263,32 @@ def forward(self, x):
                 for buf in bufs[1:]:
                     self.assertEqual(rank_0_buf, buf)
 
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "Only Nccl & Gloo backend support DistributedDataParallel",
+        )
+        def test_sync_bn_logged(self):
+            model = BN_NET
+            rank = self.rank
+            # single gpu training setup
+            model_gpu = model.cuda(rank)
+            no_sync_bn = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(model_gpu),
+                device_ids=[self.rank],
+            )
+            ddp_logging_data = no_sync_bn._get_ddp_logging_data()
+            sync_bn_logged = ddp_logging_data.get("has_sync_bn", True)
+            self.assertFalse(sync_bn_logged)
+            model_DDP = nn.SyncBatchNorm.convert_sync_batchnorm(model_gpu)
+            model_DDP = torch.nn.parallel.DistributedDataParallel(
+                model_DDP,
+                device_ids=[self.rank],
+            )
+            ddp_logging_data = model_DDP._get_ddp_logging_data()
+            sync_bn_logged = ddp_logging_data.get("has_sync_bn", False)
+            self.assertTrue(sync_bn_logged)
+
         @skip_if_lt_x_gpu(2)
         @sandcastle_skip_if(
             BACKEND != "nccl" and BACKEND != "gloo",