Fix accesses to uninitialized memory when running sum() within an OMP… (pytorch#13274)

colesbury · facebook-github-bot · commit 9af18d847ac0 · 2018-10-30T14:17:35.000-07:00
Summary: ``` … parallel region. The two_pass_reduction code allocates a buffer of size at::max_threads(). When called within a parallel region, at::parallel_for only uses 1 thread so some of this buffer is not written. This makes two changes: 1) two_pass_reduction is not called when already in a parallel region 2) two_pass_reduction fills unwritten buffer elements with the identity (the value in dst) ``` cc The controller you requested could not be found. SsnL: I think this should fix the NaNs in BatchNorm when calling sum() within a parallel region. Pull Request resolved: pytorch#13274 Differential Revision: D12840034 Pulled By: colesbury fbshipit-source-id: d32e80909a98a0f1bb1c80689fe5089b7019ef59
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
@@ -36,6 +36,14 @@ inline int get_thread_num() {
 #endif
 }
 
+inline bool in_parallel_region() {
+#ifdef _OPENMP
+  return omp_in_parallel();
+#else
+  return false;
+#endif
+}
+
 template <class F>
 inline void parallel_for(
     const int64_t begin,
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -1,5 +1,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/Parallel.h>
+#include <algorithm>
+#include <memory>
 
 /// Contains the implementation of parallel reductions in TensorIterator.
 
@@ -14,7 +16,7 @@ static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop);
 void TensorIterator::parallel_reduce(const loop2d_t& loop) {
   AT_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
   int64_t numel = this->numel();
-  if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1) {
+  if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) {
     serial_for_each(loop, {0, numel});
   } else if (use_two_pass_reduction(*this)) {
     two_pass_reduction(*this, loop);
@@ -28,21 +30,33 @@ static bool use_two_pass_reduction(TensorIterator& iter) {
 }
 
 static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) {
-  int num_threads = at::get_max_threads();
+  int max_threads = at::get_max_threads();
 
   auto& dst = iter.tensor(0);
   auto buffer_shape = DimVector(dst.sizes());
-  buffer_shape.insert(buffer_shape.begin(), num_threads);
+  buffer_shape.insert(buffer_shape.begin(), max_threads);
   auto buffer = at::empty(buffer_shape, dst.type());
 
+  std::unique_ptr<bool[]> written(new bool[max_threads]);
+  std::fill(written.get(), written.get() + max_threads, false);
+
   at::parallel_for(0, iter.numel(), internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-    auto slice = buffer[at::get_thread_num()];
+    int thread_num = at::get_thread_num();
+    written[thread_num] = true;
+    auto slice = buffer[thread_num];
     slice.copy_(dst);
 
     auto sub_iter = TensorIterator::reduce_op(slice, iter.tensor(1));
     sub_iter->serial_for_each(loop, {begin, end});
   });
 
+  // fill any unwritten slices of the buffer with the identity
+  for (int thread_num = 0; thread_num < max_threads; thread_num++) {
+    if (!written[thread_num]) {
+      buffer[thread_num].copy_(dst);
+    }
+  }
+
   auto unsqueezed = dst.unsqueeze(0);
   auto final_reduce = TensorIterator::reduce_op(unsqueezed, buffer);
   final_reduce->for_each(loop);
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
@@ -1,7 +1,8 @@
 #include "gtest/gtest.h"
 
-#include "ATen/ATen.h"
-#include "ATen/DLConvertor.h"
+#include <ATen/ATen.h>
+#include <ATen/DLConvertor.h>
+#include <ATen/Parallel.h>
 
 #include <iostream>
 #include <string.h>
@@ -24,3 +25,14 @@ TEST(TestParallel, TestParallel) {
   as[2] = 0;
   ASSERT_TRUE(a.sum(0).equal(as));
 }
+
+TEST(TestParallel, NestedParallel) {
+  Tensor a = ones({1024, 1024});
+  auto expected = a.sum();
+  // check that calling sum() from within a parallel block computes the same result
+  at::parallel_for(0, 10, 1, [&](int64_t begin, int64_t end) {
+    if (begin == 0) {
+      ASSERT_TRUE(a.sum().equal(expected));
+    }
+  });
+}