Replace CUDA-specific set_index(_from) method from DeviceGuard with set_device. (pytorch#13275)

ezyang · facebook-github-bot · commit 0aaff5eaf982 · 2018-10-31T07:55:13.000-07:00
Summary: Pull Request resolved: pytorch#13275 This resulted in a bunch of knock-on changes, which I will now describe: - s/original_index/original_device/ - s/last_index/last_device/ - A bunch of places that used set_index, now use CUDAGuard (which does have set_index) because they were CUDA-specific code. Major caveat: DeviceGuard doesn't *actually* work non-CUDA/CPU devices, To make that happen, I plan on totally replacing the implementation of DeviceGuard; what I mostly care about here is wrangling the API into an acceptable state. Reviewed By: gchanan Differential Revision: D12832080 fbshipit-source-id: 7de068c7cec35663dc8a533026a626331336e61d
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
@@ -10,41 +10,38 @@
 #include <cstddef>
 
 namespace at {
-/// RAII guard that sets a certain default GPU index in its constructor, and
+/// RAII guard that sets a certain default device in its constructor, and
 /// changes it back to the device that was originally active upon destruction.
 ///
-/// The index is always reset to the one that was active at the time of
-/// construction of the guard. Even if you `set_index` after construction, the
-/// destructor will still reset the index to the one that was active at
+/// The device is always reset to the one that was active at the time of
+/// construction of the guard. Even if you `set_device` after construction, the
+/// destructor will still reset the device to the one that was active at
 /// construction time.
 struct DeviceGuard {
   /// Default constructor, does nothing.
   DeviceGuard() = default;
 
-  /// Uses the given device's `index()` if it is a CUDA device, else does
-  /// nothing.
+  /// Set the current device to the passed Device.
   explicit DeviceGuard(Device device) {
-    if (device.is_cuda()) {
-      set_index(device.index());
-    }
+    set_device(device);
   }
 
   explicit DeviceGuard(c10::optional<Device> device_opt) {
-    if (device_opt.has_value() && device_opt.value().is_cuda()) {
-      set_index(device_opt.value().index());
+    if (device_opt.has_value()) {
+      set_device(device_opt.value());
     }
   }
 
-  /// Sets the device to the index on which the given tensor is located.
+  /// Sets the current device to the device on which the given tensor is located.
   explicit DeviceGuard(const Tensor& tensor) {
-    set_index_from(tensor);
+    set_device_from(tensor);
   }
 
-  /// Sets the device to the index on which the first tensor in the list is
+  /// Sets the current device to the device on which the first tensor in the list is
   /// located. If the list is empty, does nothing.
   explicit DeviceGuard(const TensorList& tensors) {
     if (!tensors.empty()) {
-      set_index_from(tensors.front());
+      set_device_from(tensors.front());
     }
   }
 
@@ -71,7 +68,7 @@ struct DeviceGuard {
     return *this;
   }
 
-  /// Resets the device to the index that was active at construction of the
+  /// Resets the device to the device that was active at construction of the
   /// guard.
   ~DeviceGuard() {
     // It should only not have a value if an index was never actually set.
@@ -82,7 +79,12 @@ struct DeviceGuard {
   }
 
   /// Sets the device to the given one.
-  void set_index(int16_t index) {
+  void set_device(at::Device device) {
+    if (device.type() == at::kCPU) {
+      return;
+    }
+    AT_ASSERT(device.type() == at::kCUDA);
+    auto index = device.index();
     if (index == -1) {
       return;
     }
@@ -100,28 +102,35 @@ struct DeviceGuard {
     last_index_ = index;
   }
 
-  /// Calls `set_index` with the `Tensor`'s current device, if it is a CUDA
-  /// tensor. Does nothing if the `tensor` is not defined.
-  void set_index_from(const Tensor& tensor) {
-    if (tensor.defined() && tensor.is_cuda()) {
-      set_index(tensor.get_device());
+  /// Calls `set_device` with the `Tensor`'s current device, if it is not a
+  /// CPU tensor. Does nothing if the `tensor` is not defined.
+  void set_device_from(const Tensor& tensor) {
+    if (tensor.defined()) {
+      set_device(tensor.device());
     }
   }
 
   /// Returns the device that was set upon construction of the guard.
-  int16_t original_index() const noexcept {
-    return original_index_;
+  at::Device original_device() const noexcept {
+    return original_index_ == -1 ? at::kCPU : at::Device(at::kCUDA, original_index_);
   }
 
-  /// Returns the last device that was set via `set_index`, if any.
-  int16_t last_index() const noexcept {
-    return last_index_;
+  /// Returns the last device that was set via `set_device`, if any.
+  at::Device last_device() const noexcept {
+    return last_index_ == -1 ? at::kCPU : at::Device(at::kCUDA, last_index_);
   }
 
  private:
+  // This representation only works under the assumption that the DeviceType
+  // is only CUDA.  I think a reasonable invariant to assert for DeviceGuard
+  // is that once you've "picked" a device type, you can't mix set_device
+  // with other device types.
+
   /// The original device that was active at construction of this object.
+  /// If not -1, it is a CUDA device.
   int16_t original_index_ = -1;
-  /// The last index that was set via `set_index`.
+  /// The last device that was set via `set_device`.  If not -1, it is a CUDA
+  /// device.
   int16_t last_index_ = -1;
 };
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDAGuard.h b/aten/src/ATen/cuda/CUDAGuard.h
@@ -76,13 +76,13 @@ struct CUDAGuard {
 
   /// Sets the CUDA device to the given one.
   /// TODO: Deprecate this name
-  void set_device(int32_t device) {
-    device_guard_.set_index(device);
+  void set_device(int32_t device_index) {
+    set_index(device_index);
   }
 
   /// Sets the CUDA device to the given one.
-  void set_index(int32_t device) {
-    device_guard_.set_index(device);
+  void set_index(int32_t device_index) {
+    device_guard_.set_device(at::Device(at::kCUDA, device_index));
   }
 
   /// Returns the CUDA streams that were active in the first call to
@@ -93,13 +93,13 @@ struct CUDAGuard {
   }
 
   /// Returns the device that was set upon construction of the guard.
-  int32_t original_device() const noexcept {
-    return device_guard_.original_index();
+  Device original_device() const noexcept {
+    return device_guard_.original_device();
   }
 
   /// Returns the last device that was set via `set_device`, if any.
-  int32_t last_device() const noexcept {
-    return device_guard_.last_index();
+  Device last_device() const noexcept {
+    return device_guard_.last_device();
   }
 
  private:
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
@@ -10,6 +10,7 @@ namespace native {
 // comparing against the current device object in Tensor.
 // This always **copies** but this is intended because (1) we shouldn't modify
 // input argument, and (2) Device is small anyways.
+// NB: This ONLY works for CUDA device
 static inline Device ensure_has_index(const Device &device) {
   if (!device.is_cuda() || device.has_index()) {
     return device;
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
@@ -28,10 +28,7 @@ Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking
 }
 
 Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device> to_device) const {
-  DeviceGuard device_guard;
-  if (to_device.has_value()) {
-    device_guard.set_index(to_device.value().index());
-  }
+  DeviceGuard device_guard(to_device);
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
   Tensor r;
   if (is_sparse()) {
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
@@ -152,7 +152,7 @@ TEST(TestStream, CUDAGuardTest) {
   // Setting a stream changes the current device and the stream on that device
   {
     at::cuda::CUDAGuard guard(streams1[1]);
-    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(guard.last_device(), at::Device(at::kCUDA, 1));
     ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
     ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[1]);
   }
@@ -164,7 +164,7 @@ TEST(TestStream, CUDAGuardTest) {
   // Setting only the device changes only the current device and not the stream
   {
     at::cuda::CUDAGuard guard(/*device=*/1);
-    ASSERT_EQ_CUDA(guard.last_device(), 1);
+    ASSERT_EQ_CUDA(guard.last_device(), at::Device(at::kCUDA, 1));
     ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
     ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
   }
@@ -196,13 +196,13 @@ TEST(TestStream, CUDAGuardMovableTest) {
   first.set_device(1);
   at::cuda::CUDAGuard second(std::move(first));
   ASSERT_EQ_CUDA(second.original_streams().size(), device_count);
-  ASSERT_EQ_CUDA(second.original_device(), 0);
-  ASSERT_EQ_CUDA(second.last_device(), 1);
+  ASSERT_EQ_CUDA(second.original_device(), at::Device(at::kCUDA, 0));
+  ASSERT_EQ_CUDA(second.last_device(), at::Device(at::kCUDA, 1));
   at::cuda::CUDAGuard third;
   third = std::move(second);
   ASSERT_EQ_CUDA(third.original_streams().size(), device_count);
-  ASSERT_EQ_CUDA(third.original_device(), 0);
-  ASSERT_EQ_CUDA(third.last_device(), 1);
+  ASSERT_EQ_CUDA(third.original_device(), at::Device(at::kCUDA, 0));
+  ASSERT_EQ_CUDA(third.last_device(), at::Device(at::kCUDA, 1));
 }
 
 // Streampool Round Robin
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
@@ -13,7 +13,7 @@
 using namespace at;
 
 // TODO: This might be generally helpful aliases elsewhere.
-at::Device CPUDevice(DeviceIndex index) {
+at::Device CPUDevice() {
   return at::Device(at::kCPU);
 }
 at::Device CUDADevice(DeviceIndex index) {
@@ -128,15 +128,15 @@ TEST(OptionsGuardTest, DeviceGuardOptionsGuardInteraction_MultiCUDA) {
 
 TEST(DeviceGuardTest, IsMovable_CUDA) {
   DeviceGuard first(CUDADevice(1));
-  ASSERT_EQ(first.original_index(), 0);
-  ASSERT_EQ(first.last_index(), 1);
+  ASSERT_EQ(first.original_device(), CUDADevice(0));
+  ASSERT_EQ(first.last_device(), CUDADevice(1));
   DeviceGuard second(std::move(first));
-  ASSERT_EQ(second.original_index(), 0);
-  ASSERT_EQ(second.last_index(), 1);
-  ASSERT_EQ(first.original_index(), -1);
+  ASSERT_EQ(second.original_device(), CUDADevice(0));
+  ASSERT_EQ(second.last_device(), CUDADevice(1));
+  ASSERT_EQ(first.original_device(), CPUDevice());
   DeviceGuard third;
   third = std::move(second);
-  ASSERT_EQ(third.original_index(), 0);
-  ASSERT_EQ(third.last_index(), 1);
-  ASSERT_EQ(second.original_index(), -1);
+  ASSERT_EQ(third.original_device(), CUDADevice(0));
+  ASSERT_EQ(third.last_device(), CUDADevice(1));
+  ASSERT_EQ(second.original_device(), CPUDevice());
 }
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
@@ -1,6 +1,10 @@
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/function.h"
 
+#ifdef USE_CUDA
+#include "ATen/cuda/CUDAGuard.h"
+#endif
+
 #include <sstream>
 
 namespace torch { namespace autograd { namespace profiler {
@@ -122,7 +126,7 @@ RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
 
 #ifdef USE_CUDA
 static void onEachDevice(std::function<void(int)> op) {
-  at::DeviceGuard device_guard;
+  at::cuda::CUDAGuard device_guard;
   int count;
   TORCH_CUDA_CHECK(cudaGetDeviceCount(&count));
   for(int i = 0; i < count; i++) {
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
@@ -45,15 +45,13 @@ namespace torch { namespace autograd {
 
 VariableInfo::VariableInfo(const Variable& var)
   : type(&var.type())
+  , device(var.device())
   , size(var.sizes().vec())
   , requires_grad(var.requires_grad()) {
-  if (var.type().is_cuda()) {
-    device = var.get_device();
-  }
 }
 
 Variable VariableInfo::zeros(at::DeviceGuard& device_guard) const {
-  device_guard.set_index(device);
+  device_guard.set_device(device);
   return at::zeros(size, type->options());
 }
 
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
@@ -27,7 +27,7 @@ struct VariableInfo {
   Variable zeros(at::DeviceGuard& device_guard) const;
 
   at::Type* type;
-  int32_t device = -1;
+  at::Device device = at::kCPU;
   std::vector<int64_t> size;
   bool requires_grad;
 };
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
@@ -4,6 +4,7 @@
 #include "torch/csrc/utils/hash.h"
 
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 
 #include <THC/THC.h>
@@ -241,7 +242,7 @@ void broadcast(
   const auto comms = user_comms.empty() ? _get_communicators(tensors)
                                         : ArrayRef<ncclComm_t>(user_comms);
 
-  at::DeviceGuard device_guard;
+  at::cuda::CUDAGuard device_guard;
   AutoNcclGroup nccl_group_guard;
   for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
     int device = tensors[i].get_device();
@@ -288,7 +289,7 @@ void reduce(
   auto comms_ref = user_comms.empty() ? _get_communicators(inputs)
                                       : ArrayRef<ncclComm_t>(user_comms);
 
-  at::DeviceGuard device_guard;
+  at::cuda::CUDAGuard device_guard;
   AutoNcclGroup nccl_group_guard;
   for (size_t i = 0; i < len; i++) {
     int device = inputs[i].device().index();
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
@@ -9,6 +9,8 @@
 #include "torch/csrc/cuda/nccl.h"
 #include "torch/csrc/utils/functional.h"
 
+#include <ATen/cuda/CUDAGuard.h>
+
 #include <nccl.h>
 
 #include <sstream>
@@ -192,7 +194,7 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
     std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
     auto comms = user_comms.empty() ? _get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
-    at::DeviceGuard device_guard;
+    at::cuda::CUDAGuard device_guard;
     AutoNcclGroup nccl_group_guard;
     for (size_t i = 0; i < len; i++) {
       int device = inputs[i].get_device();
@@ -272,7 +274,7 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
     std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
     auto comms = user_comms.empty() ? _get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
-    at::DeviceGuard device_guard;
+    at::cuda::CUDAGuard device_guard;
     AutoNcclGroup nccl_group_guard;
     for (size_t i = 0; i < len; i++) {
       int device = inputs[i].get_device();
@@ -335,7 +337,7 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
     std::lock_guard<std::mutex> lock(*(THCCachingAllocator_getCudaFreeMutex()));
     auto comms = user_comms.empty() ? _get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
-    at::DeviceGuard device_guard;
+    at::cuda::CUDAGuard device_guard;
     AutoNcclGroup nccl_group_guard;
     for (size_t i = 0; i < len; i++) {
       int device = inputs[i].get_device();
diff --git a/torch/csrc/distributed/c10d/ddp.cpp b/torch/csrc/distributed/c10d/ddp.cpp
@@ -183,7 +183,8 @@ void syncReduction(
 
   // Now make the BW stream wait on it
   auto bwDevice = cudaGuard.original_device();
-  auto bwStream = cudaGuard.original_streams()[bwDevice];
+  AT_ASSERT(bwDevice.type() == at::kCUDA);
+  auto bwStream = cudaGuard.original_streams()[bwDevice.index()];
 
   // Now let the BW stream wait for the worker stream
   event.block(bwStream);
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
@@ -94,7 +94,10 @@ Tensor new_with_tensor_copy(const Type& type, Tensor other, int32_t device_index
   AutoNoGIL no_gil;
   at::DeviceGuard device_guard;
   if (type.is_cuda()) {
-    device_guard.set_index(device_index);
+    // TODO: It would be better if new_with_tensor_copy took an at::Device
+    // to begin with, but then we need to fix the situation with
+    // dispatch_type_conversion bleggg
+    device_guard.set_device(at::Device(at::kCUDA, device_index));
   }
   return type.copy(other);
 }
diff --git a/torch/lib/THD/base/data_channels/DataChannelNccl.cpp b/torch/lib/THD/base/data_channels/DataChannelNccl.cpp
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp

Original file line number	Diff line number	Diff line change
`@@ -76,13 +76,13 @@ struct CUDAGuard {`
`76`	`76`
`77`	`77`	`/// Sets the CUDA device to the given one.`
`78`	`78`	`/// TODO: Deprecate this name`
`79`		`- void set_device(int32_t device) {`
`80`		`- device_guard_.set_index(device);`
	`79`	`+ void set_device(int32_t device_index) {`
	`80`	`+ set_index(device_index);`
`81`	`81`	`}`
`82`	`82`
`83`	`83`	`/// Sets the CUDA device to the given one.`
`84`		`- void set_index(int32_t device) {`
`85`		`- device_guard_.set_index(device);`
	`84`	`+ void set_index(int32_t device_index) {`
	`85`	`+ device_guard_.set_device(at::Device(at::kCUDA, device_index));`
`86`	`86`	`}`
`87`	`87`
`88`	`88`	`/// Returns the CUDA streams that were active in the first call to`
`@@ -93,13 +93,13 @@ struct CUDAGuard {`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`/// Returns the device that was set upon construction of the guard.`
`96`		`- int32_t original_device() const noexcept {`
`97`		`- return device_guard_.original_index();`
	`96`	`+ Device original_device() const noexcept {`
	`97`	`+ return device_guard_.original_device();`
`98`	`98`	`}`
`99`	`99`
`100`	`100`	/// Returns the last device that was set via `set_device`, if any.
`101`		`- int32_t last_device() const noexcept {`
`102`		`- return device_guard_.last_index();`
	`101`	`+ Device last_device() const noexcept {`
	`102`	`+ return device_guard_.last_device();`
`103`	`103`	`}`
`104`	`104`
`105`	`105`	`private:`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,10 @@ Tensor new_with_tensor_copy(const Type& type, Tensor other, int32_t device_index`
`94`	`94`	`AutoNoGIL no_gil;`
`95`	`95`	`at::DeviceGuard device_guard;`
`96`	`96`	`if (type.is_cuda()) {`
`97`		`- device_guard.set_index(device_index);`
	`97`	`+ // TODO: It would be better if new_with_tensor_copy took an at::Device`
	`98`	`+ // to begin with, but then we need to fix the situation with`
	`99`	`+ // dispatch_type_conversion bleggg`
	`100`	`+ device_guard.set_device(at::Device(at::kCUDA, device_index));`
`98`	`101`	`}`
`99`	`102`	`return type.copy(other);`
`100`	`103`	`}`