Delete DeviceGuard(int64_t) constructor. (pytorch#13232)

ezyang · facebook-github-bot · commit e5d56659ec4b · 2018-10-31T07:55:11.000-07:00
Summary: Pull Request resolved: pytorch#13232 DeviceGuard should be device agnostic, which means that it shouldn't assume that int64_t means select the CUDA device. Reviewed By: gchanan Differential Revision: D10858024 fbshipit-source-id: b40e8337e4046906fd8f83a95e6206367fb29dbe
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
@@ -35,11 +35,6 @@ struct DeviceGuard {
     }
   }
 
-  /// Calls `set_index` with the given index.
-  explicit DeviceGuard(int16_t index) {
-    set_index(index);
-  }
-
   /// Sets the device to the index on which the given tensor is located.
   explicit DeviceGuard(const Tensor& tensor) {
     set_index_from(tensor);
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "ATen/DeviceGuard.h"
 #include "ATen/cuda/ATenCUDAGeneral.h"
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/CUDAGuard.h"
 #include "ATen/cuda/Exceptions.h"
 #include "c10/util/Exception.h"
 
@@ -35,7 +35,7 @@ struct AT_CUDA_API CUDAEvent {
   ~CUDAEvent() {
     try {
       if (is_created_) {
-        at::DeviceGuard device_guard{static_cast<int16_t>(device_index_)};
+        at::cuda::CUDAGuard device_guard(static_cast<int16_t>(device_index_));
         cudaEventDestroy(event_);
       }
     } catch (...) { /* No throw */ }
@@ -105,7 +105,7 @@ struct AT_CUDA_API CUDAEvent {
   }
 
   void create(const int64_t device) {
-    at::DeviceGuard device_index_guard{static_cast<int16_t>(device)};
+    at::cuda::CUDAGuard device_index_guard(static_cast<int16_t>(device));
     AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
 
     is_created_ = true;
diff --git a/aten/src/ATen/cuda/CUDAGuard.h b/aten/src/ATen/cuda/CUDAGuard.h
@@ -62,7 +62,7 @@ struct CUDAGuard {
   /// Sets the current CUDA device to the device associated with the given
   /// stream, and then sets the current stream on that device to the one given.
   void set_stream(const CUDAStream& stream) {
-    device_guard_.set_index(stream.device_index());
+    set_index(stream.device_index());
     // If we haven't stored the current stream yet, store it now.
     if (original_streams_.empty()) {
       const size_t device_count = getNumGPUs();
@@ -75,10 +75,16 @@ struct CUDAGuard {
   }
 
   /// Sets the CUDA device to the given one.
+  /// TODO: Deprecate this name
   void set_device(int32_t device) {
     device_guard_.set_index(device);
   }
 
+  /// Sets the CUDA device to the given one.
+  void set_index(int32_t device) {
+    device_guard_.set_index(device);
+  }
+
   /// Returns the CUDA streams that were active in the first call to
   /// `set_stream`. If there was no such call, the returned container is
   /// empty.
diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp
@@ -1,7 +1,7 @@
 #include "ATen/cuda/CUDAStream.h"
-#include "ATen/DeviceGuard.h"
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAEvent.h"
+#include "ATen/cuda/CUDAGuard.h"
 #include "ATen/cuda/Exceptions.h"
 #include "c10/util/Exception.h"
 
@@ -185,7 +185,7 @@ static void initGlobalStreamState() {
 static void initDeviceStreamState(const int64_t device) {
   // Switches to the requested device so streams are properly associated
   // with it.
-  at::DeviceGuard device_guard{static_cast<int16_t>(device)};
+  at::cuda::CUDAGuard device_guard{static_cast<int16_t>(device)};
 
   for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
     auto& lowpri_stream = low_priority_streams[device][i];
diff --git a/aten/src/ATen/native/cuda/Resize.cuh b/aten/src/ATen/native/cuda/Resize.cuh
@@ -3,6 +3,8 @@
 #include "ATen/ATen.h"
 #include "THC/THCTensor.hpp"
 
+#include "ATen/cuda/CUDAGuard.h"
+
 namespace at { namespace native {
 
 // These functions are called by native::resize_ as well as (legacy) THC resize.
@@ -33,9 +35,9 @@ inline TensorImpl* resize_impl_cuda_(
   }
 
   // NB: We don't need to hold the device guard when calling from TH
-  c10::optional<DeviceGuard> guard;
+  c10::optional<cuda::CUDAGuard> guard;
   if (device_guard) {
-    guard = DeviceGuard(self->storage().device().index());
+    guard = cuda::CUDAGuard(self->storage().device().index());
   }
 
   int64_t storage_size = 1;
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
@@ -127,7 +127,7 @@ TEST(TestStream, CUDAGuardTest) {
 
   std::vector<at::cuda::CUDAStream> streams1;
   {
-    at::DeviceGuard device_guard(1);
+    at::cuda::CUDAGuard device_guard(1);
     streams1.push_back(at::cuda::getDefaultCUDAStream());
     streams1.push_back(at::cuda::getStreamFromPool());
   }
@@ -237,7 +237,7 @@ TEST(TestStream, MultiGPUTest) {
 
   ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());
 
-  at::DeviceGuard device_guard{1};
+  at::cuda::CUDAGuard device_guard{1};
   ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());
 }
 
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
@@ -7,8 +7,19 @@
 #include <ATen/core/ScalarType.h>
 #include <ATen/core/TensorOptions.h>
 
+// NB: This file is compiled even in CPU build (for some reason), so
+// make sure you don't include any CUDA only headers.
+
 using namespace at;
 
+// TODO: This might be generally helpful aliases elsewhere.
+at::Device CPUDevice(DeviceIndex index) {
+  return at::Device(at::kCPU);
+}
+at::Device CUDADevice(DeviceIndex index) {
+  return at::Device(at::kCUDA, index);
+}
+
 // A macro so we don't lose location information when an assertion fails.
 #define REQUIRE_OPTIONS(device_, index_, type_, layout_)                      \
   ASSERT_EQ(options.device().type(), Device((device_), (index_)).type()); \
@@ -54,14 +65,14 @@ TEST(TensorOptionsTest, ConstructsWellFromCUDATensors_MultiCUDA) {
   if (at::globalContext().getNumGPUs() > 1) {
     Tensor tensor;
     {
-      DeviceGuard guard(1);
+      DeviceGuard guard(CUDADevice(1));
       tensor = empty(5, device(kCUDA));
     }
     options = tensor.options();
     REQUIRE_OPTIONS(kCUDA, 1, kFloat, kStrided);
 
     {
-      DeviceGuard guard(1);
+      DeviceGuard guard(CUDADevice(1));
       tensor = empty(5, device(kCUDA).layout(kSparse));
     }
     options = tensor.options();
@@ -94,15 +105,15 @@ TEST(OptionsGuardTest, DeviceGuardOptionsGuardInteraction_MultiCUDA) {
   Tensor tensor;
   {
     // Check that OptionsGuard respects any active device before construction.
-    DeviceGuard guard(1);
+    DeviceGuard guard(CUDADevice(1));
     {
       OptionsGuard guard(device(kCUDA));
       tensor = at::empty({10});
       REQUIRE_TENSOR_OPTIONS(kCUDA, 1, kFloat, kStrided);
       {
         // Check that OptionsGuard respects any active device after
         // construction.
-        DeviceGuard guard(0);
+        DeviceGuard guard(CUDADevice(0));
         tensor = at::empty({10});
         REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kFloat, kStrided);
         {
@@ -116,7 +127,7 @@ TEST(OptionsGuardTest, DeviceGuardOptionsGuardInteraction_MultiCUDA) {
 }
 
 TEST(DeviceGuardTest, IsMovable_CUDA) {
-  DeviceGuard first(1);
+  DeviceGuard first(CUDADevice(1));
   ASSERT_EQ(first.original_index(), 0);
   ASSERT_EQ(first.last_index(), 1);
   DeviceGuard second(std::move(first));
diff --git a/tools/cwrap/plugins/AutoGPU.py b/tools/cwrap/plugins/AutoGPU.py
@@ -10,5 +10,5 @@ def __init__(self, has_self=True, condition=None):
     def process_pre_arg_assign(self, template, option):
         if not option.get('device_guard', True):
             return template
-        call = 'at::DeviceGuard device_guard(get_device(args));'
+        call = 'at::cuda::CUDAGuard device_guard(get_device(args));'
         return [call] + template
diff --git a/tools/cwrap/plugins/NNExtension.py b/tools/cwrap/plugins/NNExtension.py
@@ -10,7 +10,11 @@
 #include "THP.h"
 #include "torch/csrc/nn/type_checks.h"
 
-#include <ATen/DeviceGuard.h>
+// HIPify isn't being applied to autogenerated files, so defensively
+// handle both the CUDA and ROCM cases.
+#if defined(USE_CUDA) || defined(USE_ROCM)
+#include <ATen/cuda/CUDAGuard.h>
+#endif
 
 """
 REGISTER_METHOD_TEMPLATE = Template('  {"$name", (PyCFunction)$name, METH_STATIC | METH_VARARGS, NULL},\n')
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
@@ -44,11 +44,11 @@ using at::DeviceGuard;
 
 namespace {
 
-inline int deviceForInputs(Stack & stack, size_t N) {
+inline at::optional<at::Device> deviceForInputs(Stack & stack, size_t N) {
   if(N == 0)
-    return -1;
+    return c10::nullopt;
   auto t = (stack.end() - N)->toTensor();
-  return t.type().is_cuda() ? (int) t.get_device() : -1;
+  return c10::make_optional(t.device());
 }
 
 template<size_t N>
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -267,9 +267,7 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block
     grad_fn = std::make_shared<CopyBackwards>();
     grad_fn->set_next_edges(collect_next_edges(self, src));
     grad_fn->src_type = &src.type();
-    if (src.is_cuda()) {
-      grad_fn->src_device = src.get_device();
-    }
+    grad_fn->src_device = src.device();
   }
   if (self.is_sparse() && src.is_sparse()) baseType->copy_sparse_to_sparse_(self_, src_, non_blocking);
   else if (!self.is_sparse() && !src.is_sparse()) baseType->s_copy_(self_, src_, non_blocking);
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
@@ -29,6 +29,7 @@
 #ifdef USE_CUDA
 #include <cuda.h>
 #include <THC/THC.h>
+#include <ATen/cuda/CUDAGuard.h>
 #endif
 
 namespace torch { namespace autograd {
@@ -200,9 +201,13 @@ Engine::Engine() = default;
 // This Engine's ReadyQueues and their corresponding threads are leaked here
 Engine::~Engine() = default;
 
+// TODO: Engine is not written in a way that it can deal with anything that's
+// not CUDA.
 auto Engine::thread_init(int device) -> void {
   THInferNumThreads();
-  at::DeviceGuard guard(device);
+#ifdef USE_CUDA
+  at::cuda::CUDAGuard guard(device);
+#endif
   worker_device = device;
   thread_main(nullptr);
 }
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
@@ -24,7 +24,9 @@ auto CopyBackwards::apply(variable_list&& grads) -> variable_list {
   }
   if (should_compute_output(1)) {
     at::DeviceGuard device_guard(src_device);
-    if (grad.is_cuda() && grad.get_device() != src_device) {
+    // TODO: What if !grad.is_cuda(), but src_device is CUDA?
+    // This code is kind of weirdly asymmetric.
+    if (grad.is_cuda() && grad.device() != src_device) {
       grad_inputs[1] = src_type->copy(grad);
     } else {
       grad_inputs[1] = grad.toType(*src_type);
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
@@ -16,7 +16,7 @@ struct CopyBackwards : public Function {
   variable_list apply(variable_list&& grads) override;
 
   at::Type *src_type = nullptr; // initialized for safety.
-  int32_t src_device = -1;
+  at::Device src_device = at::kCPU;
 };
 
 // Performs grad[idx] = fn(grad[idx]), but out-of-place. The slicing operation
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
@@ -41,7 +41,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
                              "first on devices list");
   std::vector<Tensor> tensors;
   tensors.reserve(devices.size());
-  at::DeviceGuard _device_guard;
+  at::cuda::CUDAGuard _device_guard;
 #ifdef USE_NCCL
   if (nccl::is_available({tensor})) {
     tensors.push_back(tensor);
@@ -82,7 +82,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
     o.reserve(tensors.size());
 
   unique_type_checker type_checker;
-  at::DeviceGuard device_guard(devices[0]);
+  at::cuda::CUDAGuard device_guard(devices[0]);
   for (auto & chunk : utils::take_tensors(tensors, buffer_size)) {
     auto & type = chunk.type();
     type_checker.show(type);
diff --git a/torch/csrc/distributed/c10d/ddp.cpp b/torch/csrc/distributed/c10d/ddp.cpp
@@ -123,7 +123,7 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
   // improve performance
   std::vector<at::cuda::CUDAStream> workerStreams;
   for (size_t devIdx = 0; devIdx < devices.size(); ++devIdx) {
-    at::DeviceGuard guard(devices[devIdx]);
+    at::cuda::CUDAGuard guard(devices[devIdx]);
     events[devIdx].record();
     workerStreams.push_back(at::cuda::getStreamFromPool(false, devices[devIdx]));
     // Let the worker stream to wait for the default stream
@@ -138,7 +138,7 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
 
   std::vector<at::Tensor> gradsBatchCoalesced;
   for (size_t devIdx = 0; devIdx < devices.size(); ++devIdx) {
-    at::DeviceGuard guard(devices[devIdx]);
+    at::cuda::CUDAGuard guard(devices[devIdx]);
     gradsBatchCoalesced.push_back(
         torch::utils::flatten_dense_tensors(gradsBatch[devIdx]));
   }
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <ATen/cuda/CUDAGuard.h>
 #endif
 
 #include <random>
@@ -262,7 +263,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
   size_t storage_size = (size_t)THPUtils_unpackLong(_size);
 
   int64_t device = THPUtils_unpackLong(_device);
-  at::DeviceGuard device_guard(device);
+  at::cuda::CUDAGuard device_guard(device);
 
   char *buffer;
   Py_ssize_t handle_size;
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
@@ -3,6 +3,7 @@
 #include "torch/csrc/jit/resource_guard.h"
 
 #include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAGuard.h"
 #include "THC/THC.h"
 #include "THC/THCGenerator.hpp"
 #include "torch/csrc/cuda/cuda_check.h"
@@ -34,7 +35,7 @@ CUDAFusedKernel::CUDAFusedKernel(
   const std::string& name
 , AnnotatedGraph& agraph)
 : FusedKernel(name, agraph) {
-  at::DeviceGuard device_guard(agraph.device);
+  at::cuda::CUDAGuard device_guard(agraph.device);
 
   TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
   checkCUDAVersion(prop);
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -38,6 +38,12 @@ int64_t wrapDim(int64_t dim, at::IntList sizes) {
   return dim;
 }
 
+// TODO: Would be better to make JIT not assume that CUDA devices
+// are the only thing that exist.
+static at::Device jitDeviceIndexToDevice(int device) {
+  return device == -1 ? at::kCPU : at::Device(at::kCUDA, device);
+}
+
 IValue representativeValue(Value* v) {
   TypePtr type_ = v->type();
   // if the value is actually constant, just use it!
@@ -46,7 +52,7 @@ IValue representativeValue(Value* v) {
   }
   if (CompleteTensorTypePtr type = type_->cast<CompleteTensorType>()) {
     auto backend = type->device() == -1 ? at::Backend::CPU : at::Backend::CUDA;
-    at::DeviceGuard device_guard(type->device());
+    at::DeviceGuard device_guard(jitDeviceIndexToDevice(type->device()));
     auto& attype = at::getNonVariableType(backend, type->scalarType());
     auto t = at::empty_strided(type->sizes(), type->strides(), attype.options()).zero_();
     return autograd::make_variable(t, /*requires_grad=*/false);
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.cpp b/torch/csrc/utils/tensor_conversion_dispatch.cpp
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
diff --git a/torch/lib/c10d/CUDAUtils.cpp b/torch/lib/c10d/CUDAUtils.cpp
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp

Original file line number	Diff line number	Diff line change
`@@ -35,11 +35,6 @@ struct DeviceGuard {`
`35`	`35`	`}`
`36`	`36`	`}`
`37`	`37`
`38`		- /// Calls `set_index` with the given index.
`39`		`- explicit DeviceGuard(int16_t index) {`
`40`		`- set_index(index);`
`41`		`- }`
`42`		`-`
`43`	`38`	`/// Sets the device to the index on which the given tensor is located.`
`44`	`39`	`explicit DeviceGuard(const Tensor& tensor) {`
`45`	`40`	`set_index_from(tensor);`
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ TEST(TestStream, CUDAGuardTest) {`
`127`	`127`
`128`	`128`	`std::vector<at::cuda::CUDAStream> streams1;`
`129`	`129`	`{`
`130`		`- at::DeviceGuard device_guard(1);`
	`130`	`+ at::cuda::CUDAGuard device_guard(1);`
`131`	`131`	`streams1.push_back(at::cuda::getDefaultCUDAStream());`
`132`	`132`	`streams1.push_back(at::cuda::getStreamFromPool());`
`133`	`133`	`}`
`@@ -237,7 +237,7 @@ TEST(TestStream, MultiGPUTest) {`
`237`	`237`
`238`	`238`	`ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());`
`239`	`239`
`240`		`- at::DeviceGuard device_guard{1};`
	`240`	`+ at::cuda::CUDAGuard device_guard{1};`
`241`	`241`	`ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());`
`242`	`242`	`}`
`243`	`243`