Skip to content

Commit e5d5665

Browse files
ezyangfacebook-github-bot
authored andcommitted
Delete DeviceGuard(int64_t) constructor. (pytorch#13232)
Summary: Pull Request resolved: pytorch#13232 DeviceGuard should be device agnostic, which means that it shouldn't assume that int64_t means select the CUDA device. Reviewed By: gchanan Differential Revision: D10858024 fbshipit-source-id: b40e8337e4046906fd8f83a95e6206367fb29dbe
1 parent e93c721 commit e5d5665

File tree

23 files changed

+102
-48
lines changed

23 files changed

+102
-48
lines changed

aten/src/ATen/DeviceGuard.h

-5
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ struct DeviceGuard {
3535
}
3636
}
3737

38-
/// Calls `set_index` with the given index.
39-
explicit DeviceGuard(int16_t index) {
40-
set_index(index);
41-
}
42-
4338
/// Sets the device to the index on which the given tensor is located.
4439
explicit DeviceGuard(const Tensor& tensor) {
4540
set_index_from(tensor);

aten/src/ATen/cuda/CUDAEvent.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#pragma once
22

3-
#include "ATen/DeviceGuard.h"
43
#include "ATen/cuda/ATenCUDAGeneral.h"
54
#include "ATen/cuda/CUDAContext.h"
65
#include "ATen/cuda/CUDAStream.h"
6+
#include "ATen/cuda/CUDAGuard.h"
77
#include "ATen/cuda/Exceptions.h"
88
#include "c10/util/Exception.h"
99

@@ -35,7 +35,7 @@ struct AT_CUDA_API CUDAEvent {
3535
~CUDAEvent() {
3636
try {
3737
if (is_created_) {
38-
at::DeviceGuard device_guard{static_cast<int16_t>(device_index_)};
38+
at::cuda::CUDAGuard device_guard(static_cast<int16_t>(device_index_));
3939
cudaEventDestroy(event_);
4040
}
4141
} catch (...) { /* No throw */ }
@@ -105,7 +105,7 @@ struct AT_CUDA_API CUDAEvent {
105105
}
106106

107107
void create(const int64_t device) {
108-
at::DeviceGuard device_index_guard{static_cast<int16_t>(device)};
108+
at::cuda::CUDAGuard device_index_guard(static_cast<int16_t>(device));
109109
AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
110110

111111
is_created_ = true;

aten/src/ATen/cuda/CUDAGuard.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ struct CUDAGuard {
6262
/// Sets the current CUDA device to the device associated with the given
6363
/// stream, and then sets the current stream on that device to the one given.
6464
void set_stream(const CUDAStream& stream) {
65-
device_guard_.set_index(stream.device_index());
65+
set_index(stream.device_index());
6666
// If we haven't stored the current stream yet, store it now.
6767
if (original_streams_.empty()) {
6868
const size_t device_count = getNumGPUs();
@@ -75,10 +75,16 @@ struct CUDAGuard {
7575
}
7676

7777
/// Sets the CUDA device to the given one.
78+
/// TODO: Deprecate this name
7879
void set_device(int32_t device) {
7980
device_guard_.set_index(device);
8081
}
8182

83+
/// Sets the CUDA device to the given one.
84+
void set_index(int32_t device) {
85+
device_guard_.set_index(device);
86+
}
87+
8288
/// Returns the CUDA streams that were active in the first call to
8389
/// `set_stream`. If there was no such call, the returned container is
8490
/// empty.

aten/src/ATen/cuda/CUDAStream.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "ATen/cuda/CUDAStream.h"
2-
#include "ATen/DeviceGuard.h"
32
#include "ATen/cuda/CUDAContext.h"
43
#include "ATen/cuda/CUDAEvent.h"
4+
#include "ATen/cuda/CUDAGuard.h"
55
#include "ATen/cuda/Exceptions.h"
66
#include "c10/util/Exception.h"
77

@@ -185,7 +185,7 @@ static void initGlobalStreamState() {
185185
static void initDeviceStreamState(const int64_t device) {
186186
// Switches to the requested device so streams are properly associated
187187
// with it.
188-
at::DeviceGuard device_guard{static_cast<int16_t>(device)};
188+
at::cuda::CUDAGuard device_guard{static_cast<int16_t>(device)};
189189

190190
for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
191191
auto& lowpri_stream = low_priority_streams[device][i];

aten/src/ATen/native/cuda/Resize.cuh

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "ATen/ATen.h"
44
#include "THC/THCTensor.hpp"
55

6+
#include "ATen/cuda/CUDAGuard.h"
7+
68
namespace at { namespace native {
79

810
// These functions are called by native::resize_ as well as (legacy) THC resize.
@@ -33,9 +35,9 @@ inline TensorImpl* resize_impl_cuda_(
3335
}
3436

3537
// NB: We don't need to hold the device guard when calling from TH
36-
c10::optional<DeviceGuard> guard;
38+
c10::optional<cuda::CUDAGuard> guard;
3739
if (device_guard) {
38-
guard = DeviceGuard(self->storage().device().index());
40+
guard = cuda::CUDAGuard(self->storage().device().index());
3941
}
4042

4143
int64_t storage_size = 1;

aten/src/ATen/test/stream_test.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ TEST(TestStream, CUDAGuardTest) {
127127

128128
std::vector<at::cuda::CUDAStream> streams1;
129129
{
130-
at::DeviceGuard device_guard(1);
130+
at::cuda::CUDAGuard device_guard(1);
131131
streams1.push_back(at::cuda::getDefaultCUDAStream());
132132
streams1.push_back(at::cuda::getStreamFromPool());
133133
}
@@ -237,7 +237,7 @@ TEST(TestStream, MultiGPUTest) {
237237

238238
ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());
239239

240-
at::DeviceGuard device_guard{1};
240+
at::cuda::CUDAGuard device_guard{1};
241241
ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());
242242
}
243243

test/cpp/api/tensor_options_cuda.cpp

+16-5
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,19 @@
77
#include <ATen/core/ScalarType.h>
88
#include <ATen/core/TensorOptions.h>
99

10+
// NB: This file is compiled even in CPU build (for some reason), so
11+
// make sure you don't include any CUDA only headers.
12+
1013
using namespace at;
1114

15+
// TODO: This might be generally helpful aliases elsewhere.
16+
at::Device CPUDevice(DeviceIndex index) {
17+
return at::Device(at::kCPU);
18+
}
19+
at::Device CUDADevice(DeviceIndex index) {
20+
return at::Device(at::kCUDA, index);
21+
}
22+
1223
// A macro so we don't lose location information when an assertion fails.
1324
#define REQUIRE_OPTIONS(device_, index_, type_, layout_) \
1425
ASSERT_EQ(options.device().type(), Device((device_), (index_)).type()); \
@@ -54,14 +65,14 @@ TEST(TensorOptionsTest, ConstructsWellFromCUDATensors_MultiCUDA) {
5465
if (at::globalContext().getNumGPUs() > 1) {
5566
Tensor tensor;
5667
{
57-
DeviceGuard guard(1);
68+
DeviceGuard guard(CUDADevice(1));
5869
tensor = empty(5, device(kCUDA));
5970
}
6071
options = tensor.options();
6172
REQUIRE_OPTIONS(kCUDA, 1, kFloat, kStrided);
6273

6374
{
64-
DeviceGuard guard(1);
75+
DeviceGuard guard(CUDADevice(1));
6576
tensor = empty(5, device(kCUDA).layout(kSparse));
6677
}
6778
options = tensor.options();
@@ -94,15 +105,15 @@ TEST(OptionsGuardTest, DeviceGuardOptionsGuardInteraction_MultiCUDA) {
94105
Tensor tensor;
95106
{
96107
// Check that OptionsGuard respects any active device before construction.
97-
DeviceGuard guard(1);
108+
DeviceGuard guard(CUDADevice(1));
98109
{
99110
OptionsGuard guard(device(kCUDA));
100111
tensor = at::empty({10});
101112
REQUIRE_TENSOR_OPTIONS(kCUDA, 1, kFloat, kStrided);
102113
{
103114
// Check that OptionsGuard respects any active device after
104115
// construction.
105-
DeviceGuard guard(0);
116+
DeviceGuard guard(CUDADevice(0));
106117
tensor = at::empty({10});
107118
REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kFloat, kStrided);
108119
{
@@ -116,7 +127,7 @@ TEST(OptionsGuardTest, DeviceGuardOptionsGuardInteraction_MultiCUDA) {
116127
}
117128

118129
TEST(DeviceGuardTest, IsMovable_CUDA) {
119-
DeviceGuard first(1);
130+
DeviceGuard first(CUDADevice(1));
120131
ASSERT_EQ(first.original_index(), 0);
121132
ASSERT_EQ(first.last_index(), 1);
122133
DeviceGuard second(std::move(first));

tools/cwrap/plugins/AutoGPU.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ def __init__(self, has_self=True, condition=None):
1010
def process_pre_arg_assign(self, template, option):
1111
if not option.get('device_guard', True):
1212
return template
13-
call = 'at::DeviceGuard device_guard(get_device(args));'
13+
call = 'at::cuda::CUDAGuard device_guard(get_device(args));'
1414
return [call] + template

tools/cwrap/plugins/NNExtension.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
#include "THP.h"
1111
#include "torch/csrc/nn/type_checks.h"
1212
13-
#include <ATen/DeviceGuard.h>
13+
// HIPify isn't being applied to autogenerated files, so defensively
14+
// handle both the CUDA and ROCM cases.
15+
#if defined(USE_CUDA) || defined(USE_ROCM)
16+
#include <ATen/cuda/CUDAGuard.h>
17+
#endif
1418
1519
"""
1620
REGISTER_METHOD_TEMPLATE = Template(' {"$name", (PyCFunction)$name, METH_STATIC | METH_VARARGS, NULL},\n')

tools/jit/templates/register_aten_ops.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ using at::DeviceGuard;
4444

4545
namespace {
4646

47-
inline int deviceForInputs(Stack & stack, size_t N) {
47+
inline at::optional<at::Device> deviceForInputs(Stack & stack, size_t N) {
4848
if(N == 0)
49-
return -1;
49+
return c10::nullopt;
5050
auto t = (stack.end() - N)->toTensor();
51-
return t.type().is_cuda() ? (int) t.get_device() : -1;
51+
return c10::make_optional(t.device());
5252
}
5353

5454
template<size_t N>

torch/csrc/autograd/VariableTypeManual.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -267,9 +267,7 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block
267267
grad_fn = std::make_shared<CopyBackwards>();
268268
grad_fn->set_next_edges(collect_next_edges(self, src));
269269
grad_fn->src_type = &src.type();
270-
if (src.is_cuda()) {
271-
grad_fn->src_device = src.get_device();
272-
}
270+
grad_fn->src_device = src.device();
273271
}
274272
if (self.is_sparse() && src.is_sparse()) baseType->copy_sparse_to_sparse_(self_, src_, non_blocking);
275273
else if (!self.is_sparse() && !src.is_sparse()) baseType->s_copy_(self_, src_, non_blocking);

torch/csrc/autograd/engine.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#ifdef USE_CUDA
3030
#include <cuda.h>
3131
#include <THC/THC.h>
32+
#include <ATen/cuda/CUDAGuard.h>
3233
#endif
3334

3435
namespace torch { namespace autograd {
@@ -200,9 +201,13 @@ Engine::Engine() = default;
200201
// This Engine's ReadyQueues and their corresponding threads are leaked here
201202
Engine::~Engine() = default;
202203

204+
// TODO: Engine is not written in a way that it can deal with anything that's
205+
// not CUDA.
203206
auto Engine::thread_init(int device) -> void {
204207
THInferNumThreads();
205-
at::DeviceGuard guard(device);
208+
#ifdef USE_CUDA
209+
at::cuda::CUDAGuard guard(device);
210+
#endif
206211
worker_device = device;
207212
thread_main(nullptr);
208213
}

torch/csrc/autograd/functions/tensor.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ auto CopyBackwards::apply(variable_list&& grads) -> variable_list {
2424
}
2525
if (should_compute_output(1)) {
2626
at::DeviceGuard device_guard(src_device);
27-
if (grad.is_cuda() && grad.get_device() != src_device) {
27+
// TODO: What if !grad.is_cuda(), but src_device is CUDA?
28+
// This code is kind of weirdly asymmetric.
29+
if (grad.is_cuda() && grad.device() != src_device) {
2830
grad_inputs[1] = src_type->copy(grad);
2931
} else {
3032
grad_inputs[1] = grad.toType(*src_type);

torch/csrc/autograd/functions/tensor.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ struct CopyBackwards : public Function {
1616
variable_list apply(variable_list&& grads) override;
1717

1818
at::Type *src_type = nullptr; // initialized for safety.
19-
int32_t src_device = -1;
19+
at::Device src_device = at::kCPU;
2020
};
2121

2222
// Performs grad[idx] = fn(grad[idx]), but out-of-place. The slicing operation

torch/csrc/cuda/comm.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
4141
"first on devices list");
4242
std::vector<Tensor> tensors;
4343
tensors.reserve(devices.size());
44-
at::DeviceGuard _device_guard;
44+
at::cuda::CUDAGuard _device_guard;
4545
#ifdef USE_NCCL
4646
if (nccl::is_available({tensor})) {
4747
tensors.push_back(tensor);
@@ -82,7 +82,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
8282
o.reserve(tensors.size());
8383

8484
unique_type_checker type_checker;
85-
at::DeviceGuard device_guard(devices[0]);
85+
at::cuda::CUDAGuard device_guard(devices[0]);
8686
for (auto & chunk : utils::take_tensors(tensors, buffer_size)) {
8787
auto & type = chunk.type();
8888
type_checker.show(type);

torch/csrc/distributed/c10d/ddp.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
123123
// improve performance
124124
std::vector<at::cuda::CUDAStream> workerStreams;
125125
for (size_t devIdx = 0; devIdx < devices.size(); ++devIdx) {
126-
at::DeviceGuard guard(devices[devIdx]);
126+
at::cuda::CUDAGuard guard(devices[devIdx]);
127127
events[devIdx].record();
128128
workerStreams.push_back(at::cuda::getStreamFromPool(false, devices[devIdx]));
129129
// Let the worker stream to wait for the default stream
@@ -138,7 +138,7 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
138138

139139
std::vector<at::Tensor> gradsBatchCoalesced;
140140
for (size_t devIdx = 0; devIdx < devices.size(); ++devIdx) {
141-
at::DeviceGuard guard(devices[devIdx]);
141+
at::cuda::CUDAGuard guard(devices[devIdx]);
142142
gradsBatchCoalesced.push_back(
143143
torch::utils::flatten_dense_tensors(gradsBatch[devIdx]));
144144
}

torch/csrc/generic/StorageSharing.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifdef USE_CUDA
22
#include <cuda.h>
33
#include <cuda_runtime.h>
4+
#include <ATen/cuda/CUDAGuard.h>
45
#endif
56

67
#include <random>
@@ -262,7 +263,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
262263
size_t storage_size = (size_t)THPUtils_unpackLong(_size);
263264

264265
int64_t device = THPUtils_unpackLong(_device);
265-
at::DeviceGuard device_guard(device);
266+
at::cuda::CUDAGuard device_guard(device);
266267

267268
char *buffer;
268269
Py_ssize_t handle_size;

torch/csrc/jit/fusers/cuda/fused_kernel.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "torch/csrc/jit/resource_guard.h"
44

55
#include "ATen/cuda/CUDAContext.h"
6+
#include "ATen/cuda/CUDAGuard.h"
67
#include "THC/THC.h"
78
#include "THC/THCGenerator.hpp"
89
#include "torch/csrc/cuda/cuda_check.h"
@@ -34,7 +35,7 @@ CUDAFusedKernel::CUDAFusedKernel(
3435
const std::string& name
3536
, AnnotatedGraph& agraph)
3637
: FusedKernel(name, agraph) {
37-
at::DeviceGuard device_guard(agraph.device);
38+
at::cuda::CUDAGuard device_guard(agraph.device);
3839

3940
TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
4041
checkCUDAVersion(prop);

torch/csrc/jit/passes/shape_analysis.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ int64_t wrapDim(int64_t dim, at::IntList sizes) {
3838
return dim;
3939
}
4040

41+
// TODO: Would be better to make JIT not assume that CUDA devices
42+
// are the only thing that exist.
43+
static at::Device jitDeviceIndexToDevice(int device) {
44+
return device == -1 ? at::kCPU : at::Device(at::kCUDA, device);
45+
}
46+
4147
IValue representativeValue(Value* v) {
4248
TypePtr type_ = v->type();
4349
// if the value is actually constant, just use it!
@@ -46,7 +52,7 @@ IValue representativeValue(Value* v) {
4652
}
4753
if (CompleteTensorTypePtr type = type_->cast<CompleteTensorType>()) {
4854
auto backend = type->device() == -1 ? at::Backend::CPU : at::Backend::CUDA;
49-
at::DeviceGuard device_guard(type->device());
55+
at::DeviceGuard device_guard(jitDeviceIndexToDevice(type->device()));
5056
auto& attype = at::getNonVariableType(backend, type->scalarType());
5157
auto t = at::empty_strided(type->sizes(), type->strides(), attype.options()).zero_();
5258
return autograd::make_variable(t, /*requires_grad=*/false);

0 commit comments

Comments
 (0)