alpharho1
diff --git a/‎caffe2/operators/utility_ops.cu
Lines changed: 2 additions & 10 deletions b/‎caffe2/operators/utility_ops.cu
Lines changed: 2 additions & 10 deletions
diff --git a/‎caffe2/operators/utility_ops.h
Lines changed: 27 additions & 27 deletions b/‎caffe2/operators/utility_ops.h
Lines changed: 27 additions & 27 deletions
diff --git a/‎caffe2/python/operator_test/utility_ops_test.py
Lines changed: 39 additions & 1 deletion b/‎caffe2/python/operator_test/utility_ops_test.py
Lines changed: 39 additions & 1 deletion
@@ -26,16 +26,8 @@ bool WeightedSumOp<CUDAContext>::RunOnDevice() {
 
 template <>
 bool SumOp<CUDAContext>::RunOnDevice() {
-  if (Input(0).IsType<float>()) {
-    return DoRunWithType<float, float>();
-  } else if (Input(0).IsType<at::Half>()) {
-    return DoRunWithType<at::Half, at::Half>();
-  } else if (Input(0).IsType<int32_t>()) {
-    return DoRunWithType<int32_t, int32_t>();
-  } else {
-    CAFFE_THROW("Unsupported inputs");
-  }
-  return false;
+  return DispatchHelper<TensorTypes<float, int32_t, int64_t>>::call(
+      this, Input(0));
 }
 
 REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
 
@@ -239,8 +239,7 @@ class FlattenToVecOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
-    CAFFE_ENFORCE_GE(
-        input.dim(), 1, "The rank of the tensor must be >= 1.");
+    CAFFE_ENFORCE_GE(input.dim(), 1, "The rank of the tensor must be >= 1.");
     output->Resize(input.numel());
 
     context_.CopyItemsSameDevice(
@@ -280,7 +279,7 @@ class SumOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SumOp);
 
-  template <typename T, typename M>
+  template <typename T>
   bool DoRunWithType() {
     auto& input0 = Input(0);
 
@@ -331,16 +330,8 @@ class SumOp : public Operator<Context> {
   }
 
   bool RunOnDevice() override {
-    if (Input(0).template IsType<float>()) {
-      return DoRunWithType<float, float>();
-    } else if (Input(0).template IsType<int>()) {
-      return DoRunWithType<int, int>();
-    } else {
-      CAFFE_THROW(
-          "Sum operator only supports 32-bit float and ints, but",
-          " input was of type ",
-          Input(0).dtype().name());
-    }
+    return DispatchHelper<TensorTypes<float, int32_t, int64_t>>::call(
+        this, Input(0));
   }
 };
 
@@ -369,7 +360,8 @@ class WeightedSumOp : public Operator<Context> {
   template <typename T>
   bool DoRunWithType() {
     // the code is written this way because of 10.1 + gcc 7.3.1 compiler bug
-    // as discussed at https://devtalk.nvidia.com/default/topic/1048037/linux/cuda-10-1-nvidia-you-re-now-quot-fixing-quot-gcc-bugs-that-gcc-doesn-t-even-have/
+    // as discussed at
+    // https://devtalk.nvidia.com/default/topic/1048037/linux/cuda-10-1-nvidia-you-re-now-quot-fixing-quot-gcc-bugs-that-gcc-doesn-t-even-have/
     const int input_size = (*this).InputSize();
     CAFFE_ENFORCE_EQ(input_size % 2, 0);
     const auto& X0 = Input(0);
@@ -751,14 +743,14 @@ class ScatterOp : public Operator<CPUContext> {
   template <class... Args>
   explicit ScatterOp(Args&&... args)
       : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(int, "axis", axis_, 1) {
-  }
+        OP_SINGLE_ARG(int, "axis", axis_, 1) {}
 
   virtual ~ScatterOp() noexcept override {}
 
   bool RunOnDevice() override {
-
-    TORCH_CHECK(Context::GetDeviceType() == kCPU, "ScatterOp currently only supports CPU.")
+    TORCH_CHECK(
+        Context::GetDeviceType() == kCPU,
+        "ScatterOp currently only supports CPU.")
 
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, this->template Input<Tensor>(INDICES, CPU));
@@ -775,7 +767,8 @@ class ScatterOp : public Operator<CPUContext> {
     // ONNX allows negative axis to index from the back, valid range: [-r, r].
     axis_ = data.canonical_axis_index(axis_);
 
-    CAFFE_ENFORCE_GE(data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D");
+    CAFFE_ENFORCE_GE(
+        data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D");
     CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative");
     CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range");
 
@@ -818,14 +811,20 @@ class ScatterOp : public Operator<CPUContext> {
     // src offset can be computed as i * J_src * K + j * K + k.
     // dst offset can be computed as i * J_dst * K + idxs[idxs_offset] * K + K
     // Note that idxs and src should have the same rank and shape.
-    // dst should have the same rank as idxs and src, but the dimension of dim axis can be different.
-    // That is why in the above equation, there is the difference of J_src and J_dst.
-    for (int64_t outer_batch = 0; outer_batch < outer_dims_product; ++outer_batch) {
+    // dst should have the same rank as idxs and src, but the dimension of dim
+    // axis can be different. That is why in the above equation, there is the
+    // difference of J_src and J_dst.
+    for (int64_t outer_batch = 0; outer_batch < outer_dims_product;
+         ++outer_batch) {
       for (int64_t i = 0; i < N; ++i) {
-        for (int64_t inner_batch = 0; inner_batch < idxs_block_size; ++inner_batch) {
-          auto idxs_elem_idx = outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch;
-          auto src_elem_idx = outer_batch * src_batch_size + i * src_block_size + inner_batch;
-          auto dst_elem_idx = outer_batch * dst_batch_size + idxs[idxs_elem_idx] * dst_block_size + inner_batch;
+        for (int64_t inner_batch = 0; inner_batch < idxs_block_size;
+             ++inner_batch) {
+          auto idxs_elem_idx =
+              outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch;
+          auto src_elem_idx =
+              outer_batch * src_batch_size + i * src_block_size + inner_batch;
+          auto dst_elem_idx = outer_batch * dst_batch_size +
+              idxs[idxs_elem_idx] * dst_block_size + inner_batch;
 
           auto src = src_base + src_elem_idx * item_bytesize;
           auto dst = out + dst_elem_idx * item_bytesize;
@@ -1401,7 +1400,8 @@ class RangeOp : public Operator<Context> {
     T step = 1;
 
     for (int i = 0; i < InputSize(); ++i) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), 1, "All inputs must be scalar/1D tensor.");
+      CAFFE_ENFORCE_EQ(
+          Input(i).numel(), 1, "All inputs must be scalar/1D tensor.");
     }
 
     switch (InputSize()) {
 
@@ -12,7 +12,6 @@
 import numpy as np
 import random
 import six
-import unittest
 
 
 class TestUtilityOps(serial.SerializedTestCase):
@@ -270,6 +269,45 @@ def mx_grad(a):
         )
         self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
 
+    @serial.given(
+        n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
+        in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
+        seed=st.integers(min_value=0, max_value=65535),
+        dtype=st.sampled_from([np.int32, np.int64, np.float32]),
+        **hu.gcs)
+    def test_sum(
+            self, n, m, d, in_place, engine, seed, dtype, gc, dc):
+        input_names = []
+        input_vars = []
+        np.random.seed(seed)
+        for i in range(m):
+            X_name = 'X' + str(i)
+            input_names.extend([X_name])
+            var = np.random.rand(n, d).astype(dtype)
+            vars()[X_name] = var
+            input_vars.append(var)
+
+        def sum_op_ref(*args):
+            res = np.zeros((n, d))
+            for i in range(m):
+                res = res + args[i]
+            return (res, )
+
+        op = core.CreateOperator(
+            "Sum",
+            input_names,
+            [input_names[0]] if in_place else ['Y'],
+            engine=engine,
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=input_vars,
+            reference=sum_op_ref,
+        )
+        self.assertDeviceChecks(dc, op, input_vars, [0])
+
     @serial.given(
         inputs=hu.lengths_tensor().flatmap(
             lambda pair: st.tuples(