Revert D28819780: [TensorExpr] Fix handling of 0-dim tensors.

malfet · facebook-github-bot · commit ba3a90b55ea1 · 2021-06-04T19:25:30.000-07:00
Test Plan: revert-hammer

Differential Revision:
D28819780

Original commit changeset: f3feff35a1ce

fbshipit-source-id: 1dca4ac9cea0b67e9f02800f6d5b3c7e4ae1d81a
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
@@ -675,24 +675,24 @@ at::Tensor iotaTensor(IntArrayRef sizes, const at::TensorOptions& options) {
 } // namespace
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-TEST_F(Kernel, SumAllAxes) {
+TEST_F(Kernel, DISABLED_SumAllAxes) {
+  // [zero-dim tensors]
+  // NNC does not yet handle zero-dim tensors. aten::sum with no axis
+  // input returns a zero-dim tensors, so these tests must be disabled
+  // until we add support for zero-dim tensors.
+
   // Test lowering of sum on all axes.
   const auto graph_template = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)):
         %1 : ${dtype}
-        %2 : ${out_dtype}(requires_grad=0, device=cpu) = aten::sum(%0, %1)
+        %2 : Tensor = aten::sum(%0, %1)
         return (%2))IR";
   auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
 
   for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
     KernelScope kernel_scope;
     TemplateEnv env;
     env.s("dtype", dtypeConstant(scalar_type));
-    if (scalar_type == ScalarType::Undefined) {
-      env.s("out_dtype", "Float");
-    } else {
-      env.s("out_dtype", "Double");
-    }
     const auto graph_string = format(graph_template, env);
 
     auto graph = std::make_shared<Graph>();
@@ -1104,16 +1104,17 @@ TEST_F(Kernel, Softmax4D) {
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-TEST_F(Kernel, InlineProducerIntoReduction) {
+TEST_F(Kernel, DISABLED_InlineProducerIntoReduction) {
+  // see : [zero-dim tensors]
   KernelScope kernel_scope;
 
   // Inline producer (mul) into reduction (sum).
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
             %1 : Float(5, 3, strides=[3, 1], device=cpu)):
-        %2 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%0, %1)
+        %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
         %3 : int = prim::Constant[value=7]()
-        %4 : Double(device=cpu) = aten::sum(%2, %3)
+        %4 : Float(5, 3, strides=[3, 1]) = aten::sum(%2, %3)
         return (%4))IR";
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
@@ -1144,7 +1145,9 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-TEST_F(Kernel, InlineReductionIntoConsumer) {
+TEST_F(Kernel, DISABLED_InlineReductionIntoConsumer) {
+  // see : [zero-dim tensors]
+
   KernelScope kernel_scope;
 
   // Inline producer (mul %2) into reduction (sum %4) but DO NOT
@@ -1154,8 +1157,8 @@ TEST_F(Kernel, InlineReductionIntoConsumer) {
             %1 : Float(5, 3, strides=[3, 1], device=cpu)):
         %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
         %3 : int = prim::Constant[value=6]()
-        %4 : Float(device=cpu) = aten::sum(%2, %3)
-        %5 : Float(5, 3, strides=[3, 1], device=cpu) = aten::mul(%2, %4)
+        %4 : Float(5, 3, strides=[3, 1]) = aten::sum(%2, %3)
+        %5 : Float(5, 3, strides=[3, 1]) = aten::mul(%2, %4)
         return (%5))IR";
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
@@ -23,54 +23,6 @@ namespace jit {
 
 using namespace torch::jit::tensorexpr;
 
-TEST(Reductions, ReduceSum0D_1) {
-  KernelScope kernel_scope;
-  const int M = 10;
-
-  Placeholder b(BufHandle("b", {M}, kFloat));
-  std::vector<float> in(M);
-  for (int j = 0; j < M; ++j) {
-    in[j] = j;
-  }
-
-  std::vector<float> out(M, -1.f);
-
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  Stmt* s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  for (int i = 0; i < M; ++i) {
-    ASSERT_EQ(out[i], in[i]);
-  }
-}
-
-TEST(Reductions, ReduceSum0D_2) {
-  KernelScope kernel_scope;
-  const int M = 10;
-
-  Placeholder b(BufHandle("b", {}, kFloat));
-  std::vector<float> in(1);
-  in[0] = 77.7;
-
-  std::vector<float> out(1, -1.f);
-
-  Tensor* c = Reduce("sum", {}, Sum(), b, {});
-  LoopNest loop({c});
-  loop.prepareForCodegen();
-  Stmt* s = loop.root_stmt();
-  s = IRSimplifier::simplify(s);
-
-  SimpleIREvaluator cg(s, {b, c});
-
-  cg.call({in, out});
-  ASSERT_EQ(out[0], in[0]);
-}
-
 // Sum an array to a single value.
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(Reductions, ReduceSum1D) {
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -114,22 +114,21 @@ TEST(TEFuserPass, FuserPass_3) {
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(TEFuserPass, FuserPass_0DimInput) {
-  WithCPUFuser cf;
   const auto graph_string = R"IR(
-    graph(%x : Float(device=cpu),
-          %y : Float(device=cpu)):
+    graph(%x : Float(device=cuda),
+          %y : Float(device=cuda)):
       %one : int = prim::Constant[value=1]()
-      %a : Float(device=cpu) = aten::mul(%x, %y)
-      %b : Float(device=cpu) = aten::add(%x, %a, %one)
+      %a : Float(device=cuda) = aten::mul(%x, %y)
+      %b : Float(device=cuda) = aten::add(%x, %a, %one)
       return (%b))IR";
   auto g = std::make_shared<Graph>();
   torch::jit::parseIR(graph_string, g.get());
 
   g->lint();
   FuseTensorExprs(g);
 
-  // We should fuse 0-dim tensors too
-  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+  // We should not fuse 0-dim tensors
+  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
@@ -999,6 +999,7 @@ def fn_test_diamond(x, y):
         assert cx.elapsed_value() == 1
         self.assertEqual(out, x + y)
 
+    @unittest.skip("Reenable when TE will add support for 0-dim tensors")
     def test_scalar(self):
         def fn(x, y):
             return 2 * x + y
@@ -1972,6 +1973,7 @@ def te_compile(self, device, dtype, op):
         if op.name in skip_ops:
             return
         sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
+        is_compiling = False
         for sample_input in sample_inputs_itr:
             arg_values = [sample_input.input] + list(sample_input.args)
             kwarg_values = sample_input.kwargs
@@ -2003,12 +2005,23 @@ def f({', '.join(param_names)}):
             f.__module__ = 'test'
             out = f(*param_values)
 
+            # NNC currently oftens segfault when asked to lower ops with 0-dim tensor outputs
+            if isinstance(out, torch.Tensor) and out.dim() == 0:
+                continue
+            else:
+                is_compiling = True
+
             ts_g = torch.jit.trace(f, param_values)
             kernel = torch._C._te.TensorExprKernel(ts_g.graph)
             correct_val = f(*param_values)
             self.assertEqual(kernel.run(tuple(param_values)), correct_val)
             self.assertEqual(kernel.fallback(tuple(param_values)), correct_val)
 
+        # If all sample inputs have scalar output, we won't have tested it and
+        # we consider the op to be not working
+        if not is_compiling:
+            raise RuntimeError("Skipped all inputs")
+
     @onlyCPU
     @unittest.skipIf(not LLVM_ENABLED, "Compiles with TensorExprKernel")
     @ops([op for op in op_db if get_name(op) in works_list], allowed_dtypes=(torch.float,))
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
@@ -529,6 +529,7 @@ def test(x, y):
         )
         self.assertLastGraphAllFused()
 
+    @unittest.skip("temporarily disable")
     def test_min_max_reduction(self):
         def test(x):
             return torch.min(x) + torch.max(x)
@@ -538,6 +539,7 @@ def test(x):
         np.testing.assert_allclose(warmup_and_run_forward(traced, a), np.amin(a.numpy()) + np.amax(a.numpy()))
         self.assertLastGraphAllFused()
 
+    @unittest.skip("temporarily disable")
     def test_min_max_reduction2(self):
         def test(x):
             return x.min() + x.max()
@@ -557,13 +559,14 @@ def test(x):
             a.numpy(), axis=1) + np.amax(a.numpy(), axis=1))
         self.assertLastGraphAllFused()
 
+    @unittest.skip("temporarily disable")
     def test_min_max_reduction_dim1_2(self):
         def test(x):
-            return torch.min(x * x, 1)
+            return torch.min(x, 1)
 
         traced = torch.jit.trace(test, (torch.zeros(16, 16)))
         a = 8.0 * torch.rand(16, 16)
-        np.testing.assert_allclose(warmup_and_run_forward(traced, a)[0], np.amin((a * a).numpy(), axis=1))
+        np.testing.assert_allclose(warmup_and_run_forward(traced, a)[0], np.amin(a.numpy(), axis=1))
         self.assertLastGraphAllFused()
 
     def test_clamp(self):
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -847,6 +847,9 @@ class TensorExprFuser {
       if (!v->isCompleteTensor()) {
         return false;
       }
+      if (*v->type()->castRaw<TensorType>()->dim() == 0) {
+        return false;
+      }
     }
     return true;
   }
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1407,39 +1407,30 @@ Tensor* computeSum(
   // aten::sum takes the input tensor named self.
   auto sizes = valueShape(inputs[0]);
 
-  size_t rank = sizes.size();
+  int rank = sizes.size();
   if (inputs.size() > 2) {
-    if (auto emptyAxes = c10::get_if<BufList>(&inputs[1])) {
-      // If dim-array is an empty list, it will appear as BufList instead of
-      // IntList, and hence we need a special handling for it.
-      // In that case, we need to sum over all axes.
-      TORCH_INTERNAL_ASSERT(emptyAxes->empty());
-      axes.resize(rank);
-      std::iota(axes.begin(), axes.end(), 0);
-    } else if (rank > 0) {
-      auto nodeAxes = c10::get<IntList>(inputs[1]);
-      // Canonicalize axes: wrap around, sort and make unique.
-      for (auto axis : nodeAxes) {
-        axes.push_back(at::maybe_wrap_dim(axis, rank));
-      }
-      std::sort(axes.begin(), axes.end());
-      axes.erase(std::unique(axes.begin(), axes.end()), axes.end());
+    auto nodeAxes = c10::get<IntList>(inputs[1]);
+    // Canonicalize axes: wrap around, sort and make unique.
+    for (auto axis : nodeAxes) {
+      axes.push_back(at::maybe_wrap_dim(axis, rank));
     }
+    std::sort(axes.begin(), axes.end());
+    axes.erase(std::unique(axes.begin(), axes.end()), axes.end());
     keepdim = c10::get<bool>(inputs[2]);
   } else {
-    axes.resize(rank);
+    axes.resize(sizes.size());
     std::iota(axes.begin(), axes.end(), 0);
   }
   // Axes go into reduction dimensions.
   std::vector<DimArg> reductionDims;
-  reductionDims.reserve(rank);
+  reductionDims.reserve(sizes.size());
   for (size_t axis : axes) {
     reductionDims.emplace_back(sizes[axis]);
   }
   std::vector<DimArg> outputDims;
   // Output dimensions are the complement of axes. When keepdim is set, a
   // one-sized dimension is inserted for each axis.
-  for (size_t dim = 0; dim < rank; ++dim) {
+  for (size_t dim = 0; dim < sizes.size(); ++dim) {
     if (!std::count(axes.begin(), axes.end(), dim)) {
       outputDims.emplace_back(sizes[dim]);
     } else if (keepdim) {
@@ -2519,6 +2510,9 @@ Tensor* tensorexpr::computeOperandValue(
     }
     case aten::t: {
       auto shape = valueShape(inputs[0]);
+      if (shape.size() == 1) {
+        return new Tensor(c10::get<BufHandle>(inputs[0]).node(), nullptr);
+      }
       return computeOperandValue(
           aten::transpose,
           {inputs[0], (int64_t)1, (int64_t)0},
@@ -2527,17 +2521,6 @@ Tensor* tensorexpr::computeOperandValue(
     }
     case aten::transpose: {
       auto A = c10::get<BufHandle>(inputs[0]);
-      // Trivial case of 0-dim and 1-dim tensors: transpose is just a copy
-      if (A.ndim() < 1) {
-        return Compute(
-            "aten_transpose",
-            c10::fmap<DimArg>(outputShape),
-            [&](std::vector<VarHandle> axes) {
-              TORCH_INTERNAL_ASSERT(axes.size() <= 1);
-              return A.load(axes);
-            });
-      }
-      // Usual case where transpose actually swaps dimensions
       auto start_dim =
           at::maybe_wrap_dim(c10::get<int64_t>(inputs[1]), A.ndim());
       auto to_dim = at::maybe_wrap_dim(c10::get<int64_t>(inputs[2]), A.ndim());
@@ -2551,16 +2534,6 @@ Tensor* tensorexpr::computeOperandValue(
     }
     case aten::permute: {
       auto A = c10::get<BufHandle>(inputs[0]);
-      // Trivial case of 0-dim tensors: just a copy of the input
-      if (A.ndim() == 0) {
-        return Compute(
-            "aten_permute",
-            c10::fmap<DimArg>(outputShape),
-            [&](const std::vector<VarHandle>& axes) {
-              std::vector<ExprHandle> empty_indices;
-              return A.load(empty_indices);
-            });
-      }
       auto permute_dims = c10::get<IntList>(inputs[1]);
       return Compute(
           "aten_permute",
@@ -2590,15 +2563,6 @@ Tensor* tensorexpr::computeOperandValue(
     case aten::reshape:
     case aten::view: {
       auto A = c10::get<BufHandle>(inputs[0]);
-      if (A.ndim() == 0) {
-        return Compute(
-            "aten_view",
-            c10::fmap<DimArg>(outputShape),
-            [&](const std::vector<VarHandle>& axes) {
-              std::vector<ExprHandle> empty_indices;
-              return A.load(empty_indices);
-            });
-      }
       auto view_dims = c10::get<IntList>(inputs[1]);
       return Compute(
           "aten_reshape",
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
@@ -177,16 +177,6 @@ Tensor* Reduce(
   std::vector<const Var*> reduce_vars;
   unpack_dim_args(reduce_args, &reduce_dims, &reduce_vars);
 
-  // If reduce_vars is empty, then it's not a reduction, but rather a simple
-  // copy
-  if (reduce_vars.empty()) {
-    const Expr* body =
-        Reducer::getReduceBody(body_func, VarVectorToVarHandleVector(vars))
-            .node();
-    Buf* func_result = new Buf(func_name, dims, body->dtype());
-    return new Tensor(func_result, vars, body);
-  }
-
   std::vector<const Var*> all_vars;
   all_vars.insert(all_vars.end(), vars.begin(), vars.end());
   all_vars.insert(all_vars.end(), reduce_vars.begin(), reduce_vars.end());

Original file line number	Diff line number	Diff line change
`@@ -847,6 +847,9 @@ class TensorExprFuser {`
`847`	`847`	`if (!v->isCompleteTensor()) {`
`848`	`848`	`return false;`
`849`	`849`	`}`
	`850`	`+ if (*v->type()->castRaw<TensorType>()->dim() == 0) {`
	`851`	`+ return false;`
	`852`	`+ }`
`850`	`853`	`}`
`851`	`854`	`return true;`
`852`	`855`	`}`