[caffe2] Remove OperatorBase::newstyle_outputs_ (pytorch#67093)

peterbell10 · pytorchmergebot · commit dd2511125042 · 2023-01-23T22:41:59.000Z
`OperatorBase` maintains `output_tensors_` and `newstyle_outputs_` which hold the same list of tensors except one is `vector<caffe2::Tensor>` and the other is `List<at::Tensor>`. This instead maintains only `output_tensors_` and handles the conversions inside of export_caffe2_op_to_c10. Differential Revision: [D32289811](https://our.internmc.facebook.com/intern/diff/D32289811) Pull Request resolved: pytorch#67093 Approved by: https://github.com/dagitses, https://github.com/malfet
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <unordered_map>
 #include <string>
-#include <ATen/ATen.h>
+#include <ATen/Functions.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <caffe2/core/context.h>
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
@@ -12,6 +12,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/library.h>
+#include <caffe2/core/tensor.h>
 #include <vector>
 
 namespace caffe2 {
@@ -20,19 +21,19 @@ namespace detail {
 constexpr const char* PREALLOCATED_OUTPUT_ARGNAME =
     "_caffe2_preallocated_outputs";
 
-using _CallCaffe2OpFunc = c10::List<at::Tensor>(
+using _CallCaffe2OpFunc = std::vector<caffe2::Tensor>(
     const c10::FunctionSchema& schema,
-    std::vector<c10::IValue>&& inputs,
-    c10::List<at::Tensor>&& outputs);
+    std::vector<c10::IValue> &&inputs,
+    std::vector<caffe2::Tensor> &&outputs);
 
 template <class Caffe2Operator>
-inline c10::List<at::Tensor> _call_caffe2_op(
+inline std::vector<caffe2::Tensor> _call_caffe2_op(
     const c10::FunctionSchema& schema,
-    std::vector<c10::IValue>&& inputs,
-    c10::List<at::Tensor>&& outputs) {
+    std::vector<c10::IValue> &&inputs,
+    std::vector<caffe2::Tensor> &&outputs) {
   Caffe2Operator op(schema, std::move(inputs), std::move(outputs), -1);
   op.Run(-1);
-  return std::move(op).move_newstyle_outputs();
+  return std::move(op).move_output_tensors();
 }
 
 // This function is inline in the hope that compilers optimizing for speed will
@@ -62,7 +63,6 @@ inline void _call_caffe2_op_from_c10(
           *OptionalType::create(ListType::ofTensors())));
   IValue preallocated_outputs = torch::jit::pop(*stack);
 
-  const size_t num_outputs = schema.returns().size();
   const size_t num_inputs = schema.arguments().size() -
       1; // -1 because the last argument is the list of preallocated tensors
 
@@ -71,7 +71,7 @@ inline void _call_caffe2_op_from_c10(
     // either the schema doesn't support preallocated outputs or it does but
     // they haven't been passed in. Pass a list of uninitialized tensors to
     // the caffe2 operator as preallocated outputs.
-    outputs.resize(num_outputs);
+    outputs.resize(schema.returns().size());
   } else {
     AT_ASSERT(preallocated_outputs.isTensorList());
     outputs = std::move(preallocated_outputs).toTensorList();
@@ -81,7 +81,15 @@ inline void _call_caffe2_op_from_c10(
   // instances in the cache.
   std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
 
-  outputs = (*call_op)(schema, std::move(inputs), std::move(outputs));
+  // Convert outputs to caffe2::Tensor
+  const size_t num_outputs = outputs.size();
+  std::vector<caffe2::Tensor> outputs_c2(num_outputs);
+  for (auto i : c10::irange(num_outputs)) {
+    outputs_c2[i] = caffe2::Tensor(outputs.extract(i));
+  }
+
+  outputs_c2 = (*call_op)(schema, std::move(inputs), std::move(outputs_c2));
+  TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size());
 
   bool return_tensor_list = false;
   if (schema.returns().size() == 1) {
@@ -93,11 +101,13 @@ inline void _call_caffe2_op_from_c10(
     }
   }
   if (return_tensor_list) {
-    // We should not unwrap the list if we expect tensor list in the schema.
+    for (const auto i : c10::irange(num_outputs)) {
+      outputs.set(i, at::Tensor(std::move(outputs_c2[i])));
+    }
     torch::jit::push(*stack, outputs);
   } else {
-    for (const auto i : c10::irange(outputs.size())) {
-      torch::jit::push(*stack, outputs.extract(i));
+    for (const auto i : c10::irange(num_outputs)) {
+      torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i])));
     }
   }
 
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
@@ -59,10 +59,6 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
       device_option_(
           operator_def.has_device_option() ? operator_def.device_option()
                                            : DeviceOption()),
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_(),
-#endif
       input_size_(operator_def.input_size()),
       event_(std::make_unique<Event>(device_option_)) {
   static GlobalInitIsCalledGuard guard;
@@ -124,14 +120,13 @@ compute_input_size_(const std::vector<c10::IValue>& inputs) {
 OperatorBase::OperatorBase(
     const c10::FunctionSchema& fn_schema,
     std::vector<c10::IValue> inputs,
-    c10::List<at::Tensor> outputs)
+    std::vector<caffe2::Tensor> outputs)
     // NOLINTNEXTLINE(performance-move-const-arg)
     : fn_schema_(make_unique<c10::FunctionSchema>(std::move(fn_schema))),
       newstyle_inputs_(std::move(inputs)),
-      newstyle_outputs_(std::move(outputs)),
+      output_tensors_(std::move(outputs)),
       input_size_(compute_input_size_(newstyle_inputs_)) {
   input_tensors_.resize(input_size_);
-  output_tensors_.resize(newstyle_outputs_.size());
 }
 #endif
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
@@ -74,7 +74,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   explicit OperatorBase(
       const c10::FunctionSchema& schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs);
+      std::vector<caffe2::Tensor> outputs);
 #endif
 
   virtual ~OperatorBase() noexcept;
@@ -250,15 +250,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    if (!output.defined() || caffe2::Tensor(output).GetDeviceType() != type) {
+    auto &output = output_tensors_[idx];
+    if (!output.defined() || output.GetDeviceType() != type) {
       // Fix tensor type
-      Tensor tensor = Tensor(type);
-      output = at::Tensor(std::move(tensor.getIntrusivePtr()));
+      output = Tensor(type);
     }
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -280,9 +277,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     if (!isLegacyOperator()) {
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_[idx] = at::Tensor(tensor);
-
-      // also update the tensor in the hack
       output_tensors_[idx] = std::move(tensor);
 #else
       CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
@@ -310,16 +304,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    Tensor tensor = output.defined()
-        ? GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options)
+    auto &output = output_tensors_[idx];
+    output = output.defined()
+        ? GetSizedTensorWithOptions(std::move(output), dims, options)
         : caffe2::empty(dims, options);
-    // assign it back in case it changed
-    output = at::Tensor(std::move(tensor.getIntrusivePtr()));
 
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -434,7 +424,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return newstyle_outputs_.size();
+    return output_tensors_.size();
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -599,8 +589,8 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  c10::List<at::Tensor> move_newstyle_outputs() && {
-    return std::move(newstyle_outputs_);
+  std::vector<caffe2::Tensor> move_output_tensors() && {
+    return std::move(output_tensors_);
   }
 #endif
 
@@ -620,7 +610,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
-  c10::List<at::Tensor> newstyle_outputs_;
 #endif
   // HACK
   // We preserve the fact that Output() returns Tensor*
@@ -819,7 +808,7 @@ class Operator : public OperatorBase {
   explicit Operator(
       const c10::FunctionSchema& fn_schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs,
+      std::vector<caffe2::Tensor> outputs,
       StreamId stream = 0)
       : OperatorBase(fn_schema, std::move(inputs), std::move(outputs)) {
     // In the constructor, we switch to the device so that the child class