[Vulkan] Add Vulkan Rewrite to Transfer Inputs and Outputs to Vulkan and CPU Backends Respectively (pytorch#87432)

salilsdesai · pytorchmergebot · commit df1cc0ef4738 · 2022-10-31T14:18:45.000Z
With this change, we don't have to manually invoke transferring input and output backends when we run vulkan models. Graph rewrite code based off of: - pytorch@32efff4#diff-a473bddb458dc24225866a45092d6eca064eddd256245d93020e48e216eee4d5R160-R179 Differential Revision: [D39519168](https://our.internmc.facebook.com/intern/diff/D39519168/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39519168/)! Pull Request resolved: pytorch#87432 Approved by: https://github.com/mcr229, https://github.com/digantdesai
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@@ -195,14 +195,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
@@ -223,14 +225,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@@ -158,14 +158,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
@@ -187,14 +189,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
@@ -180,6 +180,10 @@ class vkRunner final : public Runner<T> {
   virtual c10::IValue run(
       T& module,
       const std::vector<c10::IValue>& inputs) override {
+    if (!module.attr("requires_backend_transfers", at::IValue(true)).toBool()) {
+      // No need to transfer input/output backends
+      return module.forward(inputs);
+    }
 
     if (inputs_.size() == 0) {
       // Upload the input tensor(s) to GPU memory.
diff --git a/docs/source/mobile_optimizer.rst b/docs/source/mobile_optimizer.rst
@@ -7,13 +7,16 @@ torch.utils.mobile_optimizer
 Torch mobile supports ``torch.mobile_optimizer.optimize_for_mobile`` utility to run a list of optimization pass with modules in eval mode.
 The method takes the following parameters: a torch.jit.ScriptModule object, a blocklisting optimization set and a preserved method list
 
-By default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
+For CPU Backend, by default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
     - **Conv2D + BatchNorm fusion** (blocklisting option `MobileOptimizerType::CONV_BN_FUSION`):  This optimization pass folds ``Conv2d-BatchNorm2d`` into ``Conv2d`` in ``forward`` method of this module and all its submodules. The weight and bias of the ``Conv2d`` are correspondingly updated.
     - **Insert and Fold prepacked ops** (blocklisting option `MobileOptimizerType::INSERT_FOLD_PREPACK_OPS`): This optimization pass rewrites the graph to replace 2D convolutions and linear ops with their prepacked counterparts. Prepacked ops are stateful ops in that, they require some state to be created, such as weight prepacking and use this state, i.e. prepacked weights, during op execution. XNNPACK is one such backend that provides prepacked ops, with kernels optimized for mobile platforms (such as ARM CPUs). Prepacking of weight enables efficient memory access and thus faster kernel execution. At the moment ``optimize_for_mobile`` pass rewrites the graph to replace ``Conv2D/Linear`` with 1) op that pre-packs weight for XNNPACK conv2d/linear ops and 2) op that takes pre-packed weight and activation as input and generates output activations. Since 1 needs to be done only once, we fold the weight pre-packing such that it is done only once at model load time. This pass of the ``optimize_for_mobile`` does 1 and 2 and then folds, i.e. removes, weight pre-packing ops.
     - **ReLU/Hardtanh fusion**: XNNPACK ops support fusion of clamping. That is clamping of output activation is done as part of the kernel, including for 2D convolution and linear op kernels. Thus clamping effectively comes for free. Thus any op that can be expressed as clamping op, such as ``ReLU`` or ``hardtanh``, can be fused with previous ``Conv2D`` or ``linear`` op in XNNPACK. This pass rewrites graph by finding ``ReLU/hardtanh`` ops that follow XNNPACK ``Conv2D/linear`` ops, written by the previous pass, and fuses them together.
     - **Dropout removal** (blocklisting option `MobileOptimizerType::REMOVE_DROPOUT`): This optimization pass removes ``dropout`` and ``dropout_`` nodes from this module when training is false.
     - **Conv packed params hoisting** (blocklisting option `MobileOptimizerType::HOIST_CONV_PACKED_PARAMS`): This optimization pass moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics.
 
+for Vulkan Backend, by default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the folllwing optimization:
+    - **Automatic GPU Transfer** (blocklisting option `MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER`): This optimization pass rewrites the graph such that inputs are transferred to Vulkan backend, and outputs are transferred to CPU backend
+
 ``optimize_for_mobile`` will also invoke freeze_module pass which only preserves ``forward`` method. If you have other method to that needed to be preserved,  add them into the preserved method list and pass into the method.
 
 
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
@@ -261,7 +261,7 @@ def test_no_new_bindings(self):
             "set_num_threads",
             "unify_type_list",
             "vitals_enabled",
-
+            "VULKAN_AUTOMATIC_GPU_TRANSFER",
             "wait",
             "Tag",
         }
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -178,6 +178,7 @@ INSERT_FOLD_PREPACK_OPS: MobileOptimizerType
 REMOVE_DROPOUT: MobileOptimizerType
 FUSE_ADD_RELU: MobileOptimizerType
 HOIST_CONV_PACKED_PARAMS: MobileOptimizerType
+VULKAN_AUTOMATIC_GPU_TRANSFER: MobileOptimizerType
 
 def fork(*args: Any, **kwargs: Any) -> Future: ...
 def wait(fut: Future) -> Any: ...
diff --git a/torch/csrc/jit/passes/mobile_optimizer_type.h b/torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -9,4 +9,5 @@ enum class MobileOptimizerType : int8_t {
   FUSE_ADD_RELU,
   HOIST_CONV_PACKED_PARAMS,
   CONV_1D_TO_2D,
+  VULKAN_AUTOMATIC_GPU_TRANSFER,
 };
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/fold_conv_bn.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/fuse_linear.h>
@@ -82,6 +83,51 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
   transpose_rewriter.runOnGraph(graph);
 }
 
+void transferInputOutputBackends(std::shared_ptr<Graph>& graph) {
+  // Move inputs to Vulkan backend
+  for (Value* input : graph->inputs()) {
+    NamedValue named_input = NamedValue("", input);
+    if (named_input.type()->kind() == TypeKind::TensorType) {
+      // find the insertion point
+      WithInsertPoint ip(input->uses()[0].user->prev());
+      Value* replaced_input = graph->insert(
+          Symbol::fromQualString("aten::to"), {named_input, "vulkan"});
+      // replace the input
+      input->replaceAllUsesAfterNodeWith(
+          replaced_input->node(), replaced_input);
+    }
+  }
+
+  // Move outputs to CPU backend
+  at::ArrayRef<Value*>&& outputs = graph->outputs();
+  for (size_t i = 0; i < outputs.size(); i++) {
+    Value* output = outputs[i];
+    NamedValue named_output = NamedValue("", output);
+    if (named_output.type()->kind() == TypeKind::TensorType) {
+      // find the insertion point
+      WithInsertPoint ip(output->node()->next());
+      Value* replaced_output = graph->insert(
+          Symbol::fromQualString("aten::to"), {named_output, "cpu"});
+      // replace the output
+      graph->block()->replaceOutput(i, replaced_output);
+    }
+  }
+
+  SubgraphRewriter rewriter;
+  rewriter.runOnGraph(graph);
+}
+
+void transferInputOutputBackends(script::Module& module) {
+  std::shared_ptr<Graph> graph = module.get_methods()[0].graph();
+  transferInputOutputBackends(graph);
+}
+
+void eliminateDeadCode(script::Module& module) {
+  for (auto& method : module.get_methods()) {
+    EliminateDeadCode(method.graph());
+  }
+}
+
 void insertPrePackedGruOp(std::shared_ptr<Graph>& graph) {
   std::string gru_pattern = R"(
       graph(%input.1, %hx.1, %params_cpu:Tensor[], %has_biases:bool, %num_layers:int, %dropout:float, %train:bool, %bidirectional:bool, %batch_first:bool):
@@ -276,12 +322,19 @@ script::Module vulkanOptimizeForMobile(
   cloned_module = FoldConvBatchNorm(cloned_module);
   vulkanInsertPrePackedOps(cloned_module);
   cloned_module = freeze_module(cloned_module, preserved_methods);
+  if (!optimization_blocklist.count(
+          MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER)) {
+    transferInputOutputBackends(cloned_module);
+    cloned_module.register_attribute(
+        "requires_backend_transfers", BoolType::get(), false);
+  }
   vulkanFusePrePackedConvWithClamp(cloned_module);
   vulkanFoldPrePackingOps(cloned_module);
   removeDropout(cloned_module);
   vulkanRemoveMutation(cloned_module);
   // remove duplicated constants
   vulkanRunCanonicalOptimizations(cloned_module);
+  eliminateDeadCode(cloned_module);
 
   cloned_module.register_attribute(
       "optimized_for_vulkan", BoolType::get(), true);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
@@ -1297,6 +1297,9 @@ void initJITBindings(PyObject* module) {
       .value(
           "HOIST_CONV_PACKED_PARAMS",
           MobileOptimizerType::HOIST_CONV_PACKED_PARAMS)
+      .value(
+          "VULKAN_AUTOMATIC_GPU_TRANSFER",
+          MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER)
       .export_values();
 
   // This allows PyTorchStreamReader to read from a Python buffer. It requires

Original file line number	Diff line number	Diff line change
`@@ -261,7 +261,7 @@ def test_no_new_bindings(self):`
`261`	`261`	`"set_num_threads",`
`262`	`262`	`"unify_type_list",`
`263`	`263`	`"vitals_enabled",`
`264`		`-`
	`264`	`+ "VULKAN_AUTOMATIC_GPU_TRANSFER",`
`265`	`265`	`"wait",`
`266`	`266`	`"Tag",`
`267`	`267`	`}`