[ONNX] Apply Common Subexpression Elimination pass to ONNX export (pytorch#85665)

Thiago Crepaldi · pytorchmergebot · commit 6cfe555f4fe5 · 2022-09-27T21:26:32.000Z
## Summary Exporting graphs with Autocast may fail due to a limitation on JIT tracer. By disabling Autocast cache, tracer works, but there can be performance hit when there is reuse of weights in convolution, for example By applying CSE, such performance loss can be reverted. ps: As a comment at pytorch#84092 mentioned, disabling Autocast cache is an acceptable workaround and used throughout PyTorch code. Fixes pytorch#84092 ## Examples of before and after CSE being applied: ### Example: eliminating `%17` and reusing `%16` instead ```python # BEFORE graph(%0 : Float(requires_grad=0, device=cpu)): %3 : Scalar = aten::ScalarImplicit(%0), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %13 : int = prim::Constant[value=3](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %14 : int = prim::Constant[value=4](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %15 : int[] = prim::ListConstruct(%13, %14), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %16 : NoneType = prim::Constant(), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %17 : NoneType = prim::Constant(), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %18 : Device = prim::Constant[value="cpu"](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %19 : bool = prim::Constant[value=0](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %20 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::full(%15, %3, %16, %17, %18, %19), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 return (%20) AFTER graph(%0 : Float(requires_grad=0, device=cpu)): %3 : Scalar = aten::ScalarImplicit(%0), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %13 : int = prim::Constant[value=3](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %14 : int = prim::Constant[value=4](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %15 : int[] = prim::ListConstruct(%13, %14), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %16 : NoneType = prim::Constant(), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: %18 : Device = prim::Constant[value="cpu"](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %19 : bool = prim::Constant[value=0](), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 %20 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::full(%15, %3, %16, %16, %18, %19), scope: test_onnx_opset.TestONNXOpset.test_full.<locals>.MyModule:: # /home/thiagofc/dev/github/pytorch/test/onnx/test_onnx_opset.py:347:0 return (%20) ``` Pull Request resolved: pytorch#85665 Approved by: https://github.com/ngimel, https://github.com/AllenTiTaiWang, https://github.com/BowenBao
diff --git a/test/onnx/expect/TestOperators.test_baddbmm.expect b/test/onnx/expect/TestOperators.test_baddbmm.expect
@@ -5,12 +5,12 @@ graph {
   node {
     input: "onnx::MatMul_1"
     input: "onnx::MatMul_2"
-    output: "onnx::Mul_5"
+    output: "onnx::Mul_4"
     name: "MatMul_0"
     op_type: "MatMul"
   }
   node {
-    output: "onnx::Mul_11"
+    output: "onnx::Mul_10"
     name: "Constant_1"
     op_type: "Constant"
     attribute {
@@ -23,14 +23,14 @@ graph {
     }
   }
   node {
-    input: "onnx::Mul_5"
-    input: "onnx::Mul_11"
-    output: "onnx::Add_7"
+    input: "onnx::Mul_4"
+    input: "onnx::Mul_10"
+    output: "onnx::Add_6"
     name: "Mul_2"
     op_type: "Mul"
   }
   node {
-    output: "onnx::Mul_12"
+    output: "onnx::Mul_11"
     name: "Constant_3"
     op_type: "Constant"
     attribute {
@@ -44,15 +44,15 @@ graph {
   }
   node {
     input: "onnx::Mul_0"
-    input: "onnx::Mul_12"
-    output: "onnx::Add_9"
+    input: "onnx::Mul_11"
+    output: "onnx::Add_8"
     name: "Mul_4"
     op_type: "Mul"
   }
   node {
-    input: "onnx::Add_7"
-    input: "onnx::Add_9"
-    output: "10"
+    input: "onnx::Add_6"
+    input: "onnx::Add_8"
+    output: "9"
     name: "Add_5"
     op_type: "Add"
   }
@@ -115,7 +115,7 @@ graph {
     }
   }
   output {
-    name: "10"
+    name: "9"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/expect/TestOperators.test_narrow.expect b/test/onnx/expect/TestOperators.test_narrow.expect
@@ -3,7 +3,7 @@ producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::Slice_14"
+    output: "onnx::Slice_13"
     name: "Constant_0"
     op_type: "Constant"
     attribute {
@@ -17,7 +17,7 @@ graph {
     }
   }
   node {
-    output: "onnx::Slice_15"
+    output: "onnx::Slice_14"
     name: "Constant_1"
     op_type: "Constant"
     attribute {
@@ -31,7 +31,7 @@ graph {
     }
   }
   node {
-    output: "onnx::Slice_16"
+    output: "onnx::Slice_15"
     name: "Constant_2"
     op_type: "Constant"
     attribute {
@@ -46,10 +46,10 @@ graph {
   }
   node {
     input: "onnx::Slice_0"
+    input: "onnx::Slice_13"
     input: "onnx::Slice_14"
     input: "onnx::Slice_15"
-    input: "onnx::Slice_16"
-    output: "12"
+    output: "11"
     name: "Slice_3"
     op_type: "Slice"
   }
@@ -71,7 +71,7 @@ graph {
     }
   }
   output {
-    name: "12"
+    name: "11"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/expect/TestOperators.test_shape_value_map.expect b/test/onnx/expect/TestOperators.test_shape_value_map.expect
@@ -55,7 +55,7 @@ graph {
     op_type: "Unsqueeze"
   }
   node {
-    output: "onnx::Concat_26"
+    output: "onnx::Concat_25"
     name: "Constant_5"
     op_type: "Constant"
     attribute {
@@ -69,7 +69,7 @@ graph {
     }
   }
   node {
-    output: "onnx::Concat_27"
+    output: "onnx::Concat_26"
     name: "Constant_6"
     op_type: "Constant"
     attribute {
@@ -83,7 +83,7 @@ graph {
     }
   }
   node {
-    output: "onnx::Concat_28"
+    output: "onnx::Concat_27"
     name: "Constant_7"
     op_type: "Constant"
     attribute {
@@ -98,9 +98,9 @@ graph {
   }
   node {
     input: "onnx::Concat_8"
+    input: "onnx::Concat_25"
     input: "onnx::Concat_26"
     input: "onnx::Concat_27"
-    input: "onnx::Concat_28"
     output: "onnx::Reshape_15"
     name: "Concat_8"
     op_type: "Concat"
@@ -148,7 +148,7 @@ graph {
     }
   }
   node {
-    output: "onnx::Unsqueeze_20"
+    output: "onnx::Unsqueeze_19"
     name: "Constant_12"
     op_type: "Constant"
     attribute {
@@ -163,13 +163,13 @@ graph {
   }
   node {
     input: "onnx::Unsqueeze_3"
-    input: "onnx::Unsqueeze_20"
-    output: "onnx::Concat_21"
+    input: "onnx::Unsqueeze_19"
+    output: "onnx::Concat_20"
     name: "Unsqueeze_13"
     op_type: "Unsqueeze"
   }
   node {
-    output: "onnx::Concat_29"
+    output: "onnx::Concat_28"
     name: "Constant_14"
     op_type: "Constant"
     attribute {
@@ -183,9 +183,9 @@ graph {
     }
   }
   node {
-    input: "onnx::Concat_21"
-    input: "onnx::Concat_29"
-    output: "onnx::Reshape_24"
+    input: "onnx::Concat_20"
+    input: "onnx::Concat_28"
+    output: "onnx::Reshape_23"
     name: "Concat_15"
     op_type: "Concat"
     attribute {
@@ -196,8 +196,8 @@ graph {
   }
   node {
     input: "onnx::Reshape_18"
-    input: "onnx::Reshape_24"
-    output: "25"
+    input: "onnx::Reshape_23"
+    output: "24"
     name: "Reshape_16"
     op_type: "Reshape"
     attribute {
@@ -230,7 +230,7 @@ graph {
     }
   }
   output {
-    name: "25"
+    name: "24"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
@@ -1,5 +1,11 @@
 # Owner(s): ["module: onnx"]
 
+"""
+Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data]
+          --no-onnx: no onnx python dependency
+          --produce-onnx-test-data: generate onnx test data
+          --accept: accept onnx updates and overwrite models
+"""
 import glob
 import inspect
 import io
@@ -8,6 +14,9 @@
 import shutil
 import tempfile
 
+# Full diff for expect files
+import unittest
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -30,15 +39,6 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfCaffe2, skipIfNoLapack
 
-"""Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data]
-          --no-onnx: no onnx python dependence
-          --produce-onnx-test-data: generate onnx test data
-          --accept: accept onnx updates and overwrite models
-"""
-
-# Full diff for expect files
-import unittest
-
 unittest.TestCase.maxDiff = None
 
 _onnx_test = False  # flag to produce onnx test cases.
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -989,6 +989,7 @@ def _jit_set_inline_everything_mode(enabled: _bool) -> None: ...
 def _jit_get_logging_option() -> str: ...
 def _jit_set_logging_option(option: str) -> None: ...
 def _jit_set_logging_stream(stream_name: str) -> None: ...
+def _jit_pass_cse(Graph) -> _bool: ...
 def _jit_pass_dce(Graph) -> None: ...
 def _jit_pass_lint(Graph) -> None: ...
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
@@ -573,6 +573,12 @@ def _optimize_graph(
     _C._jit_pass_dce(graph)
     _C._jit_pass_lint(graph)
 
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
     _C._jit_pass_canonicalize_graph_fuser_ops(graph)
     _C._jit_pass_lint(graph)
     _C._jit_pass_peephole(graph, True)
@@ -632,6 +638,7 @@ def _optimize_graph(
         dynamic_axes = {} if dynamic_axes is None else dynamic_axes
         _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
     _C._jit_pass_onnx_lint(graph)
+
     graph = _C._jit_pass_onnx(graph, operator_export_type)
     _C._jit_pass_onnx_lint(graph)
     _C._jit_pass_lint(graph)
@@ -851,11 +858,10 @@ def _trace_and_get_graph_from_model(model, args):
     orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
 
     # Disable Autocast cache because it replaces kernel's weight and bias
-    # to be replaced by (undesired) constants
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
     # TODO: https://github.com/pytorch/pytorch/issues/84092
     prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
-    # When weights are not reused, there is no perf impact
-    # ONNX runtimes can also apply CSE optimization to compensate the lack of cache here
     torch.set_autocast_cache_enabled(False)
     trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
         model, args, strict=False, _force_outplace=False, _return_inputs_states=True