feat(fx): make_fx should be aware of functions wrapped with @fx.wrap (pytorch#93273)

jon-chuang · pytorchmergebot · commit 6a4bf3b71bf2 · 2023-02-02T01:57:52.000Z
Fixes pytorch#89421 The strategy is to patch the given function wrapped with `@torch.fx.wrap` so that if a tensor tracer is active, we will `proxy_call` the function. `proxy_call` will also skip certain checks if the function to proxy call is not a torch op (checked with `isinstance(.., OpOverload)`. @IvanYashchuk @ezyang @Chillee Pull Request resolved: pytorch#93273 Approved by: https://github.com/ezyang
diff --git a/test/test_fx.py b/test/test_fx.py
@@ -31,6 +31,7 @@
 from torch.fx.passes import shape_prop
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.experimental.rewriter import RewritingTracer
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.operator_schemas import get_signature_for_torch_op
 from copy import deepcopy
 from collections import namedtuple
@@ -477,6 +478,45 @@ def to_trace(y):
         self.assertIn('wrapped_decorated_fn', m.code)
         self.assertEqual(m(1), 1)
 
+    @unittest.skipIf(sys.version_info >= (3, 11, 0), "FX currently does not have 3.11 support")
+    def test_wrap_with_make_fx(self):
+        def to_trace(y):
+            return a_lifted_leaf((4, y), 3) * a_lifted_leaf((3, 4), 5) * a_lifted_leaf((y, y), y)
+
+        expected_code = """def forward(self, y_1):
+    a_lifted_leaf = __main___a_lifted_leaf((4, y_1), 3)
+    a_lifted_leaf_1 = __main___a_lifted_leaf((3, 4), 5)
+    mul = torch.ops.aten.mul.Tensor(a_lifted_leaf, 12);  a_lifted_leaf = None
+    a_lifted_leaf_2 = __main___a_lifted_leaf((y_1, y_1), y_1);  y_1 = None
+    mul_1 = torch.ops.aten.mul.Tensor(mul, a_lifted_leaf_2);  mul = a_lifted_leaf_2 = None
+    return mul_1"""
+
+        m = make_fx(to_trace, tracing_mode="real")(torch.tensor([10]))
+        self.assertIn('a_lifted_leaf', m.code)
+        # aten.add.Tensor should be internal to `a_lifted_leaf` when some of the parameters are tensors.
+        # However, it should not be traced as the function is marked as opaque.
+        self.assertNotIn('aten.add.Tensor', m.code)
+        self.assertExpectedInline(
+            m.code.strip(),
+            expected_code
+        )
+
+        m = make_fx(to_trace, tracing_mode="fake")(torch.tensor([10]))
+        self.assertIn('a_lifted_leaf', m.code)
+        self.assertNotIn('aten.add.Tensor', m.code)
+        self.assertExpectedInline(
+            m.code.strip(),
+            expected_code
+        )
+
+        m = make_fx(to_trace, tracing_mode="symbolic")(torch.tensor([10]))
+        self.assertIn('a_lifted_leaf', m.code)
+        self.assertNotIn('aten.add.Tensor', m.code)
+        self.assertExpectedInline(
+            m.code.strip(),
+            expected_code
+        )
+
     def test_graph_edit_with_proxy(self):
         class M(torch.nn.Module):
             def forward(self, a, b):
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 81a041a68245cd8f871c43bbbbd5b6b627979a30
+Subproject commit 171a7a986f7fbd9ed71bd0cf3c7ad4f55843d6b3
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
@@ -849,6 +849,18 @@ def wrapped(*args, **kwargs):
             )
             return_proxy.node.meta["is_wrapped"] = True
             return return_proxy
+
+        # import here to avoid circular imports
+        from .experimental.proxy_tensor import get_innermost_proxy_mode, proxy_call, disable_proxy_modes_tracing
+
+        # If there is no input with proxy, see if we are tracing with proxy tensors
+        proxy_mode = get_innermost_proxy_mode()
+        if proxy_mode is not None:
+            # Disable tracing of the interior of the wrapped fn while evaluating
+            with disable_proxy_modes_tracing():
+                out = proxy_call(proxy_mode, orig_fn, args, kwargs)
+            return out
+
         return orig_fn(*args, **kwargs)
 
     return wrapped
@@ -868,6 +880,18 @@ def wrapped(*args, **kwargs):
         proxy = _find_proxy(args, kwargs)
         if proxy is not None:
             return proxy.tracer.create_proxy("call_method", name, args, kwargs)
+
+        # import here to avoid circular imports
+        from .experimental.proxy_tensor import get_innermost_proxy_mode, proxy_call, disable_proxy_modes_tracing
+
+        # If there is no input with proxy, see if we are tracing with proxy tensors
+        proxy_mode = get_innermost_proxy_mode()
+        if proxy_mode is not None:
+            # Disable tracing of the interior of the wrapped method while evaluating
+            with disable_proxy_modes_tracing():
+                out = proxy_call(proxy_mode, orig_fn, args, kwargs)
+            return out
+
         return orig_fn(*args, **kwargs)
 
     return wrapped
@@ -913,7 +937,7 @@ def patch(
         """
         Replace frame_dict[name] with new_fn until we exit the context manager.
         """
-        new_fn.__fx_already_patched = deduplicate  # type: ignore[attr-defined]
+        setattr(new_fn, "__fx_already_patched", deduplicate)  # noqa: B010
         if name not in frame_dict and hasattr(builtins, name):
             self.patches_made.append(_PatchedFnDel(frame_dict, name, None))
         elif getattr(frame_dict[name], "__fx_already_patched", False):
@@ -923,19 +947,21 @@ def patch(
                 _PatchedFnSetItem(frame_dict, name, frame_dict[name])
             )
         frame_dict[name] = new_fn
+        assert(getattr(frame_dict[name], "__fx_already_patched", False) == deduplicate)
 
     def patch_method(
         self, cls: type, name: str, new_fn: Callable, deduplicate: bool = True
     ):
         """
         Replace object_or_dict.name with new_fn until we exit the context manager.
         """
-        new_fn.__fx_already_patched = deduplicate  # type: ignore[attr-defined]
+        setattr(new_fn, "__fx_already_patched", deduplicate)  # noqa: B010
         orig_fn = getattr(cls, name)
         if getattr(orig_fn, "__fx_already_patched", False):
             return  # already patched, no need to do it again
         self.patches_made.append(_PatchedFnSetAttr(cls, name, orig_fn))
         setattr(cls, name, new_fn)
+        assert(getattr(getattr(cls, name), "__fx_already_patched", False) == deduplicate)
 
     def visit_once(self, thing: Any):
         """Return True on the first call to with thing, otherwise false"""
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
@@ -235,6 +235,11 @@ def fetch_tensor_proxy(tracer):
 HANDLED_TYPES = (torch.Tensor, torch.nn.Parameter)
 
 def proxy_call(proxy_mode, func, args, kwargs):
+    # `__torch_dispatch__` is only called on torch ops, which must subclass `OpOverload`
+    # We treat all other functions as an `external_call`, for instance, a function decorated
+    # with `@torch.tx.wrap`
+    external_call = not isinstance(func, torch._ops.OpOverload)
+
     def can_handle_tensor(x):
         return type(x) in HANDLED_TYPES or has_proxy_slot(x, proxy_mode.tracer)
 
@@ -243,17 +248,17 @@ def can_handle_tensor(x):
     if not pytree.tree_all_only(torch.Tensor, can_handle_tensor, (args, kwargs)):
         return NotImplemented
 
-    if func in CURRENT_DECOMPOSITION_TABLE:
+    if not external_call:
+        if func in CURRENT_DECOMPOSITION_TABLE:
+            with proxy_mode:
+                r = CURRENT_DECOMPOSITION_TABLE[func](*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
         with proxy_mode:
-            r = CURRENT_DECOMPOSITION_TABLE[func](*args, **kwargs)
+            r = func.decompose(*args, **kwargs)
             if r is not NotImplemented:
                 return r
 
-    with proxy_mode:
-        r = func.decompose(*args, **kwargs)
-        if r is not NotImplemented:
-            return r
-
     tracer = proxy_mode.tracer
     f_args, f_kwargs = pytree.tree_map_only(torch.Tensor, fetch_tensor_proxy(tracer), (args, kwargs))
 
@@ -266,8 +271,7 @@ def can_handle_tensor(x):
         # this can happen
         and pytree.tree_all_only((SymInt, SymFloat, SymBool), lambda _: False, (args, kwargs))
     )
-
-    if torch.Tag.data_dependent_output in func.tags:  # type: ignore[attr-defined]
+    if not external_call and torch.Tag.data_dependent_output in func.tags:  # type: ignore[attr-defined]
         # Check if all of the Tensor inputs are constants
         if all_constant:
             const_args, const_kwargs = pytree.tree_map_only(
@@ -327,20 +331,23 @@ def can_handle_tensor(x):
     if func is torch.ops.aten.lift_fresh.default:
         func = torch.ops.aten.lift_fresh_copy.default
 
-    proxy_out = proxy_mode.tracer.create_proxy('call_function', func, proxy_args, proxy_kwargs,
-                                               name=proxy_mode.tracer.graph._target_to_str(func.overloadpacket.__name__))
-
-    # This makes DCE marginally less likely to DCE inplace operations.
-    # It is not strictly necessary
-    # Kind of a hacky way to test if an op is in-place or not
-    if func.overloadpacket.__name__[-1] == "_" and func.overloadpacket.__name__[0] != "_":
-        if isinstance(args[0], List):
-            # e.g., c10d::allreduce_ returns a list of tensors as the first element
-            # in the output.
-            for i, a in enumerate(args[0]):
-                a.proxy = proxy_out[0][i]
-        else:
-            args[0].proxy = proxy_out
+    if external_call:
+        proxy_out = proxy_mode.tracer.create_proxy('call_function', func, proxy_args, proxy_kwargs, name=func.__name__)
+    else:
+        proxy_out = proxy_mode.tracer.create_proxy('call_function', func, proxy_args, proxy_kwargs,
+                                                   name=proxy_mode.tracer.graph._target_to_str(func.overloadpacket.__name__))
+
+        # This makes DCE marginally less likely to DCE inplace operations.
+        # It is not strictly necessary
+        # Kind of a hacky way to test if an op is in-place or not
+        if func.overloadpacket.__name__[-1] == "_" and func.overloadpacket.__name__[0] != "_":
+            if isinstance(args[0], List):
+                # e.g., c10d::allreduce_ returns a list of tensors as the first element
+                # in the output.
+                for i, a in enumerate(args[0]):
+                    a.proxy = proxy_out[0][i]
+            else:
+                args[0].proxy = proxy_out
 
     out = func(*args, **kwargs)
 
@@ -376,7 +383,7 @@ def can_handle_tensor(x):
         with maybe_disable_fake_tensor_mode():
             constant = args[0].clone()
     elif (
-        torch.Tag.nondeterministic_seeded not in func.tags  # type: ignore[attr-defined]
+        (external_call or torch.Tag.nondeterministic_seeded not in func.tags)  # type: ignore[attr-defined]
         and all_constant
         and any_constant
         and pytree.tree_all_only(torch.Tensor, lambda t: t.numel() <= CONSTANT_NUMEL_LIMIT, out)