Expose autograd.graph.Node as an abstract base class (pytorch#91475)

soulitzer · pytorchmergebot · commit 388b245d54a6 · 2023-01-18T00:20:13.000Z
This PR: - registers all of the codegened Nodes to the torch._C._functions module, this is where special nodes like AccumulateGrad are already registered. - creates a autograd.graph.Node abstract base class that all of the newly registered nodes subclass from. We make the subclassing happen by implementing the ``__subclasshook__`` method - enables static type checking to work and also enables Sphinx to generate documentation for the Node and its methods - handles both the custom Function and codegened cases Pull Request resolved: pytorch#91475 Approved by: https://github.com/albanD
diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
@@ -260,12 +260,40 @@ Anomaly detection
 .. autoclass:: set_detect_anomaly
 
 
-Saved tensors default hooks
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Autograd graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Autograd exposes methods that allow one to inspect the graph and interpose behavior during
+the backward pass.
+
+The ``grad_fn`` attribute of a :class:`torch.Tensor` holds a  :class:`torch.autograd.graph.Node`
+if the tensor is the output of a operation that was recorded by autograd (i.e., grad_mode is
+enabled and at least one of the inputs required gradients), or ``None`` otherwise.
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    graph.Node.name
+    graph.Node.metadata
+    graph.Node.next_functions
+    graph.Node.register_hook
+    graph.Node.register_prehook
 
 Some operations need intermediary results to be saved during the forward pass
 in order to execute the backward pass.
-You can define how these saved tensors should be packed / unpacked using hooks.
+These intermediary results are saved as attributes on the ``grad_fn`` and can be accessed.
+For example::
+
+    >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+    >>> b = a.exp()
+    >>> print(isinstance(b.grad_fn, torch.autograd.graph.Node))
+    True
+    >>> print(dir(b.grad_fn))
+    ['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_raw_saved_result', '_register_hook_dict', '_saved_result', 'metadata', 'name', 'next_functions', 'register_hook', 'register_prehook', 'requires_grad']
+    >>> print(torch.allclose(b.grad_fn._saved_result, b))
+    True
+
+You can also define how these saved tensors should be packed / unpacked using hooks.
 A common application is to trade compute for memory by saving those intermediary results
 to disk or to CPU instead of leaving them on the GPU. This is especially useful if you
 notice your model fits on GPU during evaluation, but not training.
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -6051,6 +6051,51 @@ def backward(ctx, g):
         self.assertEqual(y.grad_fn.saved_tensors, ())
         self.assertEqual(y.grad_fn._raw_saved_tensors, ())
 
+    def test_autograd_node_isinstance(self):
+        # Node is a "virtual" base class of codegen'd nodes. This means that
+        # isinstance and issubclass are overridden, but mro is unchanged
+        Node = torch.autograd.graph.Node
+
+        a = torch.rand(3, 3, requires_grad=True)
+        b = a.exp()
+
+        # Some nodes have codegened registrations to the torch._C._function module
+        self.assertIsInstance(b.grad_fn, Node)
+        self.assertTrue(issubclass(type(b.grad_fn), Node))
+        self.assertTrue(Node not in type(b.grad_fn).mro())
+
+        # Other nodes have manual registrations to the torch._C._function module
+        self.assertNotIsInstance(torch._C._functions.AccumulateGrad, Node)
+        self.assertTrue(issubclass(torch._C._functions.AccumulateGrad, Node))
+        self.assertIsInstance(b.grad_fn.next_functions[0][0], Node)
+        self.assertTrue(issubclass(torch._C._functions.DelayedError, Node))
+
+        # Special cases
+        self.assertNotIsInstance(None, Node)
+        self.assertNotIsInstance(1, Node)
+        self.assertNotIsInstance(Node, Node)
+        self.assertTrue(issubclass(Node, Node))
+
+        # Custom function case
+        self.assertTrue(issubclass(torch.autograd.function.BackwardCFunction, Node))
+
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                self.assertIsInstance(ctx, Node)
+                return x
+
+            @staticmethod
+            def backward(ctx, x):
+                self.assertIsInstance(ctx, Node)
+                return x
+
+        out = Func.apply(a)
+        self.assertIsInstance(out.grad_fn, Node)
+        self.assertTrue(issubclass(type(out.grad_fn), Node))
+        self.assertTrue(Node not in type(out.grad_fn).mro())
+        out.sum().backward()
+
     def test_autograd_views_codegen(self):
         # This is not necessarily the absolute correct behavior, but this is the current
         # one. This test is here to make sure that any change to this behavior is detected
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
@@ -128,7 +128,7 @@
 PY_FUNCTION_DEFINITION = CodeTemplate(
     """\
 static PyTypeObject ${op}Class;
-addClass<${op}>(${op}Class, "${op}", ${op}_properties);
+addClass<${op}>(module, ${op}Class, "${op}", ${op}_properties);
 """
 )
 
@@ -432,11 +432,12 @@ def gen_autograd_functions_python(
             "generated_comment": "@"
             + f"generated from {fm.template_dir_for_comments()}/python_functions.h",
             "shard_forward_declare": [
-                f"void initialize_autogenerated_functions_{i}();"
+                f"void initialize_autogenerated_functions_{i}(PyObject* module);"
                 for i in range(num_shards)
             ],
             "shard_call": [
-                f"initialize_autogenerated_functions_{i}();" for i in range(num_shards)
+                f"initialize_autogenerated_functions_{i}(module);"
+                for i in range(num_shards)
             ],
         },
     )
diff --git a/tools/autograd/templates/python_functions.cpp b/tools/autograd/templates/python_functions.cpp
@@ -19,17 +19,18 @@
 namespace torch { namespace autograd { namespace generated {
 
 template<typename C>
-static void addClass(PyTypeObject& type, const char* name,
+static void addClass(PyObject* module, PyTypeObject& type, const char* name,
   PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)
 {
   _initFunctionPyTypeObject(type, name, function_properties, function_methods);
   Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
   registerCppFunction(typeid(C), &type);
 }
 
 ${py_function_props_and_getters}
 
-void initialize_autogenerated_functions${shard_id}() {
+void initialize_autogenerated_functions${shard_id}(PyObject* module) {
   ${py_function_initializers}
 }
 
diff --git a/tools/autograd/templates/python_functions.h b/tools/autograd/templates/python_functions.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <Python.h>
+
 // ${generated_comment}
 
 // Python bindings for automatically generated autograd functions
@@ -8,7 +10,7 @@ namespace torch { namespace autograd { namespace generated {
 
 ${shard_forward_declare}
 
-inline void initialize_autogenerated_functions() {
+inline void initialize_autogenerated_functions(PyObject* module) {
   ${shard_call}
 }
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -3,6 +3,7 @@
 import torch
 from torch.package import PackageExporter
 from torch import Tensor
+from torch.autograd.graph import Node as _Node
 from enum import Enum
 from pathlib import Path
 from typing import (
@@ -1178,7 +1179,7 @@ class _TensorBase(metaclass=_TensorMeta):
     _version: _int
     _base: Optional[Tensor]
     _cdata: _int
-    grad_fn: Any
+    grad_fn: _Node
     _grad_fn: Any
     _grad: Optional[Tensor]
     grad: Optional[Tensor]
@@ -1542,7 +1543,7 @@ def _activate_cuda_trace() -> None: ...
 
 # Defined in torch/csrc/Module.cpp
 def _current_graph_task_id() -> _int: ...
-def _current_autograd_node() -> Any: ...
+def _current_autograd_node() ->  _Node: ...
 
 class _OutOfMemoryError:
     pass
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
@@ -5,15 +5,122 @@
 from torch.utils._python_dispatch import TorchDispatchMode
 from collections import defaultdict
 import weakref
+import abc
 
 __all__ = [
     "saved_tensors_hooks",
     "save_on_cpu",
     "disable_saved_tensors_hooks",
     "register_multi_grad_hook",
     "allow_mutation_on_saved_tensors",
+    "Node",
 ]
 
+class Node(abc.ABC):
+    @abc.abstractmethod
+    def name(self) -> str:
+        r"""Returns the name.
+
+        Example::
+
+            >>> import torch
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> print(b.grad_fn.name())
+            CloneBackward0
+        """
+        ...
+
+    @property
+    @abc.abstractmethod
+    def next_functions(self) -> Tuple[Tuple[Optional['Node'], int], ...]:
+        ...
+
+    @abc.abstractmethod
+    def metadata(self) -> dict:
+        r"""Returns the metadata."""
+        ...
+
+    @abc.abstractmethod
+    def _register_hook_dict(self, tensor: torch.Tensor) -> None:
+        ...
+
+    @abc.abstractmethod
+    def register_hook(self, fn: Callable[..., Any]) -> RemovableHandle:
+        r"""Registers a backward hook.
+
+        The hook will be called every time a gradient with respect to the
+        Node is computed. The hook should have the following signature::
+
+            hook(grad_inputs: Tuple[Tensor], grad_outputs: Tuple[Tensor]) -> Tuple[Tensor] or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad_outputs`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        Example::
+
+            >>> import torch
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> handle = b.grad_fn.register_hook(lambda gI, gO: (gO[0] * 2,))
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([2., 2., 2.])
+            >>> handle.remove() # Removes the hook
+            >>> a.grad = None
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([1., 1., 1.])
+        """
+        ...
+
+    @abc.abstractmethod
+    def register_prehook(self, fn: Callable[..., Any]) -> RemovableHandle:
+        r"""Registers a backward pre-hook.
+
+        The hook will be called every time a gradient with respect to the
+        Node is computed. The hook should have the following signature::
+
+            hook(grad_outputs: Tuple[Tensor]) -> Tuple[Tensor] or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad_outputs`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        Example::
+
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> handle = b.grad_fn.register_prehook(lambda gI: (gI[0] * 2,))
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([2., 2., 2.])
+            >>> handle.remove()
+            >>> a.grad = None
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([1., 1., 1.])
+        """
+        ...
+
+    @classmethod
+    def __subclasshook__(cls, C):
+        if cls is Node:
+            if ((C is not None and C is getattr(torch._C._functions, C.__name__, None))
+                    or issubclass(C, torch.autograd.function.BackwardCFunction)):
+                return True
+        return NotImplemented
+
 class saved_tensors_hooks():
     """Context-manager that sets a pair of pack / unpack hooks for saved tensors.
 
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
@@ -156,7 +156,7 @@ void THPAutograd_initFunctions() {
   static PyTypeObject CopySlicesClass;
   addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
 
-  generated::initialize_autogenerated_functions();
+  generated::initialize_autogenerated_functions(module);
 
   auto c_module = THPObjectPtr(PyImport_ImportModule("torch._C"));
   if (!c_module)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
@@ -1224,6 +1224,7 @@ def _register_post_backward_hooks(
             "register the post-backward hook",
         )
         acc_grad = temp_flat_param.grad_fn.next_functions[0][0]
+        assert acc_grad is not None
         hook_handle = acc_grad.register_hook(
             functools.partial(_post_backward_hook, state, handle)
         )

Original file line number	Diff line number	Diff line change
`@@ -19,17 +19,18 @@`
`19`	`19`	`namespace torch { namespace autograd { namespace generated {`
`20`	`20`
`21`	`21`	`template<typename C>`
`22`		`-static void addClass(PyTypeObject& type, const char* name,`
	`22`	`+static void addClass(PyObject* module, PyTypeObject& type, const char* name,`
`23`	`23`	`PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)`
`24`	`24`	`{`
`25`	`25`	`_initFunctionPyTypeObject(type, name, function_properties, function_methods);`
`26`	`26`	`Py_INCREF(&type);`
	`27`	`+ PyModule_AddObject(module, name, (PyObject*)&type);`
`27`	`28`	`registerCppFunction(typeid(C), &type);`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`${py_function_props_and_getters}`
`31`	`32`
`32`		`-void initialize_autogenerated_functions${shard_id}() {`
	`33`	`+void initialize_autogenerated_functions${shard_id}(PyObject* module) {`
`33`	`34`	`${py_function_initializers}`
`34`	`35`	`}`
`35`	`36`
Original file line number	Diff line number	Diff line change
`@@ -1224,6 +1224,7 @@ def _register_post_backward_hooks(`
`1224`	`1224`	`"register the post-backward hook",`
`1225`	`1225`	`)`
`1226`	`1226`	`acc_grad = temp_flat_param.grad_fn.next_functions[0][0]`
	`1227`	`+ assert acc_grad is not None`
`1227`	`1228`	`hook_handle = acc_grad.register_hook(`
`1228`	`1229`	`functools.partial(_post_backward_hook, state, handle)`
`1229`	`1230`	`)`