Fix non-existing parameters in docstrings in torch/nn (pytorch#90596)

kit1980 · pytorchmergebot · commit 9ef1d55e6b85 · 2022-12-10T14:37:31.000Z
This is a continuation of pytorch#90505 Pull Request resolved: pytorch#90596 Approved by: https://github.com/lezcano
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -1412,18 +1412,14 @@ def _register_buffer_comm_hook(
         Args:
             state (Any): Optional state that is passed to the hook.
             hook (Callable): Callable with the following signature:
-                            ``hook(state: object, buffers: Dict[str, torch.Tensor])
-                            -> Optional[List[torch.futures.Future[torch.Tensor]]]``
+                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``
             comm_hook_location (_BufferCommHookLocation): Enum value indicating
                             where to run the hook.
                             _BufferCommHookLocation.PRE_FORWARD means that the
                             hook will run _before_ the forward pass, and
                             _BufferCommHookLocation.POST_FORWARD means that the
                             hook will run _after_ the forward pass.
 
-            hook (Callable): Callable with the following signature:
-                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
-
             NOTE: To maximize performance, users can return a
                 List[torch.futures.Future] from their hook, and DDP will
                 install and await these hooks appropriately at the end of
@@ -1558,26 +1554,26 @@ def _register_fused_optim(
         self, optim: Type, *args, optim_params=None, **kwargs
     ):
         r"""
-            Registers an optimizer with DDP such that the optimization for a
-            parameter will run immediately when that parameter's gradient is
-            finished with reduction, instead of waiting for all parameters'
-            gradients to finish reduction. This can result in a training speedup
-            depending on your workload since the optimizer can run while gradient
-            reduction for other parameters are still ongoing. In addition, this has
-            the potential to reduce peak memory consumption during training, as it
-            only needs to load the per-parameter optimizer states of a single
-            parameter at a time, instead of loading all per-parameter optimizer
-            states at once.
-
-            Args:
-                optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered
-                as a fused optimizer.
-                *args (Sequence[Any]): Arguments to forward to `optim_cls`.
-                optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
-                to optimize, similar to `params` argument of traditional `torch.optim`
-                Optimizers. If this is omitted, all DDP model parameters will be
-                optimized.
-                **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`.
+        Registers an optimizer with DDP such that the optimization for a
+        parameter will run immediately when that parameter's gradient is
+        finished with reduction, instead of waiting for all parameters'
+        gradients to finish reduction. This can result in a training speedup
+        depending on your workload since the optimizer can run while gradient
+        reduction for other parameters are still ongoing. In addition, this has
+        the potential to reduce peak memory consumption during training, as it
+        only needs to load the per-parameter optimizer states of a single
+        parameter at a time, instead of loading all per-parameter optimizer
+        states at once.
+
+        Args:
+            optim (Type): a ``torch.optim.Optimizer`` class to be registered
+            as a fused optimizer.
+            *args (Sequence[Any]): Arguments to forward to `optim`.
+            optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
+            to optimize, similar to `params` argument of traditional `torch.optim`
+            Optimizers. If this is omitted, all DDP model parameters will be
+            optimized.
+            **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim`.
 
         .. warning ::
             _register_fused_optim should only be called once on a DDP instance,
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -24,12 +24,10 @@ def forward_helper(func, expanded_args, expanded_kwargs):
 
     Args:
         func: The function to be called
-        ctx: The context from the autograd.Function object. Will be used to save
-          computed state from the forward pass
         expanded_args: Arguments to be passed to :attr:`func`. Will include arguments
           that need to be unpacked because they are ExpandedWeights
-        num_true_outs: The number of outputs seen by the user since some functions
-          return auxillary data that is only used in the backward pass
+        expanded_kwargs: Keyword arguments to be passed to :attr:`func`.
+          Similar to :attr:`expanded_args`.
     '''
     unexpanded_args, unexpanded_kwargs = _check_and_unexpand_args(func, expanded_args, expanded_kwargs)
     return func(*unexpanded_args, **unexpanded_kwargs)
diff --git a/torch/nn/utils/memory_format.py b/torch/nn/utils/memory_format.py
@@ -41,9 +41,9 @@ def convert_conv2d_weight_memory_format(module, memory_format):
         immediately before a convolution.
 
     Args:
-        module (nn.Module): ``nn.Conv2d`` & ``nn.ConvTranspose2d``  or container
+        module (nn.Module): ``nn.Conv2d`` & ``nn.ConvTranspose2d`` or container
                             ``nn.Module``
-        format: user specified ``memory_format``,
+        memory_format: user specified ``memory_format``,
             e.g. ``torch.channels_last`` or ``torch.contiguous_format``
 
     Returns:
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
@@ -120,7 +120,7 @@ def __init__(
         #    assert X.dtype == Z.dtype and X.shape == Z.shape
         #    # If it has one input, this allows to be able to use set_ to be able to
         #    # move data to/from the original tensor without changing its id (which is what the
-        #    # optimiser uses to track parameters)
+        #    # optimizer uses to track parameters)
         #    if isinstance(Y, Tensor)
         #      assert X.dtype == Y.dtype
         # Below we use original = X, new = Y
@@ -591,7 +591,7 @@ def is_parametrized(module: Module, tensor_name: Optional[str] = None) -> bool:
 
     Args:
         module (nn.Module): module to query
-        name (str, optional): attribute in the module to query
+        tensor_name (str, optional): attribute in the module to query
             Default: ``None``
     """
     parametrizations = getattr(module, "parametrizations", None)