Do not use ZeroTensor for inplace ops (pytorch#69998)

soulitzer · facebook-github-bot · commit 5ccf28d0661b · 2021-12-23T15:52:34.000-08:00
Summary: Pull Request resolved: pytorch#69998 Fixes: pytorch#69855 The check for undefined grads for forward AD was not being run because `check_undefined_grads` was only passed as True by OpInfo for backward AD. This PR updates gradcheck to interpret `check_undefined_grads` as possibly for forward or backward AD. This PR also updates codegen to 1) not use ZeroTensor for `self` when the op is inplace. 2) only create zeros (either through ZeroTensor or at::zeros) if the tensor itself is not undefined. Previously we would error in this case when we call `.options` on the undefined tensor. ~TODO: undo the skips that are due to the original issue~ Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D33235973 Pulled By: soulitzer fbshipit-source-id: 5769b6d6ca123b2bed31dc2bc6bc8e4701581891
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -707,7 +707,7 @@ def _fn(t, *args, **kwargs):
         return _fn
 
     def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
-                      check_undefined_grad=True, check_batched_grad=None, check_batched_forward_grad=False):
+                      check_batched_grad=None, check_batched_forward_grad=False):
         assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
         # NB: check_backward_ad does not affect gradgradcheck (always True)
         if variant is None:
@@ -756,7 +756,7 @@ def fn(*inputs):
                                           fast_mode=op.gradcheck_fast_mode,
                                           check_forward_ad=check_forward_ad,
                                           check_backward_ad=check_backward_ad,
-                                          check_undefined_grad=check_undefined_grad,
+                                          check_undefined_grad=True,
                                           check_batched_forward_grad=check_batched_forward_grad))
             elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
                 self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
@@ -779,10 +779,9 @@ def fn(*inputs):
                 self.assertTrue(False, msg="Unknown check requested!")
 
     def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
-                          check_undefined_grad=True, check_batched_grad=None, check_batched_forward_grad=False):
+                          check_batched_grad=None, check_batched_forward_grad=False):
         return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
-                                  check_backward_ad=check_backward_ad, check_undefined_grad=check_undefined_grad,
-                                  check_batched_grad=check_batched_grad,
+                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
                                   check_batched_forward_grad=check_batched_forward_grad)
 
     def _skip_helper(self, op, device, dtype):
@@ -864,8 +863,7 @@ def call_grad_test_helper():
             check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
                                           (op.check_inplace_batched_forward_grad and is_inplace))
             self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
-                                   check_undefined_grad=False, check_batched_grad=False,
-                                   check_batched_forward_grad=check_batched_forward_grad)
+                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
         if op.supports_forward_ad:
             call_grad_test_helper()
         else:
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
@@ -322,7 +322,8 @@
 FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate("""\
 auto ${inp}_t_raw = toNonOptFwGrad(${inp});
 auto ${inp}_tensor = toNonOptTensor(${inp});
-auto ${inp}_t = ${inp}_t_raw.defined() ? ${inp}_t_raw : at::_efficientzerotensor(${inp}_tensor.sizes(), ${inp}_tensor.options());
+auto ${inp}_t = (${inp}_t_raw.defined() || !${inp}_tensor.defined())
+  ? ${inp}_t_raw : at::${zeros_fn}(${inp}_tensor.sizes(), ${inp}_tensor.options());
 """)
 
 FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate("""\
@@ -908,12 +909,13 @@ def emit_fw_derivatives() -> List[str]:
 
             unpacked_arguments = ""
             for inp in differentiable_inputs:
+                zeros_fn = "zeros" if inplace and inp.name == "self" else "_efficientzerotensor"
                 if inp.name in derivative.required_inputs_fw_grad:
-                    unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp=inp.name)
+                    unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp=inp.name, zeros_fn=zeros_fn)
                 if inp.name in (derivative.required_inputs_primal or []):
                     unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp=inp.name)
             if derivative.required_original_self_value:
-                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp="original_self")
+                unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp="original_self", zeros_fn=zeros_fn)
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp="original_self")
             elif inplace and derivative.is_reusing_outplace_formula:
                 # The gradient wasn't already cloned, do it if grad mode is enabled
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
@@ -882,6 +882,7 @@ def _test_undefined_forward_mode(func, outputs, inputs):
     with fwAD.dual_level():
         fw_grads = []
         dual_inputs = []
+        tensor_indices = set()
         for i, inp in enumerate(inputs):
             if is_tensor_like(inp) and inp.requires_grad:
                 if inp.layout == torch._mkldnn:  # type: ignore[attr-defined]
@@ -891,12 +892,15 @@ def _test_undefined_forward_mode(func, outputs, inputs):
                 # If inp is a differentiable view, the dual might not be the tangent given to
                 # make_dual, so read it explicitly from the dual tensor
                 fw_grads.append(fwAD.unpack_dual(inp)[1])
+                tensor_indices.add(i)
             dual_inputs.append(inp)
 
         for i, (fw_grad, u) in enumerate(zip(fw_grads, all_u)):
             fw_grad.copy_(u.view_as(fw_grad))
 
-        for idx, inp in enumerate(tensor_inputs):
+        for idx, inp in enumerate(inputs):
+            if idx not in tensor_indices:
+                continue
             dual_inp_obj = dual_inputs[idx]
 
             # case 1 (Materialized Zero Tensor Tangent)
@@ -1381,8 +1385,6 @@ def gradcheck(
     """
     assert check_forward_ad or check_backward_ad, \
         "Expected at least one of check_forward_ad or check_backward_ad to be True"
-    assert not (check_undefined_grad and not check_backward_ad), \
-        "Setting check_undefined_grad=True requires check_backward_ad to be True"
     assert not (check_batched_grad and not check_backward_ad), (
         "Setting check_batched_grad=True requires check_backward_ad to be True")
     assert not (check_batched_forward_grad and not check_forward_ad), (
@@ -1427,7 +1429,7 @@ def _gradcheck_helper(func, inputs, eps, atol, rtol, check_sparse_nnz, nondet_to
 
     _test_backward_mul_by_grad_output(outputs, tupled_inputs, check_sparse_nnz)
 
-    if check_undefined_grad:
+    if check_undefined_grad and check_backward_ad:
         _test_undefined_backward_mode(func, outputs, tupled_inputs)
     return True
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -10203,11 +10203,7 @@ def ref_pairwise_distance(input1, input2):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_masked_select,
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-           )),
+           sample_inputs_func=sample_inputs_masked_select),
     OpInfo('matrix_exp',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
@@ -10257,44 +10253,23 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_max_min_reduction_no_dim,
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-               # (ROCm) unexpected success
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
-           )),
+           sample_inputs_func=sample_inputs_max_min_reduction_no_dim),
     OpInfo('median',
            dtypes=all_types_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of median do support out
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-               # (ROCm) unexpected success
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
-           )),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
     OpInfo('nanmedian',
            dtypes=all_types_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of nanmedian do support out
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-               # (ROCm) unexpected success
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
-           )),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
     OpInfo('var_mean',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
@@ -10394,14 +10369,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_max_min_reduction_no_dim,
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-               # (ROCm) unexpected success
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
-                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
-           )),
+           sample_inputs_func=sample_inputs_max_min_reduction_no_dim),
     OpInfo('quantile',
            dtypes=floating_types(),
            sample_inputs_func=sample_inputs_reduction_quantile,
@@ -10709,8 +10677,6 @@ def ref_pairwise_distance(input1, input2):
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided,
            skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
                # AssertionError: False is not true : Tensors failed to compare as equal!
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
                # AssertionError: False is not true : Scalars failed to compare as equal!
@@ -13080,11 +13046,7 @@ def ref_pairwise_distance(input1, input2):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
-           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-           )),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('index_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            # An `out=` variant exists but is not exposed to the Python API
@@ -13990,11 +13952,7 @@ def ref_pairwise_distance(input1, input2):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_trace,
-           skips=(
-               # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-           )),
+           sample_inputs_func=sample_inputs_trace),
     OpInfo('transpose',
            aliases=('swapdims', 'swapaxes'),
            assert_jit_shape_analysis=True,
@@ -15244,8 +15202,6 @@ def ref_pairwise_distance(input1, input2):
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         skips=(
-            # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
             DecorateInfo(
                 unittest.skip("Skipped!"),
                 "TestJit",