[cuBLAS] Add an option to disable reduced precision reductions for BF16 GEMM (pytorch#89172)

eqy · pytorchmergebot · commit 8b617f813d86 · 2022-12-21T18:58:28.000Z
Essentially the same change as pytorch#67946, except that the default is to disallow reduced precision reductions in `BFloat16` GEMMs (for now). If performance is severely regressed, we can change the default, but this option appears to be necessary to pass some `addmm` `BFloat16` tests on H100. CC @ptrblck @ngimel Pull Request resolved: pytorch#89172 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -250,6 +250,15 @@ void Context::setAllowFP16ReductionCuBLAS(bool b) {
   allow_fp16_reduction_cublas = b;
 }
 
+bool Context::allowBF16ReductionCuBLAS() const {
+  return allow_bf16_reduction_cublas;
+}
+
+void Context::setAllowBF16ReductionCuBLAS(bool b) {
+  allow_bf16_reduction_cublas = b;
+}
+
+
 bool Context::hasMKL() {
 #if AT_MKL_ENABLED()
   return true;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -241,6 +241,8 @@ class TORCH_API Context {
   void setFloat32MatmulPrecision(Float32MatmulPrecision p);
   bool allowFP16ReductionCuBLAS() const;
   void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
   at::QEngine qEngine() const;
   void setQEngine(at::QEngine e);
   static const std::vector<at::QEngine>& supportedQEngines();
@@ -288,6 +290,7 @@ class TORCH_API Context {
   int benchmark_limit_cudnn = 10;
   bool allow_tf32_cudnn = true;
   bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
   bool enabled_mkldnn = true;
   at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default;
 #ifdef C10_MOBILE
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -538,6 +538,11 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   float fbeta = beta;
   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   GEMM_CHECK_ARGVALUES(at::BFloat16);
+  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  }
+  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
   TORCH_CUDABLAS_CHECK(cublasGemmEx(
       handle,
       opa,
@@ -558,6 +563,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
       ldc,
       CUDA_R_32F,
       CUBLAS_GEMM_DFALT_TENSOR_OP));
+  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
 }
 #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
@@ -34,6 +34,10 @@ torch.backends.cuda
 
     A :class:`bool` that controls whether reduced precision reductions (e.g., with fp16 accumulation type) are allowed with fp16 GEMMs.
 
+.. attribute::  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+
+    A :class:`bool` that controls whether reduced precision reductions are allowed with bf16 GEMMs.
+
 .. attribute::  torch.backends.cuda.cufft_plan_cache
 
     ``cufft_plan_cache`` caches the cuFFT plans
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -169,12 +169,34 @@ If full precision reductions are needed, users can disable reduced precision red
 
   torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
 
-To toggle the reduced precision reduction flags in C++, you can do
+To toggle the reduced precision reduction flags in C++, one can do
 
 .. code:: C++
 
   at::globalContext().setAllowFP16ReductionCuBLAS(false);
 
+.. _bf16reducedprecision:
+
+Reduced Precision Reduction in BF16 GEMMs
+-----------------------------------------
+
+A similar flag (as above) exists for BFloat16 GEMMs. Note that this switch is
+set to `False` by default for BF16 as we have observed numerical instability in
+PyTorch CI tests (e.g., test/test_matmul_cuda.py).
+
+If reduced precision reductions are not desired, users can disable reduced
+precision reductions in bf16 GEMMs with:
+
+.. code:: python
+
+  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+
+To toggle the reduced precision reduction flags in C++, one can do
+
+.. code:: C++
+
+  at::globalContext().setAllowBF16ReductionCuBLAS(true);
+
 Asynchronous execution
 ----------------------
 
diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
@@ -98,13 +98,17 @@ If your network needs full float32 precision for both matrix multiplications and
 
 For more information see :ref:`TensorFloat32<tf32_on_ampere>`.
 
-Reduced Precision Reduction for FP16 GEMMs
-------------------------------------------
+Reduced Precision Reduction for FP16  and BF16 GEMMs
+----------------------------------------------------
 Half-precision GEMM operations are typically done with intermediate accumulations (reduction) in single-precision for numerical accuracy and improved resilience to overflow. For performance, certain GPU architectures, especially more recent ones, allow a few truncations of the intermediate accumulation results to the reduced precision (e.g., half-precision). This change is often benign from the perspective of model convergence, though it may lead to unexpected results (e.g., ``inf`` values when the final result should be be representable in half-precision).
 If reduced-precision reductions are problematic, they can be turned off with
 ``torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False``
 
-For more information see :ref:`allow_fp16_reduced_precision_reduction<fp16reducedprecision>`
+A similar flag exists for BF16 GEMM operations and is turned off by default. If BF16
+reduced-precision reductions are problematic, they can be turned off with
+``torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False``
+
+For more information see :ref:`allow_fp16_reduced_precision_reduction<fp16reducedprecision>` and :ref:`allow_bf16_reduced_precision_reduction<bf16reducedprecision>`
 
 .. _fp16_on_mi200:
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -636,6 +636,14 @@ def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self):
         self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig)
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
 
+    def test_cublas_allow_bf16_reduced_precision_reduction_get_set(self):
+        orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+        self.assertEqual(torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), orig)
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
+        self.assertEqual(torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), not orig)
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
+
+
     def test_cudnn_allow_tf32_get_set(self):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=False):
             self.assertFalse(torch.backends.cudnn.allow_tf32)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -845,6 +845,8 @@ def _get_float32_matmul_precision() -> str: ... #THPModule_float32MatmulPrecisio
 def _set_float32_matmul_precision(arg: str) -> None: ... #THPModule_setFloat32MatmulPrecision
 def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ... #THPModule_allowFP16ReductionCuBLAS
 def _set_cublas_allow_fp16_reduced_precision_reduction(arg: _bool) -> None: ... #THPModule_setAllowFP16ReductionCuBLAS
+def _get_cublas_allow_bf16_reduced_precision_reduction() -> _bool: ... #THPModule_allowBF16ReductionCuBLAS
+def _set_cublas_allow_bf16_reduced_precision_reduction(arg: _bool) -> None: ... #THPModule_setAllowBF16ReductionCuBLAS
 def _set_conj(x: Tensor, conj: _bool) -> None: ...
 def _set_neg(x: Tensor, neg: _bool) -> None: ...
 def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
@@ -97,13 +97,17 @@ def __getattr__(self, name):
             return torch._C._get_cublas_allow_tf32()
         elif name == "allow_fp16_reduced_precision_reduction":
             return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
         raise AssertionError("Unknown attribute " + name)
 
     def __setattr__(self, name, value):
         if name == "allow_tf32":
             return torch._C._set_cublas_allow_tf32(value)
         elif name == "allow_fp16_reduced_precision_reduction":
             return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
         raise AssertionError("Unknown attribute " + name)
 
 _LinalgBackends = {
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -743,6 +743,27 @@ PyObject* THPModule_allowFP16ReductionCuBLAS(
   Py_RETURN_FALSE;
 }
 
+PyObject* THPModule_setAllowBF16ReductionCuBLAS(
+    PyObject* _unused,
+    PyObject* arg) {
+  THPUtils_assert(
+      PyBool_Check(arg),
+      "set_allow_bf16_reduction_cublas expects a bool, "
+      "but got %s",
+      THPUtils_typename(arg));
+  at::globalContext().setAllowBF16ReductionCuBLAS(arg == Py_True);
+  Py_RETURN_NONE;
+}
+
+PyObject* THPModule_allowBF16ReductionCuBLAS(
+    PyObject* _unused,
+    PyObject* noargs) {
+  if (at::globalContext().allowBF16ReductionCuBLAS()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
 PyObject* THPModule_setFlushDenormal(PyObject* _unused, PyObject* arg) {
   THPUtils_assert(
       PyBool_Check(arg),
@@ -1063,6 +1084,14 @@ static PyMethodDef TorchMethods[] = {
      THPModule_setAllowFP16ReductionCuBLAS,
      METH_O,
      nullptr},
+    {"_get_cublas_allow_bf16_reduced_precision_reduction",
+     THPModule_allowBF16ReductionCuBLAS,
+     METH_NOARGS,
+     nullptr},
+    {"_set_cublas_allow_bf16_reduced_precision_reduction",
+     THPModule_setAllowBF16ReductionCuBLAS,
+     METH_O,
+     nullptr},
     {"_vmapmode_increment_nesting",
      THPModule_vmapmode_increment_nesting,
      METH_NOARGS,