[cublas][cublasLt] Fall back to unfused addmm for 2-byte-aligned inputs (pytorch#92201)

eqy · pytorchmergebot · commit 1af40d5108ee · 2023-01-21T00:32:02.000Z
Fix for this issue surfaced from the discuss forum: https://discuss.pytorch.org/t/cuda-error-cublas-status-not-supported-when-calling-cublasltmatmul-from-torch-nn-functional-linear/170214 Note that PyTorch builds before pytorch#71200 should not be affected as there was no `cublasLt` dispatch path. Additionally, the provided repro has the quirk of using a 3D input, which means it will not dispatch to `cublasLt`-backed `addmm` until builds that include pytorch#72728. Changing the input to 2D by trivially removing the size `1` dimension will surface the failure on builds after pytorch#71200. Interestingly, the use-case where _all_ inputs are 2-byte aligned are supported (runs without crashing), but when some are > 2-byte and some are == 2-byte are not. This behavior suggests that the `cuBlastLt` heuristics are incorrect, as the heuristic function has visibility of the raw pointer values via the descriptors when it is called. We will follow up with `cuBlasLt` but this fix is needed to prevent unnecessary crashes for now. CC @ptrblck @ngimel Pull Request resolved: pytorch#92201 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -146,6 +146,18 @@ static bool getDisableAddmmCudaLt() {
     return false;
 }
 
+uint8_t getAlignment(const Tensor &t) {
+  // alignment are in bytes
+  uint8_t alignment = 1;
+  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
+  for (; alignment < 4; alignment *= 2) {
+    if (address % (alignment * 2)) {
+      return alignment;
+    }
+  }
+  return alignment;
+}
+
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
   // Make sure to keep addmm_cuda below in sync with this code; it
   // preflights a check to try to avoid actually needing to call
@@ -173,13 +185,26 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     // leading dim >> rows when they are sliced from a large tensor
     // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
     if (!disable_addmm_cuda_lt) {
+      auto self_alignment = getAlignment(self);
+      auto mat1_alignment = getAlignment(mat1);
+      auto mat2_alignment = getAlignment(mat2);
+      // due to a heuristic bug, cuBlasLt requires all alignments > 2 or the same ( == 2)
+      // should we err on the side of caution and remove the second dispatch path?
+      bool alignment_ok = (self_alignment > 2 &&
+                           mat1_alignment > 2 &&
+                           mat2_alignment > 2) ||
+                          (self_alignment == 2 &&
+                           mat1_alignment == 2 &&
+                           mat2_alignment == 2);
+
       useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
           result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
           self.is_contiguous() &&
           (scalar_type == at::ScalarType::Double ||
            scalar_type == at::ScalarType::Float ||
            scalar_type == at::ScalarType::Half ||
            scalar_type == at::ScalarType::BFloat16) &&
+          alignment_ok &&
           mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
           mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
           mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -100,6 +100,18 @@ def test_cublas_addmm(self, size: int, dtype: torch.dtype):
         self.assertEqual(res_cpu, res_cuda)
         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
 
+    @onlyCUDA
+    def test_cublas_addmm_alignment(self):
+        dtype = torch.half
+        device = 'cuda'
+        A = torch.rand((5120 * 2560 + 1), requires_grad=True, dtype=dtype, device=device)
+        A = A[1:].reshape(5120, 2560)
+        # check that heuristic does not fail on 2-byte alignment
+        X = torch.rand((26, 1, 2560), requires_grad=True, dtype=dtype, device=device)
+        B = torch.rand((5120), requires_grad=True, dtype=dtype, device=device)
+        out = torch.nn.functional.linear(X, A, B)
+        self.assertEqual(out, torch.matmul(X, A.transpose(1, 0)) + B)
+
     @onlyCUDA
     @unittest.skipIf(not CUDA11OrLater, "Only CUDA 11+ is supported")
     @toleranceOverride({torch.float32: xtol(atol=1e-5, rtol=1e-5)})