[cuDNN V8 API] (reopen 2) Allow the number of kernels profiled under torch.backends.cudnn.benchmark = True to be limitedCudnnv8 benchmark limit (pytorch#78299)

eqy · pytorchmergebot · commit ae6dd20ba772 · 2022-07-07T23:25:23.000Z
Reopen of pytorch#77002 to address comments by @malfet CC @ngimel @ptrblck Pull Request resolved: pytorch#78299 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -144,6 +144,14 @@ void Context::setBenchmarkCuDNN(bool b) {
   benchmark_cudnn = b;
 }
 
+int Context::benchmarkLimitCuDNN() const {
+  return benchmark_limit_cudnn;
+}
+
+void Context::setBenchmarkLimitCuDNN(int b) {
+  benchmark_limit_cudnn = b;
+}
+
 bool Context::allowTF32CuBLAS() const {
   static bool allow_tf32_cublas_override = c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true;
   return allow_tf32_cublas_override || float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -121,6 +121,8 @@ class TORCH_API Context {
   void setUserEnabledMkldnn(bool e);
   bool benchmarkCuDNN() const;
   void setBenchmarkCuDNN(bool);
+  int benchmarkLimitCuDNN() const;
+  void setBenchmarkLimitCuDNN(int);
   bool deterministicCuDNN() const;
   void setDeterministicCuDNN(bool);
 
@@ -254,6 +256,7 @@ class TORCH_API Context {
   bool benchmark_cudnn = false;
   Float32MatmulPrecision float32_matmul_precision =
       at::Float32MatmulPrecision::HIGHEST;
+  int benchmark_limit_cudnn = 10;
   bool allow_tf32_cudnn = true;
   bool allow_fp16_reduction_cublas = true;
   bool enabled_mkldnn = true;
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -344,7 +344,7 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera
       remove_invalid = true;
     }
   }
-  if (remove_invalid) {
+  if (remove_invalid || max_plans) {
     cudnn_frontend::executionPlans_t new_valid_plans;
     unsigned int plan_count = 0;
     for (auto &plan : valid_plans) {
@@ -370,7 +370,8 @@ auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescripto
   cudnn_frontend::executionPlans_t valid_plans;
   c10::DeviceGuard g(x.options().device());
   at::DataPtr workspace_ptr;
-  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
+  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr, benchmark_limit);
   auto variantPack = cudnn_frontend::VariantPackBuilder()
       .setDataPointers(3, data_ptrs)
       .setUids(3, uids)
@@ -400,7 +401,8 @@ auto get_plans_from_find_fused(const cudnnHandle_t handle,
   cudnn_frontend::executionPlans_t valid_plans;
   c10::DeviceGuard g(x.options().device());
   at::DataPtr workspace_ptr;
-  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
+  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr, benchmark_limit);
   auto variantPack = cudnn_frontend::VariantPackBuilder()
       .setDataPointers(5, data_ptrs)
       .setUids(5, uids)
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
@@ -78,6 +78,14 @@ torch.backends.cudnn
     A :class:`bool` that, if True, causes cuDNN to benchmark multiple convolution algorithms
     and select the fastest.
 
+.. attribute::  torch.backends.cudnn.benchmark_limit
+
+    A :class:`int` that specifies the maximum number of cuDNN convolution algorithms to try when
+    `torch.backends.cudnn.benchmark` is True. Set `benchmark_limit` to zero to try every
+    available algorithm. Note that this setting only affects convolutions dispatched via the
+    cuDNN v8 API.
+
+
 torch.backends.mps
 ^^^^^^^^^^^^^^^^^^
 .. automodule:: torch.backends.mps
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -142,6 +142,10 @@ if(USE_ROCM)
     list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${roctracer_INCLUDE_DIRS})
 endif()
 
+if(USE_EXPERIMENTAL_CUDNN_V8_API)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_EXPERIMENTAL_CUDNN_V8_API)
+endif()
+
 if(USE_CUDNN OR USE_ROCM)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/shared/cudnn.cpp
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1053,6 +1053,8 @@ def _cuda_jiterator_compile_and_launch_kernel(code_string: str,
                                               num_outputs: _int,
                                               tensors: Tuple,
                                               kwargs: Dict[str, Union[_int, _float, _bool]]) -> Tensor: ...
+def _cuda_get_cudnn_benchmark_limit() -> _int: ...
+def _cuda_set_cudnn_benchmark_limit(arg: _int) -> None: ...
 def _nccl_version() -> _int: ...
 def _nccl_unique_id() -> bytes: ...
 def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
@@ -102,15 +102,18 @@ def is_acceptable(tensor):
     return True
 
 
-def set_flags(_enabled=None, _benchmark=None, _deterministic=None, _allow_tf32=None):
+def set_flags(_enabled=None, _benchmark=None, _benchmark_limit=None, _deterministic=None, _allow_tf32=None):
     orig_flags = (torch._C._get_cudnn_enabled(),
                   torch._C._get_cudnn_benchmark(),
+                  None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
                   torch._C._get_cudnn_deterministic(),
                   torch._C._get_cudnn_allow_tf32())
     if _enabled is not None:
         torch._C._set_cudnn_enabled(_enabled)
     if _benchmark is not None:
         torch._C._set_cudnn_benchmark(_benchmark)
+    if _benchmark_limit is not None and is_available():
+        torch._C._cuda_set_cudnn_benchmark_limit(_benchmark_limit)
     if _deterministic is not None:
         torch._C._set_cudnn_deterministic(_deterministic)
     if _allow_tf32 is not None:
@@ -119,9 +122,9 @@ def set_flags(_enabled=None, _benchmark=None, _deterministic=None, _allow_tf32=N
 
 
 @contextmanager
-def flags(enabled=False, benchmark=False, deterministic=False, allow_tf32=True):
+def flags(enabled=False, benchmark=False, benchmark_limit=10, deterministic=False, allow_tf32=True):
     with __allow_nonbracketed_mutation():
-        orig_flags = set_flags(enabled, benchmark, deterministic, allow_tf32)
+        orig_flags = set_flags(enabled, benchmark, benchmark_limit, deterministic, allow_tf32)
     try:
         yield
     finally:
@@ -141,6 +144,9 @@ def __init__(self, m, name):
     enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
     deterministic = ContextProp(torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic)
     benchmark = ContextProp(torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark)
+    benchmark_limit = None
+    if is_available():
+        benchmark_limit = ContextProp(torch._C._cuda_get_cudnn_benchmark_limit, torch._C._cuda_set_cudnn_benchmark_limit)
     allow_tf32 = ContextProp(torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32)
 
 # This is the sys.modules replacement trick, see
@@ -152,3 +158,4 @@ def __init__(self, m, name):
 deterministic: bool
 benchmark: bool
 allow_tf32: bool
+benchmark_limit: int
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -1,4 +1,10 @@
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAConfig.h>
+#if AT_CUDNN_ENABLED()
+
+#include <ATen/native/cudnn/Macros.h>
+
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CachingHostAllocator.h>
@@ -727,6 +733,32 @@ static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THCPModule_setBenchmarkLimitCuDNN(PyObject* _unused, PyObject* arg) {
+  THPUtils_assert(
+      THPUtils_checkLong(arg),
+      "set_benchmark_limit_cudnn expects an int, "
+      "but got %s",
+      THPUtils_typename(arg));
+  auto benchmark_limit = static_cast<int>(THPUtils_unpackLong(arg));
+#if defined(USE_ROCM)
+  TORCH_WARN_ONCE(
+      "cuDNN Benchmark limit is not supported in MIOpen and will have no effect.");
+#endif
+#if AT_CUDNN_ENABLED()
+#if HAS_CUDNN_V8()
+  at::globalContext().setBenchmarkLimitCuDNN(benchmark_limit);
+#else
+  TORCH_WARN_ONCE(
+      "cuDNN Benchmark limit is not supported with cuDNN v7 API and will have no effect.");
+#endif
+#endif
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_benchmarkLimitCuDNN(PyObject* _unused, PyObject* noargs) {
+  return THPUtils_packInt32(at::globalContext().benchmarkLimitCuDNN());
+}
+
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,
 // cppcoreguidelines-avoid-non-const-global-variables,
 // cppcoreguidelines-avoid-c-arrays)
@@ -814,6 +846,14 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaJiteratorCompileAndLaunchKernel,
      METH_VARARGS,
      nullptr},
+    {"_cuda_get_cudnn_benchmark_limit",
+     THCPModule_benchmarkLimitCuDNN,
+     METH_NOARGS,
+     nullptr},
+    {"_cuda_set_cudnn_benchmark_limit",
+     THCPModule_setBenchmarkLimitCuDNN,
+     METH_O,
+     nullptr},
 #ifdef USE_NCCL
     {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
     {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},