ananthsub
diff --git a/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 30 additions & 0 deletions b/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 30 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎caffe2/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎caffe2/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
@@ -188,6 +188,36 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
             sharedMemBytes, hStream, kernelParams, extra);
 }
 
+// Irregularly shaped functions
+CUresult CUDAAPI cuLaunchCooperativeKernel(
+    CUfunction f,
+    unsigned int gridDimX,
+    unsigned int gridDimY,
+    unsigned int gridDimZ,
+    unsigned int blockDimX,
+    unsigned int blockDimY,
+    unsigned int blockDimZ,
+    unsigned int sharedMemBytes,
+    CUstream hStream,
+    void** kernelParams) {
+  auto fn = reinterpret_cast<decltype(&cuLaunchCooperativeKernel)>(
+      getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLaunchCooperativeKernel");
+  lazyNVRTC.cuLaunchCooperativeKernel = fn;
+  return fn(
+      f,
+      gridDimX,
+      gridDimY,
+      gridDimZ,
+      blockDimX,
+      blockDimY,
+      blockDimZ,
+      sharedMemBytes,
+      hStream,
+      kernelParams);
+}
+
 CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module,
                                     const void *image,
                                     unsigned int numOptions,
 
@@ -49,6 +49,7 @@ namespace at { namespace cuda {
   _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
   _(cuGetErrorString)                            \
   _(cuLaunchKernel)                              \
+  _(cuLaunchCooperativeKernel)                   \
   _(cuCtxGetCurrent)                             \
   _(cuModuleUnload)                              \
   _(cuDevicePrimaryCtxGetState)                  \
 
@@ -962,8 +962,11 @@ if(USE_CUDA OR USE_ROCM)
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu