Skip to content

Commit 76d282d

Browse files
jjsjann123facebook-github-bot
authored andcommitted
Nvfuser code bump 12 5 (pytorch#69964)
Summary: Pull Request resolved: pytorch#69964 Things added in this PR that requires review: 1. cuLaunchCooperativeKernel driver API added aten/src/ATen/cuda/detail/LazyNVRTC.cpp aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h nvfuser code update: 1. perf turning on codegen scheduler that improves performance. 2. permutation support has been extended beyond contiguous/channels-last. (The improvements could be observed on PW benchmark) Things reverted from local changes: 1. aten::gelu with approximation 2. local changes that is upstreamed in PR pytorch#68804 Pull Request resolved: pytorch#69428 Reviewed By: ngimel Differential Revision: D33073817 Pulled By: wconstab fbshipit-source-id: e77d32e81d037d7370822b040456fd4c3bd68edb
1 parent a6a1c70 commit 76d282d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+5362
-1549
lines changed

aten/src/ATen/cuda/detail/LazyNVRTC.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,36 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
188188
sharedMemBytes, hStream, kernelParams, extra);
189189
}
190190

191+
// Irregularly shaped functions
192+
CUresult CUDAAPI cuLaunchCooperativeKernel(
193+
CUfunction f,
194+
unsigned int gridDimX,
195+
unsigned int gridDimY,
196+
unsigned int gridDimZ,
197+
unsigned int blockDimX,
198+
unsigned int blockDimY,
199+
unsigned int blockDimZ,
200+
unsigned int sharedMemBytes,
201+
CUstream hStream,
202+
void** kernelParams) {
203+
auto fn = reinterpret_cast<decltype(&cuLaunchCooperativeKernel)>(
204+
getCUDALibrary().sym(__func__));
205+
if (!fn)
206+
throw std::runtime_error("Can't get cuLaunchCooperativeKernel");
207+
lazyNVRTC.cuLaunchCooperativeKernel = fn;
208+
return fn(
209+
f,
210+
gridDimX,
211+
gridDimY,
212+
gridDimZ,
213+
blockDimX,
214+
blockDimY,
215+
blockDimZ,
216+
sharedMemBytes,
217+
hStream,
218+
kernelParams);
219+
}
220+
191221
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module,
192222
const void *image,
193223
unsigned int numOptions,

aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ namespace at { namespace cuda {
4949
_(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
5050
_(cuGetErrorString) \
5151
_(cuLaunchKernel) \
52+
_(cuLaunchCooperativeKernel) \
5253
_(cuCtxGetCurrent) \
5354
_(cuModuleUnload) \
5455
_(cuDevicePrimaryCtxGetState) \

caffe2/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,8 +962,11 @@ if(USE_CUDA OR USE_ROCM)
962962
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
963963
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
964964
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
965+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
965966
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
967+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu
966968
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
969+
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu
967970
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
968971
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
969972
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu

0 commit comments

Comments
 (0)