diff --git a/src/cuda/helpers.h b/src/cuda/helpers.h index b176805c2..e6829b589 100644 --- a/src/cuda/helpers.h +++ b/src/cuda/helpers.h @@ -6,6 +6,7 @@ #ifdef CT2_USE_HIP #include #include +#include #define __nv_bfloat16 __hip_bfloat16 __device__ inline void __syncwarp(uint32_t mask){} //TODO: 6.1 should have this but it doesn't? @@ -414,7 +415,7 @@ namespace ctranslate2 { // They help define row-wise reduction where each block handles a single row. #ifdef CT2_USE_HIP - #define C10_WARP_SIZE 64 //TODO: detect arch to set 32 for rdna + #define C10_WARP_SIZE warpSize #else #define C10_WARP_SIZE 32 #endif diff --git a/src/ops/conv1d_gpu.cu b/src/ops/conv1d_gpu.cu index e7006e42f..1503f01c6 100644 --- a/src/ops/conv1d_gpu.cu +++ b/src/ops/conv1d_gpu.cu @@ -67,8 +67,8 @@ namespace ctranslate2 { size_t workspace_size = 0; void* workspace = nullptr; CUDNN_CHECK(miopenConvolutionForwardGetWorkSpaceSize(handle, - input_desc, weight_desc, + input_desc, conv_desc, output_desc, &workspace_size));