From 138bb4b152269dcb0ee790b0e666b47d71120334 Mon Sep 17 00:00:00 2001 From: troelsy Date: Thu, 22 May 2025 11:31:45 +0000 Subject: [PATCH 1/6] Initial implementation of Otsu in CUDA --- .../cudaarithm/include/opencv2/cudaarithm.hpp | 10 +- modules/cudaarithm/src/cuda/threshold.cu | 220 +++++++++++++++++- .../test/test_element_operations.cpp | 51 +++- .../opencv2/cudev/util/saturate_cast.hpp | 2 + .../include/opencv2/cudev/warp/shuffle.hpp | 10 + 5 files changed, 288 insertions(+), 5 deletions(-) diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index e5ab2cf2575..80b4dd8fac7 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -546,12 +546,16 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp /** @brief Applies a fixed-level threshold to each array element. +The special value cv::THRESH_OTSU may be combined with one of the other types. In this case, the function determines the +optimal threshold value using the Otsu's and uses it instead of the specified threshold. The function returns the +computed threshold value in addititon to the thresholded matrix. +The Otsu's method is implemented only for 8-bit matrices. + @param src Source array (single-channel). -@param dst Destination array with the same size and type as src . +@param dst Destination array with the same size and type as src. @param thresh Threshold value. @param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types. -@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE -threshold types are not supported. +@param type Threshold type. For details, see threshold. The THRESH_TRIANGLE threshold type is not supported. @param stream Stream for the asynchronous version. @sa threshold diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index cfa88dfae7c..8a04fc790b7 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -95,12 +95,230 @@ namespace } } -double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream) +template +__global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long long *sums) +{ + extern __shared__ unsigned long long shared_memory_u64[]; + + uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x; + uint threshold = blockIdx.y * blockDim.y + threadIdx.y; + + uint threshold_sum_above = 0; + unsigned long long sum_above = 0; + + if (bin_idx >= threshold) + { + uint value = histogram[bin_idx]; + threshold_sum_above = value; + sum_above = value * bin_idx; + } + + blockReduce((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus()); + blockReduce((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus()); + + if (bin_idx == 0) + { + threshold_sums[threshold] = threshold_sum_above; + sums[threshold] = sum_above; + } +} + +template +__global__ void +otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums) +{ + extern __shared__ signed long long shared_memory_i64[]; + + uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x; + uint threshold = blockIdx.y * blockDim.y + threadIdx.y; + + uint n_samples = threshold_sums[0]; + uint n_samples_above = threshold_sums[threshold]; + uint n_samples_below = n_samples - n_samples_above; + + unsigned long long total_sum = sums[0]; + unsigned long long sum_above = sums[threshold]; + unsigned long long sum_below = total_sum - sum_above; + + int threshold_variance_above = 0; + int threshold_variance_below = 0; + if (bin_idx >= threshold) + { + uint mean = sum_above / n_samples_above; + int sigma = bin_idx - mean; + threshold_variance_above = sigma * sigma; + } + else + { + uint mean = sum_below / n_samples_below; + int sigma = bin_idx - mean; + threshold_variance_below = sigma * sigma; + } + + uint bin_count = histogram[bin_idx]; + signed long long threshold_variance_above64 = threshold_variance_above * bin_count; + signed long long threshold_variance_below64 = threshold_variance_below * bin_count; + blockReduce((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus()); + blockReduce((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus()); + + if (bin_idx == 0) + { + variance[threshold] = make_longlong2(threshold_variance_above64, threshold_variance_below64); + } +} + +template +__global__ void +otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) +{ + extern __shared__ float shared_memory_f32[]; + + uint threshold = blockIdx.x * blockDim.x + threadIdx.x; + + uint n_samples = threshold_sums[0]; + uint n_samples_above = threshold_sums[threshold]; + uint n_samples_below = n_samples - n_samples_above; + + float threshold_mean_above = (float)n_samples_above / n_samples; + float threshold_mean_below = (float)n_samples_below / n_samples; + + longlong2 variances = variance[threshold]; + float variance_above = (float)variances.x / n_samples_above; + float variance_below = (float)variances.y / n_samples_below; + + float above = threshold_mean_above * variance_above; + float below = threshold_mean_below * variance_below; + float score = above + below; + + float original_score = score; + + blockReduce((float *)shared_memory_f32, score, threshold, minimum()); + + if (threshold == 0) + { + shared_memory_f32[0] = score; + } + __syncthreads(); + + score = shared_memory_f32[0]; + + // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we + // know which threshold it is + if (original_score == score) + { + *otsu_threshold = threshold - 1; + } +} + +template +void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream) +{ + CV_Assert(n_bins == 256); + CV_Assert(n_thresholds == 256); + + cudaStream_t cuda_stream = StreamAccessor::getStream(stream); + + dim3 block_all = dim3(n_bins, 1, 1); + dim3 grid_all = dim3(divUp((int)n_bins, block_all.x), divUp((int)n_thresholds, block_all.y), 1); + dim3 block_score = dim3(n_thresholds, 1, 1); + dim3 grid_score = dim3(divUp((int)n_thresholds, block_score.x), 1, 1); + + GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1); + GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1); + GpuMat gpu_variances = GpuMat(1, n_bins, CV_32SC4); + + uint shared_memory; + shared_memory = n_bins * sizeof(unsigned long long); + otsu_mean<256><<>>(histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_variance<256><<>>(gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + + shared_memory = n_bins * sizeof(float); + otsu_score<256><<>>( + otsu_threshold, gpu_threshold_sums.ptr(), gpu_variances.ptr()); +} + +// TODO: Replace this is cv::cuda::calcHist +template +__global__ void histogram_kernel( + uint *histogram, const uint8_t *image, uint width, + uint height, uint pitch) +{ + __shared__ uint local_histogram[n_bins]; + + uint x = blockIdx.x * blockDim.x + threadIdx.x; + uint y = blockIdx.y * blockDim.y + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < n_bins) + { + local_histogram[tid] = 0; + } + + __syncthreads(); + + if (x < width && y < height) + { + uint8_t value = image[y * pitch + x]; + atomicInc(&local_histogram[value], 0xFFFFFFFF); + } + + __syncthreads(); + + if (tid < n_bins) + { + cv::cudev::atomicAdd(&histogram[tid], local_histogram[tid]); + } +} + +// TODO: Replace this with cv::cuda::calcHist +void calcHist( + const GpuMat src, GpuMat histogram, Stream stream) +{ + const uint n_bins = 256; + + cudaStream_t cuda_stream = StreamAccessor::getStream(stream); + + dim3 block(128, 4, 1); + dim3 grid = dim3(divUp(src.cols, block.x), divUp(src.rows, block.y), 1); + CV_CUDEV_SAFE_CALL(cudaMemsetAsync(histogram.ptr(), 0, n_bins * sizeof(uint), cuda_stream)); + histogram_kernel + <<>>( + histogram.ptr(), src.ptr(), (uint) src.cols, (uint) src.rows, (uint) src.step); +} + +double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream) { GpuMat src = getInputMat(_src, stream); const int depth = src.depth(); + // TODO: How can we access precomp.hpp to avoid defining THRESH_OTSU? + const int THRESH_OTSU = 8; + if ((type & THRESH_OTSU) == THRESH_OTSU) + { + CV_Assert(depth == CV_8U); + CV_Assert(src.channels() == 1); + CV_Assert(maxVal == 255.0); + + const uint n_bins = 256; + const uint n_thresholds = 256; + + // Find the threshold using Otsu and then run the normal thresholding algorithm + GpuMat gpu_histogram = GpuMat(n_bins, 1, CV_32SC1); + calcHist(src, gpu_histogram, stream); + + GpuMat gpu_otsu_threshold(1, 1, CV_32SC1); + compute_otsu_async(gpu_histogram.ptr(), gpu_otsu_threshold.ptr(), stream); + + cv::Mat mat_otsu_threshold; + gpu_otsu_threshold.download(mat_otsu_threshold, stream); + stream.waitForCompletion(); + + // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type + type = type & ~THRESH_OTSU; + thresh = (double) mat_otsu_threshold.at(0); + } + CV_Assert( depth <= CV_64F ); CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ ); diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp index a15ad7b3ee5..c91fa91fd2f 100644 --- a/modules/cudaarithm/test/test_element_operations.cpp +++ b/modules/cudaarithm/test/test_element_operations.cpp @@ -2529,7 +2529,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // Threshold -CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV) +CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV, cv::THRESH_OTSU) #define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV)) PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi) @@ -2577,6 +2577,55 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine( ALL_THRESH_OPS, WHOLE_SUBMAT)); +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// ThresholdOtsu + +PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + int type; + int channel; + int threshOp; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + type = GET_PARAM(2) | cv::THRESH_OTSU; + channel = GET_PARAM(3); + threshOp = GET_PARAM(4); + useRoi = GET_PARAM(5); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(ThresholdOtsu, Accuracy) +{ + cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel)); + double maxVal = randomDouble(20.0, 127.0); + double thresh = randomDouble(0.0, maxVal); + + cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi); + cv::cuda::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp); + + cv::Mat dst_gold; + cv::threshold(src, dst_gold, thresh, maxVal, threshOp); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + testing::Values(MatDepth(CV_8U)), + testing::Values(Channels(1)), + ALL_THRESH_OPS, + WHOLE_SUBMAT)); + + //////////////////////////////////////////////////////////////////////////////// // InRange diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp index 59fd8da45ac..c256e7d908c 100644 --- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp +++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp @@ -62,6 +62,8 @@ template __device__ __forceinline__ T saturate_cast(ushort v) { ret template __device__ __forceinline__ T saturate_cast(short v) { return T(v); } template __device__ __forceinline__ T saturate_cast(uint v) { return T(v); } template __device__ __forceinline__ T saturate_cast(int v) { return T(v); } +template __device__ __forceinline__ T saturate_cast(signed long long v) { return T(v); } +template __device__ __forceinline__ T saturate_cast(unsigned long long v) { return T(v); } template __device__ __forceinline__ T saturate_cast(float v) { return T(v); } template __device__ __forceinline__ T saturate_cast(double v) { return T(v); } diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp index f00d1f850d0..0de5351fff3 100644 --- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp +++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp @@ -332,6 +332,16 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp return (uint) __shfl_down((int) val, delta, width); } +__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize) +{ + return __shfl_down(val, delta, width); +} + +__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize) +{ + return (unsigned long long) __shfl_down(val, delta, width); +} + __device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize) { return __shfl_down(val, delta, width); From 6d6218e3dd2f4b145b7dbf620695398b8771769b Mon Sep 17 00:00:00 2001 From: troelsy Date: Thu, 22 May 2025 11:46:01 +0000 Subject: [PATCH 2/6] Avoid potential race condition of shared memory --- modules/cudaarithm/src/cuda/threshold.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 8a04fc790b7..5ad6f213816 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -114,6 +114,7 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l } blockReduce((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus()); + __syncthreads(); blockReduce((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus()); if (bin_idx == 0) @@ -159,6 +160,7 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign signed long long threshold_variance_above64 = threshold_variance_above * bin_count; signed long long threshold_variance_below64 = threshold_variance_below * bin_count; blockReduce((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus()); + __syncthreads(); blockReduce((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus()); if (bin_idx == 0) From 7d4fbfe8aa5ee057271a1f4c29e0388b53b8020d Mon Sep 17 00:00:00 2001 From: troelsy Date: Tue, 27 May 2025 14:24:06 +0000 Subject: [PATCH 3/6] Fixed test and solved numerical issue computing the variance for Otsu --- modules/cudaarithm/src/cuda/threshold.cu | 26 +++++++++---------- .../test/test_element_operations.cpp | 11 ++++---- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 5ad6f213816..4d194737e1c 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -141,31 +141,31 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign unsigned long long sum_above = sums[threshold]; unsigned long long sum_below = total_sum - sum_above; - int threshold_variance_above = 0; - int threshold_variance_below = 0; + float threshold_variance_above_f32 = 0; + float threshold_variance_below_f32 = 0; if (bin_idx >= threshold) { - uint mean = sum_above / n_samples_above; - int sigma = bin_idx - mean; - threshold_variance_above = sigma * sigma; + float mean = (float) sum_above / n_samples_above; + float sigma = bin_idx - mean; + threshold_variance_above_f32 = sigma * sigma; } else { - uint mean = sum_below / n_samples_below; - int sigma = bin_idx - mean; - threshold_variance_below = sigma * sigma; + float mean = (float) sum_below / n_samples_below; + float sigma = bin_idx - mean; + threshold_variance_below_f32 = sigma * sigma; } uint bin_count = histogram[bin_idx]; - signed long long threshold_variance_above64 = threshold_variance_above * bin_count; - signed long long threshold_variance_below64 = threshold_variance_below * bin_count; - blockReduce((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus()); + signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count); + signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count); + blockReduce((signed long long *)shared_memory_i64, threshold_variance_above_i64, bin_idx, plus()); __syncthreads(); - blockReduce((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus()); + blockReduce((signed long long *)shared_memory_i64, threshold_variance_below_i64, bin_idx, plus()); if (bin_idx == 0) { - variance[threshold] = make_longlong2(threshold_variance_above64, threshold_variance_below64); + variance[threshold] = make_longlong2(threshold_variance_above_i64, threshold_variance_below_i64); } } diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp index c91fa91fd2f..baf5e49e3c4 100644 --- a/modules/cudaarithm/test/test_element_operations.cpp +++ b/modules/cudaarithm/test/test_element_operations.cpp @@ -2593,9 +2593,9 @@ PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels { devInfo = GET_PARAM(0); size = GET_PARAM(1); - type = GET_PARAM(2) | cv::THRESH_OTSU; + type = GET_PARAM(2); channel = GET_PARAM(3); - threshOp = GET_PARAM(4); + threshOp = GET_PARAM(4) | cv::THRESH_OTSU; useRoi = GET_PARAM(5); cv::cuda::setDevice(devInfo.deviceID()); @@ -2605,15 +2605,14 @@ PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels CUDA_TEST_P(ThresholdOtsu, Accuracy) { cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel)); - double maxVal = randomDouble(20.0, 127.0); - double thresh = randomDouble(0.0, maxVal); cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi); - cv::cuda::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp); + double otsu_gpu = cv::cuda::threshold(loadMat(src, useRoi), dst, 0, 255, threshOp); cv::Mat dst_gold; - cv::threshold(src, dst_gold, thresh, maxVal, threshOp); + double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp); + EXPECT_NEAR(otsu_gpu, otsu_cpu, 1e-5); EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } From bdeeab74f9c86d8b80420a978a79d284f9bee406 Mon Sep 17 00:00:00 2001 From: troelsy Date: Fri, 30 May 2025 07:39:49 +0000 Subject: [PATCH 4/6] Review changes --- modules/cudaarithm/src/cuda/threshold.cu | 76 +++++++++---------- .../test/test_element_operations.cpp | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 4d194737e1c..e771df20320 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -95,13 +95,16 @@ namespace } } -template -__global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long long *sums) + +__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums) { - extern __shared__ unsigned long long shared_memory_u64[]; + const uint32_t n_bins = 256; + + __shared__ uint shared_memory_ts[n_bins / WARP_SIZE]; + __shared__ unsigned long long shared_memory_s[n_bins / WARP_SIZE]; - uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x; - uint threshold = blockIdx.y * blockDim.y + threadIdx.y; + int bin_idx = threadIdx.x; + int threshold = blockIdx.x; uint threshold_sum_above = 0; unsigned long long sum_above = 0; @@ -113,9 +116,8 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l sum_above = value * bin_idx; } - blockReduce((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus()); - __syncthreads(); - blockReduce((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus()); + blockReduce(shared_memory_ts, threshold_sum_above, bin_idx, plus()); + blockReduce(shared_memory_s, sum_above, bin_idx, plus()); if (bin_idx == 0) { @@ -124,14 +126,16 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l } } -template __global__ void otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums) { - extern __shared__ signed long long shared_memory_i64[]; + const uint32_t n_bins = 256; + + __shared__ signed long long shared_memory_a[n_bins / WARP_SIZE]; + __shared__ signed long long shared_memory_b[n_bins / WARP_SIZE]; - uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x; - uint threshold = blockIdx.y * blockDim.y + threadIdx.y; + int bin_idx = threadIdx.x; + int threshold = blockIdx.x; uint n_samples = threshold_sums[0]; uint n_samples_above = threshold_sums[threshold]; @@ -159,9 +163,8 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign uint bin_count = histogram[bin_idx]; signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count); signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count); - blockReduce((signed long long *)shared_memory_i64, threshold_variance_above_i64, bin_idx, plus()); - __syncthreads(); - blockReduce((signed long long *)shared_memory_i64, threshold_variance_below_i64, bin_idx, plus()); + blockReduce(shared_memory_a, threshold_variance_above_i64, bin_idx, plus()); + blockReduce(shared_memory_b, threshold_variance_below_i64, bin_idx, plus()); if (bin_idx == 0) { @@ -169,13 +172,15 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign } } -template + __global__ void otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) { - extern __shared__ float shared_memory_f32[]; + const uint32_t n_thresholds = 256; + + __shared__ float shared_memory[n_thresholds / WARP_SIZE]; - uint threshold = blockIdx.x * blockDim.x + threadIdx.x; + int threshold = threadIdx.x; uint n_samples = threshold_sums[0]; uint n_samples_above = threshold_sums[threshold]; @@ -194,15 +199,15 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) float original_score = score; - blockReduce((float *)shared_memory_f32, score, threshold, minimum()); + blockReduce(shared_memory, score, threshold, minimum()); if (threshold == 0) { - shared_memory_f32[0] = score; + shared_memory[0] = score; } __syncthreads(); - score = shared_memory_f32[0]; + score = shared_memory[0]; // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we // know which threshold it is @@ -212,18 +217,17 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) } } -template -void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream) +void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream) { - CV_Assert(n_bins == 256); - CV_Assert(n_thresholds == 256); + const uint n_bins = 256; + const uint n_thresholds = 256; cudaStream_t cuda_stream = StreamAccessor::getStream(stream); - dim3 block_all = dim3(n_bins, 1, 1); - dim3 grid_all = dim3(divUp((int)n_bins, block_all.x), divUp((int)n_thresholds, block_all.y), 1); - dim3 block_score = dim3(n_thresholds, 1, 1); - dim3 grid_score = dim3(divUp((int)n_thresholds, block_score.x), 1, 1); + dim3 block_all(n_bins); + dim3 grid_all(n_thresholds); + dim3 block_score(n_thresholds); + dim3 grid_score(1); GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1); GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1); @@ -231,11 +235,11 @@ void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream) uint shared_memory; shared_memory = n_bins * sizeof(unsigned long long); - otsu_mean<256><<>>(histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); - otsu_variance<256><<>>(gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_sums<<>>(histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_variance<<>>(gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); shared_memory = n_bins * sizeof(float); - otsu_score<256><<>>( + otsu_score<<>>( otsu_threshold, gpu_threshold_sums.ptr(), gpu_variances.ptr()); } @@ -300,17 +304,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou { CV_Assert(depth == CV_8U); CV_Assert(src.channels() == 1); - CV_Assert(maxVal == 255.0); - - const uint n_bins = 256; - const uint n_thresholds = 256; // Find the threshold using Otsu and then run the normal thresholding algorithm - GpuMat gpu_histogram = GpuMat(n_bins, 1, CV_32SC1); + GpuMat gpu_histogram = GpuMat(256, 1, CV_32SC1); calcHist(src, gpu_histogram, stream); GpuMat gpu_otsu_threshold(1, 1, CV_32SC1); - compute_otsu_async(gpu_histogram.ptr(), gpu_otsu_threshold.ptr(), stream); + compute_otsu(gpu_histogram.ptr(), gpu_otsu_threshold.ptr(), stream); cv::Mat mat_otsu_threshold; gpu_otsu_threshold.download(mat_otsu_threshold, stream); diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp index baf5e49e3c4..f3ce3b52020 100644 --- a/modules/cudaarithm/test/test_element_operations.cpp +++ b/modules/cudaarithm/test/test_element_operations.cpp @@ -2612,7 +2612,7 @@ CUDA_TEST_P(ThresholdOtsu, Accuracy) cv::Mat dst_gold; double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp); - EXPECT_NEAR(otsu_gpu, otsu_cpu, 1e-5); + ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu); EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } From c3cae2e1276e4867d068f90c9054fe63e94ce318 Mon Sep 17 00:00:00 2001 From: troelsy Date: Fri, 30 May 2025 10:17:31 +0000 Subject: [PATCH 5/6] Implemented BufferPool --- modules/cudaarithm/src/cuda/threshold.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index e771df20320..52a88680c3b 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -229,9 +229,10 @@ void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream) dim3 block_score(n_thresholds); dim3 grid_score(1); - GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1); - GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1); - GpuMat gpu_variances = GpuMat(1, n_bins, CV_32SC4); + BufferPool pool(stream); + GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator()); + GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator()); + GpuMat gpu_variances(1, n_bins, CV_32SC4, pool.getAllocator()); uint shared_memory; shared_memory = n_bins * sizeof(unsigned long long); @@ -305,11 +306,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou CV_Assert(depth == CV_8U); CV_Assert(src.channels() == 1); + BufferPool pool(stream); + // Find the threshold using Otsu and then run the normal thresholding algorithm - GpuMat gpu_histogram = GpuMat(256, 1, CV_32SC1); + GpuMat gpu_histogram(256, 1, CV_32SC1, pool.getAllocator()); calcHist(src, gpu_histogram, stream); - GpuMat gpu_otsu_threshold(1, 1, CV_32SC1); + GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator()); compute_otsu(gpu_histogram.ptr(), gpu_otsu_threshold.ptr(), stream); cv::Mat mat_otsu_threshold; From 13f0fdcc382d59695ccf5a416ed394ed895fec20 Mon Sep 17 00:00:00 2001 From: troelsy Date: Wed, 11 Jun 2025 13:21:20 +0000 Subject: [PATCH 6/6] Review changes --- modules/cudaarithm/src/cuda/threshold.cu | 39 +++++++++++------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 52a88680c3b..c5586e61ab7 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -100,8 +100,8 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l { const uint32_t n_bins = 256; - __shared__ uint shared_memory_ts[n_bins / WARP_SIZE]; - __shared__ unsigned long long shared_memory_s[n_bins / WARP_SIZE]; + __shared__ uint shared_memory_ts[n_bins]; + __shared__ unsigned long long shared_memory_s[n_bins]; int bin_idx = threadIdx.x; int threshold = blockIdx.x; @@ -127,12 +127,12 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l } __global__ void -otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums) +otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums) { const uint32_t n_bins = 256; - __shared__ signed long long shared_memory_a[n_bins / WARP_SIZE]; - __shared__ signed long long shared_memory_b[n_bins / WARP_SIZE]; + __shared__ signed long long shared_memory_a[n_bins]; + __shared__ signed long long shared_memory_b[n_bins]; int bin_idx = threadIdx.x; int threshold = blockIdx.x; @@ -168,13 +168,13 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign if (bin_idx == 0) { - variance[threshold] = make_longlong2(threshold_variance_above_i64, threshold_variance_below_i64); + variance[threshold] = make_float2(threshold_variance_above_i64, threshold_variance_below_i64); } } __global__ void -otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) +otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance) { const uint32_t n_thresholds = 256; @@ -189,9 +189,9 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance) float threshold_mean_above = (float)n_samples_above / n_samples; float threshold_mean_below = (float)n_samples_below / n_samples; - longlong2 variances = variance[threshold]; - float variance_above = (float)variances.x / n_samples_above; - float variance_below = (float)variances.y / n_samples_below; + float2 variances = variance[threshold]; + float variance_above = variances.x / n_samples_above; + float variance_below = variances.y / n_samples_below; float above = threshold_mean_above * variance_above; float below = threshold_mean_below * variance_below; @@ -232,16 +232,14 @@ void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream) BufferPool pool(stream); GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator()); GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator()); - GpuMat gpu_variances(1, n_bins, CV_32SC4, pool.getAllocator()); - - uint shared_memory; - shared_memory = n_bins * sizeof(unsigned long long); - otsu_sums<<>>(histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); - otsu_variance<<>>(gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); - - shared_memory = n_bins * sizeof(float); - otsu_score<<>>( - otsu_threshold, gpu_threshold_sums.ptr(), gpu_variances.ptr()); + GpuMat gpu_variances(1, n_bins, CV_32FC2, pool.getAllocator()); + + otsu_sums<<>>( + histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_variance<<>>( + gpu_variances.ptr(), histogram, gpu_threshold_sums.ptr(), gpu_sums.ptr()); + otsu_score<<>>( + otsu_threshold, gpu_threshold_sums.ptr(), gpu_variances.ptr()); } // TODO: Replace this is cv::cuda::calcHist @@ -299,7 +297,6 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou const int depth = src.depth(); - // TODO: How can we access precomp.hpp to avoid defining THRESH_OTSU? const int THRESH_OTSU = 8; if ((type & THRESH_OTSU) == THRESH_OTSU) {