From 138bb4b152269dcb0ee790b0e666b47d71120334 Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Thu, 22 May 2025 11:31:45 +0000
Subject: [PATCH 1/6] Initial implementation of Otsu in CUDA

---
 .../cudaarithm/include/opencv2/cudaarithm.hpp |  10 +-
 modules/cudaarithm/src/cuda/threshold.cu      | 220 +++++++++++++++++-
 .../test/test_element_operations.cpp          |  51 +++-
 .../opencv2/cudev/util/saturate_cast.hpp      |   2 +
 .../include/opencv2/cudev/warp/shuffle.hpp    |  10 +
 5 files changed, 288 insertions(+), 5 deletions(-)
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index e5ab2cf2575..80b4dd8fac7 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -546,12 +546,16 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp
 
 /** @brief Applies a fixed-level threshold to each array element.
 
+The special value cv::THRESH_OTSU may be combined with one of the other types. In this case, the function determines the
+optimal threshold value using the Otsu's and uses it instead of the specified threshold. The function returns the
+computed threshold value in addititon to the thresholded matrix.
+The Otsu's method is implemented only for 8-bit matrices.
+
 @param src Source array (single-channel).
-@param dst Destination array with the same size and type as src .
+@param dst Destination array with the same size and type as src.
 @param thresh Threshold value.
 @param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
-@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE
-threshold types are not supported.
+@param type Threshold type. For details, see threshold. The THRESH_TRIANGLE threshold type is not supported.
 @param stream Stream for the asynchronous version.
 
 @sa threshold
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index cfa88dfae7c..8a04fc790b7 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -95,12 +95,230 @@ namespace
     }
 }
 
-double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
+template <uint n_bins>
+__global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long long *sums)
+{
+    extern __shared__ unsigned long long shared_memory_u64[];
+
+    uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint threshold = blockIdx.y * blockDim.y + threadIdx.y;
+
+    uint threshold_sum_above = 0;
+    unsigned long long sum_above = 0;
+
+    if (bin_idx >= threshold)
+    {
+        uint value = histogram[bin_idx];
+        threshold_sum_above = value;
+        sum_above = value * bin_idx;
+    }
+
+    blockReduce<n_bins>((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus<uint>());
+    blockReduce<n_bins>((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus<unsigned long long>());
+
+    if (bin_idx == 0)
+    {
+        threshold_sums[threshold] = threshold_sum_above;
+        sums[threshold] = sum_above;
+    }
+}
+
+template <uint n_bins>
+__global__ void
+otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
+{
+    extern __shared__ signed long long shared_memory_i64[];
+
+    uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    uint threshold = blockIdx.y * blockDim.y + threadIdx.y;
+
+    uint n_samples = threshold_sums[0];
+    uint n_samples_above = threshold_sums[threshold];
+    uint n_samples_below = n_samples - n_samples_above;
+
+    unsigned long long total_sum = sums[0];
+    unsigned long long sum_above = sums[threshold];
+    unsigned long long sum_below = total_sum - sum_above;
+
+    int threshold_variance_above = 0;
+    int threshold_variance_below = 0;
+    if (bin_idx >= threshold)
+    {
+        uint mean = sum_above / n_samples_above;
+        int sigma = bin_idx - mean;
+        threshold_variance_above = sigma * sigma;
+    }
+    else
+    {
+        uint mean = sum_below / n_samples_below;
+        int sigma = bin_idx - mean;
+        threshold_variance_below = sigma * sigma;
+    }
+
+    uint bin_count = histogram[bin_idx];
+    signed long long threshold_variance_above64 = threshold_variance_above * bin_count;
+    signed long long threshold_variance_below64 = threshold_variance_below * bin_count;
+    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus<signed long long>());
+    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus<signed long long>());
+
+    if (bin_idx == 0)
+    {
+        variance[threshold] = make_longlong2(threshold_variance_above64, threshold_variance_below64);
+    }
+}
+
+template <uint n_thresholds>
+__global__ void
+otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
+{
+    extern __shared__ float shared_memory_f32[];
+
+    uint threshold = blockIdx.x * blockDim.x + threadIdx.x;
+
+    uint n_samples = threshold_sums[0];
+    uint n_samples_above = threshold_sums[threshold];
+    uint n_samples_below = n_samples - n_samples_above;
+
+    float threshold_mean_above = (float)n_samples_above / n_samples;
+    float threshold_mean_below = (float)n_samples_below / n_samples;
+
+    longlong2 variances = variance[threshold];
+    float variance_above = (float)variances.x / n_samples_above;
+    float variance_below = (float)variances.y / n_samples_below;
+
+    float above = threshold_mean_above * variance_above;
+    float below = threshold_mean_below * variance_below;
+    float score = above + below;
+
+    float original_score = score;
+
+    blockReduce<n_thresholds>((float *)shared_memory_f32, score, threshold, minimum<float>());
+
+    if (threshold == 0)
+    {
+        shared_memory_f32[0] = score;
+    }
+    __syncthreads();
+
+    score = shared_memory_f32[0];
+
+    // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
+    // know which threshold it is
+    if (original_score == score)
+    {
+        *otsu_threshold = threshold - 1;
+    }
+}
+
+template <uint n_bins, uint n_thresholds>
+void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream)
+{
+    CV_Assert(n_bins == 256);
+    CV_Assert(n_thresholds == 256);
+
+    cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
+
+    dim3 block_all = dim3(n_bins, 1, 1);
+    dim3 grid_all = dim3(divUp((int)n_bins, block_all.x), divUp((int)n_thresholds, block_all.y), 1);
+    dim3 block_score = dim3(n_thresholds, 1, 1);
+    dim3 grid_score = dim3(divUp((int)n_thresholds, block_score.x), 1, 1);
+
+    GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1);
+    GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1);
+    GpuMat gpu_variances = GpuMat(1, n_bins, CV_32SC4);
+
+    uint shared_memory;
+    shared_memory = n_bins * sizeof(unsigned long long);
+    otsu_mean<256><<<grid_all, block_all, shared_memory, cuda_stream>>>(histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_variance<256><<<grid_all, block_all, shared_memory, cuda_stream>>>(gpu_variances.ptr<longlong2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+
+    shared_memory = n_bins * sizeof(float);
+    otsu_score<256><<<grid_score, block_score, shared_memory, cuda_stream>>>(
+        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<longlong2>());
+}
+
+// TODO: Replace this is cv::cuda::calcHist
+template <uint n_bins>
+__global__ void histogram_kernel(
+    uint *histogram, const uint8_t *image, uint width,
+    uint height, uint pitch)
+{
+    __shared__ uint local_histogram[n_bins];
+
+    uint x = blockIdx.x * blockDim.x + threadIdx.x;
+    uint y = blockIdx.y * blockDim.y + threadIdx.y;
+    uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+    if (tid < n_bins)
+    {
+        local_histogram[tid] = 0;
+    }
+
+    __syncthreads();
+
+    if (x < width && y < height)
+    {
+        uint8_t value = image[y * pitch + x];
+        atomicInc(&local_histogram[value], 0xFFFFFFFF);
+    }
+
+    __syncthreads();
+
+    if (tid < n_bins)
+    {
+        cv::cudev::atomicAdd(&histogram[tid], local_histogram[tid]);
+    }
+}
+
+// TODO: Replace this with cv::cuda::calcHist
+void calcHist(
+    const GpuMat src, GpuMat histogram, Stream stream)
+{
+    const uint n_bins = 256;
+
+    cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
+
+    dim3 block(128, 4, 1);
+    dim3 grid = dim3(divUp(src.cols, block.x), divUp(src.rows, block.y), 1);
+    CV_CUDEV_SAFE_CALL(cudaMemsetAsync(histogram.ptr<uint>(), 0, n_bins * sizeof(uint), cuda_stream));
+    histogram_kernel<n_bins>
+        <<<grid, block, 0, cuda_stream>>>(
+            histogram.ptr<uint>(), src.ptr<uint8_t>(), (uint) src.cols, (uint) src.rows, (uint) src.step);
+}
+
+double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream)
 {
     GpuMat src = getInputMat(_src, stream);
 
     const int depth = src.depth();
 
+    // TODO: How can we access precomp.hpp to avoid defining THRESH_OTSU?
+    const int THRESH_OTSU = 8;
+    if ((type & THRESH_OTSU) == THRESH_OTSU)
+    {
+        CV_Assert(depth == CV_8U);
+        CV_Assert(src.channels() == 1);
+        CV_Assert(maxVal == 255.0);
+
+        const uint n_bins = 256;
+        const uint n_thresholds = 256;
+
+        // Find the threshold using Otsu and then run the normal thresholding algorithm
+        GpuMat gpu_histogram = GpuMat(n_bins, 1, CV_32SC1);
+        calcHist(src, gpu_histogram, stream);
+
+        GpuMat gpu_otsu_threshold(1, 1, CV_32SC1);
+        compute_otsu_async<n_bins, n_thresholds>(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
+
+        cv::Mat mat_otsu_threshold;
+        gpu_otsu_threshold.download(mat_otsu_threshold, stream);
+        stream.waitForCompletion();
+
+        // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type
+        type = type & ~THRESH_OTSU;
+        thresh = (double) mat_otsu_threshold.at<int>(0);
+    }
+
     CV_Assert( depth <= CV_64F );
     CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index a15ad7b3ee5..c91fa91fd2f 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2529,7 +2529,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Threshold
 
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV, cv::THRESH_OTSU)
 #define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
 
 PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
@@ -2577,6 +2577,55 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine(
     ALL_THRESH_OPS,
     WHOLE_SUBMAT));
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ThresholdOtsu
+
+PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int channel;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2) | cv::THRESH_OTSU;
+        channel = GET_PARAM(3);
+        threshOp = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ThresholdOtsu, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel));
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::cuda::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1)),
+    ALL_THRESH_OPS,
+    WHOLE_SUBMAT));
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // InRange
 
diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
index 59fd8da45ac..c256e7d908c 100644
--- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@@ -62,6 +62,8 @@ template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { ret
 template <typename T> __device__ __forceinline__ T saturate_cast(short v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(uint v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(int v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(signed long long v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(unsigned long long v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(float v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(double v) { return T(v); }
 
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
index f00d1f850d0..0de5351fff3 100644
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -332,6 +332,16 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp
     return (uint) __shfl_down((int) val, delta, width);
 }
 
+__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize)
+{
+    return __shfl_down(val, delta, width);
+}
+
+__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize)
+{
+    return (unsigned long long) __shfl_down(val, delta, width);
+}
+
 __device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
 {
     return __shfl_down(val, delta, width);

From 6d6218e3dd2f4b145b7dbf620695398b8771769b Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Thu, 22 May 2025 11:46:01 +0000
Subject: [PATCH 2/6] Avoid potential race condition of shared memory

---
 modules/cudaarithm/src/cuda/threshold.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 8a04fc790b7..5ad6f213816 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -114,6 +114,7 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l
     }
 
     blockReduce<n_bins>((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus<uint>());
+    __syncthreads();
     blockReduce<n_bins>((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus<unsigned long long>());
 
     if (bin_idx == 0)
@@ -159,6 +160,7 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign
     signed long long threshold_variance_above64 = threshold_variance_above * bin_count;
     signed long long threshold_variance_below64 = threshold_variance_below * bin_count;
     blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus<signed long long>());
+    __syncthreads();
     blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus<signed long long>());
 
     if (bin_idx == 0)

From 7d4fbfe8aa5ee057271a1f4c29e0388b53b8020d Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Tue, 27 May 2025 14:24:06 +0000
Subject: [PATCH 3/6] Fixed test and solved numerical issue computing the
 variance for Otsu

---
 modules/cudaarithm/src/cuda/threshold.cu      | 26 +++++++++----------
 .../test/test_element_operations.cpp          | 11 ++++----
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 5ad6f213816..4d194737e1c 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -141,31 +141,31 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign
     unsigned long long sum_above = sums[threshold];
     unsigned long long sum_below = total_sum - sum_above;
 
-    int threshold_variance_above = 0;
-    int threshold_variance_below = 0;
+    float threshold_variance_above_f32 = 0;
+    float threshold_variance_below_f32 = 0;
     if (bin_idx >= threshold)
     {
-        uint mean = sum_above / n_samples_above;
-        int sigma = bin_idx - mean;
-        threshold_variance_above = sigma * sigma;
+        float mean = (float) sum_above / n_samples_above;
+        float sigma = bin_idx - mean;
+        threshold_variance_above_f32 = sigma * sigma;
     }
     else
     {
-        uint mean = sum_below / n_samples_below;
-        int sigma = bin_idx - mean;
-        threshold_variance_below = sigma * sigma;
+        float mean = (float) sum_below / n_samples_below;
+        float sigma = bin_idx - mean;
+        threshold_variance_below_f32 = sigma * sigma;
     }
 
     uint bin_count = histogram[bin_idx];
-    signed long long threshold_variance_above64 = threshold_variance_above * bin_count;
-    signed long long threshold_variance_below64 = threshold_variance_below * bin_count;
-    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_above64, bin_idx, plus<signed long long>());
+    signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count);
+    signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count);
+    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_above_i64, bin_idx, plus<signed long long>());
     __syncthreads();
-    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_below64, bin_idx, plus<signed long long>());
+    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_below_i64, bin_idx, plus<signed long long>());
 
     if (bin_idx == 0)
     {
-        variance[threshold] = make_longlong2(threshold_variance_above64, threshold_variance_below64);
+        variance[threshold] = make_longlong2(threshold_variance_above_i64, threshold_variance_below_i64);
     }
 }
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index c91fa91fd2f..baf5e49e3c4 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2593,9 +2593,9 @@ PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
-        type = GET_PARAM(2) | cv::THRESH_OTSU;
+        type = GET_PARAM(2);
         channel = GET_PARAM(3);
-        threshOp = GET_PARAM(4);
+        threshOp = GET_PARAM(4) | cv::THRESH_OTSU;
         useRoi = GET_PARAM(5);
 
         cv::cuda::setDevice(devInfo.deviceID());
@@ -2605,15 +2605,14 @@ PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels
 CUDA_TEST_P(ThresholdOtsu, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel));
-    double maxVal = randomDouble(20.0, 127.0);
-    double thresh = randomDouble(0.0, maxVal);
 
     cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-    cv::cuda::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+    double otsu_gpu = cv::cuda::threshold(loadMat(src, useRoi), dst, 0, 255, threshOp);
 
     cv::Mat dst_gold;
-    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+    double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp);
 
+    EXPECT_NEAR(otsu_gpu, otsu_cpu, 1e-5);
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 

From bdeeab74f9c86d8b80420a978a79d284f9bee406 Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Fri, 30 May 2025 07:39:49 +0000
Subject: [PATCH 4/6] Review changes

---
 modules/cudaarithm/src/cuda/threshold.cu      | 76 +++++++++----------
 .../test/test_element_operations.cpp          |  2 +-
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 4d194737e1c..e771df20320 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -95,13 +95,16 @@ namespace
     }
 }
 
-template <uint n_bins>
-__global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long long *sums)
+
+__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    extern __shared__ unsigned long long shared_memory_u64[];
+    const uint32_t n_bins = 256;
+
+    __shared__ uint shared_memory_ts[n_bins / WARP_SIZE];
+    __shared__ unsigned long long shared_memory_s[n_bins / WARP_SIZE];
 
-    uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    uint threshold = blockIdx.y * blockDim.y + threadIdx.y;
+    int bin_idx = threadIdx.x;
+    int threshold = blockIdx.x;
 
     uint threshold_sum_above = 0;
     unsigned long long sum_above = 0;
@@ -113,9 +116,8 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l
         sum_above = value * bin_idx;
     }
 
-    blockReduce<n_bins>((uint *)shared_memory_u64, threshold_sum_above, bin_idx, plus<uint>());
-    __syncthreads();
-    blockReduce<n_bins>((unsigned long long *)shared_memory_u64, sum_above, bin_idx, plus<unsigned long long>());
+    blockReduce<n_bins>(shared_memory_ts, threshold_sum_above, bin_idx, plus<uint>());
+    blockReduce<n_bins>(shared_memory_s, sum_above, bin_idx, plus<unsigned long long>());
 
     if (bin_idx == 0)
     {
@@ -124,14 +126,16 @@ __global__ void otsu_mean(uint *histogram, uint *threshold_sums, unsigned long l
     }
 }
 
-template <uint n_bins>
 __global__ void
 otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    extern __shared__ signed long long shared_memory_i64[];
+    const uint32_t n_bins = 256;
+
+    __shared__ signed long long shared_memory_a[n_bins / WARP_SIZE];
+    __shared__ signed long long shared_memory_b[n_bins / WARP_SIZE];
 
-    uint bin_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    uint threshold = blockIdx.y * blockDim.y + threadIdx.y;
+    int bin_idx = threadIdx.x;
+    int threshold = blockIdx.x;
 
     uint n_samples = threshold_sums[0];
     uint n_samples_above = threshold_sums[threshold];
@@ -159,9 +163,8 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign
     uint bin_count = histogram[bin_idx];
     signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count);
     signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count);
-    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_above_i64, bin_idx, plus<signed long long>());
-    __syncthreads();
-    blockReduce<n_bins>((signed long long *)shared_memory_i64, threshold_variance_below_i64, bin_idx, plus<signed long long>());
+    blockReduce<n_bins>(shared_memory_a, threshold_variance_above_i64, bin_idx, plus<signed long long>());
+    blockReduce<n_bins>(shared_memory_b, threshold_variance_below_i64, bin_idx, plus<signed long long>());
 
     if (bin_idx == 0)
     {
@@ -169,13 +172,15 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign
     }
 }
 
-template <uint n_thresholds>
+
 __global__ void
 otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
 {
-    extern __shared__ float shared_memory_f32[];
+    const uint32_t n_thresholds = 256;
+
+    __shared__ float shared_memory[n_thresholds / WARP_SIZE];
 
-    uint threshold = blockIdx.x * blockDim.x + threadIdx.x;
+    int threshold = threadIdx.x;
 
     uint n_samples = threshold_sums[0];
     uint n_samples_above = threshold_sums[threshold];
@@ -194,15 +199,15 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
 
     float original_score = score;
 
-    blockReduce<n_thresholds>((float *)shared_memory_f32, score, threshold, minimum<float>());
+    blockReduce<n_thresholds>(shared_memory, score, threshold, minimum<float>());
 
     if (threshold == 0)
     {
-        shared_memory_f32[0] = score;
+        shared_memory[0] = score;
     }
     __syncthreads();
 
-    score = shared_memory_f32[0];
+    score = shared_memory[0];
 
     // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
     // know which threshold it is
@@ -212,18 +217,17 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
     }
 }
 
-template <uint n_bins, uint n_thresholds>
-void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream)
+void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
 {
-    CV_Assert(n_bins == 256);
-    CV_Assert(n_thresholds == 256);
+    const uint n_bins = 256;
+    const uint n_thresholds = 256;
 
     cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
 
-    dim3 block_all = dim3(n_bins, 1, 1);
-    dim3 grid_all = dim3(divUp((int)n_bins, block_all.x), divUp((int)n_thresholds, block_all.y), 1);
-    dim3 block_score = dim3(n_thresholds, 1, 1);
-    dim3 grid_score = dim3(divUp((int)n_thresholds, block_score.x), 1, 1);
+    dim3 block_all(n_bins);
+    dim3 grid_all(n_thresholds);
+    dim3 block_score(n_thresholds);
+    dim3 grid_score(1);
 
     GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1);
     GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1);
@@ -231,11 +235,11 @@ void compute_otsu_async(uint *histogram, uint *otsu_threshold, Stream &stream)
 
     uint shared_memory;
     shared_memory = n_bins * sizeof(unsigned long long);
-    otsu_mean<256><<<grid_all, block_all, shared_memory, cuda_stream>>>(histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
-    otsu_variance<256><<<grid_all, block_all, shared_memory, cuda_stream>>>(gpu_variances.ptr<longlong2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_sums<<<grid_all, block_all, shared_memory, cuda_stream>>>(histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_variance<<<grid_all, block_all, shared_memory, cuda_stream>>>(gpu_variances.ptr<longlong2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
 
     shared_memory = n_bins * sizeof(float);
-    otsu_score<256><<<grid_score, block_score, shared_memory, cuda_stream>>>(
+    otsu_score<<<grid_score, block_score, shared_memory, cuda_stream>>>(
         otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<longlong2>());
 }
 
@@ -300,17 +304,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
     {
         CV_Assert(depth == CV_8U);
         CV_Assert(src.channels() == 1);
-        CV_Assert(maxVal == 255.0);
-
-        const uint n_bins = 256;
-        const uint n_thresholds = 256;
 
         // Find the threshold using Otsu and then run the normal thresholding algorithm
-        GpuMat gpu_histogram = GpuMat(n_bins, 1, CV_32SC1);
+        GpuMat gpu_histogram = GpuMat(256, 1, CV_32SC1);
         calcHist(src, gpu_histogram, stream);
 
         GpuMat gpu_otsu_threshold(1, 1, CV_32SC1);
-        compute_otsu_async<n_bins, n_thresholds>(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
+        compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
 
         cv::Mat mat_otsu_threshold;
         gpu_otsu_threshold.download(mat_otsu_threshold, stream);
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index baf5e49e3c4..f3ce3b52020 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2612,7 +2612,7 @@ CUDA_TEST_P(ThresholdOtsu, Accuracy)
     cv::Mat dst_gold;
     double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp);
 
-    EXPECT_NEAR(otsu_gpu, otsu_cpu, 1e-5);
+    ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 

From c3cae2e1276e4867d068f90c9054fe63e94ce318 Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Fri, 30 May 2025 10:17:31 +0000
Subject: [PATCH 5/6] Implemented BufferPool

---
 modules/cudaarithm/src/cuda/threshold.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index e771df20320..52a88680c3b 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -229,9 +229,10 @@ void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
     dim3 block_score(n_thresholds);
     dim3 grid_score(1);
 
-    GpuMat gpu_threshold_sums = GpuMat(1, n_bins, CV_32SC1);
-    GpuMat gpu_sums = GpuMat(1, n_bins, CV_64FC1);
-    GpuMat gpu_variances = GpuMat(1, n_bins, CV_32SC4);
+    BufferPool pool(stream);
+    GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator());
+    GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator());
+    GpuMat gpu_variances(1, n_bins, CV_32SC4, pool.getAllocator());
 
     uint shared_memory;
     shared_memory = n_bins * sizeof(unsigned long long);
@@ -305,11 +306,13 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
         CV_Assert(depth == CV_8U);
         CV_Assert(src.channels() == 1);
 
+        BufferPool pool(stream);
+
         // Find the threshold using Otsu and then run the normal thresholding algorithm
-        GpuMat gpu_histogram = GpuMat(256, 1, CV_32SC1);
+        GpuMat gpu_histogram(256, 1, CV_32SC1, pool.getAllocator());
         calcHist(src, gpu_histogram, stream);
 
-        GpuMat gpu_otsu_threshold(1, 1, CV_32SC1);
+        GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator());
         compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
 
         cv::Mat mat_otsu_threshold;

From 13f0fdcc382d59695ccf5a416ed394ed895fec20 Mon Sep 17 00:00:00 2001
From: troelsy <troels@ynddal.dk>
Date: Wed, 11 Jun 2025 13:21:20 +0000
Subject: [PATCH 6/6] Review changes

---
 modules/cudaarithm/src/cuda/threshold.cu | 39 +++++++++++-------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 52a88680c3b..c5586e61ab7 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -100,8 +100,8 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
 {
     const uint32_t n_bins = 256;
 
-    __shared__ uint shared_memory_ts[n_bins / WARP_SIZE];
-    __shared__ unsigned long long shared_memory_s[n_bins / WARP_SIZE];
+    __shared__ uint shared_memory_ts[n_bins];
+    __shared__ unsigned long long shared_memory_s[n_bins];
 
     int bin_idx = threadIdx.x;
     int threshold = blockIdx.x;
@@ -127,12 +127,12 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
 }
 
 __global__ void
-otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
+otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
     const uint32_t n_bins = 256;
 
-    __shared__ signed long long shared_memory_a[n_bins / WARP_SIZE];
-    __shared__ signed long long shared_memory_b[n_bins / WARP_SIZE];
+    __shared__ signed long long shared_memory_a[n_bins];
+    __shared__ signed long long shared_memory_b[n_bins];
 
     int bin_idx = threadIdx.x;
     int threshold = blockIdx.x;
@@ -168,13 +168,13 @@ otsu_variance(longlong2 *variance, uint *histogram, uint *threshold_sums, unsign
 
     if (bin_idx == 0)
     {
-        variance[threshold] = make_longlong2(threshold_variance_above_i64, threshold_variance_below_i64);
+        variance[threshold] = make_float2(threshold_variance_above_i64, threshold_variance_below_i64);
     }
 }
 
 
 __global__ void
-otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
+otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 {
     const uint32_t n_thresholds = 256;
 
@@ -189,9 +189,9 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, longlong2 *variance)
     float threshold_mean_above = (float)n_samples_above / n_samples;
     float threshold_mean_below = (float)n_samples_below / n_samples;
 
-    longlong2 variances = variance[threshold];
-    float variance_above = (float)variances.x / n_samples_above;
-    float variance_below = (float)variances.y / n_samples_below;
+    float2 variances = variance[threshold];
+    float variance_above = variances.x / n_samples_above;
+    float variance_below = variances.y / n_samples_below;
 
     float above = threshold_mean_above * variance_above;
     float below = threshold_mean_below * variance_below;
@@ -232,16 +232,14 @@ void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
     BufferPool pool(stream);
     GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator());
     GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator());
-    GpuMat gpu_variances(1, n_bins, CV_32SC4, pool.getAllocator());
-
-    uint shared_memory;
-    shared_memory = n_bins * sizeof(unsigned long long);
-    otsu_sums<<<grid_all, block_all, shared_memory, cuda_stream>>>(histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
-    otsu_variance<<<grid_all, block_all, shared_memory, cuda_stream>>>(gpu_variances.ptr<longlong2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
-
-    shared_memory = n_bins * sizeof(float);
-    otsu_score<<<grid_score, block_score, shared_memory, cuda_stream>>>(
-        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<longlong2>());
+    GpuMat gpu_variances(1, n_bins, CV_32FC2, pool.getAllocator());
+
+    otsu_sums<<<grid_all, block_all, 0, cuda_stream>>>(
+        histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_variance<<<grid_all, block_all, 0, cuda_stream>>>(
+        gpu_variances.ptr<float2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+    otsu_score<<<grid_score, block_score, 0, cuda_stream>>>(
+        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<float2>());
 }
 
 // TODO: Replace this is cv::cuda::calcHist
@@ -299,7 +297,6 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
 
     const int depth = src.depth();
 
-    // TODO: How can we access precomp.hpp to avoid defining THRESH_OTSU?
     const int THRESH_OTSU = 8;
     if ((type & THRESH_OTSU) == THRESH_OTSU)
     {