Skip to content

Commit 6329974

Browse files
authored
Merge pull request #3943 from troelsy:4.x
Add Otsu's method to cv::cuda::threshold #3943 I implemented Otsu's method in CUDA for a separate project and want to add it to cv::cuda::threshold I have made an effort to use existing OpenCV functions in my code, but I had some trouble with `ThresholdTypes` and `cv::cuda::calcHist`. I couldn't figure out how to include `precomp.hpp` to get the definition of `ThresholdTypes`. For `cv::cuda::calcHist` I tried adding `opencv_cudaimgproc`, but it creates a circular dependency on `cudaarithm`. I have include a simple implementation of `calcHist` so the code runs, but I would like input on how to use `cv::cuda::calcHist` instead. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 parent edce692 commit 6329974

File tree

5 files changed

+289
-5
lines changed

5 files changed

+289
-5
lines changed

modules/cudaarithm/include/opencv2/cudaarithm.hpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -546,12 +546,16 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp
546546

547547
/** @brief Applies a fixed-level threshold to each array element.
548548
549+
The special value cv::THRESH_OTSU may be combined with one of the other types. In this case, the function determines the
550+
optimal threshold value using the Otsu's and uses it instead of the specified threshold. The function returns the
551+
computed threshold value in addititon to the thresholded matrix.
552+
The Otsu's method is implemented only for 8-bit matrices.
553+
549554
@param src Source array (single-channel).
550-
@param dst Destination array with the same size and type as src .
555+
@param dst Destination array with the same size and type as src.
551556
@param thresh Threshold value.
552557
@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
553-
@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE
554-
threshold types are not supported.
558+
@param type Threshold type. For details, see threshold. The THRESH_TRIANGLE threshold type is not supported.
555559
@param stream Stream for the asynchronous version.
556560
557561
@sa threshold

modules/cudaarithm/src/cuda/threshold.cu

Lines changed: 221 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,232 @@ namespace
9595
}
9696
}
9797

98-
double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
98+
99+
__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
100+
{
101+
const uint32_t n_bins = 256;
102+
103+
__shared__ uint shared_memory_ts[n_bins];
104+
__shared__ unsigned long long shared_memory_s[n_bins];
105+
106+
int bin_idx = threadIdx.x;
107+
int threshold = blockIdx.x;
108+
109+
uint threshold_sum_above = 0;
110+
unsigned long long sum_above = 0;
111+
112+
if (bin_idx >= threshold)
113+
{
114+
uint value = histogram[bin_idx];
115+
threshold_sum_above = value;
116+
sum_above = value * bin_idx;
117+
}
118+
119+
blockReduce<n_bins>(shared_memory_ts, threshold_sum_above, bin_idx, plus<uint>());
120+
blockReduce<n_bins>(shared_memory_s, sum_above, bin_idx, plus<unsigned long long>());
121+
122+
if (bin_idx == 0)
123+
{
124+
threshold_sums[threshold] = threshold_sum_above;
125+
sums[threshold] = sum_above;
126+
}
127+
}
128+
129+
__global__ void
130+
otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
131+
{
132+
const uint32_t n_bins = 256;
133+
134+
__shared__ signed long long shared_memory_a[n_bins];
135+
__shared__ signed long long shared_memory_b[n_bins];
136+
137+
int bin_idx = threadIdx.x;
138+
int threshold = blockIdx.x;
139+
140+
uint n_samples = threshold_sums[0];
141+
uint n_samples_above = threshold_sums[threshold];
142+
uint n_samples_below = n_samples - n_samples_above;
143+
144+
unsigned long long total_sum = sums[0];
145+
unsigned long long sum_above = sums[threshold];
146+
unsigned long long sum_below = total_sum - sum_above;
147+
148+
float threshold_variance_above_f32 = 0;
149+
float threshold_variance_below_f32 = 0;
150+
if (bin_idx >= threshold)
151+
{
152+
float mean = (float) sum_above / n_samples_above;
153+
float sigma = bin_idx - mean;
154+
threshold_variance_above_f32 = sigma * sigma;
155+
}
156+
else
157+
{
158+
float mean = (float) sum_below / n_samples_below;
159+
float sigma = bin_idx - mean;
160+
threshold_variance_below_f32 = sigma * sigma;
161+
}
162+
163+
uint bin_count = histogram[bin_idx];
164+
signed long long threshold_variance_above_i64 = (signed long long)(threshold_variance_above_f32 * bin_count);
165+
signed long long threshold_variance_below_i64 = (signed long long)(threshold_variance_below_f32 * bin_count);
166+
blockReduce<n_bins>(shared_memory_a, threshold_variance_above_i64, bin_idx, plus<signed long long>());
167+
blockReduce<n_bins>(shared_memory_b, threshold_variance_below_i64, bin_idx, plus<signed long long>());
168+
169+
if (bin_idx == 0)
170+
{
171+
variance[threshold] = make_float2(threshold_variance_above_i64, threshold_variance_below_i64);
172+
}
173+
}
174+
175+
176+
__global__ void
177+
otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
178+
{
179+
const uint32_t n_thresholds = 256;
180+
181+
__shared__ float shared_memory[n_thresholds / WARP_SIZE];
182+
183+
int threshold = threadIdx.x;
184+
185+
uint n_samples = threshold_sums[0];
186+
uint n_samples_above = threshold_sums[threshold];
187+
uint n_samples_below = n_samples - n_samples_above;
188+
189+
float threshold_mean_above = (float)n_samples_above / n_samples;
190+
float threshold_mean_below = (float)n_samples_below / n_samples;
191+
192+
float2 variances = variance[threshold];
193+
float variance_above = variances.x / n_samples_above;
194+
float variance_below = variances.y / n_samples_below;
195+
196+
float above = threshold_mean_above * variance_above;
197+
float below = threshold_mean_below * variance_below;
198+
float score = above + below;
199+
200+
float original_score = score;
201+
202+
blockReduce<n_thresholds>(shared_memory, score, threshold, minimum<float>());
203+
204+
if (threshold == 0)
205+
{
206+
shared_memory[0] = score;
207+
}
208+
__syncthreads();
209+
210+
score = shared_memory[0];
211+
212+
// We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
213+
// know which threshold it is
214+
if (original_score == score)
215+
{
216+
*otsu_threshold = threshold - 1;
217+
}
218+
}
219+
220+
void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
221+
{
222+
const uint n_bins = 256;
223+
const uint n_thresholds = 256;
224+
225+
cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
226+
227+
dim3 block_all(n_bins);
228+
dim3 grid_all(n_thresholds);
229+
dim3 block_score(n_thresholds);
230+
dim3 grid_score(1);
231+
232+
BufferPool pool(stream);
233+
GpuMat gpu_threshold_sums(1, n_bins, CV_32SC1, pool.getAllocator());
234+
GpuMat gpu_sums(1, n_bins, CV_64FC1, pool.getAllocator());
235+
GpuMat gpu_variances(1, n_bins, CV_32FC2, pool.getAllocator());
236+
237+
otsu_sums<<<grid_all, block_all, 0, cuda_stream>>>(
238+
histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
239+
otsu_variance<<<grid_all, block_all, 0, cuda_stream>>>(
240+
gpu_variances.ptr<float2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
241+
otsu_score<<<grid_score, block_score, 0, cuda_stream>>>(
242+
otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<float2>());
243+
}
244+
245+
// TODO: Replace this is cv::cuda::calcHist
246+
template <uint n_bins>
247+
__global__ void histogram_kernel(
248+
uint *histogram, const uint8_t *image, uint width,
249+
uint height, uint pitch)
250+
{
251+
__shared__ uint local_histogram[n_bins];
252+
253+
uint x = blockIdx.x * blockDim.x + threadIdx.x;
254+
uint y = blockIdx.y * blockDim.y + threadIdx.y;
255+
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
256+
257+
if (tid < n_bins)
258+
{
259+
local_histogram[tid] = 0;
260+
}
261+
262+
__syncthreads();
263+
264+
if (x < width && y < height)
265+
{
266+
uint8_t value = image[y * pitch + x];
267+
atomicInc(&local_histogram[value], 0xFFFFFFFF);
268+
}
269+
270+
__syncthreads();
271+
272+
if (tid < n_bins)
273+
{
274+
cv::cudev::atomicAdd(&histogram[tid], local_histogram[tid]);
275+
}
276+
}
277+
278+
// TODO: Replace this with cv::cuda::calcHist
279+
void calcHist(
280+
const GpuMat src, GpuMat histogram, Stream stream)
281+
{
282+
const uint n_bins = 256;
283+
284+
cudaStream_t cuda_stream = StreamAccessor::getStream(stream);
285+
286+
dim3 block(128, 4, 1);
287+
dim3 grid = dim3(divUp(src.cols, block.x), divUp(src.rows, block.y), 1);
288+
CV_CUDEV_SAFE_CALL(cudaMemsetAsync(histogram.ptr<uint>(), 0, n_bins * sizeof(uint), cuda_stream));
289+
histogram_kernel<n_bins>
290+
<<<grid, block, 0, cuda_stream>>>(
291+
histogram.ptr<uint>(), src.ptr<uint8_t>(), (uint) src.cols, (uint) src.rows, (uint) src.step);
292+
}
293+
294+
double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream)
99295
{
100296
GpuMat src = getInputMat(_src, stream);
101297

102298
const int depth = src.depth();
103299

300+
const int THRESH_OTSU = 8;
301+
if ((type & THRESH_OTSU) == THRESH_OTSU)
302+
{
303+
CV_Assert(depth == CV_8U);
304+
CV_Assert(src.channels() == 1);
305+
306+
BufferPool pool(stream);
307+
308+
// Find the threshold using Otsu and then run the normal thresholding algorithm
309+
GpuMat gpu_histogram(256, 1, CV_32SC1, pool.getAllocator());
310+
calcHist(src, gpu_histogram, stream);
311+
312+
GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator());
313+
compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
314+
315+
cv::Mat mat_otsu_threshold;
316+
gpu_otsu_threshold.download(mat_otsu_threshold, stream);
317+
stream.waitForCompletion();
318+
319+
// Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type
320+
type = type & ~THRESH_OTSU;
321+
thresh = (double) mat_otsu_threshold.at<int>(0);
322+
}
323+
104324
CV_Assert( depth <= CV_64F );
105325
CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );
106326

modules/cudaarithm/test/test_element_operations.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2529,7 +2529,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine(
25292529
///////////////////////////////////////////////////////////////////////////////////////////////////////
25302530
// Threshold
25312531

2532-
CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
2532+
CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV, cv::THRESH_OTSU)
25332533
#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
25342534

25352535
PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
@@ -2577,6 +2577,54 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine(
25772577
ALL_THRESH_OPS,
25782578
WHOLE_SUBMAT));
25792579

2580+
///////////////////////////////////////////////////////////////////////////////////////////////////////
2581+
// ThresholdOtsu
2582+
2583+
PARAM_TEST_CASE(ThresholdOtsu, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
2584+
{
2585+
cv::cuda::DeviceInfo devInfo;
2586+
cv::Size size;
2587+
int type;
2588+
int channel;
2589+
int threshOp;
2590+
bool useRoi;
2591+
2592+
virtual void SetUp()
2593+
{
2594+
devInfo = GET_PARAM(0);
2595+
size = GET_PARAM(1);
2596+
type = GET_PARAM(2);
2597+
channel = GET_PARAM(3);
2598+
threshOp = GET_PARAM(4) | cv::THRESH_OTSU;
2599+
useRoi = GET_PARAM(5);
2600+
2601+
cv::cuda::setDevice(devInfo.deviceID());
2602+
}
2603+
};
2604+
2605+
CUDA_TEST_P(ThresholdOtsu, Accuracy)
2606+
{
2607+
cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel));
2608+
2609+
cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
2610+
double otsu_gpu = cv::cuda::threshold(loadMat(src, useRoi), dst, 0, 255, threshOp);
2611+
2612+
cv::Mat dst_gold;
2613+
double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, threshOp);
2614+
2615+
ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
2616+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
2617+
}
2618+
2619+
INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
2620+
ALL_DEVICES,
2621+
DIFFERENT_SIZES,
2622+
testing::Values(MatDepth(CV_8U)),
2623+
testing::Values(Channels(1)),
2624+
ALL_THRESH_OPS,
2625+
WHOLE_SUBMAT));
2626+
2627+
25802628
////////////////////////////////////////////////////////////////////////////////
25812629
// InRange
25822630

modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { ret
6262
template <typename T> __device__ __forceinline__ T saturate_cast(short v) { return T(v); }
6363
template <typename T> __device__ __forceinline__ T saturate_cast(uint v) { return T(v); }
6464
template <typename T> __device__ __forceinline__ T saturate_cast(int v) { return T(v); }
65+
template <typename T> __device__ __forceinline__ T saturate_cast(signed long long v) { return T(v); }
66+
template <typename T> __device__ __forceinline__ T saturate_cast(unsigned long long v) { return T(v); }
6567
template <typename T> __device__ __forceinline__ T saturate_cast(float v) { return T(v); }
6668
template <typename T> __device__ __forceinline__ T saturate_cast(double v) { return T(v); }
6769

modules/cudev/include/opencv2/cudev/warp/shuffle.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,16 @@ __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warp
332332
return (uint) __shfl_down((int) val, delta, width);
333333
}
334334

335+
__device__ __forceinline__ signed long long shfl_down(signed long long val, uint delta, int width = warpSize)
336+
{
337+
return __shfl_down(val, delta, width);
338+
}
339+
340+
__device__ __forceinline__ unsigned long long shfl_down(unsigned long long val, uint delta, int width = warpSize)
341+
{
342+
return (unsigned long long) __shfl_down(val, delta, width);
343+
}
344+
335345
__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
336346
{
337347
return __shfl_down(val, delta, width);

0 commit comments

Comments
 (0)