@@ -95,12 +95,232 @@ namespace
95
95
}
96
96
}
97
97
98
- double cv::cuda::threshold (InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
98
+
99
+ __global__ void otsu_sums (uint *histogram, uint *threshold_sums, unsigned long long *sums)
100
+ {
101
+ const uint32_t n_bins = 256 ;
102
+
103
+ __shared__ uint shared_memory_ts[n_bins];
104
+ __shared__ unsigned long long shared_memory_s[n_bins];
105
+
106
+ int bin_idx = threadIdx .x ;
107
+ int threshold = blockIdx .x ;
108
+
109
+ uint threshold_sum_above = 0 ;
110
+ unsigned long long sum_above = 0 ;
111
+
112
+ if (bin_idx >= threshold)
113
+ {
114
+ uint value = histogram[bin_idx];
115
+ threshold_sum_above = value;
116
+ sum_above = value * bin_idx;
117
+ }
118
+
119
+ blockReduce<n_bins>(shared_memory_ts, threshold_sum_above, bin_idx, plus<uint>());
120
+ blockReduce<n_bins>(shared_memory_s, sum_above, bin_idx, plus<unsigned long long >());
121
+
122
+ if (bin_idx == 0 )
123
+ {
124
+ threshold_sums[threshold] = threshold_sum_above;
125
+ sums[threshold] = sum_above;
126
+ }
127
+ }
128
+
129
+ __global__ void
130
+ otsu_variance (float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
131
+ {
132
+ const uint32_t n_bins = 256 ;
133
+
134
+ __shared__ signed long long shared_memory_a[n_bins];
135
+ __shared__ signed long long shared_memory_b[n_bins];
136
+
137
+ int bin_idx = threadIdx .x ;
138
+ int threshold = blockIdx .x ;
139
+
140
+ uint n_samples = threshold_sums[0 ];
141
+ uint n_samples_above = threshold_sums[threshold];
142
+ uint n_samples_below = n_samples - n_samples_above;
143
+
144
+ unsigned long long total_sum = sums[0 ];
145
+ unsigned long long sum_above = sums[threshold];
146
+ unsigned long long sum_below = total_sum - sum_above;
147
+
148
+ float threshold_variance_above_f32 = 0 ;
149
+ float threshold_variance_below_f32 = 0 ;
150
+ if (bin_idx >= threshold)
151
+ {
152
+ float mean = (float ) sum_above / n_samples_above;
153
+ float sigma = bin_idx - mean;
154
+ threshold_variance_above_f32 = sigma * sigma;
155
+ }
156
+ else
157
+ {
158
+ float mean = (float ) sum_below / n_samples_below;
159
+ float sigma = bin_idx - mean;
160
+ threshold_variance_below_f32 = sigma * sigma;
161
+ }
162
+
163
+ uint bin_count = histogram[bin_idx];
164
+ signed long long threshold_variance_above_i64 = (signed long long )(threshold_variance_above_f32 * bin_count);
165
+ signed long long threshold_variance_below_i64 = (signed long long )(threshold_variance_below_f32 * bin_count);
166
+ blockReduce<n_bins>(shared_memory_a, threshold_variance_above_i64, bin_idx, plus<signed long long >());
167
+ blockReduce<n_bins>(shared_memory_b, threshold_variance_below_i64, bin_idx, plus<signed long long >());
168
+
169
+ if (bin_idx == 0 )
170
+ {
171
+ variance[threshold] = make_float2 (threshold_variance_above_i64, threshold_variance_below_i64);
172
+ }
173
+ }
174
+
175
+
176
+ __global__ void
177
+ otsu_score (uint *otsu_threshold, uint *threshold_sums, float2 *variance)
178
+ {
179
+ const uint32_t n_thresholds = 256 ;
180
+
181
+ __shared__ float shared_memory[n_thresholds / WARP_SIZE];
182
+
183
+ int threshold = threadIdx .x ;
184
+
185
+ uint n_samples = threshold_sums[0 ];
186
+ uint n_samples_above = threshold_sums[threshold];
187
+ uint n_samples_below = n_samples - n_samples_above;
188
+
189
+ float threshold_mean_above = (float )n_samples_above / n_samples;
190
+ float threshold_mean_below = (float )n_samples_below / n_samples;
191
+
192
+ float2 variances = variance[threshold];
193
+ float variance_above = variances.x / n_samples_above;
194
+ float variance_below = variances.y / n_samples_below;
195
+
196
+ float above = threshold_mean_above * variance_above;
197
+ float below = threshold_mean_below * variance_below;
198
+ float score = above + below;
199
+
200
+ float original_score = score;
201
+
202
+ blockReduce<n_thresholds>(shared_memory, score, threshold, minimum<float >());
203
+
204
+ if (threshold == 0 )
205
+ {
206
+ shared_memory[0 ] = score;
207
+ }
208
+ __syncthreads ();
209
+
210
+ score = shared_memory[0 ];
211
+
212
+ // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
213
+ // know which threshold it is
214
+ if (original_score == score)
215
+ {
216
+ *otsu_threshold = threshold - 1 ;
217
+ }
218
+ }
219
+
220
+ void compute_otsu (uint *histogram, uint *otsu_threshold, Stream &stream)
221
+ {
222
+ const uint n_bins = 256 ;
223
+ const uint n_thresholds = 256 ;
224
+
225
+ cudaStream_t cuda_stream = StreamAccessor::getStream (stream);
226
+
227
+ dim3 block_all (n_bins);
228
+ dim3 grid_all (n_thresholds);
229
+ dim3 block_score (n_thresholds);
230
+ dim3 grid_score (1 );
231
+
232
+ BufferPool pool (stream);
233
+ GpuMat gpu_threshold_sums (1 , n_bins, CV_32SC1, pool.getAllocator ());
234
+ GpuMat gpu_sums (1 , n_bins, CV_64FC1, pool.getAllocator ());
235
+ GpuMat gpu_variances (1 , n_bins, CV_32FC2, pool.getAllocator ());
236
+
237
+ otsu_sums<<<grid_all, block_all, 0 , cuda_stream>>> (
238
+ histogram, gpu_threshold_sums.ptr <uint>(), gpu_sums.ptr <unsigned long long >());
239
+ otsu_variance<<<grid_all, block_all, 0 , cuda_stream>>> (
240
+ gpu_variances.ptr <float2 >(), histogram, gpu_threshold_sums.ptr <uint>(), gpu_sums.ptr <unsigned long long >());
241
+ otsu_score<<<grid_score, block_score, 0 , cuda_stream>>> (
242
+ otsu_threshold, gpu_threshold_sums.ptr <uint>(), gpu_variances.ptr <float2 >());
243
+ }
244
+
245
+ // TODO: Replace this is cv::cuda::calcHist
246
+ template <uint n_bins>
247
+ __global__ void histogram_kernel (
248
+ uint *histogram, const uint8_t *image, uint width,
249
+ uint height, uint pitch)
250
+ {
251
+ __shared__ uint local_histogram[n_bins];
252
+
253
+ uint x = blockIdx .x * blockDim .x + threadIdx .x ;
254
+ uint y = blockIdx .y * blockDim .y + threadIdx .y ;
255
+ uint tid = threadIdx .y * blockDim .x + threadIdx .x ;
256
+
257
+ if (tid < n_bins)
258
+ {
259
+ local_histogram[tid] = 0 ;
260
+ }
261
+
262
+ __syncthreads ();
263
+
264
+ if (x < width && y < height)
265
+ {
266
+ uint8_t value = image[y * pitch + x];
267
+ atomicInc (&local_histogram[value], 0xFFFFFFFF );
268
+ }
269
+
270
+ __syncthreads ();
271
+
272
+ if (tid < n_bins)
273
+ {
274
+ cv::cudev::atomicAdd (&histogram[tid], local_histogram[tid]);
275
+ }
276
+ }
277
+
278
+ // TODO: Replace this with cv::cuda::calcHist
279
+ void calcHist (
280
+ const GpuMat src, GpuMat histogram, Stream stream)
281
+ {
282
+ const uint n_bins = 256 ;
283
+
284
+ cudaStream_t cuda_stream = StreamAccessor::getStream (stream);
285
+
286
+ dim3 block (128 , 4 , 1 );
287
+ dim3 grid = dim3 (divUp (src.cols , block.x ), divUp (src.rows , block.y ), 1 );
288
+ CV_CUDEV_SAFE_CALL (cudaMemsetAsync (histogram.ptr <uint>(), 0 , n_bins * sizeof (uint), cuda_stream));
289
+ histogram_kernel<n_bins>
290
+ <<<grid, block, 0 , cuda_stream>>> (
291
+ histogram.ptr <uint>(), src.ptr <uint8_t >(), (uint) src.cols , (uint) src.rows , (uint) src.step );
292
+ }
293
+
294
+ double cv::cuda::threshold (InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream &stream)
99
295
{
100
296
GpuMat src = getInputMat (_src, stream);
101
297
102
298
const int depth = src.depth ();
103
299
300
+ const int THRESH_OTSU = 8 ;
301
+ if ((type & THRESH_OTSU) == THRESH_OTSU)
302
+ {
303
+ CV_Assert (depth == CV_8U);
304
+ CV_Assert (src.channels () == 1 );
305
+
306
+ BufferPool pool (stream);
307
+
308
+ // Find the threshold using Otsu and then run the normal thresholding algorithm
309
+ GpuMat gpu_histogram (256 , 1 , CV_32SC1, pool.getAllocator ());
310
+ calcHist (src, gpu_histogram, stream);
311
+
312
+ GpuMat gpu_otsu_threshold (1 , 1 , CV_32SC1, pool.getAllocator ());
313
+ compute_otsu (gpu_histogram.ptr <uint>(), gpu_otsu_threshold.ptr <uint>(), stream);
314
+
315
+ cv::Mat mat_otsu_threshold;
316
+ gpu_otsu_threshold.download (mat_otsu_threshold, stream);
317
+ stream.waitForCompletion ();
318
+
319
+ // Overwrite the threshold value with the Otsu value and remove the Otsu flag from the type
320
+ type = type & ~THRESH_OTSU;
321
+ thresh = (double ) mat_otsu_threshold.at <int >(0 );
322
+ }
323
+
104
324
CV_Assert ( depth <= CV_64F );
105
325
CV_Assert ( type <= 4 /* THRESH_TOZERO_INV*/ );
106
326
0 commit comments