• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/emulation.hpp"
47 #include "opencv2/core/cuda/transform.hpp"
48 #include "opencv2/core/cuda/functional.hpp"
49 #include "opencv2/core/cuda/utility.hpp"
50 
51 using namespace cv::cuda;
52 using namespace cv::cuda::device;
53 
54 namespace canny
55 {
56     struct L1 : binary_function<int, int, float>
57     {
operator ()canny::L158         __device__ __forceinline__ float operator ()(int x, int y) const
59         {
60             return ::abs(x) + ::abs(y);
61         }
62 
L1canny::L163         __host__ __device__ __forceinline__ L1() {}
L1canny::L164         __host__ __device__ __forceinline__ L1(const L1&) {}
65     };
66     struct L2 : binary_function<int, int, float>
67     {
operator ()canny::L268         __device__ __forceinline__ float operator ()(int x, int y) const
69         {
70             return ::sqrtf(x * x + y * y);
71         }
72 
L2canny::L273         __host__ __device__ __forceinline__ L2() {}
L2canny::L274         __host__ __device__ __forceinline__ L2(const L2&) {}
75     };
76 }
77 
78 namespace cv { namespace cuda { namespace device
79 {
80     template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
81     {
82         enum { smart_shift = 4 };
83     };
84     template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
85     {
86         enum { smart_shift = 4 };
87     };
88 }}}
89 
90 namespace canny
91 {
92     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
93     struct SrcTex
94     {
95         int xoff;
96         int yoff;
SrcTexcanny::SrcTex97         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
98 
operator ()canny::SrcTex99         __device__ __forceinline__ int operator ()(int y, int x) const
100         {
101             return tex2D(tex_src, x + xoff, y + yoff);
102         }
103     };
104 
105     template <class Norm> __global__
calcMagnitudeKernel(const SrcTex src,PtrStepi dx,PtrStepi dy,PtrStepSzf mag,const Norm norm)106     void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
107     {
108         const int x = blockIdx.x * blockDim.x + threadIdx.x;
109         const int y = blockIdx.y * blockDim.y + threadIdx.y;
110 
111         if (y >= mag.rows || x >= mag.cols)
112             return;
113 
114         int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
115         int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
116 
117         dx(y, x) = dxVal;
118         dy(y, x) = dyVal;
119 
120         mag(y, x) = norm(dxVal, dyVal);
121     }
122 
calcMagnitude(PtrStepSzb srcWhole,int xoff,int yoff,PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)123     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
124     {
125         const dim3 block(16, 16);
126         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
127 
128         bindTexture(&tex_src, srcWhole);
129         SrcTex src(xoff, yoff);
130 
131         if (L2Grad)
132         {
133             L2 norm;
134             calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
135         }
136         else
137         {
138             L1 norm;
139             calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
140         }
141 
142         cudaSafeCall( cudaGetLastError() );
143 
144         if (stream == NULL)
145             cudaSafeCall( cudaDeviceSynchronize() );
146     }
147 
calcMagnitude(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)148     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
149     {
150         if (L2Grad)
151         {
152             L2 norm;
153             transform(dx, dy, mag, norm, WithOutMask(), stream);
154         }
155         else
156         {
157             L1 norm;
158             transform(dx, dy, mag, norm, WithOutMask(), stream);
159         }
160     }
161 }
162 
163 //////////////////////////////////////////////////////////////////////////////////////////
164 
165 namespace canny
166 {
167     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
168 
calcMapKernel(const PtrStepSzi dx,const PtrStepi dy,PtrStepi map,const float low_thresh,const float high_thresh)169     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
170     {
171         const int CANNY_SHIFT = 15;
172         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
173 
174         const int x = blockIdx.x * blockDim.x + threadIdx.x;
175         const int y = blockIdx.y * blockDim.y + threadIdx.y;
176 
177         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
178             return;
179 
180         int dxVal = dx(y, x);
181         int dyVal = dy(y, x);
182 
183         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
184         const float m = tex2D(tex_mag, x, y);
185 
186         dxVal = ::abs(dxVal);
187         dyVal = ::abs(dyVal);
188 
189         // 0 - the pixel can not belong to an edge
190         // 1 - the pixel might belong to an edge
191         // 2 - the pixel does belong to an edge
192         int edge_type = 0;
193 
194         if (m > low_thresh)
195         {
196             const int tg22x = dxVal * TG22;
197             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
198 
199             dyVal <<= CANNY_SHIFT;
200 
201             if (dyVal < tg22x)
202             {
203                 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
204                     edge_type = 1 + (int)(m > high_thresh);
205             }
206             else if(dyVal > tg67x)
207             {
208                 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
209                     edge_type = 1 + (int)(m > high_thresh);
210             }
211             else
212             {
213                 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
214                     edge_type = 1 + (int)(m > high_thresh);
215             }
216         }
217 
218         map(y, x) = edge_type;
219     }
220 
calcMap(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,PtrStepSzi map,float low_thresh,float high_thresh,cudaStream_t stream)221     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
222     {
223         const dim3 block(16, 16);
224         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
225 
226         bindTexture(&tex_mag, mag);
227 
228         calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
229         cudaSafeCall( cudaGetLastError() );
230 
231         if (stream == NULL)
232             cudaSafeCall( cudaDeviceSynchronize() );
233     }
234 }
235 
236 //////////////////////////////////////////////////////////////////////////////////////////
237 
238 namespace canny
239 {
240     __device__ int counter = 0;
241 
checkIdx(int y,int x,int rows,int cols)242     __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
243     {
244         return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
245     }
246 
edgesHysteresisLocalKernel(PtrStepSzi map,short2 * st)247     __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
248     {
249         __shared__ volatile int smem[18][18];
250 
251         const int x = blockIdx.x * blockDim.x + threadIdx.x;
252         const int y = blockIdx.y * blockDim.y + threadIdx.y;
253 
254         smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
255         if (threadIdx.y == 0)
256             smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
257         if (threadIdx.y == blockDim.y - 1)
258             smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
259         if (threadIdx.x == 0)
260             smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
261         if (threadIdx.x == blockDim.x - 1)
262             smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
263         if (threadIdx.x == 0 && threadIdx.y == 0)
264             smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
265         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
266             smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
267         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
268             smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
269         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
270             smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
271 
272         __syncthreads();
273 
274         if (x >= map.cols || y >= map.rows)
275             return;
276 
277         int n;
278 
279         #pragma unroll
280         for (int k = 0; k < 16; ++k)
281         {
282             n = 0;
283 
284             if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
285             {
286                 n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
287                 n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
288                 n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
289 
290                 n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
291                 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
292 
293                 n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
294                 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
295                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
296             }
297 
298             __syncthreads();
299 
300             if (n > 0)
301                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
302 
303             __syncthreads();
304         }
305 
306         const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
307 
308         map(y, x) = e;
309 
310         n = 0;
311 
312         if (e == 2)
313         {
314             n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
315             n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
316             n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
317 
318             n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
319             n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
320 
321             n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
322             n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
323             n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
324         }
325 
326         if (n > 0)
327         {
328             const int ind =  ::atomicAdd(&counter, 1);
329             st[ind] = make_short2(x, y);
330         }
331     }
332 
edgesHysteresisLocal(PtrStepSzi map,short2 * st1,cudaStream_t stream)333     void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
334     {
335         void* counter_ptr;
336         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
337 
338         cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
339 
340         const dim3 block(16, 16);
341         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
342 
343         edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
344         cudaSafeCall( cudaGetLastError() );
345 
346         if (stream == NULL)
347             cudaSafeCall( cudaDeviceSynchronize() );
348     }
349 }
350 
351 //////////////////////////////////////////////////////////////////////////////////////////
352 
353 namespace canny
354 {
355     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
356     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
357 
edgesHysteresisGlobalKernel(PtrStepSzi map,short2 * st1,short2 * st2,const int count)358     __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
359     {
360         const int stack_size = 512;
361 
362         __shared__ int s_counter;
363         __shared__ int s_ind;
364         __shared__ short2 s_st[stack_size];
365 
366         if (threadIdx.x == 0)
367             s_counter = 0;
368 
369         __syncthreads();
370 
371         int ind = blockIdx.y * gridDim.x + blockIdx.x;
372 
373         if (ind >= count)
374             return;
375 
376         short2 pos = st1[ind];
377 
378         if (threadIdx.x < 8)
379         {
380             pos.x += c_dx[threadIdx.x];
381             pos.y += c_dy[threadIdx.x];
382 
383             if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
384             {
385                 map(pos.y, pos.x) = 2;
386 
387                 ind = Emulation::smem::atomicAdd(&s_counter, 1);
388 
389                 s_st[ind] = pos;
390             }
391         }
392 
393         __syncthreads();
394 
395         while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
396         {
397             const int subTaskIdx = threadIdx.x >> 3;
398             const int portion = ::min(s_counter, blockDim.x >> 3);
399 
400             if (subTaskIdx < portion)
401                 pos = s_st[s_counter - 1 - subTaskIdx];
402 
403             __syncthreads();
404 
405             if (threadIdx.x == 0)
406                 s_counter -= portion;
407 
408             __syncthreads();
409 
410             if (subTaskIdx < portion)
411             {
412                 pos.x += c_dx[threadIdx.x & 7];
413                 pos.y += c_dy[threadIdx.x & 7];
414 
415                 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
416                 {
417                     map(pos.y, pos.x) = 2;
418 
419                     ind = Emulation::smem::atomicAdd(&s_counter, 1);
420 
421                     s_st[ind] = pos;
422                 }
423             }
424 
425             __syncthreads();
426         }
427 
428         if (s_counter > 0)
429         {
430             if (threadIdx.x == 0)
431             {
432                 s_ind = ::atomicAdd(&counter, s_counter);
433 
434                 if (s_ind + s_counter > map.cols * map.rows)
435                     s_counter = 0;
436             }
437 
438             __syncthreads();
439 
440             ind = s_ind;
441 
442             for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
443                 st2[ind + i] = s_st[i];
444         }
445     }
446 
edgesHysteresisGlobal(PtrStepSzi map,short2 * st1,short2 * st2,cudaStream_t stream)447     void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
448     {
449         void* counter_ptr;
450         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
451 
452         int count;
453         cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
454         cudaSafeCall( cudaStreamSynchronize(stream) );
455 
456         while (count > 0)
457         {
458             cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
459 
460             const dim3 block(128);
461             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
462 
463             edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
464             cudaSafeCall( cudaGetLastError() );
465 
466             if (stream == NULL)
467                 cudaSafeCall( cudaDeviceSynchronize() );
468 
469             cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
470             cudaSafeCall( cudaStreamSynchronize(stream) );
471 
472             count = min(count, map.cols * map.rows);
473 
474             //std::swap(st1, st2);
475             short2* tmp = st1;
476             st1 = st2;
477             st2 = tmp;
478         }
479     }
480 }
481 
482 //////////////////////////////////////////////////////////////////////////////////////////
483 
484 namespace canny
485 {
486     struct GetEdges : unary_function<int, uchar>
487     {
operator ()canny::GetEdges488         __device__ __forceinline__ uchar operator ()(int e) const
489         {
490             return (uchar)(-(e >> 1));
491         }
492 
GetEdgescanny::GetEdges493         __host__ __device__ __forceinline__ GetEdges() {}
GetEdgescanny::GetEdges494         __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
495     };
496 }
497 
498 namespace cv { namespace cuda { namespace device
499 {
500     template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
501     {
502         enum { smart_shift = 4 };
503     };
504 }}}
505 
506 namespace canny
507 {
getEdges(PtrStepSzi map,PtrStepSzb dst,cudaStream_t stream)508     void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
509     {
510         transform(map, dst, GetEdges(), WithOutMask(), stream);
511     }
512 }
513 
514 #endif /* CUDA_DISABLER */
515