• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "opencv2/core/cuda/common.hpp"
44 #include "opencv2/core/cuda/saturate_cast.hpp"
45 #include "opencv2/core/cuda/vec_math.hpp"
46 #include "opencv2/core/cuda/border_interpolate.hpp"
47 
48 using namespace cv::cuda;
49 using namespace cv::cuda::device;
50 
51 namespace column_filter
52 {
53     #define MAX_KERNEL_SIZE 32
54 
55     __constant__ float c_kernel[MAX_KERNEL_SIZE];
56 
57     template <int KSIZE, typename T, typename D, typename B>
linearColumnFilter(const PtrStepSz<T> src,PtrStep<D> dst,const int anchor,const B brd)58     __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
59     {
60         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
61             const int BLOCK_DIM_X = 16;
62             const int BLOCK_DIM_Y = 16;
63             const int PATCH_PER_BLOCK = 4;
64             const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
65         #else
66             const int BLOCK_DIM_X = 16;
67             const int BLOCK_DIM_Y = 8;
68             const int PATCH_PER_BLOCK = 2;
69             const int HALO_SIZE = 2;
70         #endif
71 
72         typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
73 
74         __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
75 
76         const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
77 
78         if (x >= src.cols)
79             return;
80 
81         const T* src_col = src.ptr() + x;
82 
83         const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
84 
85         if (blockIdx.y > 0)
86         {
87             //Upper halo
88             #pragma unroll
89             for (int j = 0; j < HALO_SIZE; ++j)
90                 smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
91         }
92         else
93         {
94             //Upper halo
95             #pragma unroll
96             for (int j = 0; j < HALO_SIZE; ++j)
97                 smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
98         }
99 
100         if (blockIdx.y + 2 < gridDim.y)
101         {
102             //Main data
103             #pragma unroll
104             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
105                 smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
106 
107             //Lower halo
108             #pragma unroll
109             for (int j = 0; j < HALO_SIZE; ++j)
110                 smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
111         }
112         else
113         {
114             //Main data
115             #pragma unroll
116             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
117                 smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
118 
119             //Lower halo
120             #pragma unroll
121             for (int j = 0; j < HALO_SIZE; ++j)
122                 smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
123         }
124 
125         __syncthreads();
126 
127         #pragma unroll
128         for (int j = 0; j < PATCH_PER_BLOCK; ++j)
129         {
130             const int y = yStart + j * BLOCK_DIM_Y;
131 
132             if (y < src.rows)
133             {
134                 sum_t sum = VecTraits<sum_t>::all(0);
135 
136                 #pragma unroll
137                 for (int k = 0; k < KSIZE; ++k)
138                     sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
139 
140                 dst(y, x) = saturate_cast<D>(sum);
141             }
142         }
143     }
144 
145     template <int KSIZE, typename T, typename D, template<typename> class B>
caller(PtrStepSz<T> src,PtrStepSz<D> dst,int anchor,int cc,cudaStream_t stream)146     void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
147     {
148         int BLOCK_DIM_X;
149         int BLOCK_DIM_Y;
150         int PATCH_PER_BLOCK;
151 
152         if (cc >= 20)
153         {
154             BLOCK_DIM_X = 16;
155             BLOCK_DIM_Y = 16;
156             PATCH_PER_BLOCK = 4;
157         }
158         else
159         {
160             BLOCK_DIM_X = 16;
161             BLOCK_DIM_Y = 8;
162             PATCH_PER_BLOCK = 2;
163         }
164 
165         const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
166         const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
167 
168         B<T> brd(src.rows);
169 
170         linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
171 
172         cudaSafeCall( cudaGetLastError() );
173 
174         if (stream == 0)
175             cudaSafeCall( cudaDeviceSynchronize() );
176     }
177 }
178 
179 namespace filter
180 {
181     template <typename T, typename D>
linearColumn(PtrStepSzb src,PtrStepSzb dst,const float * kernel,int ksize,int anchor,int brd_type,int cc,cudaStream_t stream)182     void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
183     {
184         typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
185 
186         static const caller_t callers[5][33] =
187         {
188             {
189                 0,
190                 column_filter::caller< 1, T, D, BrdColConstant>,
191                 column_filter::caller< 2, T, D, BrdColConstant>,
192                 column_filter::caller< 3, T, D, BrdColConstant>,
193                 column_filter::caller< 4, T, D, BrdColConstant>,
194                 column_filter::caller< 5, T, D, BrdColConstant>,
195                 column_filter::caller< 6, T, D, BrdColConstant>,
196                 column_filter::caller< 7, T, D, BrdColConstant>,
197                 column_filter::caller< 8, T, D, BrdColConstant>,
198                 column_filter::caller< 9, T, D, BrdColConstant>,
199                 column_filter::caller<10, T, D, BrdColConstant>,
200                 column_filter::caller<11, T, D, BrdColConstant>,
201                 column_filter::caller<12, T, D, BrdColConstant>,
202                 column_filter::caller<13, T, D, BrdColConstant>,
203                 column_filter::caller<14, T, D, BrdColConstant>,
204                 column_filter::caller<15, T, D, BrdColConstant>,
205                 column_filter::caller<16, T, D, BrdColConstant>,
206                 column_filter::caller<17, T, D, BrdColConstant>,
207                 column_filter::caller<18, T, D, BrdColConstant>,
208                 column_filter::caller<19, T, D, BrdColConstant>,
209                 column_filter::caller<20, T, D, BrdColConstant>,
210                 column_filter::caller<21, T, D, BrdColConstant>,
211                 column_filter::caller<22, T, D, BrdColConstant>,
212                 column_filter::caller<23, T, D, BrdColConstant>,
213                 column_filter::caller<24, T, D, BrdColConstant>,
214                 column_filter::caller<25, T, D, BrdColConstant>,
215                 column_filter::caller<26, T, D, BrdColConstant>,
216                 column_filter::caller<27, T, D, BrdColConstant>,
217                 column_filter::caller<28, T, D, BrdColConstant>,
218                 column_filter::caller<29, T, D, BrdColConstant>,
219                 column_filter::caller<30, T, D, BrdColConstant>,
220                 column_filter::caller<31, T, D, BrdColConstant>,
221                 column_filter::caller<32, T, D, BrdColConstant>
222             },
223             {
224                 0,
225                 column_filter::caller< 1, T, D, BrdColReplicate>,
226                 column_filter::caller< 2, T, D, BrdColReplicate>,
227                 column_filter::caller< 3, T, D, BrdColReplicate>,
228                 column_filter::caller< 4, T, D, BrdColReplicate>,
229                 column_filter::caller< 5, T, D, BrdColReplicate>,
230                 column_filter::caller< 6, T, D, BrdColReplicate>,
231                 column_filter::caller< 7, T, D, BrdColReplicate>,
232                 column_filter::caller< 8, T, D, BrdColReplicate>,
233                 column_filter::caller< 9, T, D, BrdColReplicate>,
234                 column_filter::caller<10, T, D, BrdColReplicate>,
235                 column_filter::caller<11, T, D, BrdColReplicate>,
236                 column_filter::caller<12, T, D, BrdColReplicate>,
237                 column_filter::caller<13, T, D, BrdColReplicate>,
238                 column_filter::caller<14, T, D, BrdColReplicate>,
239                 column_filter::caller<15, T, D, BrdColReplicate>,
240                 column_filter::caller<16, T, D, BrdColReplicate>,
241                 column_filter::caller<17, T, D, BrdColReplicate>,
242                 column_filter::caller<18, T, D, BrdColReplicate>,
243                 column_filter::caller<19, T, D, BrdColReplicate>,
244                 column_filter::caller<20, T, D, BrdColReplicate>,
245                 column_filter::caller<21, T, D, BrdColReplicate>,
246                 column_filter::caller<22, T, D, BrdColReplicate>,
247                 column_filter::caller<23, T, D, BrdColReplicate>,
248                 column_filter::caller<24, T, D, BrdColReplicate>,
249                 column_filter::caller<25, T, D, BrdColReplicate>,
250                 column_filter::caller<26, T, D, BrdColReplicate>,
251                 column_filter::caller<27, T, D, BrdColReplicate>,
252                 column_filter::caller<28, T, D, BrdColReplicate>,
253                 column_filter::caller<29, T, D, BrdColReplicate>,
254                 column_filter::caller<30, T, D, BrdColReplicate>,
255                 column_filter::caller<31, T, D, BrdColReplicate>,
256                 column_filter::caller<32, T, D, BrdColReplicate>
257             },
258             {
259                 0,
260                 column_filter::caller< 1, T, D, BrdColReflect>,
261                 column_filter::caller< 2, T, D, BrdColReflect>,
262                 column_filter::caller< 3, T, D, BrdColReflect>,
263                 column_filter::caller< 4, T, D, BrdColReflect>,
264                 column_filter::caller< 5, T, D, BrdColReflect>,
265                 column_filter::caller< 6, T, D, BrdColReflect>,
266                 column_filter::caller< 7, T, D, BrdColReflect>,
267                 column_filter::caller< 8, T, D, BrdColReflect>,
268                 column_filter::caller< 9, T, D, BrdColReflect>,
269                 column_filter::caller<10, T, D, BrdColReflect>,
270                 column_filter::caller<11, T, D, BrdColReflect>,
271                 column_filter::caller<12, T, D, BrdColReflect>,
272                 column_filter::caller<13, T, D, BrdColReflect>,
273                 column_filter::caller<14, T, D, BrdColReflect>,
274                 column_filter::caller<15, T, D, BrdColReflect>,
275                 column_filter::caller<16, T, D, BrdColReflect>,
276                 column_filter::caller<17, T, D, BrdColReflect>,
277                 column_filter::caller<18, T, D, BrdColReflect>,
278                 column_filter::caller<19, T, D, BrdColReflect>,
279                 column_filter::caller<20, T, D, BrdColReflect>,
280                 column_filter::caller<21, T, D, BrdColReflect>,
281                 column_filter::caller<22, T, D, BrdColReflect>,
282                 column_filter::caller<23, T, D, BrdColReflect>,
283                 column_filter::caller<24, T, D, BrdColReflect>,
284                 column_filter::caller<25, T, D, BrdColReflect>,
285                 column_filter::caller<26, T, D, BrdColReflect>,
286                 column_filter::caller<27, T, D, BrdColReflect>,
287                 column_filter::caller<28, T, D, BrdColReflect>,
288                 column_filter::caller<29, T, D, BrdColReflect>,
289                 column_filter::caller<30, T, D, BrdColReflect>,
290                 column_filter::caller<31, T, D, BrdColReflect>,
291                 column_filter::caller<32, T, D, BrdColReflect>
292             },
293             {
294                 0,
295                 column_filter::caller< 1, T, D, BrdColWrap>,
296                 column_filter::caller< 2, T, D, BrdColWrap>,
297                 column_filter::caller< 3, T, D, BrdColWrap>,
298                 column_filter::caller< 4, T, D, BrdColWrap>,
299                 column_filter::caller< 5, T, D, BrdColWrap>,
300                 column_filter::caller< 6, T, D, BrdColWrap>,
301                 column_filter::caller< 7, T, D, BrdColWrap>,
302                 column_filter::caller< 8, T, D, BrdColWrap>,
303                 column_filter::caller< 9, T, D, BrdColWrap>,
304                 column_filter::caller<10, T, D, BrdColWrap>,
305                 column_filter::caller<11, T, D, BrdColWrap>,
306                 column_filter::caller<12, T, D, BrdColWrap>,
307                 column_filter::caller<13, T, D, BrdColWrap>,
308                 column_filter::caller<14, T, D, BrdColWrap>,
309                 column_filter::caller<15, T, D, BrdColWrap>,
310                 column_filter::caller<16, T, D, BrdColWrap>,
311                 column_filter::caller<17, T, D, BrdColWrap>,
312                 column_filter::caller<18, T, D, BrdColWrap>,
313                 column_filter::caller<19, T, D, BrdColWrap>,
314                 column_filter::caller<20, T, D, BrdColWrap>,
315                 column_filter::caller<21, T, D, BrdColWrap>,
316                 column_filter::caller<22, T, D, BrdColWrap>,
317                 column_filter::caller<23, T, D, BrdColWrap>,
318                 column_filter::caller<24, T, D, BrdColWrap>,
319                 column_filter::caller<25, T, D, BrdColWrap>,
320                 column_filter::caller<26, T, D, BrdColWrap>,
321                 column_filter::caller<27, T, D, BrdColWrap>,
322                 column_filter::caller<28, T, D, BrdColWrap>,
323                 column_filter::caller<29, T, D, BrdColWrap>,
324                 column_filter::caller<30, T, D, BrdColWrap>,
325                 column_filter::caller<31, T, D, BrdColWrap>,
326                 column_filter::caller<32, T, D, BrdColWrap>
327             },
328             {
329                 0,
330                 column_filter::caller< 1, T, D, BrdColReflect101>,
331                 column_filter::caller< 2, T, D, BrdColReflect101>,
332                 column_filter::caller< 3, T, D, BrdColReflect101>,
333                 column_filter::caller< 4, T, D, BrdColReflect101>,
334                 column_filter::caller< 5, T, D, BrdColReflect101>,
335                 column_filter::caller< 6, T, D, BrdColReflect101>,
336                 column_filter::caller< 7, T, D, BrdColReflect101>,
337                 column_filter::caller< 8, T, D, BrdColReflect101>,
338                 column_filter::caller< 9, T, D, BrdColReflect101>,
339                 column_filter::caller<10, T, D, BrdColReflect101>,
340                 column_filter::caller<11, T, D, BrdColReflect101>,
341                 column_filter::caller<12, T, D, BrdColReflect101>,
342                 column_filter::caller<13, T, D, BrdColReflect101>,
343                 column_filter::caller<14, T, D, BrdColReflect101>,
344                 column_filter::caller<15, T, D, BrdColReflect101>,
345                 column_filter::caller<16, T, D, BrdColReflect101>,
346                 column_filter::caller<17, T, D, BrdColReflect101>,
347                 column_filter::caller<18, T, D, BrdColReflect101>,
348                 column_filter::caller<19, T, D, BrdColReflect101>,
349                 column_filter::caller<20, T, D, BrdColReflect101>,
350                 column_filter::caller<21, T, D, BrdColReflect101>,
351                 column_filter::caller<22, T, D, BrdColReflect101>,
352                 column_filter::caller<23, T, D, BrdColReflect101>,
353                 column_filter::caller<24, T, D, BrdColReflect101>,
354                 column_filter::caller<25, T, D, BrdColReflect101>,
355                 column_filter::caller<26, T, D, BrdColReflect101>,
356                 column_filter::caller<27, T, D, BrdColReflect101>,
357                 column_filter::caller<28, T, D, BrdColReflect101>,
358                 column_filter::caller<29, T, D, BrdColReflect101>,
359                 column_filter::caller<30, T, D, BrdColReflect101>,
360                 column_filter::caller<31, T, D, BrdColReflect101>,
361                 column_filter::caller<32, T, D, BrdColReflect101>
362             }
363         };
364 
365         if (stream == 0)
366             cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
367         else
368             cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
369 
370         callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
371     }
372 }
373