• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "opencv2/core/cuda/common.hpp"
44 #include "opencv2/core/cuda/saturate_cast.hpp"
45 #include "opencv2/core/cuda/vec_math.hpp"
46 #include "opencv2/core/cuda/border_interpolate.hpp"
47 
48 using namespace cv::cuda;
49 using namespace cv::cuda::device;
50 
51 namespace row_filter
52 {
53     #define MAX_KERNEL_SIZE 32
54 
55     __constant__ float c_kernel[MAX_KERNEL_SIZE];
56 
57     template <int KSIZE, typename T, typename D, typename B>
linearRowFilter(const PtrStepSz<T> src,PtrStep<D> dst,const int anchor,const B brd)58     __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
59     {
60         #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
61             const int BLOCK_DIM_X = 32;
62             const int BLOCK_DIM_Y = 8;
63             const int PATCH_PER_BLOCK = 4;
64             const int HALO_SIZE = 1;
65         #else
66             const int BLOCK_DIM_X = 32;
67             const int BLOCK_DIM_Y = 4;
68             const int PATCH_PER_BLOCK = 4;
69             const int HALO_SIZE = 1;
70         #endif
71 
72         typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
73 
74         __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
75 
76         const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
77 
78         if (y >= src.rows)
79             return;
80 
81         const T* src_row = src.ptr(y);
82 
83         const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
84 
85         if (blockIdx.x > 0)
86         {
87             //Load left halo
88             #pragma unroll
89             for (int j = 0; j < HALO_SIZE; ++j)
90                 smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
91         }
92         else
93         {
94             //Load left halo
95             #pragma unroll
96             for (int j = 0; j < HALO_SIZE; ++j)
97                 smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
98         }
99 
100         if (blockIdx.x + 2 < gridDim.x)
101         {
102             //Load main data
103             #pragma unroll
104             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
105                 smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
106 
107             //Load right halo
108             #pragma unroll
109             for (int j = 0; j < HALO_SIZE; ++j)
110                 smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
111         }
112         else
113         {
114             //Load main data
115             #pragma unroll
116             for (int j = 0; j < PATCH_PER_BLOCK; ++j)
117                 smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
118 
119             //Load right halo
120             #pragma unroll
121             for (int j = 0; j < HALO_SIZE; ++j)
122                 smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
123         }
124 
125         __syncthreads();
126 
127         #pragma unroll
128         for (int j = 0; j < PATCH_PER_BLOCK; ++j)
129         {
130             const int x = xStart + j * BLOCK_DIM_X;
131 
132             if (x < src.cols)
133             {
134                 sum_t sum = VecTraits<sum_t>::all(0);
135 
136                 #pragma unroll
137                 for (int k = 0; k < KSIZE; ++k)
138                     sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
139 
140                 dst(y, x) = saturate_cast<D>(sum);
141             }
142         }
143     }
144 
145     template <int KSIZE, typename T, typename D, template<typename> class B>
caller(PtrStepSz<T> src,PtrStepSz<D> dst,int anchor,int cc,cudaStream_t stream)146     void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
147     {
148         int BLOCK_DIM_X;
149         int BLOCK_DIM_Y;
150         int PATCH_PER_BLOCK;
151 
152         if (cc >= 20)
153         {
154             BLOCK_DIM_X = 32;
155             BLOCK_DIM_Y = 8;
156             PATCH_PER_BLOCK = 4;
157         }
158         else
159         {
160             BLOCK_DIM_X = 32;
161             BLOCK_DIM_Y = 4;
162             PATCH_PER_BLOCK = 4;
163         }
164 
165         const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
166         const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
167 
168         B<T> brd(src.cols);
169 
170         linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
171         cudaSafeCall( cudaGetLastError() );
172 
173         if (stream == 0)
174             cudaSafeCall( cudaDeviceSynchronize() );
175     }
176 }
177 
178 namespace filter
179 {
180     template <typename T, typename D>
linearRow(PtrStepSzb src,PtrStepSzb dst,const float * kernel,int ksize,int anchor,int brd_type,int cc,cudaStream_t stream)181     void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
182     {
183         typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
184 
185         static const caller_t callers[5][33] =
186         {
187             {
188                 0,
189                 row_filter::caller< 1, T, D, BrdRowConstant>,
190                 row_filter::caller< 2, T, D, BrdRowConstant>,
191                 row_filter::caller< 3, T, D, BrdRowConstant>,
192                 row_filter::caller< 4, T, D, BrdRowConstant>,
193                 row_filter::caller< 5, T, D, BrdRowConstant>,
194                 row_filter::caller< 6, T, D, BrdRowConstant>,
195                 row_filter::caller< 7, T, D, BrdRowConstant>,
196                 row_filter::caller< 8, T, D, BrdRowConstant>,
197                 row_filter::caller< 9, T, D, BrdRowConstant>,
198                 row_filter::caller<10, T, D, BrdRowConstant>,
199                 row_filter::caller<11, T, D, BrdRowConstant>,
200                 row_filter::caller<12, T, D, BrdRowConstant>,
201                 row_filter::caller<13, T, D, BrdRowConstant>,
202                 row_filter::caller<14, T, D, BrdRowConstant>,
203                 row_filter::caller<15, T, D, BrdRowConstant>,
204                 row_filter::caller<16, T, D, BrdRowConstant>,
205                 row_filter::caller<17, T, D, BrdRowConstant>,
206                 row_filter::caller<18, T, D, BrdRowConstant>,
207                 row_filter::caller<19, T, D, BrdRowConstant>,
208                 row_filter::caller<20, T, D, BrdRowConstant>,
209                 row_filter::caller<21, T, D, BrdRowConstant>,
210                 row_filter::caller<22, T, D, BrdRowConstant>,
211                 row_filter::caller<23, T, D, BrdRowConstant>,
212                 row_filter::caller<24, T, D, BrdRowConstant>,
213                 row_filter::caller<25, T, D, BrdRowConstant>,
214                 row_filter::caller<26, T, D, BrdRowConstant>,
215                 row_filter::caller<27, T, D, BrdRowConstant>,
216                 row_filter::caller<28, T, D, BrdRowConstant>,
217                 row_filter::caller<29, T, D, BrdRowConstant>,
218                 row_filter::caller<30, T, D, BrdRowConstant>,
219                 row_filter::caller<31, T, D, BrdRowConstant>,
220                 row_filter::caller<32, T, D, BrdRowConstant>
221             },
222             {
223                 0,
224                 row_filter::caller< 1, T, D, BrdRowReplicate>,
225                 row_filter::caller< 2, T, D, BrdRowReplicate>,
226                 row_filter::caller< 3, T, D, BrdRowReplicate>,
227                 row_filter::caller< 4, T, D, BrdRowReplicate>,
228                 row_filter::caller< 5, T, D, BrdRowReplicate>,
229                 row_filter::caller< 6, T, D, BrdRowReplicate>,
230                 row_filter::caller< 7, T, D, BrdRowReplicate>,
231                 row_filter::caller< 8, T, D, BrdRowReplicate>,
232                 row_filter::caller< 9, T, D, BrdRowReplicate>,
233                 row_filter::caller<10, T, D, BrdRowReplicate>,
234                 row_filter::caller<11, T, D, BrdRowReplicate>,
235                 row_filter::caller<12, T, D, BrdRowReplicate>,
236                 row_filter::caller<13, T, D, BrdRowReplicate>,
237                 row_filter::caller<14, T, D, BrdRowReplicate>,
238                 row_filter::caller<15, T, D, BrdRowReplicate>,
239                 row_filter::caller<16, T, D, BrdRowReplicate>,
240                 row_filter::caller<17, T, D, BrdRowReplicate>,
241                 row_filter::caller<18, T, D, BrdRowReplicate>,
242                 row_filter::caller<19, T, D, BrdRowReplicate>,
243                 row_filter::caller<20, T, D, BrdRowReplicate>,
244                 row_filter::caller<21, T, D, BrdRowReplicate>,
245                 row_filter::caller<22, T, D, BrdRowReplicate>,
246                 row_filter::caller<23, T, D, BrdRowReplicate>,
247                 row_filter::caller<24, T, D, BrdRowReplicate>,
248                 row_filter::caller<25, T, D, BrdRowReplicate>,
249                 row_filter::caller<26, T, D, BrdRowReplicate>,
250                 row_filter::caller<27, T, D, BrdRowReplicate>,
251                 row_filter::caller<28, T, D, BrdRowReplicate>,
252                 row_filter::caller<29, T, D, BrdRowReplicate>,
253                 row_filter::caller<30, T, D, BrdRowReplicate>,
254                 row_filter::caller<31, T, D, BrdRowReplicate>,
255                 row_filter::caller<32, T, D, BrdRowReplicate>
256             },
257             {
258                 0,
259                 row_filter::caller< 1, T, D, BrdRowReflect>,
260                 row_filter::caller< 2, T, D, BrdRowReflect>,
261                 row_filter::caller< 3, T, D, BrdRowReflect>,
262                 row_filter::caller< 4, T, D, BrdRowReflect>,
263                 row_filter::caller< 5, T, D, BrdRowReflect>,
264                 row_filter::caller< 6, T, D, BrdRowReflect>,
265                 row_filter::caller< 7, T, D, BrdRowReflect>,
266                 row_filter::caller< 8, T, D, BrdRowReflect>,
267                 row_filter::caller< 9, T, D, BrdRowReflect>,
268                 row_filter::caller<10, T, D, BrdRowReflect>,
269                 row_filter::caller<11, T, D, BrdRowReflect>,
270                 row_filter::caller<12, T, D, BrdRowReflect>,
271                 row_filter::caller<13, T, D, BrdRowReflect>,
272                 row_filter::caller<14, T, D, BrdRowReflect>,
273                 row_filter::caller<15, T, D, BrdRowReflect>,
274                 row_filter::caller<16, T, D, BrdRowReflect>,
275                 row_filter::caller<17, T, D, BrdRowReflect>,
276                 row_filter::caller<18, T, D, BrdRowReflect>,
277                 row_filter::caller<19, T, D, BrdRowReflect>,
278                 row_filter::caller<20, T, D, BrdRowReflect>,
279                 row_filter::caller<21, T, D, BrdRowReflect>,
280                 row_filter::caller<22, T, D, BrdRowReflect>,
281                 row_filter::caller<23, T, D, BrdRowReflect>,
282                 row_filter::caller<24, T, D, BrdRowReflect>,
283                 row_filter::caller<25, T, D, BrdRowReflect>,
284                 row_filter::caller<26, T, D, BrdRowReflect>,
285                 row_filter::caller<27, T, D, BrdRowReflect>,
286                 row_filter::caller<28, T, D, BrdRowReflect>,
287                 row_filter::caller<29, T, D, BrdRowReflect>,
288                 row_filter::caller<30, T, D, BrdRowReflect>,
289                 row_filter::caller<31, T, D, BrdRowReflect>,
290                 row_filter::caller<32, T, D, BrdRowReflect>
291             },
292             {
293                 0,
294                 row_filter::caller< 1, T, D, BrdRowWrap>,
295                 row_filter::caller< 2, T, D, BrdRowWrap>,
296                 row_filter::caller< 3, T, D, BrdRowWrap>,
297                 row_filter::caller< 4, T, D, BrdRowWrap>,
298                 row_filter::caller< 5, T, D, BrdRowWrap>,
299                 row_filter::caller< 6, T, D, BrdRowWrap>,
300                 row_filter::caller< 7, T, D, BrdRowWrap>,
301                 row_filter::caller< 8, T, D, BrdRowWrap>,
302                 row_filter::caller< 9, T, D, BrdRowWrap>,
303                 row_filter::caller<10, T, D, BrdRowWrap>,
304                 row_filter::caller<11, T, D, BrdRowWrap>,
305                 row_filter::caller<12, T, D, BrdRowWrap>,
306                 row_filter::caller<13, T, D, BrdRowWrap>,
307                 row_filter::caller<14, T, D, BrdRowWrap>,
308                 row_filter::caller<15, T, D, BrdRowWrap>,
309                 row_filter::caller<16, T, D, BrdRowWrap>,
310                 row_filter::caller<17, T, D, BrdRowWrap>,
311                 row_filter::caller<18, T, D, BrdRowWrap>,
312                 row_filter::caller<19, T, D, BrdRowWrap>,
313                 row_filter::caller<20, T, D, BrdRowWrap>,
314                 row_filter::caller<21, T, D, BrdRowWrap>,
315                 row_filter::caller<22, T, D, BrdRowWrap>,
316                 row_filter::caller<23, T, D, BrdRowWrap>,
317                 row_filter::caller<24, T, D, BrdRowWrap>,
318                 row_filter::caller<25, T, D, BrdRowWrap>,
319                 row_filter::caller<26, T, D, BrdRowWrap>,
320                 row_filter::caller<27, T, D, BrdRowWrap>,
321                 row_filter::caller<28, T, D, BrdRowWrap>,
322                 row_filter::caller<29, T, D, BrdRowWrap>,
323                 row_filter::caller<30, T, D, BrdRowWrap>,
324                 row_filter::caller<31, T, D, BrdRowWrap>,
325                 row_filter::caller<32, T, D, BrdRowWrap>
326             },
327             {
328                 0,
329                 row_filter::caller< 1, T, D, BrdRowReflect101>,
330                 row_filter::caller< 2, T, D, BrdRowReflect101>,
331                 row_filter::caller< 3, T, D, BrdRowReflect101>,
332                 row_filter::caller< 4, T, D, BrdRowReflect101>,
333                 row_filter::caller< 5, T, D, BrdRowReflect101>,
334                 row_filter::caller< 6, T, D, BrdRowReflect101>,
335                 row_filter::caller< 7, T, D, BrdRowReflect101>,
336                 row_filter::caller< 8, T, D, BrdRowReflect101>,
337                 row_filter::caller< 9, T, D, BrdRowReflect101>,
338                 row_filter::caller<10, T, D, BrdRowReflect101>,
339                 row_filter::caller<11, T, D, BrdRowReflect101>,
340                 row_filter::caller<12, T, D, BrdRowReflect101>,
341                 row_filter::caller<13, T, D, BrdRowReflect101>,
342                 row_filter::caller<14, T, D, BrdRowReflect101>,
343                 row_filter::caller<15, T, D, BrdRowReflect101>,
344                 row_filter::caller<16, T, D, BrdRowReflect101>,
345                 row_filter::caller<17, T, D, BrdRowReflect101>,
346                 row_filter::caller<18, T, D, BrdRowReflect101>,
347                 row_filter::caller<19, T, D, BrdRowReflect101>,
348                 row_filter::caller<20, T, D, BrdRowReflect101>,
349                 row_filter::caller<21, T, D, BrdRowReflect101>,
350                 row_filter::caller<22, T, D, BrdRowReflect101>,
351                 row_filter::caller<23, T, D, BrdRowReflect101>,
352                 row_filter::caller<24, T, D, BrdRowReflect101>,
353                 row_filter::caller<25, T, D, BrdRowReflect101>,
354                 row_filter::caller<26, T, D, BrdRowReflect101>,
355                 row_filter::caller<27, T, D, BrdRowReflect101>,
356                 row_filter::caller<28, T, D, BrdRowReflect101>,
357                 row_filter::caller<29, T, D, BrdRowReflect101>,
358                 row_filter::caller<30, T, D, BrdRowReflect101>,
359                 row_filter::caller<31, T, D, BrdRowReflect101>,
360                 row_filter::caller<32, T, D, BrdRowReflect101>
361             }
362         };
363 
364         if (stream == 0)
365             cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
366         else
367             cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
368 
369         callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
370     }
371 }
372