1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "opencv2/core/cuda/common.hpp" 44 #include "opencv2/core/cuda/saturate_cast.hpp" 45 #include "opencv2/core/cuda/vec_math.hpp" 46 #include "opencv2/core/cuda/border_interpolate.hpp" 47 48 using namespace cv::cuda; 49 using namespace cv::cuda::device; 50 51 namespace row_filter 52 { 53 #define MAX_KERNEL_SIZE 32 54 55 __constant__ float c_kernel[MAX_KERNEL_SIZE]; 56 57 template <int KSIZE, typename T, typename D, typename B> linearRowFilter(const PtrStepSz<T> src,PtrStep<D> dst,const int anchor,const B brd)58 __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) 59 { 60 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) 61 const int BLOCK_DIM_X = 32; 62 const int BLOCK_DIM_Y = 8; 63 const int PATCH_PER_BLOCK = 4; 64 const int HALO_SIZE = 1; 65 #else 66 const int BLOCK_DIM_X = 32; 67 const int BLOCK_DIM_Y = 4; 68 const int PATCH_PER_BLOCK = 4; 69 const int HALO_SIZE = 1; 70 #endif 71 72 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; 73 74 __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; 75 76 const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; 77 78 if (y >= src.rows) 79 return; 80 81 const T* src_row = src.ptr(y); 82 83 const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; 84 85 if (blockIdx.x > 0) 86 { 87 //Load left halo 88 #pragma unroll 89 for (int j = 0; j < HALO_SIZE; ++j) 90 smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); 91 } 92 else 93 { 94 //Load left halo 95 #pragma unroll 96 for (int j = 0; j < HALO_SIZE; ++j) 97 smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); 98 } 99 100 if (blockIdx.x + 2 < gridDim.x) 101 { 102 //Load main data 103 #pragma unroll 104 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 105 smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); 106 107 //Load right halo 108 #pragma unroll 109 for (int j = 0; j < HALO_SIZE; ++j) 110 smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); 111 } 112 else 113 { 114 //Load main data 115 #pragma unroll 116 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 117 smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); 118 119 //Load right halo 120 #pragma unroll 121 for (int j = 0; j < HALO_SIZE; ++j) 122 smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); 123 } 124 125 __syncthreads(); 126 127 #pragma unroll 128 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 129 { 130 const int x = xStart + j * BLOCK_DIM_X; 131 132 if (x < src.cols) 133 { 134 sum_t sum = VecTraits<sum_t>::all(0); 135 136 #pragma unroll 137 for (int k = 0; k < KSIZE; ++k) 138 sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; 139 140 dst(y, x) = saturate_cast<D>(sum); 141 } 142 } 143 } 144 145 template <int KSIZE, typename T, typename D, template<typename> class B> caller(PtrStepSz<T> src,PtrStepSz<D> dst,int anchor,int cc,cudaStream_t stream)146 void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) 147 { 148 int BLOCK_DIM_X; 149 int BLOCK_DIM_Y; 150 int PATCH_PER_BLOCK; 151 152 if (cc >= 20) 153 { 154 BLOCK_DIM_X = 32; 155 BLOCK_DIM_Y = 8; 156 PATCH_PER_BLOCK = 4; 157 } 158 else 159 { 160 BLOCK_DIM_X = 32; 161 BLOCK_DIM_Y = 4; 162 PATCH_PER_BLOCK = 4; 163 } 164 165 const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); 166 const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); 167 168 B<T> brd(src.cols); 169 170 linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); 171 cudaSafeCall( cudaGetLastError() ); 172 173 if (stream == 0) 174 cudaSafeCall( cudaDeviceSynchronize() ); 175 } 176 } 177 178 namespace filter 179 { 180 template <typename T, typename D> linearRow(PtrStepSzb src,PtrStepSzb dst,const float * kernel,int ksize,int anchor,int brd_type,int cc,cudaStream_t stream)181 void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) 182 { 183 typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); 184 185 static const caller_t callers[5][33] = 186 { 187 { 188 0, 189 row_filter::caller< 1, T, D, BrdRowConstant>, 190 row_filter::caller< 2, T, D, BrdRowConstant>, 191 row_filter::caller< 3, T, D, BrdRowConstant>, 192 row_filter::caller< 4, T, D, BrdRowConstant>, 193 row_filter::caller< 5, T, D, BrdRowConstant>, 194 row_filter::caller< 6, T, D, BrdRowConstant>, 195 row_filter::caller< 7, T, D, BrdRowConstant>, 196 row_filter::caller< 8, T, D, BrdRowConstant>, 197 row_filter::caller< 9, T, D, BrdRowConstant>, 198 row_filter::caller<10, T, D, BrdRowConstant>, 199 row_filter::caller<11, T, D, BrdRowConstant>, 200 row_filter::caller<12, T, D, BrdRowConstant>, 201 row_filter::caller<13, T, D, BrdRowConstant>, 202 row_filter::caller<14, T, D, BrdRowConstant>, 203 row_filter::caller<15, T, D, BrdRowConstant>, 204 row_filter::caller<16, T, D, BrdRowConstant>, 205 row_filter::caller<17, T, D, BrdRowConstant>, 206 row_filter::caller<18, T, D, BrdRowConstant>, 207 row_filter::caller<19, T, D, BrdRowConstant>, 208 row_filter::caller<20, T, D, BrdRowConstant>, 209 row_filter::caller<21, T, D, BrdRowConstant>, 210 row_filter::caller<22, T, D, BrdRowConstant>, 211 row_filter::caller<23, T, D, BrdRowConstant>, 212 row_filter::caller<24, T, D, BrdRowConstant>, 213 row_filter::caller<25, T, D, BrdRowConstant>, 214 row_filter::caller<26, T, D, BrdRowConstant>, 215 row_filter::caller<27, T, D, BrdRowConstant>, 216 row_filter::caller<28, T, D, BrdRowConstant>, 217 row_filter::caller<29, T, D, BrdRowConstant>, 218 row_filter::caller<30, T, D, BrdRowConstant>, 219 row_filter::caller<31, T, D, BrdRowConstant>, 220 row_filter::caller<32, T, D, BrdRowConstant> 221 }, 222 { 223 0, 224 row_filter::caller< 1, T, D, BrdRowReplicate>, 225 row_filter::caller< 2, T, D, BrdRowReplicate>, 226 row_filter::caller< 3, T, D, BrdRowReplicate>, 227 row_filter::caller< 4, T, D, BrdRowReplicate>, 228 row_filter::caller< 5, T, D, BrdRowReplicate>, 229 row_filter::caller< 6, T, D, BrdRowReplicate>, 230 row_filter::caller< 7, T, D, BrdRowReplicate>, 231 row_filter::caller< 8, T, D, BrdRowReplicate>, 232 row_filter::caller< 9, T, D, BrdRowReplicate>, 233 row_filter::caller<10, T, D, BrdRowReplicate>, 234 row_filter::caller<11, T, D, BrdRowReplicate>, 235 row_filter::caller<12, T, D, BrdRowReplicate>, 236 row_filter::caller<13, T, D, BrdRowReplicate>, 237 row_filter::caller<14, T, D, BrdRowReplicate>, 238 row_filter::caller<15, T, D, BrdRowReplicate>, 239 row_filter::caller<16, T, D, BrdRowReplicate>, 240 row_filter::caller<17, T, D, BrdRowReplicate>, 241 row_filter::caller<18, T, D, BrdRowReplicate>, 242 row_filter::caller<19, T, D, BrdRowReplicate>, 243 row_filter::caller<20, T, D, BrdRowReplicate>, 244 row_filter::caller<21, T, D, BrdRowReplicate>, 245 row_filter::caller<22, T, D, BrdRowReplicate>, 246 row_filter::caller<23, T, D, BrdRowReplicate>, 247 row_filter::caller<24, T, D, BrdRowReplicate>, 248 row_filter::caller<25, T, D, BrdRowReplicate>, 249 row_filter::caller<26, T, D, BrdRowReplicate>, 250 row_filter::caller<27, T, D, BrdRowReplicate>, 251 row_filter::caller<28, T, D, BrdRowReplicate>, 252 row_filter::caller<29, T, D, BrdRowReplicate>, 253 row_filter::caller<30, T, D, BrdRowReplicate>, 254 row_filter::caller<31, T, D, BrdRowReplicate>, 255 row_filter::caller<32, T, D, BrdRowReplicate> 256 }, 257 { 258 0, 259 row_filter::caller< 1, T, D, BrdRowReflect>, 260 row_filter::caller< 2, T, D, BrdRowReflect>, 261 row_filter::caller< 3, T, D, BrdRowReflect>, 262 row_filter::caller< 4, T, D, BrdRowReflect>, 263 row_filter::caller< 5, T, D, BrdRowReflect>, 264 row_filter::caller< 6, T, D, BrdRowReflect>, 265 row_filter::caller< 7, T, D, BrdRowReflect>, 266 row_filter::caller< 8, T, D, BrdRowReflect>, 267 row_filter::caller< 9, T, D, BrdRowReflect>, 268 row_filter::caller<10, T, D, BrdRowReflect>, 269 row_filter::caller<11, T, D, BrdRowReflect>, 270 row_filter::caller<12, T, D, BrdRowReflect>, 271 row_filter::caller<13, T, D, BrdRowReflect>, 272 row_filter::caller<14, T, D, BrdRowReflect>, 273 row_filter::caller<15, T, D, BrdRowReflect>, 274 row_filter::caller<16, T, D, BrdRowReflect>, 275 row_filter::caller<17, T, D, BrdRowReflect>, 276 row_filter::caller<18, T, D, BrdRowReflect>, 277 row_filter::caller<19, T, D, BrdRowReflect>, 278 row_filter::caller<20, T, D, BrdRowReflect>, 279 row_filter::caller<21, T, D, BrdRowReflect>, 280 row_filter::caller<22, T, D, BrdRowReflect>, 281 row_filter::caller<23, T, D, BrdRowReflect>, 282 row_filter::caller<24, T, D, BrdRowReflect>, 283 row_filter::caller<25, T, D, BrdRowReflect>, 284 row_filter::caller<26, T, D, BrdRowReflect>, 285 row_filter::caller<27, T, D, BrdRowReflect>, 286 row_filter::caller<28, T, D, BrdRowReflect>, 287 row_filter::caller<29, T, D, BrdRowReflect>, 288 row_filter::caller<30, T, D, BrdRowReflect>, 289 row_filter::caller<31, T, D, BrdRowReflect>, 290 row_filter::caller<32, T, D, BrdRowReflect> 291 }, 292 { 293 0, 294 row_filter::caller< 1, T, D, BrdRowWrap>, 295 row_filter::caller< 2, T, D, BrdRowWrap>, 296 row_filter::caller< 3, T, D, BrdRowWrap>, 297 row_filter::caller< 4, T, D, BrdRowWrap>, 298 row_filter::caller< 5, T, D, BrdRowWrap>, 299 row_filter::caller< 6, T, D, BrdRowWrap>, 300 row_filter::caller< 7, T, D, BrdRowWrap>, 301 row_filter::caller< 8, T, D, BrdRowWrap>, 302 row_filter::caller< 9, T, D, BrdRowWrap>, 303 row_filter::caller<10, T, D, BrdRowWrap>, 304 row_filter::caller<11, T, D, BrdRowWrap>, 305 row_filter::caller<12, T, D, BrdRowWrap>, 306 row_filter::caller<13, T, D, BrdRowWrap>, 307 row_filter::caller<14, T, D, BrdRowWrap>, 308 row_filter::caller<15, T, D, BrdRowWrap>, 309 row_filter::caller<16, T, D, BrdRowWrap>, 310 row_filter::caller<17, T, D, BrdRowWrap>, 311 row_filter::caller<18, T, D, BrdRowWrap>, 312 row_filter::caller<19, T, D, BrdRowWrap>, 313 row_filter::caller<20, T, D, BrdRowWrap>, 314 row_filter::caller<21, T, D, BrdRowWrap>, 315 row_filter::caller<22, T, D, BrdRowWrap>, 316 row_filter::caller<23, T, D, BrdRowWrap>, 317 row_filter::caller<24, T, D, BrdRowWrap>, 318 row_filter::caller<25, T, D, BrdRowWrap>, 319 row_filter::caller<26, T, D, BrdRowWrap>, 320 row_filter::caller<27, T, D, BrdRowWrap>, 321 row_filter::caller<28, T, D, BrdRowWrap>, 322 row_filter::caller<29, T, D, BrdRowWrap>, 323 row_filter::caller<30, T, D, BrdRowWrap>, 324 row_filter::caller<31, T, D, BrdRowWrap>, 325 row_filter::caller<32, T, D, BrdRowWrap> 326 }, 327 { 328 0, 329 row_filter::caller< 1, T, D, BrdRowReflect101>, 330 row_filter::caller< 2, T, D, BrdRowReflect101>, 331 row_filter::caller< 3, T, D, BrdRowReflect101>, 332 row_filter::caller< 4, T, D, BrdRowReflect101>, 333 row_filter::caller< 5, T, D, BrdRowReflect101>, 334 row_filter::caller< 6, T, D, BrdRowReflect101>, 335 row_filter::caller< 7, T, D, BrdRowReflect101>, 336 row_filter::caller< 8, T, D, BrdRowReflect101>, 337 row_filter::caller< 9, T, D, BrdRowReflect101>, 338 row_filter::caller<10, T, D, BrdRowReflect101>, 339 row_filter::caller<11, T, D, BrdRowReflect101>, 340 row_filter::caller<12, T, D, BrdRowReflect101>, 341 row_filter::caller<13, T, D, BrdRowReflect101>, 342 row_filter::caller<14, T, D, BrdRowReflect101>, 343 row_filter::caller<15, T, D, BrdRowReflect101>, 344 row_filter::caller<16, T, D, BrdRowReflect101>, 345 row_filter::caller<17, T, D, BrdRowReflect101>, 346 row_filter::caller<18, T, D, BrdRowReflect101>, 347 row_filter::caller<19, T, D, BrdRowReflect101>, 348 row_filter::caller<20, T, D, BrdRowReflect101>, 349 row_filter::caller<21, T, D, BrdRowReflect101>, 350 row_filter::caller<22, T, D, BrdRowReflect101>, 351 row_filter::caller<23, T, D, BrdRowReflect101>, 352 row_filter::caller<24, T, D, BrdRowReflect101>, 353 row_filter::caller<25, T, D, BrdRowReflect101>, 354 row_filter::caller<26, T, D, BrdRowReflect101>, 355 row_filter::caller<27, T, D, BrdRowReflect101>, 356 row_filter::caller<28, T, D, BrdRowReflect101>, 357 row_filter::caller<29, T, D, BrdRowReflect101>, 358 row_filter::caller<30, T, D, BrdRowReflect101>, 359 row_filter::caller<31, T, D, BrdRowReflect101>, 360 row_filter::caller<32, T, D, BrdRowReflect101> 361 } 362 }; 363 364 if (stream == 0) 365 cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); 366 else 367 cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); 368 369 callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); 370 } 371 } 372