1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "lbp.hpp" 46 #include "opencv2/core/cuda/vec_traits.hpp" 47 #include "opencv2/core/cuda/saturate_cast.hpp" 48 49 namespace cv { namespace cuda { namespace device 50 { 51 namespace lbp 52 { 53 struct LBP 54 { LBPcv::cuda::device::lbp::LBP55 __host__ __device__ __forceinline__ LBP() {} 56 operator ()cv::cuda::device::lbp::LBP57 __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const 58 { 59 int anchors[9]; 60 61 anchors[0] = integral[ty]; 62 anchors[1] = integral[ty + fw]; 63 anchors[0] -= anchors[1]; 64 anchors[2] = integral[ty + fw * 2]; 65 anchors[1] -= anchors[2]; 66 anchors[2] -= integral[ty + fw * 3]; 67 68 ty += fh; 69 anchors[3] = integral[ty]; 70 anchors[4] = integral[ty + fw]; 71 anchors[3] -= anchors[4]; 72 anchors[5] = integral[ty + fw * 2]; 73 anchors[4] -= anchors[5]; 74 anchors[5] -= integral[ty + fw * 3]; 75 76 anchors[0] -= anchors[3]; 77 anchors[1] -= anchors[4]; 78 anchors[2] -= anchors[5]; 79 // 0 - 2 contains s0 - s2 80 81 ty += fh; 82 anchors[6] = integral[ty]; 83 anchors[7] = integral[ty + fw]; 84 anchors[6] -= anchors[7]; 85 anchors[8] = integral[ty + fw * 2]; 86 anchors[7] -= anchors[8]; 87 anchors[8] -= integral[ty + fw * 3]; 88 89 anchors[3] -= anchors[6]; 90 anchors[4] -= anchors[7]; 91 anchors[5] -= anchors[8]; 92 // 3 - 5 contains s3 - s5 93 94 anchors[0] -= anchors[4]; 95 anchors[1] -= anchors[4]; 96 anchors[2] -= anchors[4]; 97 anchors[3] -= anchors[4]; 98 anchors[5] -= anchors[4]; 99 100 int response = (~(anchors[0] >> 31)) & 4; 101 response |= (~(anchors[1] >> 31)) & 2;; 102 response |= (~(anchors[2] >> 31)) & 1; 103 104 shift = (~(anchors[5] >> 31)) & 16; 105 shift |= (~(anchors[3] >> 31)) & 1; 106 107 ty += fh; 108 anchors[0] = integral[ty]; 109 anchors[1] = integral[ty + fw]; 110 anchors[0] -= anchors[1]; 111 anchors[2] = integral[ty + fw * 2]; 112 anchors[1] -= anchors[2]; 113 anchors[2] -= integral[ty + fw * 3]; 114 115 anchors[6] -= anchors[0]; 116 anchors[7] -= anchors[1]; 117 anchors[8] -= anchors[2]; 118 // 0 -2 contains s6 - s8 119 120 anchors[6] -= anchors[4]; 121 anchors[7] -= anchors[4]; 122 anchors[8] -= anchors[4]; 123 124 shift |= (~(anchors[6] >> 31)) & 2; 125 shift |= (~(anchors[7] >> 31)) & 4; 126 shift |= (~(anchors[8] >> 31)) & 8; 127 return response; 128 } 129 }; 130 131 template<typename Pr> disjoin(int4 * candidates,int4 * objects,unsigned int n,int groupThreshold,float grouping_eps,unsigned int * nclasses)132 __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses) 133 { 134 unsigned int tid = threadIdx.x; 135 extern __shared__ int sbuff[]; 136 137 int* labels = sbuff; 138 int* rrects = sbuff + n; 139 140 Pr predicate(grouping_eps); 141 partition(candidates, n, labels, predicate); 142 143 rrects[tid * 4 + 0] = 0; 144 rrects[tid * 4 + 1] = 0; 145 rrects[tid * 4 + 2] = 0; 146 rrects[tid * 4 + 3] = 0; 147 __syncthreads(); 148 149 int cls = labels[tid]; 150 Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); 151 Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); 152 Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); 153 Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); 154 155 __syncthreads(); 156 labels[tid] = 0; 157 158 __syncthreads(); 159 Emulation::smem::atomicInc((unsigned int*)labels + cls, n); 160 161 __syncthreads(); 162 *nclasses = 0; 163 164 int active = labels[tid]; 165 if (active) 166 { 167 int* r1 = rrects + tid * 4; 168 float s = 1.f / active; 169 r1[0] = saturate_cast<int>(r1[0] * s); 170 r1[1] = saturate_cast<int>(r1[1] * s); 171 r1[2] = saturate_cast<int>(r1[2] * s); 172 r1[3] = saturate_cast<int>(r1[3] * s); 173 } 174 __syncthreads(); 175 176 if (active && active >= groupThreshold) 177 { 178 int* r1 = rrects + tid * 4; 179 int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]); 180 181 int aidx = Emulation::smem::atomicInc(nclasses, n); 182 objects[aidx] = r_out; 183 } 184 } 185 connectedConmonents(PtrStepSz<int4> candidates,int ncandidates,PtrStepSz<int4> objects,int groupThreshold,float grouping_eps,unsigned int * nclasses)186 void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) 187 { 188 if (!ncandidates) return; 189 int block = ncandidates; 190 int smem = block * ( sizeof(int) + sizeof(int4) ); 191 disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses); 192 cudaSafeCall( cudaGetLastError() ); 193 } 194 195 struct Cascade 196 { Cascadecv::cuda::device::lbp::Cascade197 __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves, 198 const int* _subsets, const uchar4* _features, int _subsetSize) 199 200 : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){} 201 operator ()cv::cuda::device::lbp::Cascade202 __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const 203 { 204 int current_node = 0; 205 int current_leave = 0; 206 207 for (int s = 0; s < nstages; ++s) 208 { 209 float sum = 0; 210 Stage stage = stages[s]; 211 for (int t = 0; t < stage.ntrees; t++) 212 { 213 ClNode node = nodes[current_node]; 214 uchar4 feature = features[node.featureIdx]; 215 216 int shift; 217 int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift); 218 int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; 219 sum += leaves[idx]; 220 221 current_node += 1; 222 current_leave += 2; 223 } 224 225 if (sum < stage.threshold) 226 return false; 227 } 228 229 return true; 230 } 231 232 const Stage* stages; 233 const int nstages; 234 235 const ClNode* nodes; 236 const float* leaves; 237 const int* subsets; 238 const uchar4* features; 239 240 const int subsetSize; 241 const LBP evaluator; 242 }; 243 244 // stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k lbp_cascade(const Cascade cascade,int frameW,int frameH,int windowW,int windowH,float scale,const float factor,const int total,int * integral,const int pitch,PtrStepSz<int4> objects,unsigned int * classified)245 __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor, 246 const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified) 247 { 248 int ftid = blockIdx.x * blockDim.x + threadIdx.x; 249 if (ftid >= total) return; 250 251 int step = (scale <= 2.f); 252 253 int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step; 254 int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step); 255 int wshift = 0; 256 257 int scaleTid = ftid; 258 259 while (scaleTid >= stotal) 260 { 261 scaleTid -= stotal; 262 wshift += __float2int_rn(__fdividef(frameW, scale)) + 1; 263 scale *= factor; 264 step = (scale <= 2.f); 265 windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step)); 266 stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step); 267 } 268 269 int y = __fdividef(scaleTid, windowsForLine); 270 int x = scaleTid - y * windowsForLine; 271 272 x <<= step; 273 y <<= step; 274 275 if (cascade(y, x + wshift, integral, pitch)) 276 { 277 if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return; 278 279 int4 rect; 280 rect.x = __float2int_rn(x * scale); 281 rect.y = __float2int_rn(y * scale); 282 rect.z = __float2int_rn(windowW * scale); 283 rect.w = __float2int_rn(windowH * scale); 284 285 int res = atomicInc(classified, (unsigned int)objects.cols); 286 objects(0, res) = rect; 287 } 288 } 289 classifyPyramid(int frameW,int frameH,int windowW,int windowH,float initialScale,float factor,int workAmount,const PtrStepSzb & mstages,const int nstages,const PtrStepSzi & mnodes,const PtrStepSzf & mleaves,const PtrStepSzi & msubsets,const PtrStepSzb & mfeatures,const int subsetSize,PtrStepSz<int4> objects,unsigned int * classified,PtrStepSzi integral)290 void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount, 291 const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures, 292 const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral) 293 { 294 const int block = 128; 295 int grid = divUp(workAmount, block); 296 cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1); 297 Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize); 298 lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified); 299 } 300 } 301 }}} 302 303 #endif /* CUDA_DISABLER */ 304