1 // This file is auto-generated. Do not edit! 2 3 #include "precomp.hpp" 4 #include "opencl_kernels_objdetect.hpp" 5 6 namespace cv 7 { 8 namespace ocl 9 { 10 namespace objdetect 11 { 12 13 const struct ProgramEntry cascadedetect={"cascadedetect", 14 "#ifdef HAAR\n" 15 "typedef struct __attribute__((aligned(4))) OptHaarFeature\n" 16 "{\n" 17 "int4 ofs[3] __attribute__((aligned (4)));\n" 18 "float4 weight __attribute__((aligned (4)));\n" 19 "}\n" 20 "OptHaarFeature;\n" 21 "#endif\n" 22 "#ifdef LBP\n" 23 "typedef struct __attribute__((aligned(4))) OptLBPFeature\n" 24 "{\n" 25 "int16 ofs __attribute__((aligned (4)));\n" 26 "}\n" 27 "OptLBPFeature;\n" 28 "#endif\n" 29 "typedef struct __attribute__((aligned(4))) Stump\n" 30 "{\n" 31 "float4 st __attribute__((aligned (4)));\n" 32 "}\n" 33 "Stump;\n" 34 "typedef struct __attribute__((aligned(4))) Node\n" 35 "{\n" 36 "int4 n __attribute__((aligned (4)));\n" 37 "}\n" 38 "Node;\n" 39 "typedef struct __attribute__((aligned (4))) Stage\n" 40 "{\n" 41 "int first __attribute__((aligned (4)));\n" 42 "int ntrees __attribute__((aligned (4)));\n" 43 "float threshold __attribute__((aligned (4)));\n" 44 "}\n" 45 "Stage;\n" 46 "typedef struct __attribute__((aligned (4))) ScaleData\n" 47 "{\n" 48 "float scale __attribute__((aligned (4)));\n" 49 "int szi_width __attribute__((aligned (4)));\n" 50 "int szi_height __attribute__((aligned (4)));\n" 51 "int layer_ofs __attribute__((aligned (4)));\n" 52 "int ystep __attribute__((aligned (4)));\n" 53 "}\n" 54 "ScaleData;\n" 55 "#ifndef SUM_BUF_SIZE\n" 56 "#define SUM_BUF_SIZE 0\n" 57 "#endif\n" 58 "#ifndef NODE_COUNT\n" 59 "#define NODE_COUNT 1\n" 60 "#endif\n" 61 "#ifdef HAAR\n" 62 "__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))\n" 63 "void runHaarClassifier(\n" 64 "int nscales, __global const ScaleData* scaleData,\n" 65 "__global const int* sum,\n" 66 "int _sumstep, int sumoffset,\n" 67 "__global const OptHaarFeature* optfeatures,\n" 68 "__global const Stage* stages,\n" 69 "__global const Node* nodes,\n" 70 "__global const float* leaves0,\n" 71 "volatile __global int* facepos,\n" 72 "int4 normrect, int sqofs, int2 windowsize)\n" 73 "{\n" 74 "int lx = get_local_id(0);\n" 75 "int ly = get_local_id(1);\n" 76 "int groupIdx = get_group_id(0);\n" 77 "int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;\n" 78 "int scaleIdx, tileIdx, stageIdx;\n" 79 "int sumstep = (int)(_sumstep/sizeof(int));\n" 80 "int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),\n" 81 "mad24(normrect.y, sumstep, normrect.x + normrect.z),\n" 82 "mad24(normrect.y + normrect.w, sumstep, normrect.x),\n" 83 "mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));\n" 84 "int normarea = normrect.z * normrect.w;\n" 85 "float invarea = 1.f/normarea;\n" 86 "int lidx = ly*LOCAL_SIZE_X + lx;\n" 87 "#if SUM_BUF_SIZE > 0\n" 88 "int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),\n" 89 "mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),\n" 90 "mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),\n" 91 "mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));\n" 92 "#else\n" 93 "int4 nofs = nofs0;\n" 94 "#endif\n" 95 "#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)\n" 96 "__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];\n" 97 "#if SUM_BUF_SIZE > 0\n" 98 "__local int* ibuf = lstore;\n" 99 "__local int* lcount = ibuf + SUM_BUF_SIZE;\n" 100 "#else\n" 101 "__local int* lcount = lstore;\n" 102 "#endif\n" 103 "__local float* lnf = (__local float*)(lcount + 1);\n" 104 "__local float* lpartsum = lnf + LOCAL_SIZE;\n" 105 "__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);\n" 106 "for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )\n" 107 "{\n" 108 "__global const ScaleData* s = scaleData + scaleIdx;\n" 109 "int ystep = s->ystep;\n" 110 "int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));\n" 111 "int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,\n" 112 "(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);\n" 113 "int totalTiles = ntiles.x*ntiles.y;\n" 114 "for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )\n" 115 "{\n" 116 "int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;\n" 117 "int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;\n" 118 "int ix = lx, iy = ly;\n" 119 "__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;\n" 120 "__global const int* psum1 = psum0 + mad24(iy, sumstep, ix);\n" 121 "if( ix0 >= worksize.x || iy0 >= worksize.y )\n" 122 "continue;\n" 123 "#if SUM_BUF_SIZE > 0\n" 124 "for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )\n" 125 "{\n" 126 "int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;\n" 127 "vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);\n" 128 "}\n" 129 "#endif\n" 130 "if( lidx == 0 )\n" 131 "lcount[0] = 0;\n" 132 "barrier(CLK_LOCAL_MEM_FENCE);\n" 133 "if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )\n" 134 "{\n" 135 "#if NODE_COUNT==1\n" 136 "__global const Stump* stump = (__global const Stump*)nodes;\n" 137 "#else\n" 138 "__global const Node* node = nodes;\n" 139 "__global const float* leaves = leaves0;\n" 140 "#endif\n" 141 "#if SUM_BUF_SIZE > 0\n" 142 "__local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);\n" 143 "#else\n" 144 "__global const int* psum = psum1;\n" 145 "#endif\n" 146 "__global const int* psqsum = (__global const int*)(psum1 + sqofs);\n" 147 "float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;\n" 148 "float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;\n" 149 "float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));\n" 150 "nf = nf > 0 ? nf : 1.f;\n" 151 "for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )\n" 152 "{\n" 153 "int ntrees = stages[stageIdx].ntrees;\n" 154 "float s = 0.f;\n" 155 "#if NODE_COUNT==1\n" 156 "for( i = 0; i < ntrees; i++ )\n" 157 "{\n" 158 "float4 st = stump[i].st;\n" 159 "__global const OptHaarFeature* f = optfeatures + as_int(st.x);\n" 160 "float4 weight = f->weight;\n" 161 "int4 ofs = f->ofs[0];\n" 162 "sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;\n" 163 "ofs = f->ofs[1];\n" 164 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);\n" 165 "if( weight.z > 0 )\n" 166 "{\n" 167 "ofs = f->ofs[2];\n" 168 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);\n" 169 "}\n" 170 "s += (sval < st.y*nf) ? st.z : st.w;\n" 171 "}\n" 172 "stump += ntrees;\n" 173 "#else\n" 174 "for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )\n" 175 "{\n" 176 "int idx = 0;\n" 177 "do\n" 178 "{\n" 179 "int4 n = node[idx].n;\n" 180 "__global const OptHaarFeature* f = optfeatures + n.x;\n" 181 "float4 weight = f->weight;\n" 182 "int4 ofs = f->ofs[0];\n" 183 "sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;\n" 184 "ofs = f->ofs[1];\n" 185 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);\n" 186 "if( weight.z > 0 )\n" 187 "{\n" 188 "ofs = f->ofs[2];\n" 189 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);\n" 190 "}\n" 191 "idx = (sval < as_float(n.y)*nf) ? n.z : n.w;\n" 192 "}\n" 193 "while(idx > 0);\n" 194 "s += leaves[-idx];\n" 195 "}\n" 196 "#endif\n" 197 "if( s < stages[stageIdx].threshold )\n" 198 "break;\n" 199 "}\n" 200 "if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )\n" 201 "{\n" 202 "int count = atomic_inc(lcount);\n" 203 "lbuf[count] = (int)(ix | (iy << 8));\n" 204 "lnf[count] = nf;\n" 205 "}\n" 206 "}\n" 207 "for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )\n" 208 "{\n" 209 "barrier(CLK_LOCAL_MEM_FENCE);\n" 210 "int nrects = lcount[0];\n" 211 "if( nrects == 0 )\n" 212 "break;\n" 213 "barrier(CLK_LOCAL_MEM_FENCE);\n" 214 "if( lidx == 0 )\n" 215 "lcount[0] = 0;\n" 216 "{\n" 217 "#if NODE_COUNT == 1\n" 218 "__global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;\n" 219 "#else\n" 220 "__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;\n" 221 "__global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);\n" 222 "#endif\n" 223 "int nparts = LOCAL_SIZE / nrects;\n" 224 "int ntrees = stages[stageIdx].ntrees;\n" 225 "int ntrees_p = (ntrees + nparts - 1)/nparts;\n" 226 "int nr = lidx / nparts;\n" 227 "int partidx = -1, idxval = 0;\n" 228 "float partsum = 0.f, nf = 0.f;\n" 229 "if( nr < nrects )\n" 230 "{\n" 231 "partidx = lidx % nparts;\n" 232 "idxval = lbuf[nr];\n" 233 "nf = lnf[nr];\n" 234 "{\n" 235 "int ntrees0 = ntrees_p*partidx;\n" 236 "int ntrees1 = min(ntrees0 + ntrees_p, ntrees);\n" 237 "int ix1 = idxval & 255, iy1 = idxval >> 8;\n" 238 "#if SUM_BUF_SIZE > 0\n" 239 "__local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);\n" 240 "#else\n" 241 "__global const int* psum = psum0 + mad24(iy1, sumstep, ix1);\n" 242 "#endif\n" 243 "#if NODE_COUNT == 1\n" 244 "for( i = ntrees0; i < ntrees1; i++ )\n" 245 "{\n" 246 "float4 st = stump[i].st;\n" 247 "__global const OptHaarFeature* f = optfeatures + as_int(st.x);\n" 248 "float4 weight = f->weight;\n" 249 "int4 ofs = f->ofs[0];\n" 250 "float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;\n" 251 "ofs = f->ofs[1];\n" 252 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);\n" 253 "if( fabs(weight.z) > 0 )\n" 254 "{\n" 255 "ofs = f->ofs[2];\n" 256 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);\n" 257 "}\n" 258 "partsum += (sval < st.y*nf) ? st.z : st.w;\n" 259 "}\n" 260 "#else\n" 261 "for( i = ntrees0; i < ntrees1; i++ )\n" 262 "{\n" 263 "int idx = 0;\n" 264 "do\n" 265 "{\n" 266 "int4 n = node[i*2 + idx].n;\n" 267 "__global const OptHaarFeature* f = optfeatures + n.x;\n" 268 "float4 weight = f->weight;\n" 269 "int4 ofs = f->ofs[0];\n" 270 "float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;\n" 271 "ofs = f->ofs[1];\n" 272 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);\n" 273 "if( weight.z > 0 )\n" 274 "{\n" 275 "ofs = f->ofs[2];\n" 276 "sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);\n" 277 "}\n" 278 "idx = (sval < as_float(n.y)*nf) ? n.z : n.w;\n" 279 "}\n" 280 "while(idx > 0);\n" 281 "partsum += leaves[i*3-idx];\n" 282 "}\n" 283 "#endif\n" 284 "}\n" 285 "}\n" 286 "lpartsum[lidx] = partsum;\n" 287 "barrier(CLK_LOCAL_MEM_FENCE);\n" 288 "if( partidx == 0 )\n" 289 "{\n" 290 "float s = lpartsum[nr*nparts];\n" 291 "for( i = 1; i < nparts; i++ )\n" 292 "s += lpartsum[i + nr*nparts];\n" 293 "if( s >= stages[stageIdx].threshold )\n" 294 "{\n" 295 "int count = atomic_inc(lcount);\n" 296 "lbuf[count] = idxval;\n" 297 "lnf[count] = nf;\n" 298 "}\n" 299 "}\n" 300 "}\n" 301 "}\n" 302 "barrier(CLK_LOCAL_MEM_FENCE);\n" 303 "if( stageIdx == N_STAGES )\n" 304 "{\n" 305 "int nrects = lcount[0];\n" 306 "if( lidx < nrects )\n" 307 "{\n" 308 "int nfaces = atomic_inc(facepos);\n" 309 "if( nfaces < MAX_FACES )\n" 310 "{\n" 311 "volatile __global int* face = facepos + 1 + nfaces*3;\n" 312 "int val = lbuf[lidx];\n" 313 "face[0] = scaleIdx;\n" 314 "face[1] = ix0 + (val & 255);\n" 315 "face[2] = iy0 + (val >> 8);\n" 316 "}\n" 317 "}\n" 318 "}\n" 319 "}\n" 320 "}\n" 321 "}\n" 322 "#endif\n" 323 "#ifdef LBP\n" 324 "#undef CALC_SUM_OFS_\n" 325 "#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \\\n" 326 "((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])\n" 327 "__kernel void runLBPClassifierStumpSimple(\n" 328 "int nscales, __global const ScaleData* scaleData,\n" 329 "__global const int* sum,\n" 330 "int _sumstep, int sumoffset,\n" 331 "__global const OptLBPFeature* optfeatures,\n" 332 "__global const Stage* stages,\n" 333 "__global const Stump* stumps,\n" 334 "__global const int* bitsets,\n" 335 "int bitsetSize,\n" 336 "volatile __global int* facepos,\n" 337 "int2 windowsize)\n" 338 "{\n" 339 "int lx = get_local_id(0);\n" 340 "int ly = get_local_id(1);\n" 341 "int local_size_x = get_local_size(0);\n" 342 "int local_size_y = get_local_size(1);\n" 343 "int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n" 344 "int ngroups = get_num_groups(0)*get_num_groups(1);\n" 345 "int scaleIdx, tileIdx, stageIdx;\n" 346 "int sumstep = (int)(_sumstep/sizeof(int));\n" 347 "for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )\n" 348 "{\n" 349 "__global const ScaleData* s = scaleData + scaleIdx;\n" 350 "int ystep = s->ystep;\n" 351 "int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));\n" 352 "int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,\n" 353 "(worksize.y/ystep + local_size_y-1)/local_size_y);\n" 354 "int totalTiles = ntiles.x*ntiles.y;\n" 355 "for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )\n" 356 "{\n" 357 "int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep;\n" 358 "int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep;\n" 359 "if( ix < worksize.x && iy < worksize.y )\n" 360 "{\n" 361 "__global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;\n" 362 "__global const Stump* stump = stumps;\n" 363 "__global const int* bitset = bitsets;\n" 364 "for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )\n" 365 "{\n" 366 "int i, ntrees = stages[stageIdx].ntrees;\n" 367 "float s = 0.f;\n" 368 "for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )\n" 369 "{\n" 370 "float4 st = stump->st;\n" 371 "__global const OptLBPFeature* f = optfeatures + as_int(st.x);\n" 372 "int16 ofs = f->ofs;\n" 373 "int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );\n" 374 "int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0);\n" 375 "idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0);\n" 376 "idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0);\n" 377 "mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0);\n" 378 "mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);\n" 379 "mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);\n" 380 "mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);\n" 381 "mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);\n" 382 "s += (bitset[idx] & (1 << mask)) ? st.z : st.w;\n" 383 "}\n" 384 "if( s < stages[stageIdx].threshold )\n" 385 "break;\n" 386 "}\n" 387 "if( stageIdx == N_STAGES )\n" 388 "{\n" 389 "int nfaces = atomic_inc(facepos);\n" 390 "if( nfaces < MAX_FACES )\n" 391 "{\n" 392 "volatile __global int* face = facepos + 1 + nfaces*3;\n" 393 "face[0] = scaleIdx;\n" 394 "face[1] = ix;\n" 395 "face[2] = iy;\n" 396 "}\n" 397 "}\n" 398 "}\n" 399 "}\n" 400 "}\n" 401 "}\n" 402 "__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))\n" 403 "void runLBPClassifierStump(\n" 404 "int nscales, __global const ScaleData* scaleData,\n" 405 "__global const int* sum,\n" 406 "int _sumstep, int sumoffset,\n" 407 "__global const OptLBPFeature* optfeatures,\n" 408 "__global const Stage* stages,\n" 409 "__global const Stump* stumps,\n" 410 "__global const int* bitsets,\n" 411 "int bitsetSize,\n" 412 "volatile __global int* facepos,\n" 413 "int2 windowsize)\n" 414 "{\n" 415 "int lx = get_local_id(0);\n" 416 "int ly = get_local_id(1);\n" 417 "int groupIdx = get_group_id(0);\n" 418 "int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;\n" 419 "int scaleIdx, tileIdx, stageIdx;\n" 420 "int sumstep = (int)(_sumstep/sizeof(int));\n" 421 "int lidx = ly*LOCAL_SIZE_X + lx;\n" 422 "#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)\n" 423 "__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];\n" 424 "#if SUM_BUF_SIZE > 0\n" 425 "__local int* ibuf = lstore;\n" 426 "__local int* lcount = ibuf + SUM_BUF_SIZE;\n" 427 "#else\n" 428 "__local int* lcount = lstore;\n" 429 "#endif\n" 430 "__local float* lpartsum = (__local float*)(lcount + 1);\n" 431 "__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);\n" 432 "for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )\n" 433 "{\n" 434 "__global const ScaleData* s = scaleData + scaleIdx;\n" 435 "int ystep = s->ystep;\n" 436 "int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));\n" 437 "int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,\n" 438 "(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);\n" 439 "int totalTiles = ntiles.x*ntiles.y;\n" 440 "for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )\n" 441 "{\n" 442 "int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;\n" 443 "int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;\n" 444 "int ix = lx, iy = ly;\n" 445 "__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;\n" 446 "if( ix0 >= worksize.x || iy0 >= worksize.y )\n" 447 "continue;\n" 448 "#if SUM_BUF_SIZE > 0\n" 449 "for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )\n" 450 "{\n" 451 "int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;\n" 452 "vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);\n" 453 "}\n" 454 "barrier(CLK_LOCAL_MEM_FENCE);\n" 455 "#endif\n" 456 "if( lidx == 0 )\n" 457 "lcount[0] = 0;\n" 458 "barrier(CLK_LOCAL_MEM_FENCE);\n" 459 "if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )\n" 460 "{\n" 461 "__global const Stump* stump = stumps;\n" 462 "__global const int* bitset = bitsets;\n" 463 "#if SUM_BUF_SIZE > 0\n" 464 "__local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);\n" 465 "#else\n" 466 "__global const int* p = psum0 + mad24(iy, sumstep, ix);\n" 467 "#endif\n" 468 "for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )\n" 469 "{\n" 470 "int ntrees = stages[stageIdx].ntrees;\n" 471 "float s = 0.f;\n" 472 "for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )\n" 473 "{\n" 474 "float4 st = stump->st;\n" 475 "__global const OptLBPFeature* f = optfeatures + as_int(st.x);\n" 476 "int16 ofs = f->ofs;\n" 477 "int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );\n" 478 "int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0);\n" 479 "idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0);\n" 480 "idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0);\n" 481 "mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0);\n" 482 "mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);\n" 483 "mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);\n" 484 "mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);\n" 485 "mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);\n" 486 "s += (bitset[idx] & (1 << mask)) ? st.z : st.w;\n" 487 "}\n" 488 "if( s < stages[stageIdx].threshold )\n" 489 "break;\n" 490 "}\n" 491 "if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )\n" 492 "{\n" 493 "int count = atomic_inc(lcount);\n" 494 "lbuf[count] = (int)(ix | (iy << 8));\n" 495 "}\n" 496 "}\n" 497 "for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )\n" 498 "{\n" 499 "int nrects = lcount[0];\n" 500 "barrier(CLK_LOCAL_MEM_FENCE);\n" 501 "if( nrects == 0 )\n" 502 "break;\n" 503 "if( lidx == 0 )\n" 504 "lcount[0] = 0;\n" 505 "{\n" 506 "__global const Stump* stump = stumps + stages[stageIdx].first;\n" 507 "__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;\n" 508 "int nparts = LOCAL_SIZE / nrects;\n" 509 "int ntrees = stages[stageIdx].ntrees;\n" 510 "int ntrees_p = (ntrees + nparts - 1)/nparts;\n" 511 "int nr = lidx / nparts;\n" 512 "int partidx = -1, idxval = 0;\n" 513 "float partsum = 0.f, nf = 0.f;\n" 514 "if( nr < nrects )\n" 515 "{\n" 516 "partidx = lidx % nparts;\n" 517 "idxval = lbuf[nr];\n" 518 "{\n" 519 "int ntrees0 = ntrees_p*partidx;\n" 520 "int ntrees1 = min(ntrees0 + ntrees_p, ntrees);\n" 521 "int ix1 = idxval & 255, iy1 = idxval >> 8;\n" 522 "#if SUM_BUF_SIZE > 0\n" 523 "__local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);\n" 524 "#else\n" 525 "__global const int* p = psum0 + mad24(iy1, sumstep, ix1);\n" 526 "#endif\n" 527 "for( i = ntrees0; i < ntrees1; i++ )\n" 528 "{\n" 529 "float4 st = stump[i].st;\n" 530 "__global const OptLBPFeature* f = optfeatures + as_int(st.x);\n" 531 "int16 ofs = f->ofs;\n" 532 "#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \\\n" 533 "((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])\n" 534 "int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );\n" 535 "int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0);\n" 536 "idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0);\n" 537 "idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0);\n" 538 "mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0);\n" 539 "mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);\n" 540 "mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);\n" 541 "mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);\n" 542 "mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);\n" 543 "partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;\n" 544 "}\n" 545 "}\n" 546 "}\n" 547 "lpartsum[lidx] = partsum;\n" 548 "barrier(CLK_LOCAL_MEM_FENCE);\n" 549 "if( partidx == 0 )\n" 550 "{\n" 551 "float s = lpartsum[nr*nparts];\n" 552 "for( i = 1; i < nparts; i++ )\n" 553 "s += lpartsum[i + nr*nparts];\n" 554 "if( s >= stages[stageIdx].threshold )\n" 555 "{\n" 556 "int count = atomic_inc(lcount);\n" 557 "lbuf[count] = idxval;\n" 558 "}\n" 559 "}\n" 560 "}\n" 561 "}\n" 562 "barrier(CLK_LOCAL_MEM_FENCE);\n" 563 "if( stageIdx == N_STAGES )\n" 564 "{\n" 565 "int nrects = lcount[0];\n" 566 "if( lidx < nrects )\n" 567 "{\n" 568 "int nfaces = atomic_inc(facepos);\n" 569 "if( nfaces < MAX_FACES )\n" 570 "{\n" 571 "volatile __global int* face = facepos + 1 + nfaces*3;\n" 572 "int val = lbuf[lidx];\n" 573 "face[0] = scaleIdx;\n" 574 "face[1] = ix0 + (val & 255);\n" 575 "face[2] = iy0 + (val >> 8);\n" 576 "}\n" 577 "}\n" 578 "}\n" 579 "}\n" 580 "}\n" 581 "}\n" 582 "#endif\n" 583 , "06c037755dc15a7796c0f3bc7ff05233"}; 584 ProgramSource cascadedetect_oclsrc(cascadedetect.programStr); 585 const struct ProgramEntry objdetect_hog={"objdetect_hog", 586 "#define CELL_WIDTH 8\n" 587 "#define CELL_HEIGHT 8\n" 588 "#define CELLS_PER_BLOCK_X 2\n" 589 "#define CELLS_PER_BLOCK_Y 2\n" 590 "#define NTHREADS 256\n" 591 "#define CV_PI_F M_PI_F\n" 592 "#ifdef INTEL_DEVICE\n" 593 "#define QANGLE_TYPE int\n" 594 "#define QANGLE_TYPE2 int2\n" 595 "#else\n" 596 "#define QANGLE_TYPE uchar\n" 597 "#define QANGLE_TYPE2 uchar2\n" 598 "#endif\n" 599 "__kernel void compute_hists_lut_kernel(\n" 600 "const int cblock_stride_x, const int cblock_stride_y,\n" 601 "const int cnbins, const int cblock_hist_size, const int img_block_width,\n" 602 "const int blocks_in_group, const int blocks_total,\n" 603 "const int grad_quadstep, const int qangle_step,\n" 604 "__global const float* grad, __global const QANGLE_TYPE* qangle,\n" 605 "__global const float* gauss_w_lut,\n" 606 "__global float* block_hists, __local float* smem)\n" 607 "{\n" 608 "const int lx = get_local_id(0);\n" 609 "const int lp = lx / 24; \n" 610 "const int gid = get_group_id(0) * blocks_in_group + lp;\n" 611 "const int gidY = gid / img_block_width;\n" 612 "const int gidX = gid - gidY * img_block_width;\n" 613 "const int lidX = lx - lp * 24;\n" 614 "const int lidY = get_local_id(1);\n" 615 "const int cell_x = lidX / 12;\n" 616 "const int cell_y = lidY;\n" 617 "const int cell_thread_x = lidX - cell_x * 12;\n" 618 "__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *\n" 619 "CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);\n" 620 "__local float* final_hist = hists + cnbins *\n" 621 "(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);\n" 622 "const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;\n" 623 "const int offset_y = gidY * cblock_stride_y + (cell_y << 2);\n" 624 "__global const float* grad_ptr = (gid < blocks_total) ?\n" 625 "grad + offset_y * grad_quadstep + (offset_x << 1) : grad;\n" 626 "__global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?\n" 627 "qangle + offset_y * qangle_step + (offset_x << 1) : qangle;\n" 628 "__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +\n" 629 "cell_thread_x;\n" 630 "for (int bin_id = 0; bin_id < cnbins; ++bin_id)\n" 631 "hist[bin_id * 48] = 0.f;\n" 632 "const int dist_x = -4 + cell_thread_x - 4 * cell_x;\n" 633 "const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);\n" 634 "const int dist_y_begin = -4 - 4 * lidY;\n" 635 "for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)\n" 636 "{\n" 637 "float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);\n" 638 "QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);\n" 639 "grad_ptr += grad_quadstep;\n" 640 "qangle_ptr += qangle_step;\n" 641 "int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);\n" 642 "int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);\n" 643 "float gaussian = gauss_w_lut[idx];\n" 644 "idx = (dist_y + 8) * 16 + (dist_x + 8);\n" 645 "float interp_weight = gauss_w_lut[256+idx];\n" 646 "hist[bin.x * 48] += gaussian * interp_weight * vote.x;\n" 647 "hist[bin.y * 48] += gaussian * interp_weight * vote.y;\n" 648 "}\n" 649 "barrier(CLK_LOCAL_MEM_FENCE);\n" 650 "volatile __local float* hist_ = hist;\n" 651 "for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)\n" 652 "{\n" 653 "if (cell_thread_x < 6)\n" 654 "hist_[0] += hist_[6];\n" 655 "barrier(CLK_LOCAL_MEM_FENCE);\n" 656 "if (cell_thread_x < 3)\n" 657 "hist_[0] += hist_[3];\n" 658 "#ifdef CPU\n" 659 "barrier(CLK_LOCAL_MEM_FENCE);\n" 660 "#endif\n" 661 "if (cell_thread_x == 0)\n" 662 "final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =\n" 663 "hist_[0] + hist_[1] + hist_[2];\n" 664 "}\n" 665 "barrier(CLK_LOCAL_MEM_FENCE);\n" 666 "int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;\n" 667 "if ((tid < cblock_hist_size) && (gid < blocks_total))\n" 668 "{\n" 669 "__global float* block_hist = block_hists +\n" 670 "(gidY * img_block_width + gidX) * cblock_hist_size;\n" 671 "block_hist[tid] = final_hist[tid];\n" 672 "}\n" 673 "}\n" 674 "__kernel void normalize_hists_36_kernel(__global float* block_hists,\n" 675 "const float threshold, __local float *squares)\n" 676 "{\n" 677 "const int tid = get_local_id(0);\n" 678 "const int gid = get_global_id(0);\n" 679 "const int bid = tid / 36; \n" 680 "const int boffset = bid * 36; \n" 681 "const int hid = tid - boffset; \n" 682 "float elem = block_hists[gid];\n" 683 "squares[tid] = elem * elem;\n" 684 "barrier(CLK_LOCAL_MEM_FENCE);\n" 685 "__local float* smem = squares + boffset;\n" 686 "float sum = smem[hid];\n" 687 "if (hid < 18)\n" 688 "smem[hid] = sum = sum + smem[hid + 18];\n" 689 "barrier(CLK_LOCAL_MEM_FENCE);\n" 690 "if (hid < 9)\n" 691 "smem[hid] = sum = sum + smem[hid + 9];\n" 692 "barrier(CLK_LOCAL_MEM_FENCE);\n" 693 "if (hid < 4)\n" 694 "smem[hid] = sum + smem[hid + 4];\n" 695 "barrier(CLK_LOCAL_MEM_FENCE);\n" 696 "sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];\n" 697 "elem = elem / (sqrt(sum) + 3.6f);\n" 698 "elem = min(elem, threshold);\n" 699 "barrier(CLK_LOCAL_MEM_FENCE);\n" 700 "squares[tid] = elem * elem;\n" 701 "barrier(CLK_LOCAL_MEM_FENCE);\n" 702 "sum = smem[hid];\n" 703 "if (hid < 18)\n" 704 "smem[hid] = sum = sum + smem[hid + 18];\n" 705 "barrier(CLK_LOCAL_MEM_FENCE);\n" 706 "if (hid < 9)\n" 707 "smem[hid] = sum = sum + smem[hid + 9];\n" 708 "barrier(CLK_LOCAL_MEM_FENCE);\n" 709 "if (hid < 4)\n" 710 "smem[hid] = sum + smem[hid + 4];\n" 711 "barrier(CLK_LOCAL_MEM_FENCE);\n" 712 "sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];\n" 713 "block_hists[gid] = elem / (sqrt(sum) + 1e-3f);\n" 714 "}\n" 715 "inline float reduce_smem(volatile __local float* smem, int size)\n" 716 "{\n" 717 "unsigned int tid = get_local_id(0);\n" 718 "float sum = smem[tid];\n" 719 "if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];\n" 720 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 721 "if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];\n" 722 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 723 "if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];\n" 724 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 725 "#ifdef CPU\n" 726 "if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];\n" 727 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 728 "if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];\n" 729 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 730 "if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];\n" 731 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 732 "if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];\n" 733 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 734 "if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];\n" 735 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 736 "if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];\n" 737 "barrier(CLK_LOCAL_MEM_FENCE); }\n" 738 "#else\n" 739 "if (tid < 32)\n" 740 "{\n" 741 "if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];\n" 742 "#if WAVE_SIZE < 32\n" 743 "} barrier(CLK_LOCAL_MEM_FENCE);\n" 744 "if (tid < 16) {\n" 745 "#endif\n" 746 "if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];\n" 747 "if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];\n" 748 "if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];\n" 749 "if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];\n" 750 "if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];\n" 751 "}\n" 752 "#endif\n" 753 "return sum;\n" 754 "}\n" 755 "__kernel void normalize_hists_kernel(\n" 756 "const int nthreads, const int block_hist_size, const int img_block_width,\n" 757 "__global float* block_hists, const float threshold, __local float *squares)\n" 758 "{\n" 759 "const int tid = get_local_id(0);\n" 760 "const int gidX = get_group_id(0);\n" 761 "const int gidY = get_group_id(1);\n" 762 "__global float* hist = block_hists + (gidY * img_block_width + gidX) *\n" 763 "block_hist_size + tid;\n" 764 "float elem = 0.f;\n" 765 "if (tid < block_hist_size)\n" 766 "elem = hist[0];\n" 767 "squares[tid] = elem * elem;\n" 768 "barrier(CLK_LOCAL_MEM_FENCE);\n" 769 "float sum = reduce_smem(squares, nthreads);\n" 770 "float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);\n" 771 "elem = min(elem * scale, threshold);\n" 772 "barrier(CLK_LOCAL_MEM_FENCE);\n" 773 "squares[tid] = elem * elem;\n" 774 "barrier(CLK_LOCAL_MEM_FENCE);\n" 775 "sum = reduce_smem(squares, nthreads);\n" 776 "scale = 1.0f / (sqrt(sum) + 1e-3f);\n" 777 "if (tid < block_hist_size)\n" 778 "hist[0] = elem * scale;\n" 779 "}\n" 780 "__kernel void classify_hists_180_kernel(\n" 781 "const int cdescr_width, const int cdescr_height, const int cblock_hist_size,\n" 782 "const int img_win_width, const int img_block_width,\n" 783 "const int win_block_stride_x, const int win_block_stride_y,\n" 784 "__global const float * block_hists, __global const float* coefs,\n" 785 "float free_coef, float threshold, __global uchar* labels)\n" 786 "{\n" 787 "const int tid = get_local_id(0);\n" 788 "const int gidX = get_group_id(0);\n" 789 "const int gidY = get_group_id(1);\n" 790 "__global const float* hist = block_hists + (gidY * win_block_stride_y *\n" 791 "img_block_width + gidX * win_block_stride_x) * cblock_hist_size;\n" 792 "float product = 0.f;\n" 793 "for (int i = 0; i < cdescr_height; i++)\n" 794 "{\n" 795 "product += coefs[i * cdescr_width + tid] *\n" 796 "hist[i * img_block_width * cblock_hist_size + tid];\n" 797 "}\n" 798 "__local float products[180];\n" 799 "products[tid] = product;\n" 800 "barrier(CLK_LOCAL_MEM_FENCE);\n" 801 "if (tid < 90) products[tid] = product = product + products[tid + 90];\n" 802 "barrier(CLK_LOCAL_MEM_FENCE);\n" 803 "if (tid < 45) products[tid] = product = product + products[tid + 45];\n" 804 "barrier(CLK_LOCAL_MEM_FENCE);\n" 805 "volatile __local float* smem = products;\n" 806 "#ifdef CPU\n" 807 "if (tid < 13) smem[tid] = product = product + smem[tid + 32];\n" 808 "barrier(CLK_LOCAL_MEM_FENCE);\n" 809 "if (tid < 16) smem[tid] = product = product + smem[tid + 16];\n" 810 "barrier(CLK_LOCAL_MEM_FENCE);\n" 811 "if(tid<8) smem[tid] = product = product + smem[tid + 8];\n" 812 "barrier(CLK_LOCAL_MEM_FENCE);\n" 813 "if(tid<4) smem[tid] = product = product + smem[tid + 4];\n" 814 "barrier(CLK_LOCAL_MEM_FENCE);\n" 815 "if(tid<2) smem[tid] = product = product + smem[tid + 2];\n" 816 "barrier(CLK_LOCAL_MEM_FENCE);\n" 817 "#else\n" 818 "if (tid < 13)\n" 819 "{\n" 820 "smem[tid] = product = product + smem[tid + 32];\n" 821 "}\n" 822 "#if WAVE_SIZE < 32\n" 823 "barrier(CLK_LOCAL_MEM_FENCE);\n" 824 "#endif\n" 825 "if (tid < 16)\n" 826 "{\n" 827 "smem[tid] = product = product + smem[tid + 16];\n" 828 "smem[tid] = product = product + smem[tid + 8];\n" 829 "smem[tid] = product = product + smem[tid + 4];\n" 830 "smem[tid] = product = product + smem[tid + 2];\n" 831 "}\n" 832 "#endif\n" 833 "if (tid == 0){\n" 834 "product = product + smem[tid + 1];\n" 835 "labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);\n" 836 "}\n" 837 "}\n" 838 "__kernel void classify_hists_252_kernel(\n" 839 "const int cdescr_width, const int cdescr_height, const int cblock_hist_size,\n" 840 "const int img_win_width, const int img_block_width,\n" 841 "const int win_block_stride_x, const int win_block_stride_y,\n" 842 "__global const float * block_hists, __global const float* coefs,\n" 843 "float free_coef, float threshold, __global uchar* labels)\n" 844 "{\n" 845 "const int tid = get_local_id(0);\n" 846 "const int gidX = get_group_id(0);\n" 847 "const int gidY = get_group_id(1);\n" 848 "__global const float* hist = block_hists + (gidY * win_block_stride_y *\n" 849 "img_block_width + gidX * win_block_stride_x) * cblock_hist_size;\n" 850 "float product = 0.f;\n" 851 "if (tid < cdescr_width)\n" 852 "{\n" 853 "for (int i = 0; i < cdescr_height; i++)\n" 854 "product += coefs[i * cdescr_width + tid] *\n" 855 "hist[i * img_block_width * cblock_hist_size + tid];\n" 856 "}\n" 857 "__local float products[NTHREADS];\n" 858 "products[tid] = product;\n" 859 "barrier(CLK_LOCAL_MEM_FENCE);\n" 860 "if (tid < 128) products[tid] = product = product + products[tid + 128];\n" 861 "barrier(CLK_LOCAL_MEM_FENCE);\n" 862 "if (tid < 64) products[tid] = product = product + products[tid + 64];\n" 863 "barrier(CLK_LOCAL_MEM_FENCE);\n" 864 "volatile __local float* smem = products;\n" 865 "#ifdef CPU\n" 866 "if(tid<32) smem[tid] = product = product + smem[tid + 32];\n" 867 "barrier(CLK_LOCAL_MEM_FENCE);\n" 868 "if(tid<16) smem[tid] = product = product + smem[tid + 16];\n" 869 "barrier(CLK_LOCAL_MEM_FENCE);\n" 870 "if(tid<8) smem[tid] = product = product + smem[tid + 8];\n" 871 "barrier(CLK_LOCAL_MEM_FENCE);\n" 872 "if(tid<4) smem[tid] = product = product + smem[tid + 4];\n" 873 "barrier(CLK_LOCAL_MEM_FENCE);\n" 874 "if(tid<2) smem[tid] = product = product + smem[tid + 2];\n" 875 "barrier(CLK_LOCAL_MEM_FENCE);\n" 876 "#else\n" 877 "if (tid < 32)\n" 878 "{\n" 879 "smem[tid] = product = product + smem[tid + 32];\n" 880 "#if WAVE_SIZE < 32\n" 881 "} barrier(CLK_LOCAL_MEM_FENCE);\n" 882 "if (tid < 16) {\n" 883 "#endif\n" 884 "smem[tid] = product = product + smem[tid + 16];\n" 885 "smem[tid] = product = product + smem[tid + 8];\n" 886 "smem[tid] = product = product + smem[tid + 4];\n" 887 "smem[tid] = product = product + smem[tid + 2];\n" 888 "}\n" 889 "#endif\n" 890 "if (tid == 0){\n" 891 "product = product + smem[tid + 1];\n" 892 "labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);\n" 893 "}\n" 894 "}\n" 895 "__kernel void classify_hists_kernel(\n" 896 "const int cdescr_size, const int cdescr_width, const int cblock_hist_size,\n" 897 "const int img_win_width, const int img_block_width,\n" 898 "const int win_block_stride_x, const int win_block_stride_y,\n" 899 "__global const float * block_hists, __global const float* coefs,\n" 900 "float free_coef, float threshold, __global uchar* labels)\n" 901 "{\n" 902 "const int tid = get_local_id(0);\n" 903 "const int gidX = get_group_id(0);\n" 904 "const int gidY = get_group_id(1);\n" 905 "__global const float* hist = block_hists + (gidY * win_block_stride_y *\n" 906 "img_block_width + gidX * win_block_stride_x) * cblock_hist_size;\n" 907 "float product = 0.f;\n" 908 "for (int i = tid; i < cdescr_size; i += NTHREADS)\n" 909 "{\n" 910 "int offset_y = i / cdescr_width;\n" 911 "int offset_x = i - offset_y * cdescr_width;\n" 912 "product += coefs[i] *\n" 913 "hist[offset_y * img_block_width * cblock_hist_size + offset_x];\n" 914 "}\n" 915 "__local float products[NTHREADS];\n" 916 "products[tid] = product;\n" 917 "barrier(CLK_LOCAL_MEM_FENCE);\n" 918 "if (tid < 128) products[tid] = product = product + products[tid + 128];\n" 919 "barrier(CLK_LOCAL_MEM_FENCE);\n" 920 "if (tid < 64) products[tid] = product = product + products[tid + 64];\n" 921 "barrier(CLK_LOCAL_MEM_FENCE);\n" 922 "volatile __local float* smem = products;\n" 923 "#ifdef CPU\n" 924 "if(tid<32) smem[tid] = product = product + smem[tid + 32];\n" 925 "barrier(CLK_LOCAL_MEM_FENCE);\n" 926 "if(tid<16) smem[tid] = product = product + smem[tid + 16];\n" 927 "barrier(CLK_LOCAL_MEM_FENCE);\n" 928 "if(tid<8) smem[tid] = product = product + smem[tid + 8];\n" 929 "barrier(CLK_LOCAL_MEM_FENCE);\n" 930 "if(tid<4) smem[tid] = product = product + smem[tid + 4];\n" 931 "barrier(CLK_LOCAL_MEM_FENCE);\n" 932 "if(tid<2) smem[tid] = product = product + smem[tid + 2];\n" 933 "barrier(CLK_LOCAL_MEM_FENCE);\n" 934 "#else\n" 935 "if (tid < 32)\n" 936 "{\n" 937 "smem[tid] = product = product + smem[tid + 32];\n" 938 "#if WAVE_SIZE < 32\n" 939 "} barrier(CLK_LOCAL_MEM_FENCE);\n" 940 "if (tid < 16) {\n" 941 "#endif\n" 942 "smem[tid] = product = product + smem[tid + 16];\n" 943 "smem[tid] = product = product + smem[tid + 8];\n" 944 "smem[tid] = product = product + smem[tid + 4];\n" 945 "smem[tid] = product = product + smem[tid + 2];\n" 946 "}\n" 947 "#endif\n" 948 "if (tid == 0){\n" 949 "smem[tid] = product = product + smem[tid + 1];\n" 950 "labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);\n" 951 "}\n" 952 "}\n" 953 "__kernel void extract_descrs_by_rows_kernel(\n" 954 "const int cblock_hist_size, const int descriptors_quadstep,\n" 955 "const int cdescr_size, const int cdescr_width, const int img_block_width,\n" 956 "const int win_block_stride_x, const int win_block_stride_y,\n" 957 "__global const float* block_hists, __global float* descriptors)\n" 958 "{\n" 959 "int tid = get_local_id(0);\n" 960 "int gidX = get_group_id(0);\n" 961 "int gidY = get_group_id(1);\n" 962 "__global const float* hist = block_hists + (gidY * win_block_stride_y *\n" 963 "img_block_width + gidX * win_block_stride_x) * cblock_hist_size;\n" 964 "__global float* descriptor = descriptors +\n" 965 "(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;\n" 966 "for (int i = tid; i < cdescr_size; i += NTHREADS)\n" 967 "{\n" 968 "int offset_y = i / cdescr_width;\n" 969 "int offset_x = i - offset_y * cdescr_width;\n" 970 "descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];\n" 971 "}\n" 972 "}\n" 973 "__kernel void extract_descrs_by_cols_kernel(\n" 974 "const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,\n" 975 "const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,\n" 976 "const int win_block_stride_x, const int win_block_stride_y,\n" 977 "__global const float* block_hists, __global float* descriptors)\n" 978 "{\n" 979 "int tid = get_local_id(0);\n" 980 "int gidX = get_group_id(0);\n" 981 "int gidY = get_group_id(1);\n" 982 "__global const float* hist = block_hists + (gidY * win_block_stride_y *\n" 983 "img_block_width + gidX * win_block_stride_x) * cblock_hist_size;\n" 984 "__global float* descriptor = descriptors +\n" 985 "(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;\n" 986 "for (int i = tid; i < cdescr_size; i += NTHREADS)\n" 987 "{\n" 988 "int block_idx = i / cblock_hist_size;\n" 989 "int idx_in_block = i - block_idx * cblock_hist_size;\n" 990 "int y = block_idx / cnblocks_win_x;\n" 991 "int x = block_idx - y * cnblocks_win_x;\n" 992 "descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =\n" 993 "hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];\n" 994 "}\n" 995 "}\n" 996 "__kernel void compute_gradients_8UC4_kernel(\n" 997 "const int height, const int width,\n" 998 "const int img_step, const int grad_quadstep, const int qangle_step,\n" 999 "const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,\n" 1000 "const float angle_scale, const char correct_gamma, const int cnbins)\n" 1001 "{\n" 1002 "const int x = get_global_id(0);\n" 1003 "const int tid = get_local_id(0);\n" 1004 "const int gSizeX = get_local_size(0);\n" 1005 "const int gidY = get_group_id(1);\n" 1006 "__global const uchar4* row = img + gidY * img_step;\n" 1007 "__local float sh_row[(NTHREADS + 2) * 3];\n" 1008 "uchar4 val;\n" 1009 "if (x < width)\n" 1010 "val = row[x];\n" 1011 "else\n" 1012 "val = row[width - 2];\n" 1013 "sh_row[tid + 1] = val.x;\n" 1014 "sh_row[tid + 1 + (NTHREADS + 2)] = val.y;\n" 1015 "sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;\n" 1016 "if (tid == 0)\n" 1017 "{\n" 1018 "val = row[max(x - 1, 1)];\n" 1019 "sh_row[0] = val.x;\n" 1020 "sh_row[(NTHREADS + 2)] = val.y;\n" 1021 "sh_row[2 * (NTHREADS + 2)] = val.z;\n" 1022 "}\n" 1023 "if (tid == gSizeX - 1)\n" 1024 "{\n" 1025 "val = row[min(x + 1, width - 2)];\n" 1026 "sh_row[gSizeX + 1] = val.x;\n" 1027 "sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;\n" 1028 "sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;\n" 1029 "}\n" 1030 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1031 "if (x < width)\n" 1032 "{\n" 1033 "float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],\n" 1034 "sh_row[tid + 2 * (NTHREADS + 2)], 0);\n" 1035 "float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],\n" 1036 "sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0);\n" 1037 "float4 dx;\n" 1038 "if (correct_gamma == 1)\n" 1039 "dx = sqrt(b) - sqrt(a);\n" 1040 "else\n" 1041 "dx = b - a;\n" 1042 "float4 dy = (float4) 0.f;\n" 1043 "if (gidY > 0 && gidY < height - 1)\n" 1044 "{\n" 1045 "a = convert_float4(img[(gidY - 1) * img_step + x].xyzw);\n" 1046 "b = convert_float4(img[(gidY + 1) * img_step + x].xyzw);\n" 1047 "if (correct_gamma == 1)\n" 1048 "dy = sqrt(b) - sqrt(a);\n" 1049 "else\n" 1050 "dy = b - a;\n" 1051 "}\n" 1052 "float4 mag = hypot(dx, dy);\n" 1053 "float best_dx = dx.x;\n" 1054 "float best_dy = dy.x;\n" 1055 "float mag0 = mag.x;\n" 1056 "if (mag0 < mag.y)\n" 1057 "{\n" 1058 "best_dx = dx.y;\n" 1059 "best_dy = dy.y;\n" 1060 "mag0 = mag.y;\n" 1061 "}\n" 1062 "if (mag0 < mag.z)\n" 1063 "{\n" 1064 "best_dx = dx.z;\n" 1065 "best_dy = dy.z;\n" 1066 "mag0 = mag.z;\n" 1067 "}\n" 1068 "float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;\n" 1069 "int hidx = (int)floor(ang);\n" 1070 "ang -= hidx;\n" 1071 "hidx = (hidx + cnbins) % cnbins;\n" 1072 "qangle[(gidY * qangle_step + x) << 1] = hidx;\n" 1073 "qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;\n" 1074 "grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);\n" 1075 "grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;\n" 1076 "}\n" 1077 "}\n" 1078 "__kernel void compute_gradients_8UC1_kernel(\n" 1079 "const int height, const int width,\n" 1080 "const int img_step, const int grad_quadstep, const int qangle_step,\n" 1081 "__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,\n" 1082 "const float angle_scale, const char correct_gamma, const int cnbins)\n" 1083 "{\n" 1084 "const int x = get_global_id(0);\n" 1085 "const int tid = get_local_id(0);\n" 1086 "const int gSizeX = get_local_size(0);\n" 1087 "const int gidY = get_group_id(1);\n" 1088 "__global const uchar* row = img + gidY * img_step;\n" 1089 "__local float sh_row[NTHREADS + 2];\n" 1090 "if (x < width)\n" 1091 "sh_row[tid + 1] = row[x];\n" 1092 "else\n" 1093 "sh_row[tid + 1] = row[width - 2];\n" 1094 "if (tid == 0)\n" 1095 "sh_row[0] = row[max(x - 1, 1)];\n" 1096 "if (tid == gSizeX - 1)\n" 1097 "sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];\n" 1098 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1099 "if (x < width)\n" 1100 "{\n" 1101 "float dx;\n" 1102 "if (correct_gamma == 1)\n" 1103 "dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);\n" 1104 "else\n" 1105 "dx = sh_row[tid + 2] - sh_row[tid];\n" 1106 "float dy = 0.f;\n" 1107 "if (gidY > 0 && gidY < height - 1)\n" 1108 "{\n" 1109 "float a = (float) img[ (gidY + 1) * img_step + x ];\n" 1110 "float b = (float) img[ (gidY - 1) * img_step + x ];\n" 1111 "if (correct_gamma == 1)\n" 1112 "dy = sqrt(a) - sqrt(b);\n" 1113 "else\n" 1114 "dy = a - b;\n" 1115 "}\n" 1116 "float mag = hypot(dx, dy);\n" 1117 "float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;\n" 1118 "int hidx = (int)floor(ang);\n" 1119 "ang -= hidx;\n" 1120 "hidx = (hidx + cnbins) % cnbins;\n" 1121 "qangle[ (gidY * qangle_step + x) << 1 ] = hidx;\n" 1122 "qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;\n" 1123 "grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);\n" 1124 "grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;\n" 1125 "}\n" 1126 "}\n" 1127 , "bc57f4f75fb81bae73bfe73cc4ca15e4"}; 1128 ProgramSource objdetect_hog_oclsrc(objdetect_hog.programStr); 1129 } 1130 }} 1131