///////////////////////////// OpenCL kernels for face detection ////////////////////////////// ////////////////////////////// see the opencv/doc/license.txt /////////////////////////////// // // the code has been derived from the OpenCL Haar cascade kernel by // // Niko Li, newlife20080214@gmail.com // Wang Weiyan, wangweiyanster@gmail.com // Jia Haipeng, jiahaipeng95@gmail.com // Nathan, liujun@multicorewareinc.com // Peng Xiao, pengxiao@outlook.com // Erping Pang, erping@multicorewareinc.com // #ifdef HAAR typedef struct __attribute__((aligned(4))) OptHaarFeature { int4 ofs[3] __attribute__((aligned (4))); float4 weight __attribute__((aligned (4))); } OptHaarFeature; #endif #ifdef LBP typedef struct __attribute__((aligned(4))) OptLBPFeature { int16 ofs __attribute__((aligned (4))); } OptLBPFeature; #endif typedef struct __attribute__((aligned(4))) Stump { float4 st __attribute__((aligned (4))); } Stump; typedef struct __attribute__((aligned(4))) Node { int4 n __attribute__((aligned (4))); } Node; typedef struct __attribute__((aligned (4))) Stage { int first __attribute__((aligned (4))); int ntrees __attribute__((aligned (4))); float threshold __attribute__((aligned (4))); } Stage; typedef struct __attribute__((aligned (4))) ScaleData { float scale __attribute__((aligned (4))); int szi_width __attribute__((aligned (4))); int szi_height __attribute__((aligned (4))); int layer_ofs __attribute__((aligned (4))); int ystep __attribute__((aligned (4))); } ScaleData; #ifndef SUM_BUF_SIZE #define SUM_BUF_SIZE 0 #endif #ifndef NODE_COUNT #define NODE_COUNT 1 #endif #ifdef HAAR __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1))) void runHaarClassifier( int nscales, __global const ScaleData* scaleData, __global const int* sum, int _sumstep, int sumoffset, __global const OptHaarFeature* optfeatures, __global const Stage* stages, __global const Node* nodes, __global const float* leaves0, volatile __global int* facepos, int4 normrect, int sqofs, int2 windowsize) { int lx = get_local_id(0); int ly = get_local_id(1); int groupIdx = get_group_id(0); int i, ngroups = get_global_size(0)/LOCAL_SIZE_X; int scaleIdx, tileIdx, stageIdx; int sumstep = (int)(_sumstep/sizeof(int)); int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x), mad24(normrect.y, sumstep, normrect.x + normrect.z), mad24(normrect.y + normrect.w, sumstep, normrect.x), mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z)); int normarea = normrect.z * normrect.w; float invarea = 1.f/normarea; int lidx = ly*LOCAL_SIZE_X + lx; #if SUM_BUF_SIZE > 0 int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x), mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z), mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x), mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z)); #else int4 nofs = nofs0; #endif #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y) __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1]; #if SUM_BUF_SIZE > 0 __local int* ibuf = lstore; __local int* lcount = ibuf + SUM_BUF_SIZE; #else __local int* lcount = lstore; #endif __local float* lnf = (__local float*)(lcount + 1); __local float* lpartsum = lnf + LOCAL_SIZE; __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE); for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) { __global const ScaleData* s = scaleData + scaleIdx; int ystep = s->ystep; int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X, (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y); int totalTiles = ntiles.x*ntiles.y; for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) { int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X; int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y; int ix = lx, iy = ly; __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs; __global const int* psum1 = psum0 + mad24(iy, sumstep, ix); if( ix0 >= worksize.x || iy0 >= worksize.y ) continue; #if SUM_BUF_SIZE > 0 for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 ) { int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP; vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i); } #endif if( lidx == 0 ) lcount[0] = 0; barrier(CLK_LOCAL_MEM_FENCE); if( ix0 + ix < worksize.x && iy0 + iy < worksize.y ) { #if NODE_COUNT==1 __global const Stump* stump = (__global const Stump*)nodes; #else __global const Node* node = nodes; __global const float* leaves = leaves0; #endif #if SUM_BUF_SIZE > 0 __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix); #else __global const int* psum = psum1; #endif __global const int* psqsum = (__global const int*)(psum1 + sqofs); float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea; float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea; float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f)); nf = nf > 0 ? nf : 1.f; for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) { int ntrees = stages[stageIdx].ntrees; float s = 0.f; #if NODE_COUNT==1 for( i = 0; i < ntrees; i++ ) { float4 st = stump[i].st; __global const OptHaarFeature* f = optfeatures + as_int(st.x); float4 weight = f->weight; int4 ofs = f->ofs[0]; sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; ofs = f->ofs[1]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); if( weight.z > 0 ) { ofs = f->ofs[2]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); } s += (sval < st.y*nf) ? st.z : st.w; } stump += ntrees; #else for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 ) { int idx = 0; do { int4 n = node[idx].n; __global const OptHaarFeature* f = optfeatures + n.x; float4 weight = f->weight; int4 ofs = f->ofs[0]; sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; ofs = f->ofs[1]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); if( weight.z > 0 ) { ofs = f->ofs[2]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); } idx = (sval < as_float(n.y)*nf) ? n.z : n.w; } while(idx > 0); s += leaves[-idx]; } #endif if( s < stages[stageIdx].threshold ) break; } if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) { int count = atomic_inc(lcount); lbuf[count] = (int)(ix | (iy << 8)); lnf[count] = nf; } } for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) { barrier(CLK_LOCAL_MEM_FENCE); int nrects = lcount[0]; if( nrects == 0 ) break; barrier(CLK_LOCAL_MEM_FENCE); if( lidx == 0 ) lcount[0] = 0; { #if NODE_COUNT == 1 __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first; #else __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT; __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1); #endif int nparts = LOCAL_SIZE / nrects; int ntrees = stages[stageIdx].ntrees; int ntrees_p = (ntrees + nparts - 1)/nparts; int nr = lidx / nparts; int partidx = -1, idxval = 0; float partsum = 0.f, nf = 0.f; if( nr < nrects ) { partidx = lidx % nparts; idxval = lbuf[nr]; nf = lnf[nr]; { int ntrees0 = ntrees_p*partidx; int ntrees1 = min(ntrees0 + ntrees_p, ntrees); int ix1 = idxval & 255, iy1 = idxval >> 8; #if SUM_BUF_SIZE > 0 __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1); #else __global const int* psum = psum0 + mad24(iy1, sumstep, ix1); #endif #if NODE_COUNT == 1 for( i = ntrees0; i < ntrees1; i++ ) { float4 st = stump[i].st; __global const OptHaarFeature* f = optfeatures + as_int(st.x); float4 weight = f->weight; int4 ofs = f->ofs[0]; float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; ofs = f->ofs[1]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); //if( weight.z > 0 ) if( fabs(weight.z) > 0 ) { ofs = f->ofs[2]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); } partsum += (sval < st.y*nf) ? st.z : st.w; } #else for( i = ntrees0; i < ntrees1; i++ ) { int idx = 0; do { int4 n = node[i*2 + idx].n; __global const OptHaarFeature* f = optfeatures + n.x; float4 weight = f->weight; int4 ofs = f->ofs[0]; float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; ofs = f->ofs[1]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); if( weight.z > 0 ) { ofs = f->ofs[2]; sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); } idx = (sval < as_float(n.y)*nf) ? n.z : n.w; } while(idx > 0); partsum += leaves[i*3-idx]; } #endif } } lpartsum[lidx] = partsum; barrier(CLK_LOCAL_MEM_FENCE); if( partidx == 0 ) { float s = lpartsum[nr*nparts]; for( i = 1; i < nparts; i++ ) s += lpartsum[i + nr*nparts]; if( s >= stages[stageIdx].threshold ) { int count = atomic_inc(lcount); lbuf[count] = idxval; lnf[count] = nf; } } } } barrier(CLK_LOCAL_MEM_FENCE); if( stageIdx == N_STAGES ) { int nrects = lcount[0]; if( lidx < nrects ) { int nfaces = atomic_inc(facepos); if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; int val = lbuf[lidx]; face[0] = scaleIdx; face[1] = ix0 + (val & 255); face[2] = iy0 + (val >> 8); } } } } } } #endif #ifdef LBP #undef CALC_SUM_OFS_ #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) __kernel void runLBPClassifierStumpSimple( int nscales, __global const ScaleData* scaleData, __global const int* sum, int _sumstep, int sumoffset, __global const OptLBPFeature* optfeatures, __global const Stage* stages, __global const Stump* stumps, __global const int* bitsets, int bitsetSize, volatile __global int* facepos, int2 windowsize) { int lx = get_local_id(0); int ly = get_local_id(1); int local_size_x = get_local_size(0); int local_size_y = get_local_size(1); int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0); int ngroups = get_num_groups(0)*get_num_groups(1); int scaleIdx, tileIdx, stageIdx; int sumstep = (int)(_sumstep/sizeof(int)); for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) { __global const ScaleData* s = scaleData + scaleIdx; int ystep = s->ystep; int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x, (worksize.y/ystep + local_size_y-1)/local_size_y); int totalTiles = ntiles.x*ntiles.y; for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) { int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep; int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep; if( ix < worksize.x && iy < worksize.y ) { __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs; __global const Stump* stump = stumps; __global const int* bitset = bitsets; for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ ) { int i, ntrees = stages[stageIdx].ntrees; float s = 0.f; for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize ) { float4 st = stump->st; __global const OptLBPFeature* f = optfeatures + as_int(st.x); int16 ofs = f->ofs; int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 s += (bitset[idx] & (1 << mask)) ? st.z : st.w; } if( s < stages[stageIdx].threshold ) break; } if( stageIdx == N_STAGES ) { int nfaces = atomic_inc(facepos); if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; face[0] = scaleIdx; face[1] = ix; face[2] = iy; } } } } } } __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1))) void runLBPClassifierStump( int nscales, __global const ScaleData* scaleData, __global const int* sum, int _sumstep, int sumoffset, __global const OptLBPFeature* optfeatures, __global const Stage* stages, __global const Stump* stumps, __global const int* bitsets, int bitsetSize, volatile __global int* facepos, int2 windowsize) { int lx = get_local_id(0); int ly = get_local_id(1); int groupIdx = get_group_id(0); int i, ngroups = get_global_size(0)/LOCAL_SIZE_X; int scaleIdx, tileIdx, stageIdx; int sumstep = (int)(_sumstep/sizeof(int)); int lidx = ly*LOCAL_SIZE_X + lx; #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y) __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1]; #if SUM_BUF_SIZE > 0 __local int* ibuf = lstore; __local int* lcount = ibuf + SUM_BUF_SIZE; #else __local int* lcount = lstore; #endif __local float* lpartsum = (__local float*)(lcount + 1); __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE); for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) { __global const ScaleData* s = scaleData + scaleIdx; int ystep = s->ystep; int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X, (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y); int totalTiles = ntiles.x*ntiles.y; for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) { int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X; int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y; int ix = lx, iy = ly; __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs; if( ix0 >= worksize.x || iy0 >= worksize.y ) continue; #if SUM_BUF_SIZE > 0 for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 ) { int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP; vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i); } barrier(CLK_LOCAL_MEM_FENCE); #endif if( lidx == 0 ) lcount[0] = 0; barrier(CLK_LOCAL_MEM_FENCE); if( ix0 + ix < worksize.x && iy0 + iy < worksize.y ) { __global const Stump* stump = stumps; __global const int* bitset = bitsets; #if SUM_BUF_SIZE > 0 __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix); #else __global const int* p = psum0 + mad24(iy, sumstep, ix); #endif for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) { int ntrees = stages[stageIdx].ntrees; float s = 0.f; for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize ) { float4 st = stump->st; __global const OptLBPFeature* f = optfeatures + as_int(st.x); int16 ofs = f->ofs; int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 s += (bitset[idx] & (1 << mask)) ? st.z : st.w; } if( s < stages[stageIdx].threshold ) break; } if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) { int count = atomic_inc(lcount); lbuf[count] = (int)(ix | (iy << 8)); } } for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) { int nrects = lcount[0]; barrier(CLK_LOCAL_MEM_FENCE); if( nrects == 0 ) break; if( lidx == 0 ) lcount[0] = 0; { __global const Stump* stump = stumps + stages[stageIdx].first; __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize; int nparts = LOCAL_SIZE / nrects; int ntrees = stages[stageIdx].ntrees; int ntrees_p = (ntrees + nparts - 1)/nparts; int nr = lidx / nparts; int partidx = -1, idxval = 0; float partsum = 0.f, nf = 0.f; if( nr < nrects ) { partidx = lidx % nparts; idxval = lbuf[nr]; { int ntrees0 = ntrees_p*partidx; int ntrees1 = min(ntrees0 + ntrees_p, ntrees); int ix1 = idxval & 255, iy1 = idxval >> 8; #if SUM_BUF_SIZE > 0 __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1); #else __global const int* p = psum0 + mad24(iy1, sumstep, ix1); #endif for( i = ntrees0; i < ntrees1; i++ ) { float4 st = stump[i].st; __global const OptLBPFeature* f = optfeatures + as_int(st.x); int16 ofs = f->ofs; #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w; } } } lpartsum[lidx] = partsum; barrier(CLK_LOCAL_MEM_FENCE); if( partidx == 0 ) { float s = lpartsum[nr*nparts]; for( i = 1; i < nparts; i++ ) s += lpartsum[i + nr*nparts]; if( s >= stages[stageIdx].threshold ) { int count = atomic_inc(lcount); lbuf[count] = idxval; } } } } barrier(CLK_LOCAL_MEM_FENCE); if( stageIdx == N_STAGES ) { int nrects = lcount[0]; if( lidx < nrects ) { int nfaces = atomic_inc(facepos); if( nfaces < MAX_FACES ) { volatile __global int* face = facepos + 1 + nfaces*3; int val = lbuf[lidx]; face[0] = scaleIdx; face[1] = ix0 + (val & 255); face[2] = iy0 + (val >> 8); } } } } } } #endif