• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 #include <limits>
43 
44 namespace CAROTENE_NS {
45 
countNonZero(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)46 s32 countNonZero(const Size2D &_size,
47                  const u8 * srcBase, ptrdiff_t srcStride)
48 {
49     internal::assertSupportedConfiguration();
50 #ifdef CAROTENE_NEON
51     Size2D size(_size);
52     if (srcStride == (ptrdiff_t)(size.width))
53     {
54         size.width *= size.height;
55         size.height = 1;
56     }
57     size_t roiw16 = size.width & ~15u;
58     s32 result = 0;
59     for(size_t k = 0; k < size.height; ++k)
60     {
61         const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
62         size_t i = 0;
63 
64         #define COUNTNONZERO8U_BLOCK_SIZE (16*255)
65         uint8x16_t vc1 = vmovq_n_u8(1);
66         for (; i < roiw16;)
67         {
68             size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
69             uint8x16_t vs = vmovq_n_u8(0);
70 
71             for (; i <= lim; i+= 16)
72             {
73                 internal::prefetch(src + i);
74                 uint8x16_t vln = vld1q_u8(src + i);
75                 uint8x16_t vnz = vminq_u8(vln, vc1);
76                 vs = vaddq_u8(vs, vnz);
77             }
78 
79             uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
80             uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
81 
82             s32 s[2];
83             vst1_u32((u32*)s, vs2);
84 
85             if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
86             {
87                 return 0x7fFFffFF;
88             }
89             result += (s[0] += s[1]);
90             if (s[0] < 0 || result < 0)
91             {
92                 return 0x7fFFffFF;
93             }
94         }
95         for (; i < size.width; i++)
96             result += (src[i] != 0)?1:0;
97         if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
98         {
99             return 0x7fFFffFF;
100         }
101     }
102     return result;
103 #else
104     (void)_size;
105     (void)srcBase;
106     (void)srcStride;
107 
108     return 0;
109 #endif
110 }
111 
countNonZero(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)112 s32 countNonZero(const Size2D &_size,
113                  const u16 * srcBase, ptrdiff_t srcStride)
114 {
115     internal::assertSupportedConfiguration();
116 #ifdef CAROTENE_NEON
117     Size2D size(_size);
118     if (srcStride == (ptrdiff_t)(size.width))
119     {
120         size.width *= size.height;
121         size.height = 1;
122     }
123     size_t roiw8 = size.width & ~7u;
124     s32 result = 0;
125     for(size_t k = 0; k < size.height; ++k)
126     {
127         const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
128         size_t i = 0;
129 
130         #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
131         uint16x8_t vc1 = vmovq_n_u16(1);
132         for (; i < roiw8;)
133         {
134             size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
135             uint16x8_t vs = vmovq_n_u16(0);
136 
137             for (; i <= lim; i+= 8)
138             {
139                 internal::prefetch(src + i);
140                 uint16x8_t vln = vld1q_u16(src + i);
141                 uint16x8_t vnz = vminq_u16(vln, vc1);
142                 vs = vaddq_u16(vs, vnz);
143             }
144 
145             uint32x4_t vs4 = vpaddlq_u16(vs);
146             uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
147 
148             s32 s[2];
149             vst1_u32((u32*)s, vs2);
150 
151             if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
152             {
153                 return 0x7fFFffFF;
154             }
155             result += (s[0] += s[1]);
156             if (s[0] < 0 || result < 0)
157             {
158                 return 0x7fFFffFF;
159             }
160         }
161         for (; i < size.width; i++)
162             result += (src[i] != 0)?1:0;
163         if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
164         {
165             return 0x7fFFffFF;
166         }
167     }
168     return result;
169 #else
170     (void)_size;
171     (void)srcBase;
172     (void)srcStride;
173 
174     return 0;
175 #endif
176 }
177 
countNonZero(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)178 s32 countNonZero(const Size2D &_size,
179                  const s32 * srcBase, ptrdiff_t srcStride)
180 {
181     internal::assertSupportedConfiguration();
182 #ifdef CAROTENE_NEON
183     Size2D size(_size);
184     if (srcStride == (ptrdiff_t)(size.width))
185     {
186         size.width *= size.height;
187         size.height = 1;
188     }
189     size_t roiw4 = size.width & ~3u;
190     s32 result = 0;
191     for(size_t k = 0; k < size.height; ++k)
192     {
193         const u32* src = (const u32*)internal::getRowPtr( srcBase,  srcStride, k);
194         u32 i = 0;
195 
196         uint32x4_t vc1 = vmovq_n_u32(1);
197         uint32x4_t vs = vmovq_n_u32(0);
198 
199         for (; i < roiw4; i += 4 )
200         {
201             internal::prefetch(src + i);
202             uint32x4_t vln = vld1q_u32(src + i);
203             uint32x4_t vnz = vminq_u32(vln, vc1);
204             vs = vqaddq_u32(vs, vnz);
205         }
206 
207         uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
208 
209         s32 s[2];
210         vst1_u32((u32*)s, vs2);
211 
212         if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
213         {
214             return 0x7fFFffFF;
215         }
216         result += (s[0] += s[1]);
217         if (s[0] < 0 || result < 0)
218         {
219             return 0x7fFFffFF;
220         }
221 
222         for (; i < size.width; i++)
223             result += (src[i] != 0)?1:0;
224         if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
225         {
226             return 0x7fFFffFF;
227         }
228     }
229     return result;
230 #else
231     (void)_size;
232     (void)srcBase;
233     (void)srcStride;
234 
235     return 0;
236 #endif
237 }
238 
countNonZero(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)239 s32 countNonZero(const Size2D &_size,
240                  const f32 * srcBase, ptrdiff_t srcStride)
241 {
242     internal::assertSupportedConfiguration();
243 #ifdef CAROTENE_NEON
244     Size2D size(_size);
245     if (srcStride == (ptrdiff_t)(size.width))
246     {
247         size.width *= size.height;
248         size.height = 1;
249     }
250     size_t roiw4 = size.width & ~3u;
251     s32 result = 0;
252     for(size_t k = 0; k < size.height; ++k)
253     {
254         const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
255         size_t i = 0;
256 
257         float32x4_t vc0 = vmovq_n_f32(0);
258         int32x4_t vs = vmovq_n_s32(0);
259 
260         for (; i < roiw4; i += 4 )
261         {
262             internal::prefetch(src + i);
263             float32x4_t vln = vld1q_f32(src + i);
264             int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
265             vs = vqaddq_s32(vs, vnz);
266         }
267 
268         int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
269 
270         int s[2];
271         vst1_s32(s, vs2);
272 
273         result += (s[0] += s[1]);
274         if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
275         {
276             return 0x7fFFffFF;
277         }
278 
279         for (; i < size.width; i++)
280             result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
281 
282         if (result < 0)
283         {
284             return 0x7fFFffFF;
285         }
286     }
287     return result;
288 #else
289     (void)_size;
290     (void)srcBase;
291     (void)srcStride;
292 
293     return 0;
294 #endif
295 }
296 
countNonZero(const Size2D & _size,const f64 * srcBase,ptrdiff_t srcStride)297 s32 countNonZero(const Size2D &_size,
298                  const f64 * srcBase, ptrdiff_t srcStride)
299 {
300     internal::assertSupportedConfiguration();
301 #ifdef CAROTENE_NEON
302     Size2D size(_size);
303     if (srcStride == (ptrdiff_t)(size.width))
304     {
305         size.width *= size.height;
306         size.height = 1;
307     }
308     size_t roiw8 = size.width & ~7u;
309     size_t roiw4 = size.width & ~3u;
310     size_t roiw2 = size.width & ~1u;
311     uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
312     uint32x4_t vc0 = vmovq_n_u32(0);
313 
314     s32 result = 0;
315     for(size_t k = 0; k < size.height; ++k)
316     {
317         const f64* src = internal::getRowPtr( srcBase,  srcStride, k);
318         size_t i = 0;
319 
320         int32x2_t vs1 = vmov_n_s32(0);
321         int32x2_t vs2 = vmov_n_s32(0);
322         int32x2_t vs3 = vmov_n_s32(0);
323         int32x2_t vs4 = vmov_n_s32(0);
324 
325         for (; i < roiw8; i += 8 )
326         {
327             internal::prefetch(src + i + 6);
328             uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
329             uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
330             uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
331             uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
332 
333             uint64x2_t vm1 = vandq_u64(vln1, vmask1);
334             uint64x2_t vm2 = vandq_u64(vln2, vmask1);
335             uint64x2_t vm3 = vandq_u64(vln3, vmask1);
336             uint64x2_t vm4 = vandq_u64(vln4, vmask1);
337 
338             uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
339             uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
340             uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
341             uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
342 
343             uint32x4_t vlx1 = vmvnq_u32(vequ1);
344             uint32x4_t vlx2 = vmvnq_u32(vequ2);
345             uint32x4_t vlx3 = vmvnq_u32(vequ3);
346             uint32x4_t vlx4 = vmvnq_u32(vequ4);
347 
348             int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
349             int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
350             int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
351             int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
352 
353             vs1 = vqadd_s32(vs1, vnz1);
354             vs2 = vqadd_s32(vs2, vnz2);
355             vs3 = vqadd_s32(vs3, vnz3);
356             vs4 = vqadd_s32(vs4, vnz4);
357         }
358 
359         if (i < roiw4)
360         {
361             internal::prefetch(src + i + 2);
362             uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
363             uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
364 
365             uint64x2_t vm1 = vandq_u64(vln1, vmask1);
366             uint64x2_t vm2 = vandq_u64(vln2, vmask1);
367 
368             uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
369             uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
370 
371             uint32x4_t vlx1 = vmvnq_u32(vequ1);
372             uint32x4_t vlx2 = vmvnq_u32(vequ2);
373 
374             int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
375             int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
376 
377             vs1 = vqadd_s32(vs1, vnz1);
378             vs2 = vqadd_s32(vs2, vnz2);
379             i += 4;
380         }
381 
382         if (i < roiw2)
383         {
384             internal::prefetch(src + i);
385             uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
386 
387             uint64x2_t vm1 = vandq_u64(vln1, vmask1);
388 
389             uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
390 
391             uint32x4_t vlx1 = vmvnq_u32(vequ1);
392 
393             int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
394 
395             vs1 = vqadd_s32(vs1, vnz1);
396             i += 2;
397         }
398 
399         vs1 = vqadd_s32(vs1, vs2);
400         vs3 = vqadd_s32(vs3, vs4);
401         vs1 = vqadd_s32(vs1, vs3);
402         int32x2_t vsneg = vqneg_s32(vs1);
403 
404         s32 s[2];
405         vst1_s32(s, vsneg);
406 
407         result += (s[0] += s[1]);
408         if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
409         {
410             return 0x7fFFffFF;
411         }
412 
413         for (; i < size.width; i++)
414             result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
415         if (result < 0)
416         {
417             return 0x7fFFffFF;
418         }
419     }
420     return result;
421 #else
422     (void)_size;
423     (void)srcBase;
424     (void)srcStride;
425 
426     return 0;
427 #endif
428 }
429 
430 } // namespace CAROTENE_NS
431