1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 #include <limits>
43
44 namespace CAROTENE_NS {
45
countNonZero(const Size2D & _size,const u8 * srcBase,ptrdiff_t srcStride)46 s32 countNonZero(const Size2D &_size,
47 const u8 * srcBase, ptrdiff_t srcStride)
48 {
49 internal::assertSupportedConfiguration();
50 #ifdef CAROTENE_NEON
51 Size2D size(_size);
52 if (srcStride == (ptrdiff_t)(size.width))
53 {
54 size.width *= size.height;
55 size.height = 1;
56 }
57 size_t roiw16 = size.width & ~15u;
58 s32 result = 0;
59 for(size_t k = 0; k < size.height; ++k)
60 {
61 const u8* src = internal::getRowPtr( srcBase, srcStride, k);
62 size_t i = 0;
63
64 #define COUNTNONZERO8U_BLOCK_SIZE (16*255)
65 uint8x16_t vc1 = vmovq_n_u8(1);
66 for (; i < roiw16;)
67 {
68 size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
69 uint8x16_t vs = vmovq_n_u8(0);
70
71 for (; i <= lim; i+= 16)
72 {
73 internal::prefetch(src + i);
74 uint8x16_t vln = vld1q_u8(src + i);
75 uint8x16_t vnz = vminq_u8(vln, vc1);
76 vs = vaddq_u8(vs, vnz);
77 }
78
79 uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
80 uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
81
82 s32 s[2];
83 vst1_u32((u32*)s, vs2);
84
85 if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
86 {
87 return 0x7fFFffFF;
88 }
89 result += (s[0] += s[1]);
90 if (s[0] < 0 || result < 0)
91 {
92 return 0x7fFFffFF;
93 }
94 }
95 for (; i < size.width; i++)
96 result += (src[i] != 0)?1:0;
97 if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
98 {
99 return 0x7fFFffFF;
100 }
101 }
102 return result;
103 #else
104 (void)_size;
105 (void)srcBase;
106 (void)srcStride;
107
108 return 0;
109 #endif
110 }
111
countNonZero(const Size2D & _size,const u16 * srcBase,ptrdiff_t srcStride)112 s32 countNonZero(const Size2D &_size,
113 const u16 * srcBase, ptrdiff_t srcStride)
114 {
115 internal::assertSupportedConfiguration();
116 #ifdef CAROTENE_NEON
117 Size2D size(_size);
118 if (srcStride == (ptrdiff_t)(size.width))
119 {
120 size.width *= size.height;
121 size.height = 1;
122 }
123 size_t roiw8 = size.width & ~7u;
124 s32 result = 0;
125 for(size_t k = 0; k < size.height; ++k)
126 {
127 const u16* src = internal::getRowPtr( srcBase, srcStride, k);
128 size_t i = 0;
129
130 #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
131 uint16x8_t vc1 = vmovq_n_u16(1);
132 for (; i < roiw8;)
133 {
134 size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
135 uint16x8_t vs = vmovq_n_u16(0);
136
137 for (; i <= lim; i+= 8)
138 {
139 internal::prefetch(src + i);
140 uint16x8_t vln = vld1q_u16(src + i);
141 uint16x8_t vnz = vminq_u16(vln, vc1);
142 vs = vaddq_u16(vs, vnz);
143 }
144
145 uint32x4_t vs4 = vpaddlq_u16(vs);
146 uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
147
148 s32 s[2];
149 vst1_u32((u32*)s, vs2);
150
151 if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
152 {
153 return 0x7fFFffFF;
154 }
155 result += (s[0] += s[1]);
156 if (s[0] < 0 || result < 0)
157 {
158 return 0x7fFFffFF;
159 }
160 }
161 for (; i < size.width; i++)
162 result += (src[i] != 0)?1:0;
163 if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
164 {
165 return 0x7fFFffFF;
166 }
167 }
168 return result;
169 #else
170 (void)_size;
171 (void)srcBase;
172 (void)srcStride;
173
174 return 0;
175 #endif
176 }
177
countNonZero(const Size2D & _size,const s32 * srcBase,ptrdiff_t srcStride)178 s32 countNonZero(const Size2D &_size,
179 const s32 * srcBase, ptrdiff_t srcStride)
180 {
181 internal::assertSupportedConfiguration();
182 #ifdef CAROTENE_NEON
183 Size2D size(_size);
184 if (srcStride == (ptrdiff_t)(size.width))
185 {
186 size.width *= size.height;
187 size.height = 1;
188 }
189 size_t roiw4 = size.width & ~3u;
190 s32 result = 0;
191 for(size_t k = 0; k < size.height; ++k)
192 {
193 const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
194 u32 i = 0;
195
196 uint32x4_t vc1 = vmovq_n_u32(1);
197 uint32x4_t vs = vmovq_n_u32(0);
198
199 for (; i < roiw4; i += 4 )
200 {
201 internal::prefetch(src + i);
202 uint32x4_t vln = vld1q_u32(src + i);
203 uint32x4_t vnz = vminq_u32(vln, vc1);
204 vs = vqaddq_u32(vs, vnz);
205 }
206
207 uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
208
209 s32 s[2];
210 vst1_u32((u32*)s, vs2);
211
212 if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
213 {
214 return 0x7fFFffFF;
215 }
216 result += (s[0] += s[1]);
217 if (s[0] < 0 || result < 0)
218 {
219 return 0x7fFFffFF;
220 }
221
222 for (; i < size.width; i++)
223 result += (src[i] != 0)?1:0;
224 if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
225 {
226 return 0x7fFFffFF;
227 }
228 }
229 return result;
230 #else
231 (void)_size;
232 (void)srcBase;
233 (void)srcStride;
234
235 return 0;
236 #endif
237 }
238
countNonZero(const Size2D & _size,const f32 * srcBase,ptrdiff_t srcStride)239 s32 countNonZero(const Size2D &_size,
240 const f32 * srcBase, ptrdiff_t srcStride)
241 {
242 internal::assertSupportedConfiguration();
243 #ifdef CAROTENE_NEON
244 Size2D size(_size);
245 if (srcStride == (ptrdiff_t)(size.width))
246 {
247 size.width *= size.height;
248 size.height = 1;
249 }
250 size_t roiw4 = size.width & ~3u;
251 s32 result = 0;
252 for(size_t k = 0; k < size.height; ++k)
253 {
254 const f32* src = internal::getRowPtr( srcBase, srcStride, k);
255 size_t i = 0;
256
257 float32x4_t vc0 = vmovq_n_f32(0);
258 int32x4_t vs = vmovq_n_s32(0);
259
260 for (; i < roiw4; i += 4 )
261 {
262 internal::prefetch(src + i);
263 float32x4_t vln = vld1q_f32(src + i);
264 int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
265 vs = vqaddq_s32(vs, vnz);
266 }
267
268 int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
269
270 int s[2];
271 vst1_s32(s, vs2);
272
273 result += (s[0] += s[1]);
274 if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
275 {
276 return 0x7fFFffFF;
277 }
278
279 for (; i < size.width; i++)
280 result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
281
282 if (result < 0)
283 {
284 return 0x7fFFffFF;
285 }
286 }
287 return result;
288 #else
289 (void)_size;
290 (void)srcBase;
291 (void)srcStride;
292
293 return 0;
294 #endif
295 }
296
countNonZero(const Size2D & _size,const f64 * srcBase,ptrdiff_t srcStride)297 s32 countNonZero(const Size2D &_size,
298 const f64 * srcBase, ptrdiff_t srcStride)
299 {
300 internal::assertSupportedConfiguration();
301 #ifdef CAROTENE_NEON
302 Size2D size(_size);
303 if (srcStride == (ptrdiff_t)(size.width))
304 {
305 size.width *= size.height;
306 size.height = 1;
307 }
308 size_t roiw8 = size.width & ~7u;
309 size_t roiw4 = size.width & ~3u;
310 size_t roiw2 = size.width & ~1u;
311 uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
312 uint32x4_t vc0 = vmovq_n_u32(0);
313
314 s32 result = 0;
315 for(size_t k = 0; k < size.height; ++k)
316 {
317 const f64* src = internal::getRowPtr( srcBase, srcStride, k);
318 size_t i = 0;
319
320 int32x2_t vs1 = vmov_n_s32(0);
321 int32x2_t vs2 = vmov_n_s32(0);
322 int32x2_t vs3 = vmov_n_s32(0);
323 int32x2_t vs4 = vmov_n_s32(0);
324
325 for (; i < roiw8; i += 8 )
326 {
327 internal::prefetch(src + i + 6);
328 uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
329 uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
330 uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
331 uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
332
333 uint64x2_t vm1 = vandq_u64(vln1, vmask1);
334 uint64x2_t vm2 = vandq_u64(vln2, vmask1);
335 uint64x2_t vm3 = vandq_u64(vln3, vmask1);
336 uint64x2_t vm4 = vandq_u64(vln4, vmask1);
337
338 uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
339 uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
340 uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
341 uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
342
343 uint32x4_t vlx1 = vmvnq_u32(vequ1);
344 uint32x4_t vlx2 = vmvnq_u32(vequ2);
345 uint32x4_t vlx3 = vmvnq_u32(vequ3);
346 uint32x4_t vlx4 = vmvnq_u32(vequ4);
347
348 int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
349 int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
350 int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
351 int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
352
353 vs1 = vqadd_s32(vs1, vnz1);
354 vs2 = vqadd_s32(vs2, vnz2);
355 vs3 = vqadd_s32(vs3, vnz3);
356 vs4 = vqadd_s32(vs4, vnz4);
357 }
358
359 if (i < roiw4)
360 {
361 internal::prefetch(src + i + 2);
362 uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
363 uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
364
365 uint64x2_t vm1 = vandq_u64(vln1, vmask1);
366 uint64x2_t vm2 = vandq_u64(vln2, vmask1);
367
368 uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
369 uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
370
371 uint32x4_t vlx1 = vmvnq_u32(vequ1);
372 uint32x4_t vlx2 = vmvnq_u32(vequ2);
373
374 int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
375 int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
376
377 vs1 = vqadd_s32(vs1, vnz1);
378 vs2 = vqadd_s32(vs2, vnz2);
379 i += 4;
380 }
381
382 if (i < roiw2)
383 {
384 internal::prefetch(src + i);
385 uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
386
387 uint64x2_t vm1 = vandq_u64(vln1, vmask1);
388
389 uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
390
391 uint32x4_t vlx1 = vmvnq_u32(vequ1);
392
393 int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
394
395 vs1 = vqadd_s32(vs1, vnz1);
396 i += 2;
397 }
398
399 vs1 = vqadd_s32(vs1, vs2);
400 vs3 = vqadd_s32(vs3, vs4);
401 vs1 = vqadd_s32(vs1, vs3);
402 int32x2_t vsneg = vqneg_s32(vs1);
403
404 s32 s[2];
405 vst1_s32(s, vsneg);
406
407 result += (s[0] += s[1]);
408 if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
409 {
410 return 0x7fFFffFF;
411 }
412
413 for (; i < size.width; i++)
414 result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
415 if (result < 0)
416 {
417 return 0x7fFFffFF;
418 }
419 }
420 return result;
421 #else
422 (void)_size;
423 (void)srcBase;
424 (void)srcStride;
425
426 return 0;
427 #endif
428 }
429
430 } // namespace CAROTENE_NS
431