• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 namespace CAROTENE_NS {
43 
44 #ifdef CAROTENE_NEON
45 
46 namespace {
47 
48 template <typename T>
process(const T * src,size_t j0,size_t j1,size_t i,T minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,T maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)49 void process(const T * src, size_t j0, size_t j1, size_t i,
50              T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
51              T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
52 {
53     for (size_t j = j0; j < j1; ++j)
54     {
55         T val = src[j];
56 
57         if (val == maxVal)
58         {
59             if (maxLocCount < maxLocCapacity)
60             {
61                 maxLocPtr[maxLocCount] = j;
62                 maxLocPtr[maxLocCount + 1] = i;
63             }
64             maxLocCount += 2;
65         }
66 
67         if (val == minVal)
68         {
69             if (minLocCount < minLocCapacity)
70             {
71                 minLocPtr[minLocCount] = j;
72                 minLocPtr[minLocCount + 1] = i;
73             }
74             minLocCount += 2;
75         }
76     }
77 }
78 
79 } // namespace
80 
81 #endif
82 
fillMinMaxLocs(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u8 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)83 void fillMinMaxLocs(const Size2D & size,
84                     const u8 * srcBase, ptrdiff_t srcStride,
85                     u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
86                     u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
87 {
88     internal::assertSupportedConfiguration();
89 #ifdef CAROTENE_NEON
90     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
91     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
92 
93     uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
94     uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
95 
96     u64 mask[2] = { 0ul };
97 
98     minLocCapacity <<= 1;
99     maxLocCapacity <<= 1;
100 
101     for (size_t i = 0; i < size.height; ++i)
102     {
103         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
104         size_t j = 0;
105 
106         for ( ; j < roiw16; j += 16)
107         {
108             internal::prefetch(src + j);
109             uint8x16_t v_src = vld1q_u8(src + j);
110 
111             uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
112             uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
113             uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
114 
115             vst1q_u8((u8 *)&mask[0], v_mask);
116 
117             if (mask[0])
118                 process(src, j, j + 8, i,
119                         minVal, minLocPtr, minLocCount, minLocCapacity,
120                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
121             if (mask[1])
122                 process(src, j + 8, j + 16, i,
123                         minVal, minLocPtr, minLocCount, minLocCapacity,
124                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
125         }
126         for ( ; j < roiw8; j += 8)
127         {
128             uint8x8_t v_src = vld1_u8(src + j);
129 
130             uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
131             uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
132             uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
133 
134             vst1_u8((u8 *)&mask[0], v_mask);
135 
136             if (mask[0])
137                 process(src, j, j + 8, i,
138                         minVal, minLocPtr, minLocCount, minLocCapacity,
139                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
140         }
141 
142         process(src, j, size.width, i,
143                 minVal, minLocPtr, minLocCount, minLocCapacity,
144                 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
145     }
146 
147     minLocCount >>= 1;
148     maxLocCount >>= 1;
149 #else
150     (void)size;
151     (void)srcBase;
152     (void)srcStride;
153     (void)minVal;
154     (void)minLocPtr;
155     (void)minLocCount;
156     (void)minLocCapacity;
157     (void)maxVal;
158     (void)maxLocPtr;
159     (void)maxLocCount;
160     (void)maxLocCapacity;
161 #endif
162 }
163 
fillMinMaxLocs(const Size2D & size,const u16 * srcBase,ptrdiff_t srcStride,u16 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u16 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)164 void fillMinMaxLocs(const Size2D & size,
165                     const u16 * srcBase, ptrdiff_t srcStride,
166                     u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
167                     u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
168 {
169     internal::assertSupportedConfiguration();
170 #ifdef CAROTENE_NEON
171     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
172     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
173 
174     uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
175                v_minval8 = vdupq_n_u16(minVal);
176     u64 mask[2] = { 0ul };
177 
178     minLocCapacity <<= 1;
179     maxLocCapacity <<= 1;
180 
181     for (size_t i = 0; i < size.height; ++i)
182     {
183         const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
184         size_t j = 0;
185 
186         for ( ; j < roiw16; j += 16)
187         {
188             internal::prefetch(src + j);
189             uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
190 
191             uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
192             uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
193 
194             vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
195 
196             if (mask[0])
197                 process(src, j, j + 8, i,
198                         minVal, minLocPtr, minLocCount, minLocCapacity,
199                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
200             if (mask[1])
201                 process(src, j + 8, j + 16, i,
202                         minVal, minLocPtr, minLocCount, minLocCapacity,
203                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
204         }
205         for ( ; j < roiw8; j += 8)
206         {
207             internal::prefetch(src + j);
208             uint16x8_t v_src = vld1q_u16(src + j);
209 
210             uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
211             uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
212             uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
213 
214             vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
215 
216             if (mask[0])
217                 process(src, j, j + 8, i,
218                         minVal, minLocPtr, minLocCount, minLocCapacity,
219                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
220         }
221 
222         process(src, j, size.width, i,
223                 minVal, minLocPtr, minLocCount, minLocCapacity,
224                 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
225     }
226 
227     minLocCount >>= 1;
228     maxLocCount >>= 1;
229 #else
230     (void)size;
231     (void)srcBase;
232     (void)srcStride;
233     (void)minVal;
234     (void)minLocPtr;
235     (void)minLocCount;
236     (void)minLocCapacity;
237     (void)maxVal;
238     (void)maxLocPtr;
239     (void)maxLocCount;
240     (void)maxLocCapacity;
241 #endif
242 }
243 
fillMinMaxLocs(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,s16 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,s16 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)244 void fillMinMaxLocs(const Size2D & size,
245                     const s16 * srcBase, ptrdiff_t srcStride,
246                     s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
247                     s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
248 {
249     internal::assertSupportedConfiguration();
250 #ifdef CAROTENE_NEON
251     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
252     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
253 
254     int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
255               v_minval8 = vdupq_n_s16(minVal);
256     u64 mask[2] = { 0ul };
257 
258     minLocCapacity <<= 1;
259     maxLocCapacity <<= 1;
260 
261     for (size_t i = 0; i < size.height; ++i)
262     {
263         const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
264         size_t j = 0;
265 
266         for ( ; j < roiw16; j += 16)
267         {
268             internal::prefetch(src + j);
269             int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
270 
271             uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
272             uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
273 
274             vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
275 
276             if (mask[0])
277                 process(src, j, j + 8, i,
278                         minVal, minLocPtr, minLocCount, minLocCapacity,
279                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
280             if (mask[1])
281                 process(src, j + 8, j + 16, i,
282                         minVal, minLocPtr, minLocCount, minLocCapacity,
283                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
284         }
285         for ( ; j < roiw8; j += 8)
286         {
287             internal::prefetch(src + j);
288             int16x8_t v_src = vld1q_s16(src + j);
289 
290             uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
291             uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
292             uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
293 
294             vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
295 
296             if (mask[0])
297                 process(src, j, j + 8, i,
298                         minVal, minLocPtr, minLocCount, minLocCapacity,
299                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
300         }
301 
302         process(src, j, size.width, i,
303                 minVal, minLocPtr, minLocCount, minLocCapacity,
304                 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
305     }
306 
307     minLocCount >>= 1;
308     maxLocCount >>= 1;
309 #else
310     (void)size;
311     (void)srcBase;
312     (void)srcStride;
313     (void)minVal;
314     (void)minLocPtr;
315     (void)minLocCount;
316     (void)minLocCapacity;
317     (void)maxVal;
318     (void)maxLocPtr;
319     (void)maxLocCount;
320     (void)maxLocCapacity;
321 #endif
322 }
323 
fillMinMaxLocs(const Size2D & size,const s32 * srcBase,ptrdiff_t srcStride,s32 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,s32 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)324 void fillMinMaxLocs(const Size2D & size,
325                     const s32 * srcBase, ptrdiff_t srcStride,
326                     s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
327                     s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
328 {
329     internal::assertSupportedConfiguration();
330 #ifdef CAROTENE_NEON
331     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
332 
333     int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
334               v_minval4 = vdupq_n_s32(minVal);
335     u64 mask = 0ul;
336 
337     minLocCapacity <<= 1;
338     maxLocCapacity <<= 1;
339 
340     for (size_t i = 0; i < size.height; ++i)
341     {
342         const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
343         size_t j = 0;
344 
345         for ( ; j < roiw8; j += 8)
346         {
347             internal::prefetch(src + j);
348             int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
349 
350             uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
351             uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
352 
353             vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
354 
355             if (mask)
356                 process(src, j, j + 8, i,
357                         minVal, minLocPtr, minLocCount, minLocCapacity,
358                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
359         }
360 
361         process(src, j, size.width, i,
362                 minVal, minLocPtr, minLocCount, minLocCapacity,
363                 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
364     }
365 
366     minLocCount >>= 1;
367     maxLocCount >>= 1;
368 #else
369     (void)size;
370     (void)srcBase;
371     (void)srcStride;
372     (void)minVal;
373     (void)minLocPtr;
374     (void)minLocCount;
375     (void)minLocCapacity;
376     (void)maxVal;
377     (void)maxLocPtr;
378     (void)maxLocCount;
379     (void)maxLocCapacity;
380 #endif
381 }
382 
fillMinMaxLocs(const Size2D & size,const u32 * srcBase,ptrdiff_t srcStride,u32 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u32 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)383 void fillMinMaxLocs(const Size2D & size,
384                     const u32 * srcBase, ptrdiff_t srcStride,
385                     u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
386                     u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
387 {
388     internal::assertSupportedConfiguration();
389 #ifdef CAROTENE_NEON
390     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
391 
392     uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
393                v_minval4 = vdupq_n_u32(minVal);
394     u64 mask = 0ul;
395 
396     minLocCapacity <<= 1;
397     maxLocCapacity <<= 1;
398 
399     for (size_t i = 0; i < size.height; ++i)
400     {
401         const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
402         size_t j = 0;
403 
404         for ( ; j < roiw8; j += 8)
405         {
406             internal::prefetch(src + j);
407             uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
408 
409             uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
410             uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
411 
412             vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
413 
414             if (mask)
415                 process(src, j, j + 8, i,
416                         minVal, minLocPtr, minLocCount, minLocCapacity,
417                         maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
418         }
419 
420         process(src, j, size.width, i,
421                 minVal, minLocPtr, minLocCount, minLocCapacity,
422                 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
423     }
424 
425     minLocCount >>= 1;
426     maxLocCount >>= 1;
427 #else
428     (void)size;
429     (void)srcBase;
430     (void)srcStride;
431     (void)minVal;
432     (void)minLocPtr;
433     (void)minLocCount;
434     (void)minLocCapacity;
435     (void)maxVal;
436     (void)maxLocPtr;
437     (void)maxLocCount;
438     (void)maxLocCapacity;
439 #endif
440 }
441 
442 } // namespace CAROTENE_NS
443