1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 namespace CAROTENE_NS {
43
44 #ifdef CAROTENE_NEON
45
46 namespace {
47
48 template <typename T>
process(const T * src,size_t j0,size_t j1,size_t i,T minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,T maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)49 void process(const T * src, size_t j0, size_t j1, size_t i,
50 T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
51 T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
52 {
53 for (size_t j = j0; j < j1; ++j)
54 {
55 T val = src[j];
56
57 if (val == maxVal)
58 {
59 if (maxLocCount < maxLocCapacity)
60 {
61 maxLocPtr[maxLocCount] = j;
62 maxLocPtr[maxLocCount + 1] = i;
63 }
64 maxLocCount += 2;
65 }
66
67 if (val == minVal)
68 {
69 if (minLocCount < minLocCapacity)
70 {
71 minLocPtr[minLocCount] = j;
72 minLocPtr[minLocCount + 1] = i;
73 }
74 minLocCount += 2;
75 }
76 }
77 }
78
79 } // namespace
80
81 #endif
82
fillMinMaxLocs(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u8 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)83 void fillMinMaxLocs(const Size2D & size,
84 const u8 * srcBase, ptrdiff_t srcStride,
85 u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
86 u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
87 {
88 internal::assertSupportedConfiguration();
89 #ifdef CAROTENE_NEON
90 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
91 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
92
93 uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
94 uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
95
96 u64 mask[2] = { 0ul };
97
98 minLocCapacity <<= 1;
99 maxLocCapacity <<= 1;
100
101 for (size_t i = 0; i < size.height; ++i)
102 {
103 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
104 size_t j = 0;
105
106 for ( ; j < roiw16; j += 16)
107 {
108 internal::prefetch(src + j);
109 uint8x16_t v_src = vld1q_u8(src + j);
110
111 uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
112 uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
113 uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
114
115 vst1q_u8((u8 *)&mask[0], v_mask);
116
117 if (mask[0])
118 process(src, j, j + 8, i,
119 minVal, minLocPtr, minLocCount, minLocCapacity,
120 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
121 if (mask[1])
122 process(src, j + 8, j + 16, i,
123 minVal, minLocPtr, minLocCount, minLocCapacity,
124 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
125 }
126 for ( ; j < roiw8; j += 8)
127 {
128 uint8x8_t v_src = vld1_u8(src + j);
129
130 uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
131 uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
132 uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
133
134 vst1_u8((u8 *)&mask[0], v_mask);
135
136 if (mask[0])
137 process(src, j, j + 8, i,
138 minVal, minLocPtr, minLocCount, minLocCapacity,
139 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
140 }
141
142 process(src, j, size.width, i,
143 minVal, minLocPtr, minLocCount, minLocCapacity,
144 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
145 }
146
147 minLocCount >>= 1;
148 maxLocCount >>= 1;
149 #else
150 (void)size;
151 (void)srcBase;
152 (void)srcStride;
153 (void)minVal;
154 (void)minLocPtr;
155 (void)minLocCount;
156 (void)minLocCapacity;
157 (void)maxVal;
158 (void)maxLocPtr;
159 (void)maxLocCount;
160 (void)maxLocCapacity;
161 #endif
162 }
163
fillMinMaxLocs(const Size2D & size,const u16 * srcBase,ptrdiff_t srcStride,u16 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u16 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)164 void fillMinMaxLocs(const Size2D & size,
165 const u16 * srcBase, ptrdiff_t srcStride,
166 u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
167 u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
168 {
169 internal::assertSupportedConfiguration();
170 #ifdef CAROTENE_NEON
171 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
172 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
173
174 uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
175 v_minval8 = vdupq_n_u16(minVal);
176 u64 mask[2] = { 0ul };
177
178 minLocCapacity <<= 1;
179 maxLocCapacity <<= 1;
180
181 for (size_t i = 0; i < size.height; ++i)
182 {
183 const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
184 size_t j = 0;
185
186 for ( ; j < roiw16; j += 16)
187 {
188 internal::prefetch(src + j);
189 uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
190
191 uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
192 uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
193
194 vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
195
196 if (mask[0])
197 process(src, j, j + 8, i,
198 minVal, minLocPtr, minLocCount, minLocCapacity,
199 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
200 if (mask[1])
201 process(src, j + 8, j + 16, i,
202 minVal, minLocPtr, minLocCount, minLocCapacity,
203 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
204 }
205 for ( ; j < roiw8; j += 8)
206 {
207 internal::prefetch(src + j);
208 uint16x8_t v_src = vld1q_u16(src + j);
209
210 uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
211 uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
212 uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
213
214 vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
215
216 if (mask[0])
217 process(src, j, j + 8, i,
218 minVal, minLocPtr, minLocCount, minLocCapacity,
219 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
220 }
221
222 process(src, j, size.width, i,
223 minVal, minLocPtr, minLocCount, minLocCapacity,
224 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
225 }
226
227 minLocCount >>= 1;
228 maxLocCount >>= 1;
229 #else
230 (void)size;
231 (void)srcBase;
232 (void)srcStride;
233 (void)minVal;
234 (void)minLocPtr;
235 (void)minLocCount;
236 (void)minLocCapacity;
237 (void)maxVal;
238 (void)maxLocPtr;
239 (void)maxLocCount;
240 (void)maxLocCapacity;
241 #endif
242 }
243
fillMinMaxLocs(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,s16 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,s16 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)244 void fillMinMaxLocs(const Size2D & size,
245 const s16 * srcBase, ptrdiff_t srcStride,
246 s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
247 s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
248 {
249 internal::assertSupportedConfiguration();
250 #ifdef CAROTENE_NEON
251 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
252 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
253
254 int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
255 v_minval8 = vdupq_n_s16(minVal);
256 u64 mask[2] = { 0ul };
257
258 minLocCapacity <<= 1;
259 maxLocCapacity <<= 1;
260
261 for (size_t i = 0; i < size.height; ++i)
262 {
263 const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
264 size_t j = 0;
265
266 for ( ; j < roiw16; j += 16)
267 {
268 internal::prefetch(src + j);
269 int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
270
271 uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
272 uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
273
274 vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
275
276 if (mask[0])
277 process(src, j, j + 8, i,
278 minVal, minLocPtr, minLocCount, minLocCapacity,
279 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
280 if (mask[1])
281 process(src, j + 8, j + 16, i,
282 minVal, minLocPtr, minLocCount, minLocCapacity,
283 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
284 }
285 for ( ; j < roiw8; j += 8)
286 {
287 internal::prefetch(src + j);
288 int16x8_t v_src = vld1q_s16(src + j);
289
290 uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
291 uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
292 uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
293
294 vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
295
296 if (mask[0])
297 process(src, j, j + 8, i,
298 minVal, minLocPtr, minLocCount, minLocCapacity,
299 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
300 }
301
302 process(src, j, size.width, i,
303 minVal, minLocPtr, minLocCount, minLocCapacity,
304 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
305 }
306
307 minLocCount >>= 1;
308 maxLocCount >>= 1;
309 #else
310 (void)size;
311 (void)srcBase;
312 (void)srcStride;
313 (void)minVal;
314 (void)minLocPtr;
315 (void)minLocCount;
316 (void)minLocCapacity;
317 (void)maxVal;
318 (void)maxLocPtr;
319 (void)maxLocCount;
320 (void)maxLocCapacity;
321 #endif
322 }
323
fillMinMaxLocs(const Size2D & size,const s32 * srcBase,ptrdiff_t srcStride,s32 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,s32 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)324 void fillMinMaxLocs(const Size2D & size,
325 const s32 * srcBase, ptrdiff_t srcStride,
326 s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
327 s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
328 {
329 internal::assertSupportedConfiguration();
330 #ifdef CAROTENE_NEON
331 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
332
333 int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
334 v_minval4 = vdupq_n_s32(minVal);
335 u64 mask = 0ul;
336
337 minLocCapacity <<= 1;
338 maxLocCapacity <<= 1;
339
340 for (size_t i = 0; i < size.height; ++i)
341 {
342 const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
343 size_t j = 0;
344
345 for ( ; j < roiw8; j += 8)
346 {
347 internal::prefetch(src + j);
348 int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
349
350 uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
351 uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
352
353 vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
354
355 if (mask)
356 process(src, j, j + 8, i,
357 minVal, minLocPtr, minLocCount, minLocCapacity,
358 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
359 }
360
361 process(src, j, size.width, i,
362 minVal, minLocPtr, minLocCount, minLocCapacity,
363 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
364 }
365
366 minLocCount >>= 1;
367 maxLocCount >>= 1;
368 #else
369 (void)size;
370 (void)srcBase;
371 (void)srcStride;
372 (void)minVal;
373 (void)minLocPtr;
374 (void)minLocCount;
375 (void)minLocCapacity;
376 (void)maxVal;
377 (void)maxLocPtr;
378 (void)maxLocCount;
379 (void)maxLocCapacity;
380 #endif
381 }
382
fillMinMaxLocs(const Size2D & size,const u32 * srcBase,ptrdiff_t srcStride,u32 minVal,size_t * minLocPtr,s32 & minLocCount,s32 minLocCapacity,u32 maxVal,size_t * maxLocPtr,s32 & maxLocCount,s32 maxLocCapacity)383 void fillMinMaxLocs(const Size2D & size,
384 const u32 * srcBase, ptrdiff_t srcStride,
385 u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
386 u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
387 {
388 internal::assertSupportedConfiguration();
389 #ifdef CAROTENE_NEON
390 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
391
392 uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
393 v_minval4 = vdupq_n_u32(minVal);
394 u64 mask = 0ul;
395
396 minLocCapacity <<= 1;
397 maxLocCapacity <<= 1;
398
399 for (size_t i = 0; i < size.height; ++i)
400 {
401 const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
402 size_t j = 0;
403
404 for ( ; j < roiw8; j += 8)
405 {
406 internal::prefetch(src + j);
407 uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
408
409 uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
410 uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
411
412 vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
413
414 if (mask)
415 process(src, j, j + 8, i,
416 minVal, minLocPtr, minLocCount, minLocCapacity,
417 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
418 }
419
420 process(src, j, size.width, i,
421 minVal, minLocPtr, minLocCount, minLocCapacity,
422 maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
423 }
424
425 minLocCount >>= 1;
426 maxLocCount >>= 1;
427 #else
428 (void)size;
429 (void)srcBase;
430 (void)srcStride;
431 (void)minVal;
432 (void)minLocPtr;
433 (void)minLocCount;
434 (void)minLocCapacity;
435 (void)maxVal;
436 (void)maxLocPtr;
437 (void)maxLocCount;
438 (void)maxLocCapacity;
439 #endif
440 }
441
442 } // namespace CAROTENE_NS
443