1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "vtransform.hpp"
42
43 namespace CAROTENE_NS {
44
45 #ifdef CAROTENE_NEON
46
47 namespace {
48
49 template <typename T, typename WT>
50 struct AddWrap
51 {
52 typedef T type;
53
operator ()CAROTENE_NS::__anond1c967710111::AddWrap54 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
55 const typename internal::VecTraits<T>::vec128 & v_src1,
56 typename internal::VecTraits<T>::vec128 & v_dst) const
57 {
58 v_dst = internal::vaddq(v_src0, v_src1);
59 }
60
operator ()CAROTENE_NS::__anond1c967710111::AddWrap61 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
62 const typename internal::VecTraits<T>::vec64 & v_src1,
63 typename internal::VecTraits<T>::vec64 & v_dst) const
64 {
65 v_dst = internal::vadd(v_src0, v_src1);
66 }
67
operator ()CAROTENE_NS::__anond1c967710111::AddWrap68 void operator() (const T * src0, const T * src1, T * dst) const
69 {
70 dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
71 }
72 };
73
74 template <typename T, typename WT>
75 struct AddSaturate
76 {
77 typedef T type;
78
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate79 void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
80 const typename internal::VecTraits<T>::vec128 & v_src1,
81 typename internal::VecTraits<T>::vec128 & v_dst) const
82 {
83 v_dst = internal::vqaddq(v_src0, v_src1);
84 }
85
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate86 void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
87 const typename internal::VecTraits<T>::vec64 & v_src1,
88 typename internal::VecTraits<T>::vec64 & v_dst) const
89 {
90 v_dst = internal::vqadd(v_src0, v_src1);
91 }
92
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate93 void operator() (const T * src0, const T * src1, T * dst) const
94 {
95 dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
96 }
97 };
98
99 } // namespace
100
101 #endif
102
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)103 void add(const Size2D &size,
104 const u8 * src0Base, ptrdiff_t src0Stride,
105 const u8 * src1Base, ptrdiff_t src1Stride,
106 u8 *dstBase, ptrdiff_t dstStride,
107 CONVERT_POLICY policy)
108 {
109 internal::assertSupportedConfiguration();
110 #ifdef CAROTENE_NEON
111 if (policy == CONVERT_POLICY_SATURATE)
112 {
113 internal::vtransform(size,
114 src0Base, src0Stride,
115 src1Base, src1Stride,
116 dstBase, dstStride,
117 AddSaturate<u8, u16>());
118 }
119 else
120 {
121 internal::vtransform(size,
122 src0Base, src0Stride,
123 src1Base, src1Stride,
124 dstBase, dstStride,
125 AddWrap<u8, u16>());
126 }
127 #else
128 (void)size;
129 (void)src0Base;
130 (void)src0Stride;
131 (void)src1Base;
132 (void)src1Stride;
133 (void)dstBase;
134 (void)dstStride;
135 (void)policy;
136 #endif
137 }
138
add(const Size2D & size,const s8 * src0Base,ptrdiff_t src0Stride,const s8 * src1Base,ptrdiff_t src1Stride,s8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)139 void add(const Size2D &size,
140 const s8 * src0Base, ptrdiff_t src0Stride,
141 const s8 * src1Base, ptrdiff_t src1Stride,
142 s8 *dstBase, ptrdiff_t dstStride,
143 CONVERT_POLICY policy)
144 {
145 internal::assertSupportedConfiguration();
146 #ifdef CAROTENE_NEON
147 if (policy == CONVERT_POLICY_SATURATE)
148 {
149 internal::vtransform(size,
150 src0Base, src0Stride,
151 src1Base, src1Stride,
152 dstBase, dstStride,
153 AddSaturate<s8, s16>());
154 }
155 else
156 {
157 internal::vtransform(size,
158 src0Base, src0Stride,
159 src1Base, src1Stride,
160 dstBase, dstStride,
161 AddWrap<s8, s16>());
162 }
163 #else
164 (void)size;
165 (void)src0Base;
166 (void)src0Stride;
167 (void)src1Base;
168 (void)src1Stride;
169 (void)dstBase;
170 (void)dstStride;
171 (void)policy;
172 #endif
173 }
174
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY)175 void add(const Size2D &size,
176 const u8 * src0Base, ptrdiff_t src0Stride,
177 const u8 * src1Base, ptrdiff_t src1Stride,
178 s16 *dstBase, ptrdiff_t dstStride,
179 CONVERT_POLICY)
180 {
181 internal::assertSupportedConfiguration();
182 #ifdef CAROTENE_NEON
183 size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
184 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
185
186 for (size_t i = 0; i < size.height; ++i)
187 {
188 const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
189 const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
190 u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
191 size_t j = 0;
192
193 for (; j < roiw32; j += 32)
194 {
195 internal::prefetch(src0 + j);
196 internal::prefetch(src1 + j);
197 uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
198 uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
199 vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
200 vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
201 vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
202 vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
203 }
204 for (; j < roiw8; j += 8)
205 {
206 uint8x8_t v_src0 = vld1_u8(src0 + j);
207 uint8x8_t v_src1 = vld1_u8(src1 + j);
208 vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
209 }
210
211 for (; j < size.width; j++)
212 dst[j] = (u16)src0[j] + (u16)src1[j];
213 }
214 #else
215 (void)size;
216 (void)src0Base;
217 (void)src0Stride;
218 (void)src1Base;
219 (void)src1Stride;
220 (void)dstBase;
221 (void)dstStride;
222 #endif
223 }
224
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)225 void add(const Size2D &size,
226 const u8 * src0Base, ptrdiff_t src0Stride,
227 const s16 * src1Base, ptrdiff_t src1Stride,
228 s16 *dstBase, ptrdiff_t dstStride,
229 CONVERT_POLICY policy)
230 {
231 internal::assertSupportedConfiguration();
232 #ifdef CAROTENE_NEON
233 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
234 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
235
236 for (size_t i = 0; i < size.height; ++i)
237 {
238 const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
239 const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
240 s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
241 size_t j = 0;
242
243 if (policy == CONVERT_POLICY_SATURATE)
244 {
245 for (; j < roiw16; j += 16)
246 {
247 internal::prefetch(src0 + j);
248 internal::prefetch(src1 + j);
249 uint8x16_t v_src0 = vld1q_u8(src0 + j);
250 int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
251 int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
252 int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
253 int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
254 int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
255 vst1q_s16(dst + j, v_dst0);
256 vst1q_s16(dst + j + 8, v_dst1);
257 }
258 for (; j < roiw8; j += 8)
259 {
260 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
261 int16x8_t v_src1 = vld1q_s16(src1 + j);
262 int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
263 vst1q_s16(dst + j, v_dst);
264 }
265
266 for (; j < size.width; j++)
267 dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
268 }
269 else
270 {
271 for (; j < roiw16; j += 16)
272 {
273 internal::prefetch(src0 + j);
274 internal::prefetch(src1 + j);
275 uint8x16_t v_src0 = vld1q_u8(src0 + j);
276 int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
277 int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
278 int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
279 int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
280 int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
281 vst1q_s16(dst + j, v_dst0);
282 vst1q_s16(dst + j + 8, v_dst1);
283 }
284 for (; j < roiw8; j += 8)
285 {
286 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
287 int16x8_t v_src1 = vld1q_s16(src1 + j);
288 int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
289 vst1q_s16(dst + j, v_dst);
290 }
291
292 for (; j < size.width; j++)
293 dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
294 }
295 }
296 #else
297 (void)size;
298 (void)src0Base;
299 (void)src0Stride;
300 (void)src1Base;
301 (void)src1Stride;
302 (void)dstBase;
303 (void)dstStride;
304 (void)policy;
305 #endif
306 }
307
add(const Size2D & size,const s16 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)308 void add(const Size2D &size,
309 const s16 * src0Base, ptrdiff_t src0Stride,
310 const s16 * src1Base, ptrdiff_t src1Stride,
311 s16 *dstBase, ptrdiff_t dstStride,
312 CONVERT_POLICY policy)
313 {
314 internal::assertSupportedConfiguration();
315 #ifdef CAROTENE_NEON
316 if (policy == CONVERT_POLICY_SATURATE)
317 {
318 internal::vtransform(size,
319 src0Base, src0Stride,
320 src1Base, src1Stride,
321 dstBase, dstStride,
322 AddSaturate<s16, s32>());
323 }
324 else
325 {
326 internal::vtransform(size,
327 src0Base, src0Stride,
328 src1Base, src1Stride,
329 dstBase, dstStride,
330 AddWrap<s16, s32>());
331 }
332 #else
333 (void)size;
334 (void)src0Base;
335 (void)src0Stride;
336 (void)src1Base;
337 (void)src1Stride;
338 (void)dstBase;
339 (void)dstStride;
340 (void)policy;
341 #endif
342 }
343
add(const Size2D & size,const u16 * src0Base,ptrdiff_t src0Stride,const u16 * src1Base,ptrdiff_t src1Stride,u16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)344 void add(const Size2D &size,
345 const u16 * src0Base, ptrdiff_t src0Stride,
346 const u16 * src1Base, ptrdiff_t src1Stride,
347 u16 * dstBase, ptrdiff_t dstStride,
348 CONVERT_POLICY policy)
349 {
350 internal::assertSupportedConfiguration();
351 #ifdef CAROTENE_NEON
352 if (policy == CONVERT_POLICY_SATURATE)
353 {
354 internal::vtransform(size,
355 src0Base, src0Stride,
356 src1Base, src1Stride,
357 dstBase, dstStride,
358 AddSaturate<u16, u32>());
359 }
360 else
361 {
362 internal::vtransform(size,
363 src0Base, src0Stride,
364 src1Base, src1Stride,
365 dstBase, dstStride,
366 AddWrap<u16, u32>());
367 }
368 #else
369 (void)size;
370 (void)src0Base;
371 (void)src0Stride;
372 (void)src1Base;
373 (void)src1Stride;
374 (void)dstBase;
375 (void)dstStride;
376 (void)policy;
377 #endif
378 }
379
add(const Size2D & size,const s32 * src0Base,ptrdiff_t src0Stride,const s32 * src1Base,ptrdiff_t src1Stride,s32 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)380 void add(const Size2D &size,
381 const s32 * src0Base, ptrdiff_t src0Stride,
382 const s32 * src1Base, ptrdiff_t src1Stride,
383 s32 *dstBase, ptrdiff_t dstStride,
384 CONVERT_POLICY policy)
385 {
386 internal::assertSupportedConfiguration();
387 #ifdef CAROTENE_NEON
388 if (policy == CONVERT_POLICY_SATURATE)
389 {
390 internal::vtransform(size,
391 src0Base, src0Stride,
392 src1Base, src1Stride,
393 dstBase, dstStride,
394 AddSaturate<s32, s64>());
395 }
396 else
397 {
398 internal::vtransform(size,
399 src0Base, src0Stride,
400 src1Base, src1Stride,
401 dstBase, dstStride,
402 AddWrap<s32, s64>());
403 }
404 #else
405 (void)size;
406 (void)src0Base;
407 (void)src0Stride;
408 (void)src1Base;
409 (void)src1Stride;
410 (void)dstBase;
411 (void)dstStride;
412 (void)policy;
413 #endif
414 }
415
add(const Size2D & size,const u32 * src0Base,ptrdiff_t src0Stride,const u32 * src1Base,ptrdiff_t src1Stride,u32 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)416 void add(const Size2D &size,
417 const u32 * src0Base, ptrdiff_t src0Stride,
418 const u32 * src1Base, ptrdiff_t src1Stride,
419 u32 * dstBase, ptrdiff_t dstStride,
420 CONVERT_POLICY policy)
421 {
422 internal::assertSupportedConfiguration();
423 #ifdef CAROTENE_NEON
424 if (policy == CONVERT_POLICY_SATURATE)
425 {
426 internal::vtransform(size,
427 src0Base, src0Stride,
428 src1Base, src1Stride,
429 dstBase, dstStride,
430 AddSaturate<u32, u64>());
431 }
432 else
433 {
434 internal::vtransform(size,
435 src0Base, src0Stride,
436 src1Base, src1Stride,
437 dstBase, dstStride,
438 AddWrap<u32, u64>());
439 }
440 #else
441 (void)size;
442 (void)src0Base;
443 (void)src0Stride;
444 (void)src1Base;
445 (void)src1Stride;
446 (void)dstBase;
447 (void)dstStride;
448 (void)policy;
449 #endif
450 }
451
add(const Size2D & size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride,f32 * dstBase,ptrdiff_t dstStride)452 void add(const Size2D &size,
453 const f32 * src0Base, ptrdiff_t src0Stride,
454 const f32 * src1Base, ptrdiff_t src1Stride,
455 f32 * dstBase, ptrdiff_t dstStride)
456 {
457 internal::assertSupportedConfiguration();
458 #ifdef CAROTENE_NEON
459 internal::vtransform(size,
460 src0Base, src0Stride,
461 src1Base, src1Stride,
462 dstBase, dstStride,
463 AddWrap<f32, f32>());
464 #else
465 (void)size;
466 (void)src0Base;
467 (void)src0Stride;
468 (void)src1Base;
469 (void)src1Stride;
470 (void)dstBase;
471 (void)dstStride;
472 #endif
473 }
474
475 } // namespace CAROTENE_NS
476