• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "vtransform.hpp"
42 
43 namespace CAROTENE_NS {
44 
45 #ifdef CAROTENE_NEON
46 
47 namespace {
48 
49 template <typename T, typename WT>
50 struct AddWrap
51 {
52     typedef T type;
53 
operator ()CAROTENE_NS::__anond1c967710111::AddWrap54     void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
55                      const typename internal::VecTraits<T>::vec128 & v_src1,
56                      typename internal::VecTraits<T>::vec128 & v_dst) const
57     {
58         v_dst = internal::vaddq(v_src0, v_src1);
59     }
60 
operator ()CAROTENE_NS::__anond1c967710111::AddWrap61     void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
62                      const typename internal::VecTraits<T>::vec64 & v_src1,
63                      typename internal::VecTraits<T>::vec64 & v_dst) const
64     {
65         v_dst = internal::vadd(v_src0, v_src1);
66     }
67 
operator ()CAROTENE_NS::__anond1c967710111::AddWrap68     void operator() (const T * src0, const T * src1, T * dst) const
69     {
70         dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
71     }
72 };
73 
74 template <typename T, typename WT>
75 struct AddSaturate
76 {
77     typedef T type;
78 
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate79     void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
80                      const typename internal::VecTraits<T>::vec128 & v_src1,
81                      typename internal::VecTraits<T>::vec128 & v_dst) const
82     {
83         v_dst = internal::vqaddq(v_src0, v_src1);
84     }
85 
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate86     void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
87                      const typename internal::VecTraits<T>::vec64 & v_src1,
88                      typename internal::VecTraits<T>::vec64 & v_dst) const
89     {
90         v_dst = internal::vqadd(v_src0, v_src1);
91     }
92 
operator ()CAROTENE_NS::__anond1c967710111::AddSaturate93     void operator() (const T * src0, const T * src1, T * dst) const
94     {
95         dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
96     }
97 };
98 
99 } // namespace
100 
101 #endif
102 
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,u8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)103 void add(const Size2D &size,
104          const u8 * src0Base, ptrdiff_t src0Stride,
105          const u8 * src1Base, ptrdiff_t src1Stride,
106          u8 *dstBase, ptrdiff_t dstStride,
107          CONVERT_POLICY policy)
108 {
109     internal::assertSupportedConfiguration();
110 #ifdef CAROTENE_NEON
111     if (policy == CONVERT_POLICY_SATURATE)
112     {
113         internal::vtransform(size,
114                              src0Base, src0Stride,
115                              src1Base, src1Stride,
116                              dstBase, dstStride,
117                              AddSaturate<u8, u16>());
118     }
119     else
120     {
121         internal::vtransform(size,
122                              src0Base, src0Stride,
123                              src1Base, src1Stride,
124                              dstBase, dstStride,
125                              AddWrap<u8, u16>());
126     }
127 #else
128     (void)size;
129     (void)src0Base;
130     (void)src0Stride;
131     (void)src1Base;
132     (void)src1Stride;
133     (void)dstBase;
134     (void)dstStride;
135     (void)policy;
136 #endif
137 }
138 
add(const Size2D & size,const s8 * src0Base,ptrdiff_t src0Stride,const s8 * src1Base,ptrdiff_t src1Stride,s8 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)139 void add(const Size2D &size,
140          const s8 * src0Base, ptrdiff_t src0Stride,
141          const s8 * src1Base, ptrdiff_t src1Stride,
142          s8 *dstBase, ptrdiff_t dstStride,
143          CONVERT_POLICY policy)
144 {
145     internal::assertSupportedConfiguration();
146 #ifdef CAROTENE_NEON
147     if (policy == CONVERT_POLICY_SATURATE)
148     {
149         internal::vtransform(size,
150                              src0Base, src0Stride,
151                              src1Base, src1Stride,
152                              dstBase, dstStride,
153                              AddSaturate<s8, s16>());
154     }
155     else
156     {
157         internal::vtransform(size,
158                              src0Base, src0Stride,
159                              src1Base, src1Stride,
160                              dstBase, dstStride,
161                              AddWrap<s8, s16>());
162     }
163 #else
164     (void)size;
165     (void)src0Base;
166     (void)src0Stride;
167     (void)src1Base;
168     (void)src1Stride;
169     (void)dstBase;
170     (void)dstStride;
171     (void)policy;
172 #endif
173 }
174 
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY)175 void add(const Size2D &size,
176          const u8 * src0Base, ptrdiff_t src0Stride,
177          const u8 * src1Base, ptrdiff_t src1Stride,
178          s16 *dstBase, ptrdiff_t dstStride,
179          CONVERT_POLICY)
180 {
181     internal::assertSupportedConfiguration();
182 #ifdef CAROTENE_NEON
183     size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
184     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
185 
186     for (size_t i = 0; i < size.height; ++i)
187     {
188         const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
189         const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
190         u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
191         size_t j = 0;
192 
193         for (; j < roiw32; j += 32)
194         {
195             internal::prefetch(src0 + j);
196             internal::prefetch(src1 + j);
197             uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
198             uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
199             vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
200             vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
201             vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
202             vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
203         }
204         for (; j < roiw8; j += 8)
205         {
206             uint8x8_t v_src0 = vld1_u8(src0 + j);
207             uint8x8_t v_src1 = vld1_u8(src1 + j);
208             vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
209         }
210 
211         for (; j < size.width; j++)
212             dst[j] = (u16)src0[j] + (u16)src1[j];
213     }
214 #else
215     (void)size;
216     (void)src0Base;
217     (void)src0Stride;
218     (void)src1Base;
219     (void)src1Stride;
220     (void)dstBase;
221     (void)dstStride;
222 #endif
223 }
224 
add(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)225 void add(const Size2D &size,
226          const u8 * src0Base, ptrdiff_t src0Stride,
227          const s16 * src1Base, ptrdiff_t src1Stride,
228          s16 *dstBase, ptrdiff_t dstStride,
229          CONVERT_POLICY policy)
230 {
231     internal::assertSupportedConfiguration();
232 #ifdef CAROTENE_NEON
233     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
234     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
235 
236     for (size_t i = 0; i < size.height; ++i)
237     {
238         const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
239         const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
240         s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
241         size_t j = 0;
242 
243         if (policy == CONVERT_POLICY_SATURATE)
244         {
245             for (; j < roiw16; j += 16)
246             {
247                 internal::prefetch(src0 + j);
248                 internal::prefetch(src1 + j);
249                 uint8x16_t v_src0 = vld1q_u8(src0 + j);
250                 int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
251                 int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
252                 int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
253                 int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
254                 int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
255                 vst1q_s16(dst + j, v_dst0);
256                 vst1q_s16(dst + j + 8, v_dst1);
257             }
258             for (; j < roiw8; j += 8)
259             {
260                 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
261                 int16x8_t v_src1 = vld1q_s16(src1 + j);
262                 int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
263                 vst1q_s16(dst + j, v_dst);
264             }
265 
266             for (; j < size.width; j++)
267                 dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
268         }
269         else
270         {
271             for (; j < roiw16; j += 16)
272             {
273                 internal::prefetch(src0 + j);
274                 internal::prefetch(src1 + j);
275                 uint8x16_t v_src0 = vld1q_u8(src0 + j);
276                 int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
277                 int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
278                 int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
279                 int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
280                 int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
281                 vst1q_s16(dst + j, v_dst0);
282                 vst1q_s16(dst + j + 8, v_dst1);
283             }
284             for (; j < roiw8; j += 8)
285             {
286                 int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
287                 int16x8_t v_src1 = vld1q_s16(src1 + j);
288                 int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
289                 vst1q_s16(dst + j, v_dst);
290             }
291 
292             for (; j < size.width; j++)
293                 dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
294         }
295     }
296 #else
297     (void)size;
298     (void)src0Base;
299     (void)src0Stride;
300     (void)src1Base;
301     (void)src1Stride;
302     (void)dstBase;
303     (void)dstStride;
304     (void)policy;
305 #endif
306 }
307 
add(const Size2D & size,const s16 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)308 void add(const Size2D &size,
309          const s16 * src0Base, ptrdiff_t src0Stride,
310          const s16 * src1Base, ptrdiff_t src1Stride,
311          s16 *dstBase, ptrdiff_t dstStride,
312          CONVERT_POLICY policy)
313 {
314     internal::assertSupportedConfiguration();
315 #ifdef CAROTENE_NEON
316         if (policy == CONVERT_POLICY_SATURATE)
317     {
318         internal::vtransform(size,
319                              src0Base, src0Stride,
320                              src1Base, src1Stride,
321                              dstBase, dstStride,
322                              AddSaturate<s16, s32>());
323     }
324     else
325     {
326         internal::vtransform(size,
327                              src0Base, src0Stride,
328                              src1Base, src1Stride,
329                              dstBase, dstStride,
330                              AddWrap<s16, s32>());
331     }
332 #else
333     (void)size;
334     (void)src0Base;
335     (void)src0Stride;
336     (void)src1Base;
337     (void)src1Stride;
338     (void)dstBase;
339     (void)dstStride;
340     (void)policy;
341 #endif
342 }
343 
add(const Size2D & size,const u16 * src0Base,ptrdiff_t src0Stride,const u16 * src1Base,ptrdiff_t src1Stride,u16 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)344 void add(const Size2D &size,
345          const u16 * src0Base, ptrdiff_t src0Stride,
346          const u16 * src1Base, ptrdiff_t src1Stride,
347          u16 * dstBase, ptrdiff_t dstStride,
348          CONVERT_POLICY policy)
349 {
350     internal::assertSupportedConfiguration();
351 #ifdef CAROTENE_NEON
352         if (policy == CONVERT_POLICY_SATURATE)
353     {
354         internal::vtransform(size,
355                              src0Base, src0Stride,
356                              src1Base, src1Stride,
357                              dstBase, dstStride,
358                              AddSaturate<u16, u32>());
359     }
360     else
361     {
362         internal::vtransform(size,
363                              src0Base, src0Stride,
364                              src1Base, src1Stride,
365                              dstBase, dstStride,
366                              AddWrap<u16, u32>());
367     }
368 #else
369     (void)size;
370     (void)src0Base;
371     (void)src0Stride;
372     (void)src1Base;
373     (void)src1Stride;
374     (void)dstBase;
375     (void)dstStride;
376     (void)policy;
377 #endif
378 }
379 
add(const Size2D & size,const s32 * src0Base,ptrdiff_t src0Stride,const s32 * src1Base,ptrdiff_t src1Stride,s32 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)380 void add(const Size2D &size,
381          const s32 * src0Base, ptrdiff_t src0Stride,
382          const s32 * src1Base, ptrdiff_t src1Stride,
383          s32 *dstBase, ptrdiff_t dstStride,
384          CONVERT_POLICY policy)
385 {
386     internal::assertSupportedConfiguration();
387 #ifdef CAROTENE_NEON
388         if (policy == CONVERT_POLICY_SATURATE)
389     {
390         internal::vtransform(size,
391                              src0Base, src0Stride,
392                              src1Base, src1Stride,
393                              dstBase, dstStride,
394                              AddSaturate<s32, s64>());
395     }
396     else
397     {
398         internal::vtransform(size,
399                              src0Base, src0Stride,
400                              src1Base, src1Stride,
401                              dstBase, dstStride,
402                              AddWrap<s32, s64>());
403     }
404 #else
405     (void)size;
406     (void)src0Base;
407     (void)src0Stride;
408     (void)src1Base;
409     (void)src1Stride;
410     (void)dstBase;
411     (void)dstStride;
412     (void)policy;
413 #endif
414 }
415 
add(const Size2D & size,const u32 * src0Base,ptrdiff_t src0Stride,const u32 * src1Base,ptrdiff_t src1Stride,u32 * dstBase,ptrdiff_t dstStride,CONVERT_POLICY policy)416 void add(const Size2D &size,
417          const u32 * src0Base, ptrdiff_t src0Stride,
418          const u32 * src1Base, ptrdiff_t src1Stride,
419          u32 * dstBase, ptrdiff_t dstStride,
420          CONVERT_POLICY policy)
421 {
422     internal::assertSupportedConfiguration();
423 #ifdef CAROTENE_NEON
424         if (policy == CONVERT_POLICY_SATURATE)
425     {
426         internal::vtransform(size,
427                              src0Base, src0Stride,
428                              src1Base, src1Stride,
429                              dstBase, dstStride,
430                              AddSaturate<u32, u64>());
431     }
432     else
433     {
434         internal::vtransform(size,
435                              src0Base, src0Stride,
436                              src1Base, src1Stride,
437                              dstBase, dstStride,
438                              AddWrap<u32, u64>());
439     }
440 #else
441     (void)size;
442     (void)src0Base;
443     (void)src0Stride;
444     (void)src1Base;
445     (void)src1Stride;
446     (void)dstBase;
447     (void)dstStride;
448     (void)policy;
449 #endif
450 }
451 
add(const Size2D & size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride,f32 * dstBase,ptrdiff_t dstStride)452 void add(const Size2D &size,
453          const f32 * src0Base, ptrdiff_t src0Stride,
454          const f32 * src1Base, ptrdiff_t src1Stride,
455          f32 * dstBase, ptrdiff_t dstStride)
456 {
457     internal::assertSupportedConfiguration();
458 #ifdef CAROTENE_NEON
459     internal::vtransform(size,
460                          src0Base, src0Stride,
461                          src1Base, src1Stride,
462                          dstBase, dstStride,
463                          AddWrap<f32, f32>());
464 #else
465     (void)size;
466     (void)src0Base;
467     (void)src0Stride;
468     (void)src1Base;
469     (void)src1Stride;
470     (void)dstBase;
471     (void)dstStride;
472 #endif
473 }
474 
475 } // namespace CAROTENE_NS
476