• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "vtransform.hpp"
42 
43 #include <cstring>
44 #include <cfloat>
45 #include <cmath>
46 #include <limits>
47 
48 namespace CAROTENE_NS {
49 
50 namespace {
51 
52 #ifdef CAROTENE_NEON
53 
vroundq(const float32x4_t & v)54 inline float32x4_t vroundq(const float32x4_t& v)
55 {
56     const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
57     float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
58     return vaddq_f32(v, v_addition);
59 }
60 
61 template <typename T>
divSaturateQ(const T & v1,const T & v2,const float scale)62 inline T divSaturateQ(const T &v1, const T &v2, const float scale)
63 {
64     return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
65                                                             internal::vmovl(internal::vget_low(v2)), scale)),
66                               internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
67                                                             internal::vmovl(internal::vget_high(v2)), scale))
68                              );
69 }
70 template <>
divSaturateQ(const int32x4_t & v1,const int32x4_t & v2,const float scale)71 inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
72 { return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
73 template <>
divSaturateQ(const uint32x4_t & v1,const uint32x4_t & v2,const float scale)74 inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
75 { return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
76 
vround(const float32x2_t & v)77 inline float32x2_t vround(const float32x2_t& v)
78 {
79     const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
80     float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
81     return vadd_f32(v, v_addition);
82 }
83 
84 template <typename T>
divSaturate(const T & v1,const T & v2,const float scale)85 inline T divSaturate(const T &v1, const T &v2, const float scale)
86 {
87     return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
88 }
89 template <>
divSaturate(const int32x2_t & v1,const int32x2_t & v2,const float scale)90 inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
91 { return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
92 template <>
divSaturate(const uint32x2_t & v1,const uint32x2_t & v2,const float scale)93 inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
94 { return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
95 
96 
97 template <typename T>
divWrapQ(const T & v1,const T & v2,const float scale)98 inline T divWrapQ(const T &v1, const T &v2, const float scale)
99 {
100     return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
101                                                        internal::vmovl(internal::vget_low(v2)), scale)),
102                               internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
103                                                        internal::vmovl(internal::vget_high(v2)), scale))
104                              );
105 }
106 template <>
divWrapQ(const int32x4_t & v1,const int32x4_t & v2,const float scale)107 inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
108 { return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
109 template <>
divWrapQ(const uint32x4_t & v1,const uint32x4_t & v2,const float scale)110 inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
111 { return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
112 
113 template <typename T>
divWrap(const T & v1,const T & v2,const float scale)114 inline T divWrap(const T &v1, const T &v2, const float scale)
115 {
116     return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
117 }
118 template <>
divWrap(const int32x2_t & v1,const int32x2_t & v2,const float scale)119 inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
120 { return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
121 template <>
divWrap(const uint32x2_t & v1,const uint32x2_t & v2,const float scale)122 inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
123 { return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
124 
vtstq(const uint8x16_t & v0,const uint8x16_t & v1)125 inline  uint8x16_t vtstq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vtstq_u8 (v0, v1); }
vtstq(const uint16x8_t & v0,const uint16x8_t & v1)126 inline  uint16x8_t vtstq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vtstq_u16(v0, v1); }
vtstq(const uint32x4_t & v0,const uint32x4_t & v1)127 inline  uint32x4_t vtstq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vtstq_u32(v0, v1); }
vtstq(const int8x16_t & v0,const int8x16_t & v1)128 inline   int8x16_t vtstq(const int8x16_t   & v0, const int8x16_t   & v1) { return vreinterpretq_s8_u8  (vtstq_s8 (v0, v1)); }
vtstq(const int16x8_t & v0,const int16x8_t & v1)129 inline   int16x8_t vtstq(const int16x8_t   & v0, const int16x8_t   & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
vtstq(const int32x4_t & v0,const int32x4_t & v1)130 inline   int32x4_t vtstq(const int32x4_t   & v0, const int32x4_t   & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
131 
vtst(const uint8x8_t & v0,const uint8x8_t & v1)132 inline   uint8x8_t vtst(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vtst_u8 (v0, v1); }
vtst(const uint16x4_t & v0,const uint16x4_t & v1)133 inline  uint16x4_t vtst(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vtst_u16(v0, v1); }
vtst(const uint32x2_t & v0,const uint32x2_t & v1)134 inline  uint32x2_t vtst(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vtst_u32(v0, v1); }
vtst(const int8x8_t & v0,const int8x8_t & v1)135 inline    int8x8_t vtst(const int8x8_t    & v0, const int8x8_t    & v1) { return vreinterpret_s8_u8  (vtst_s8 (v0, v1)); }
vtst(const int16x4_t & v0,const int16x4_t & v1)136 inline   int16x4_t vtst(const int16x4_t   & v0, const int16x4_t   & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
vtst(const int32x2_t & v0,const int32x2_t & v1)137 inline   int32x2_t vtst(const int32x2_t   & v0, const int32x2_t   & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
138 #endif
139 
140 template <typename T>
div(const Size2D & size,const T * src0Base,ptrdiff_t src0Stride,const T * src1Base,ptrdiff_t src1Stride,T * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)141 void div(const Size2D &size,
142          const T * src0Base, ptrdiff_t src0Stride,
143          const T * src1Base, ptrdiff_t src1Stride,
144          T * dstBase, ptrdiff_t dstStride,
145          f32 scale,
146          CONVERT_POLICY cpolicy)
147 {
148     internal::assertSupportedConfiguration();
149 
150 #ifdef CAROTENE_NEON
151     typedef typename internal::VecTraits<T>::vec128 vec128;
152     typedef typename internal::VecTraits<T>::vec64 vec64;
153 
154     if (scale == 0.0f ||
155         (std::numeric_limits<T>::is_integer &&
156          (scale * std::numeric_limits<T>::max()) <  1.0f &&
157          (scale * std::numeric_limits<T>::max()) > -1.0f))
158     {
159         for (size_t y = 0; y < size.height; ++y)
160         {
161             T * dst = internal::getRowPtr(dstBase, dstStride, y);
162             std::memset(dst, 0, sizeof(T) * size.width);
163         }
164         return;
165     }
166 
167     const size_t step128 = 16 / sizeof(T);
168     size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
169     const size_t step64 = 8 / sizeof(T);
170     size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
171 
172     for (size_t i = 0; i < size.height; ++i)
173     {
174         const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
175         const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
176         T * dst = internal::getRowPtr(dstBase, dstStride, i);
177         size_t j = 0;
178 
179         if (cpolicy == CONVERT_POLICY_SATURATE)
180         {
181             for (; j < roiw128; j += step128)
182             {
183                 internal::prefetch(src0 + j);
184                 internal::prefetch(src1 + j);
185 
186                 vec128 v_src0 = internal::vld1q(src0 + j);
187                 vec128 v_src1 = internal::vld1q(src1 + j);
188 
189                 vec128 v_mask = vtstq(v_src1,v_src1);
190                 internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
191             }
192             for (; j < roiw64; j += step64)
193             {
194                 vec64 v_src0 = internal::vld1(src0 + j);
195                 vec64 v_src1 = internal::vld1(src1 + j);
196 
197                 vec64 v_mask = vtst(v_src1,v_src1);
198                 internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
199             }
200             for (; j < size.width; j++)
201             {
202                 dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
203             }
204         }
205         else // CONVERT_POLICY_WRAP
206         {
207             for (; j < roiw128; j += step128)
208             {
209                 internal::prefetch(src0 + j);
210                 internal::prefetch(src1 + j);
211 
212                 vec128 v_src0 = internal::vld1q(src0 + j);
213                 vec128 v_src1 = internal::vld1q(src1 + j);
214 
215                 vec128 v_mask = vtstq(v_src1,v_src1);
216                 internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
217             }
218             for (; j < roiw64; j += step64)
219             {
220                 vec64 v_src0 = internal::vld1(src0 + j);
221                 vec64 v_src1 = internal::vld1(src1 + j);
222 
223                 vec64 v_mask = vtst(v_src1,v_src1);
224                 internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
225             }
226             for (; j < size.width; j++)
227             {
228                 dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
229             }
230         }
231     }
232 #else
233     (void)size;
234     (void)src0Base;
235     (void)src0Stride;
236     (void)src1Base;
237     (void)src1Stride;
238     (void)dstBase;
239     (void)dstStride;
240     (void)cpolicy;
241     (void)scale;
242 #endif
243 }
244 
245 #ifdef CAROTENE_NEON
246 
247 template <typename T>
recipSaturateQ(const T & v2,const float scale)248 inline T recipSaturateQ(const T &v2, const float scale)
249 {
250     return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
251                               internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
252                              );
253 }
254 template <>
recipSaturateQ(const int32x4_t & v2,const float scale)255 inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
256 { return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
257 template <>
recipSaturateQ(const uint32x4_t & v2,const float scale)258 inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
259 { return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
260 
261 template <typename T>
recipSaturate(const T & v2,const float scale)262 inline T recipSaturate(const T &v2, const float scale)
263 {
264     return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
265 }
266 template <>
recipSaturate(const int32x2_t & v2,const float scale)267 inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
268 { return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
269 template <>
recipSaturate(const uint32x2_t & v2,const float scale)270 inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
271 { return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
272 
273 
274 template <typename T>
recipWrapQ(const T & v2,const float scale)275 inline T recipWrapQ(const T &v2, const float scale)
276 {
277     return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
278                               internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
279                              );
280 }
281 template <>
recipWrapQ(const int32x4_t & v2,const float scale)282 inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
283 { return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
284 template <>
recipWrapQ(const uint32x4_t & v2,const float scale)285 inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
286 { return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
287 
288 template <typename T>
recipWrap(const T & v2,const float scale)289 inline T recipWrap(const T &v2, const float scale)
290 {
291     return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
292 }
293 template <>
recipWrap(const int32x2_t & v2,const float scale)294 inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
295 { return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
296 template <>
recipWrap(const uint32x2_t & v2,const float scale)297 inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
298 { return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
299 #endif
300 
301 template <typename T>
recip(const Size2D & size,const T * src1Base,ptrdiff_t src1Stride,T * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)302 void recip(const Size2D &size,
303            const T * src1Base, ptrdiff_t src1Stride,
304            T * dstBase, ptrdiff_t dstStride,
305            f32 scale,
306            CONVERT_POLICY cpolicy)
307 {
308     internal::assertSupportedConfiguration();
309 
310 #ifdef CAROTENE_NEON
311     typedef typename internal::VecTraits<T>::vec128 vec128;
312     typedef typename internal::VecTraits<T>::vec64 vec64;
313 
314     if (scale == 0.0f ||
315         (std::numeric_limits<T>::is_integer &&
316          scale <  1.0f &&
317          scale > -1.0f))
318     {
319         for (size_t y = 0; y < size.height; ++y)
320         {
321             T * dst = internal::getRowPtr(dstBase, dstStride, y);
322             std::memset(dst, 0, sizeof(T) * size.width);
323         }
324         return;
325     }
326 
327     const size_t step128 = 16 / sizeof(T);
328     size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
329     const size_t step64 = 8 / sizeof(T);
330     size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
331 
332     for (size_t i = 0; i < size.height; ++i)
333     {
334         const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
335         T * dst = internal::getRowPtr(dstBase, dstStride, i);
336         size_t j = 0;
337 
338         if (cpolicy == CONVERT_POLICY_SATURATE)
339         {
340             for (; j < roiw128; j += step128)
341             {
342                 internal::prefetch(src1 + j);
343 
344                 vec128 v_src1 = internal::vld1q(src1 + j);
345 
346                 vec128 v_mask = vtstq(v_src1,v_src1);
347                 internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
348             }
349             for (; j < roiw64; j += step64)
350             {
351                 vec64 v_src1 = internal::vld1(src1 + j);
352 
353                 vec64 v_mask = vtst(v_src1,v_src1);
354                 internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
355             }
356             for (; j < size.width; j++)
357             {
358                 dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
359             }
360         }
361         else // CONVERT_POLICY_WRAP
362         {
363             for (; j < roiw128; j += step128)
364             {
365                 internal::prefetch(src1 + j);
366 
367                 vec128 v_src1 = internal::vld1q(src1 + j);
368 
369                 vec128 v_mask = vtstq(v_src1,v_src1);
370                 internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
371             }
372             for (; j < roiw64; j += step64)
373             {
374                 vec64 v_src1 = internal::vld1(src1 + j);
375 
376                 vec64 v_mask = vtst(v_src1,v_src1);
377                 internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
378             }
379             for (; j < size.width; j++)
380             {
381                 dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
382             }
383         }
384     }
385 #else
386     (void)size;
387     (void)src1Base;
388     (void)src1Stride;
389     (void)dstBase;
390     (void)dstStride;
391     (void)cpolicy;
392     (void)scale;
393 #endif
394 }
395 
396 }
397 
div(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,u8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)398 void div(const Size2D &size,
399          const u8 * src0Base, ptrdiff_t src0Stride,
400          const u8 * src1Base, ptrdiff_t src1Stride,
401          u8 * dstBase, ptrdiff_t dstStride,
402          f32 scale,
403          CONVERT_POLICY cpolicy)
404 {
405     div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
406 }
407 
div(const Size2D & size,const s8 * src0Base,ptrdiff_t src0Stride,const s8 * src1Base,ptrdiff_t src1Stride,s8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)408 void div(const Size2D &size,
409          const s8 * src0Base, ptrdiff_t src0Stride,
410          const s8 * src1Base, ptrdiff_t src1Stride,
411          s8 * dstBase, ptrdiff_t dstStride,
412          f32 scale,
413          CONVERT_POLICY cpolicy)
414 {
415     div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
416 }
417 
div(const Size2D & size,const u16 * src0Base,ptrdiff_t src0Stride,const u16 * src1Base,ptrdiff_t src1Stride,u16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)418 void div(const Size2D &size,
419          const u16 * src0Base, ptrdiff_t src0Stride,
420          const u16 * src1Base, ptrdiff_t src1Stride,
421          u16 * dstBase, ptrdiff_t dstStride,
422          f32 scale,
423          CONVERT_POLICY cpolicy)
424 {
425     div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
426 }
427 
div(const Size2D & size,const s16 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)428 void div(const Size2D &size,
429          const s16 * src0Base, ptrdiff_t src0Stride,
430          const s16 * src1Base, ptrdiff_t src1Stride,
431          s16 * dstBase, ptrdiff_t dstStride,
432          f32 scale,
433          CONVERT_POLICY cpolicy)
434 {
435     div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
436 }
437 
div(const Size2D & size,const s32 * src0Base,ptrdiff_t src0Stride,const s32 * src1Base,ptrdiff_t src1Stride,s32 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)438 void div(const Size2D &size,
439          const s32 * src0Base, ptrdiff_t src0Stride,
440          const s32 * src1Base, ptrdiff_t src1Stride,
441          s32 * dstBase, ptrdiff_t dstStride,
442          f32 scale,
443          CONVERT_POLICY cpolicy)
444 {
445     div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
446 }
447 
div(const Size2D & size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride,f32 * dstBase,ptrdiff_t dstStride,f32 scale)448 void div(const Size2D &size,
449          const f32 * src0Base, ptrdiff_t src0Stride,
450          const f32 * src1Base, ptrdiff_t src1Stride,
451          f32 * dstBase, ptrdiff_t dstStride,
452          f32 scale)
453 {
454     internal::assertSupportedConfiguration();
455 #ifdef CAROTENE_NEON
456     if (scale == 0.0f)
457     {
458         for (size_t y = 0; y < size.height; ++y)
459         {
460             f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
461             std::memset(dst, 0, sizeof(f32) * size.width);
462         }
463         return;
464     }
465 
466     float32x4_t v_zero = vdupq_n_f32(0.0f);
467 
468     size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
469     size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
470 
471     if (std::fabs(scale - 1.0f) < FLT_EPSILON)
472     {
473         for (size_t i = 0; i < size.height; ++i)
474         {
475             const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
476             const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
477             f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
478             size_t j = 0;
479 
480             for (; j < roiw128; j += 4)
481             {
482                 internal::prefetch(src0 + j);
483                 internal::prefetch(src1 + j);
484 
485                 float32x4_t v_src0 = vld1q_f32(src0 + j);
486                 float32x4_t v_src1 = vld1q_f32(src1 + j);
487 
488                 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
489                 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
490                                    vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
491             }
492 
493             for (; j < roiw64; j += 2)
494             {
495                 float32x2_t v_src0 = vld1_f32(src0 + j);
496                 float32x2_t v_src1 = vld1_f32(src1 + j);
497 
498                 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
499                 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
500                                   vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
501             }
502 
503             for (; j < size.width; j++)
504             {
505                 dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
506             }
507         }
508     }
509     else
510     {
511         for (size_t i = 0; i < size.height; ++i)
512         {
513             const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
514             const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
515             f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
516             size_t j = 0;
517 
518             for (; j < roiw128; j += 4)
519             {
520                 internal::prefetch(src0 + j);
521                 internal::prefetch(src1 + j);
522 
523                 float32x4_t v_src0 = vld1q_f32(src0 + j);
524                 float32x4_t v_src1 = vld1q_f32(src1 + j);
525 
526                 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
527                 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
528                                    vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
529                                                          internal::vrecpq_f32(v_src1))), v_mask)));
530             }
531 
532             for (; j < roiw64; j += 2)
533             {
534                 float32x2_t v_src0 = vld1_f32(src0 + j);
535                 float32x2_t v_src1 = vld1_f32(src1 + j);
536 
537                 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
538                 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
539                                   vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
540                                                                 internal::vrecp_f32(v_src1))), v_mask)));
541             }
542 
543             for (; j < size.width; j++)
544             {
545                 dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
546             }
547         }
548     }
549 #else
550     (void)size;
551     (void)src0Base;
552     (void)src0Stride;
553     (void)src1Base;
554     (void)src1Stride;
555     (void)dstBase;
556     (void)dstStride;
557     (void)scale;
558 #endif
559 }
560 
reciprocal(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)561 void reciprocal(const Size2D &size,
562                 const u8 * srcBase, ptrdiff_t srcStride,
563                 u8 * dstBase, ptrdiff_t dstStride,
564                 f32 scale,
565                 CONVERT_POLICY cpolicy)
566 {
567     recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
568 }
569 
reciprocal(const Size2D & size,const s8 * srcBase,ptrdiff_t srcStride,s8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)570 void reciprocal(const Size2D &size,
571                 const s8 * srcBase, ptrdiff_t srcStride,
572                 s8 * dstBase, ptrdiff_t dstStride,
573                 f32 scale,
574                 CONVERT_POLICY cpolicy)
575 {
576     recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
577 }
578 
reciprocal(const Size2D & size,const u16 * srcBase,ptrdiff_t srcStride,u16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)579 void reciprocal(const Size2D &size,
580                 const u16 * srcBase, ptrdiff_t srcStride,
581                 u16 * dstBase, ptrdiff_t dstStride,
582                 f32 scale,
583                 CONVERT_POLICY cpolicy)
584 {
585     recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
586 }
587 
reciprocal(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)588 void reciprocal(const Size2D &size,
589                 const s16 * srcBase, ptrdiff_t srcStride,
590                 s16 * dstBase, ptrdiff_t dstStride,
591                 f32 scale,
592                 CONVERT_POLICY cpolicy)
593 {
594     recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
595 }
596 
reciprocal(const Size2D & size,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)597 void reciprocal(const Size2D &size,
598                 const s32 * srcBase, ptrdiff_t srcStride,
599                 s32 * dstBase, ptrdiff_t dstStride,
600                 f32 scale,
601                 CONVERT_POLICY cpolicy)
602 {
603     recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
604 }
605 
reciprocal(const Size2D & size,const f32 * srcBase,ptrdiff_t srcStride,f32 * dstBase,ptrdiff_t dstStride,f32 scale)606 void reciprocal(const Size2D &size,
607                 const f32 * srcBase, ptrdiff_t srcStride,
608                 f32 * dstBase, ptrdiff_t dstStride,
609                 f32 scale)
610 {
611     internal::assertSupportedConfiguration();
612 #ifdef CAROTENE_NEON
613     if (scale == 0.0f)
614     {
615         for (size_t y = 0; y < size.height; ++y)
616         {
617             f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
618             std::memset(dst, 0, sizeof(f32) * size.width);
619         }
620         return;
621     }
622 
623     float32x4_t v_zero = vdupq_n_f32(0.0f);
624 
625     size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
626     size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
627 
628     if (std::fabs(scale - 1.0f) < FLT_EPSILON)
629     {
630         for (size_t i = 0; i < size.height; ++i)
631         {
632             const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
633             f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
634             size_t j = 0;
635 
636             for (; j < roiw128; j += 4)
637             {
638                 internal::prefetch(src1 + j);
639 
640                 float32x4_t v_src1 = vld1q_f32(src1 + j);
641 
642                 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
643                 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
644                                    vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
645             }
646 
647             for (; j < roiw64; j += 2)
648             {
649                 float32x2_t v_src1 = vld1_f32(src1 + j);
650 
651                 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
652                 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
653                                   vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
654             }
655 
656             for (; j < size.width; j++)
657             {
658                 dst[j] = src1[j] ? 1.0f / src1[j] : 0;
659             }
660         }
661     }
662     else
663     {
664         for (size_t i = 0; i < size.height; ++i)
665         {
666             const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
667             f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
668             size_t j = 0;
669 
670             for (; j < roiw128; j += 4)
671             {
672                 internal::prefetch(src1 + j);
673 
674                 float32x4_t v_src1 = vld1q_f32(src1 + j);
675 
676                 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
677                 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
678                                    vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
679                                                                      scale)),v_mask)));
680             }
681 
682             for (; j < roiw64; j += 2)
683             {
684                 float32x2_t v_src1 = vld1_f32(src1 + j);
685 
686                 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
687                 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
688                                   vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
689                                                                   scale)), v_mask)));
690             }
691 
692             for (; j < size.width; j++)
693             {
694                 dst[j] = src1[j] ? scale / src1[j] : 0;
695             }
696         }
697     }
698 #else
699     (void)size;
700     (void)srcBase;
701     (void)srcStride;
702     (void)dstBase;
703     (void)dstStride;
704     (void)scale;
705 #endif
706 }
707 
708 } // namespace CAROTENE_NS
709