1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "vtransform.hpp"
42
43 #include <cstring>
44 #include <cfloat>
45 #include <cmath>
46 #include <limits>
47
48 namespace CAROTENE_NS {
49
50 namespace {
51
52 #ifdef CAROTENE_NEON
53
vroundq(const float32x4_t & v)54 inline float32x4_t vroundq(const float32x4_t& v)
55 {
56 const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
57 float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
58 return vaddq_f32(v, v_addition);
59 }
60
61 template <typename T>
divSaturateQ(const T & v1,const T & v2,const float scale)62 inline T divSaturateQ(const T &v1, const T &v2, const float scale)
63 {
64 return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
65 internal::vmovl(internal::vget_low(v2)), scale)),
66 internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
67 internal::vmovl(internal::vget_high(v2)), scale))
68 );
69 }
70 template <>
divSaturateQ(const int32x4_t & v1,const int32x4_t & v2,const float scale)71 inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
72 { return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
73 template <>
divSaturateQ(const uint32x4_t & v1,const uint32x4_t & v2,const float scale)74 inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
75 { return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
76
vround(const float32x2_t & v)77 inline float32x2_t vround(const float32x2_t& v)
78 {
79 const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
80 float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
81 return vadd_f32(v, v_addition);
82 }
83
84 template <typename T>
divSaturate(const T & v1,const T & v2,const float scale)85 inline T divSaturate(const T &v1, const T &v2, const float scale)
86 {
87 return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
88 }
89 template <>
divSaturate(const int32x2_t & v1,const int32x2_t & v2,const float scale)90 inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
91 { return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
92 template <>
divSaturate(const uint32x2_t & v1,const uint32x2_t & v2,const float scale)93 inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
94 { return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
95
96
97 template <typename T>
divWrapQ(const T & v1,const T & v2,const float scale)98 inline T divWrapQ(const T &v1, const T &v2, const float scale)
99 {
100 return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
101 internal::vmovl(internal::vget_low(v2)), scale)),
102 internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
103 internal::vmovl(internal::vget_high(v2)), scale))
104 );
105 }
106 template <>
divWrapQ(const int32x4_t & v1,const int32x4_t & v2,const float scale)107 inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
108 { return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
109 template <>
divWrapQ(const uint32x4_t & v1,const uint32x4_t & v2,const float scale)110 inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
111 { return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
112
113 template <typename T>
divWrap(const T & v1,const T & v2,const float scale)114 inline T divWrap(const T &v1, const T &v2, const float scale)
115 {
116 return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
117 }
118 template <>
divWrap(const int32x2_t & v1,const int32x2_t & v2,const float scale)119 inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
120 { return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
121 template <>
divWrap(const uint32x2_t & v1,const uint32x2_t & v2,const float scale)122 inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
123 { return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
124
vtstq(const uint8x16_t & v0,const uint8x16_t & v1)125 inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
vtstq(const uint16x8_t & v0,const uint16x8_t & v1)126 inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
vtstq(const uint32x4_t & v0,const uint32x4_t & v1)127 inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
vtstq(const int8x16_t & v0,const int8x16_t & v1)128 inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
vtstq(const int16x8_t & v0,const int16x8_t & v1)129 inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
vtstq(const int32x4_t & v0,const int32x4_t & v1)130 inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
131
vtst(const uint8x8_t & v0,const uint8x8_t & v1)132 inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
vtst(const uint16x4_t & v0,const uint16x4_t & v1)133 inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
vtst(const uint32x2_t & v0,const uint32x2_t & v1)134 inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
vtst(const int8x8_t & v0,const int8x8_t & v1)135 inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
vtst(const int16x4_t & v0,const int16x4_t & v1)136 inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
vtst(const int32x2_t & v0,const int32x2_t & v1)137 inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
138 #endif
139
140 template <typename T>
div(const Size2D & size,const T * src0Base,ptrdiff_t src0Stride,const T * src1Base,ptrdiff_t src1Stride,T * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)141 void div(const Size2D &size,
142 const T * src0Base, ptrdiff_t src0Stride,
143 const T * src1Base, ptrdiff_t src1Stride,
144 T * dstBase, ptrdiff_t dstStride,
145 f32 scale,
146 CONVERT_POLICY cpolicy)
147 {
148 internal::assertSupportedConfiguration();
149
150 #ifdef CAROTENE_NEON
151 typedef typename internal::VecTraits<T>::vec128 vec128;
152 typedef typename internal::VecTraits<T>::vec64 vec64;
153
154 if (scale == 0.0f ||
155 (std::numeric_limits<T>::is_integer &&
156 (scale * std::numeric_limits<T>::max()) < 1.0f &&
157 (scale * std::numeric_limits<T>::max()) > -1.0f))
158 {
159 for (size_t y = 0; y < size.height; ++y)
160 {
161 T * dst = internal::getRowPtr(dstBase, dstStride, y);
162 std::memset(dst, 0, sizeof(T) * size.width);
163 }
164 return;
165 }
166
167 const size_t step128 = 16 / sizeof(T);
168 size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
169 const size_t step64 = 8 / sizeof(T);
170 size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
171
172 for (size_t i = 0; i < size.height; ++i)
173 {
174 const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
175 const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
176 T * dst = internal::getRowPtr(dstBase, dstStride, i);
177 size_t j = 0;
178
179 if (cpolicy == CONVERT_POLICY_SATURATE)
180 {
181 for (; j < roiw128; j += step128)
182 {
183 internal::prefetch(src0 + j);
184 internal::prefetch(src1 + j);
185
186 vec128 v_src0 = internal::vld1q(src0 + j);
187 vec128 v_src1 = internal::vld1q(src1 + j);
188
189 vec128 v_mask = vtstq(v_src1,v_src1);
190 internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
191 }
192 for (; j < roiw64; j += step64)
193 {
194 vec64 v_src0 = internal::vld1(src0 + j);
195 vec64 v_src1 = internal::vld1(src1 + j);
196
197 vec64 v_mask = vtst(v_src1,v_src1);
198 internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
199 }
200 for (; j < size.width; j++)
201 {
202 dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
203 }
204 }
205 else // CONVERT_POLICY_WRAP
206 {
207 for (; j < roiw128; j += step128)
208 {
209 internal::prefetch(src0 + j);
210 internal::prefetch(src1 + j);
211
212 vec128 v_src0 = internal::vld1q(src0 + j);
213 vec128 v_src1 = internal::vld1q(src1 + j);
214
215 vec128 v_mask = vtstq(v_src1,v_src1);
216 internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
217 }
218 for (; j < roiw64; j += step64)
219 {
220 vec64 v_src0 = internal::vld1(src0 + j);
221 vec64 v_src1 = internal::vld1(src1 + j);
222
223 vec64 v_mask = vtst(v_src1,v_src1);
224 internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
225 }
226 for (; j < size.width; j++)
227 {
228 dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
229 }
230 }
231 }
232 #else
233 (void)size;
234 (void)src0Base;
235 (void)src0Stride;
236 (void)src1Base;
237 (void)src1Stride;
238 (void)dstBase;
239 (void)dstStride;
240 (void)cpolicy;
241 (void)scale;
242 #endif
243 }
244
245 #ifdef CAROTENE_NEON
246
247 template <typename T>
recipSaturateQ(const T & v2,const float scale)248 inline T recipSaturateQ(const T &v2, const float scale)
249 {
250 return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
251 internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
252 );
253 }
254 template <>
recipSaturateQ(const int32x4_t & v2,const float scale)255 inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
256 { return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
257 template <>
recipSaturateQ(const uint32x4_t & v2,const float scale)258 inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
259 { return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
260
261 template <typename T>
recipSaturate(const T & v2,const float scale)262 inline T recipSaturate(const T &v2, const float scale)
263 {
264 return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
265 }
266 template <>
recipSaturate(const int32x2_t & v2,const float scale)267 inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
268 { return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
269 template <>
recipSaturate(const uint32x2_t & v2,const float scale)270 inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
271 { return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
272
273
274 template <typename T>
recipWrapQ(const T & v2,const float scale)275 inline T recipWrapQ(const T &v2, const float scale)
276 {
277 return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
278 internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
279 );
280 }
281 template <>
recipWrapQ(const int32x4_t & v2,const float scale)282 inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
283 { return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
284 template <>
recipWrapQ(const uint32x4_t & v2,const float scale)285 inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
286 { return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
287
288 template <typename T>
recipWrap(const T & v2,const float scale)289 inline T recipWrap(const T &v2, const float scale)
290 {
291 return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
292 }
293 template <>
recipWrap(const int32x2_t & v2,const float scale)294 inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
295 { return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
296 template <>
recipWrap(const uint32x2_t & v2,const float scale)297 inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
298 { return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
299 #endif
300
301 template <typename T>
recip(const Size2D & size,const T * src1Base,ptrdiff_t src1Stride,T * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)302 void recip(const Size2D &size,
303 const T * src1Base, ptrdiff_t src1Stride,
304 T * dstBase, ptrdiff_t dstStride,
305 f32 scale,
306 CONVERT_POLICY cpolicy)
307 {
308 internal::assertSupportedConfiguration();
309
310 #ifdef CAROTENE_NEON
311 typedef typename internal::VecTraits<T>::vec128 vec128;
312 typedef typename internal::VecTraits<T>::vec64 vec64;
313
314 if (scale == 0.0f ||
315 (std::numeric_limits<T>::is_integer &&
316 scale < 1.0f &&
317 scale > -1.0f))
318 {
319 for (size_t y = 0; y < size.height; ++y)
320 {
321 T * dst = internal::getRowPtr(dstBase, dstStride, y);
322 std::memset(dst, 0, sizeof(T) * size.width);
323 }
324 return;
325 }
326
327 const size_t step128 = 16 / sizeof(T);
328 size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
329 const size_t step64 = 8 / sizeof(T);
330 size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
331
332 for (size_t i = 0; i < size.height; ++i)
333 {
334 const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
335 T * dst = internal::getRowPtr(dstBase, dstStride, i);
336 size_t j = 0;
337
338 if (cpolicy == CONVERT_POLICY_SATURATE)
339 {
340 for (; j < roiw128; j += step128)
341 {
342 internal::prefetch(src1 + j);
343
344 vec128 v_src1 = internal::vld1q(src1 + j);
345
346 vec128 v_mask = vtstq(v_src1,v_src1);
347 internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
348 }
349 for (; j < roiw64; j += step64)
350 {
351 vec64 v_src1 = internal::vld1(src1 + j);
352
353 vec64 v_mask = vtst(v_src1,v_src1);
354 internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
355 }
356 for (; j < size.width; j++)
357 {
358 dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
359 }
360 }
361 else // CONVERT_POLICY_WRAP
362 {
363 for (; j < roiw128; j += step128)
364 {
365 internal::prefetch(src1 + j);
366
367 vec128 v_src1 = internal::vld1q(src1 + j);
368
369 vec128 v_mask = vtstq(v_src1,v_src1);
370 internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
371 }
372 for (; j < roiw64; j += step64)
373 {
374 vec64 v_src1 = internal::vld1(src1 + j);
375
376 vec64 v_mask = vtst(v_src1,v_src1);
377 internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
378 }
379 for (; j < size.width; j++)
380 {
381 dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
382 }
383 }
384 }
385 #else
386 (void)size;
387 (void)src1Base;
388 (void)src1Stride;
389 (void)dstBase;
390 (void)dstStride;
391 (void)cpolicy;
392 (void)scale;
393 #endif
394 }
395
396 }
397
div(const Size2D & size,const u8 * src0Base,ptrdiff_t src0Stride,const u8 * src1Base,ptrdiff_t src1Stride,u8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)398 void div(const Size2D &size,
399 const u8 * src0Base, ptrdiff_t src0Stride,
400 const u8 * src1Base, ptrdiff_t src1Stride,
401 u8 * dstBase, ptrdiff_t dstStride,
402 f32 scale,
403 CONVERT_POLICY cpolicy)
404 {
405 div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
406 }
407
div(const Size2D & size,const s8 * src0Base,ptrdiff_t src0Stride,const s8 * src1Base,ptrdiff_t src1Stride,s8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)408 void div(const Size2D &size,
409 const s8 * src0Base, ptrdiff_t src0Stride,
410 const s8 * src1Base, ptrdiff_t src1Stride,
411 s8 * dstBase, ptrdiff_t dstStride,
412 f32 scale,
413 CONVERT_POLICY cpolicy)
414 {
415 div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
416 }
417
div(const Size2D & size,const u16 * src0Base,ptrdiff_t src0Stride,const u16 * src1Base,ptrdiff_t src1Stride,u16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)418 void div(const Size2D &size,
419 const u16 * src0Base, ptrdiff_t src0Stride,
420 const u16 * src1Base, ptrdiff_t src1Stride,
421 u16 * dstBase, ptrdiff_t dstStride,
422 f32 scale,
423 CONVERT_POLICY cpolicy)
424 {
425 div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
426 }
427
div(const Size2D & size,const s16 * src0Base,ptrdiff_t src0Stride,const s16 * src1Base,ptrdiff_t src1Stride,s16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)428 void div(const Size2D &size,
429 const s16 * src0Base, ptrdiff_t src0Stride,
430 const s16 * src1Base, ptrdiff_t src1Stride,
431 s16 * dstBase, ptrdiff_t dstStride,
432 f32 scale,
433 CONVERT_POLICY cpolicy)
434 {
435 div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
436 }
437
div(const Size2D & size,const s32 * src0Base,ptrdiff_t src0Stride,const s32 * src1Base,ptrdiff_t src1Stride,s32 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)438 void div(const Size2D &size,
439 const s32 * src0Base, ptrdiff_t src0Stride,
440 const s32 * src1Base, ptrdiff_t src1Stride,
441 s32 * dstBase, ptrdiff_t dstStride,
442 f32 scale,
443 CONVERT_POLICY cpolicy)
444 {
445 div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
446 }
447
div(const Size2D & size,const f32 * src0Base,ptrdiff_t src0Stride,const f32 * src1Base,ptrdiff_t src1Stride,f32 * dstBase,ptrdiff_t dstStride,f32 scale)448 void div(const Size2D &size,
449 const f32 * src0Base, ptrdiff_t src0Stride,
450 const f32 * src1Base, ptrdiff_t src1Stride,
451 f32 * dstBase, ptrdiff_t dstStride,
452 f32 scale)
453 {
454 internal::assertSupportedConfiguration();
455 #ifdef CAROTENE_NEON
456 if (scale == 0.0f)
457 {
458 for (size_t y = 0; y < size.height; ++y)
459 {
460 f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
461 std::memset(dst, 0, sizeof(f32) * size.width);
462 }
463 return;
464 }
465
466 float32x4_t v_zero = vdupq_n_f32(0.0f);
467
468 size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
469 size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
470
471 if (std::fabs(scale - 1.0f) < FLT_EPSILON)
472 {
473 for (size_t i = 0; i < size.height; ++i)
474 {
475 const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
476 const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
477 f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
478 size_t j = 0;
479
480 for (; j < roiw128; j += 4)
481 {
482 internal::prefetch(src0 + j);
483 internal::prefetch(src1 + j);
484
485 float32x4_t v_src0 = vld1q_f32(src0 + j);
486 float32x4_t v_src1 = vld1q_f32(src1 + j);
487
488 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
489 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
490 vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
491 }
492
493 for (; j < roiw64; j += 2)
494 {
495 float32x2_t v_src0 = vld1_f32(src0 + j);
496 float32x2_t v_src1 = vld1_f32(src1 + j);
497
498 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
499 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
500 vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
501 }
502
503 for (; j < size.width; j++)
504 {
505 dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
506 }
507 }
508 }
509 else
510 {
511 for (size_t i = 0; i < size.height; ++i)
512 {
513 const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
514 const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
515 f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
516 size_t j = 0;
517
518 for (; j < roiw128; j += 4)
519 {
520 internal::prefetch(src0 + j);
521 internal::prefetch(src1 + j);
522
523 float32x4_t v_src0 = vld1q_f32(src0 + j);
524 float32x4_t v_src1 = vld1q_f32(src1 + j);
525
526 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
527 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
528 vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
529 internal::vrecpq_f32(v_src1))), v_mask)));
530 }
531
532 for (; j < roiw64; j += 2)
533 {
534 float32x2_t v_src0 = vld1_f32(src0 + j);
535 float32x2_t v_src1 = vld1_f32(src1 + j);
536
537 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
538 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
539 vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
540 internal::vrecp_f32(v_src1))), v_mask)));
541 }
542
543 for (; j < size.width; j++)
544 {
545 dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
546 }
547 }
548 }
549 #else
550 (void)size;
551 (void)src0Base;
552 (void)src0Stride;
553 (void)src1Base;
554 (void)src1Stride;
555 (void)dstBase;
556 (void)dstStride;
557 (void)scale;
558 #endif
559 }
560
reciprocal(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)561 void reciprocal(const Size2D &size,
562 const u8 * srcBase, ptrdiff_t srcStride,
563 u8 * dstBase, ptrdiff_t dstStride,
564 f32 scale,
565 CONVERT_POLICY cpolicy)
566 {
567 recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
568 }
569
reciprocal(const Size2D & size,const s8 * srcBase,ptrdiff_t srcStride,s8 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)570 void reciprocal(const Size2D &size,
571 const s8 * srcBase, ptrdiff_t srcStride,
572 s8 * dstBase, ptrdiff_t dstStride,
573 f32 scale,
574 CONVERT_POLICY cpolicy)
575 {
576 recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
577 }
578
reciprocal(const Size2D & size,const u16 * srcBase,ptrdiff_t srcStride,u16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)579 void reciprocal(const Size2D &size,
580 const u16 * srcBase, ptrdiff_t srcStride,
581 u16 * dstBase, ptrdiff_t dstStride,
582 f32 scale,
583 CONVERT_POLICY cpolicy)
584 {
585 recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
586 }
587
reciprocal(const Size2D & size,const s16 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)588 void reciprocal(const Size2D &size,
589 const s16 * srcBase, ptrdiff_t srcStride,
590 s16 * dstBase, ptrdiff_t dstStride,
591 f32 scale,
592 CONVERT_POLICY cpolicy)
593 {
594 recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
595 }
596
reciprocal(const Size2D & size,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,f32 scale,CONVERT_POLICY cpolicy)597 void reciprocal(const Size2D &size,
598 const s32 * srcBase, ptrdiff_t srcStride,
599 s32 * dstBase, ptrdiff_t dstStride,
600 f32 scale,
601 CONVERT_POLICY cpolicy)
602 {
603 recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
604 }
605
reciprocal(const Size2D & size,const f32 * srcBase,ptrdiff_t srcStride,f32 * dstBase,ptrdiff_t dstStride,f32 scale)606 void reciprocal(const Size2D &size,
607 const f32 * srcBase, ptrdiff_t srcStride,
608 f32 * dstBase, ptrdiff_t dstStride,
609 f32 scale)
610 {
611 internal::assertSupportedConfiguration();
612 #ifdef CAROTENE_NEON
613 if (scale == 0.0f)
614 {
615 for (size_t y = 0; y < size.height; ++y)
616 {
617 f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
618 std::memset(dst, 0, sizeof(f32) * size.width);
619 }
620 return;
621 }
622
623 float32x4_t v_zero = vdupq_n_f32(0.0f);
624
625 size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
626 size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
627
628 if (std::fabs(scale - 1.0f) < FLT_EPSILON)
629 {
630 for (size_t i = 0; i < size.height; ++i)
631 {
632 const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
633 f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
634 size_t j = 0;
635
636 for (; j < roiw128; j += 4)
637 {
638 internal::prefetch(src1 + j);
639
640 float32x4_t v_src1 = vld1q_f32(src1 + j);
641
642 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
643 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
644 vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
645 }
646
647 for (; j < roiw64; j += 2)
648 {
649 float32x2_t v_src1 = vld1_f32(src1 + j);
650
651 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
652 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
653 vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
654 }
655
656 for (; j < size.width; j++)
657 {
658 dst[j] = src1[j] ? 1.0f / src1[j] : 0;
659 }
660 }
661 }
662 else
663 {
664 for (size_t i = 0; i < size.height; ++i)
665 {
666 const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
667 f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
668 size_t j = 0;
669
670 for (; j < roiw128; j += 4)
671 {
672 internal::prefetch(src1 + j);
673
674 float32x4_t v_src1 = vld1q_f32(src1 + j);
675
676 uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
677 vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
678 vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
679 scale)),v_mask)));
680 }
681
682 for (; j < roiw64; j += 2)
683 {
684 float32x2_t v_src1 = vld1_f32(src1 + j);
685
686 uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
687 vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
688 vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
689 scale)), v_mask)));
690 }
691
692 for (; j < size.width; j++)
693 {
694 dst[j] = src1[j] ? scale / src1[j] : 0;
695 }
696 }
697 }
698 #else
699 (void)size;
700 (void)srcBase;
701 (void)srcStride;
702 (void)dstBase;
703 (void)dstStride;
704 (void)scale;
705 #endif
706 }
707
708 } // namespace CAROTENE_NS
709