1 /*
2 * Copyright (c) 2016-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
25
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/IAccessWindow.h"
29 #include "arm_compute/core/ITensor.h"
30 #include "arm_compute/core/Validate.h"
31 #include "src/core/helpers/AutoConfiguration.h"
32 #include "src/core/helpers/WindowHelpers.h"
33
34 #include <arm_neon.h>
35 #include <cstdint>
36
37 using namespace arm_compute;
38
39 namespace arm_compute
40 {
41 class Coordinates;
42 } // namespace arm_compute
43
44 namespace
45 {
46 // Defines for computing atan2
47 constexpr float SCALE_FACTOR = 0.7111111111111111f;
48 constexpr float PI = 3.141592653589793f;
49 constexpr float SCALE_180 = 180.0f / PI;
50 constexpr float SCALE_360 = SCALE_180 * SCALE_FACTOR;
51 constexpr float PI_4 = 0.7853981633974483f;
52 constexpr float COEFF1 = 0.0663f;
53 constexpr float COEFF2 = 0.2447f;
54 } // namespace
55
56 namespace
57 {
inv(float32x4_t x)58 inline float32x4_t inv(float32x4_t x)
59 {
60 float32x4_t result = vrecpeq_f32(x);
61 result = vmulq_f32(vrecpsq_f32(x, result), result);
62 return result;
63 }
64
atan2_0_360(float32x4_t gx,float32x4_t gy)65 inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
66 {
67 const float32x4_t zero = vdupq_n_f32(0.0f);
68 const float32x4_t epsilon = vdupq_n_f32(1e-9f);
69 const float32x4_t piover4 = vdupq_n_f32(PI_4);
70 const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
71 const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
72 const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR);
73 const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR);
74 const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
75 const float32x4_t scale = vdupq_n_f32(SCALE_360);
76
77 float32x4_t abs_gx = vabsq_f32(gx);
78 float32x4_t abs_gy = vabsq_f32(gy);
79 float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
80 float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
81 float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
82 float32x4_t absz = vabsq_f32(z);
83 float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
84
85 /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */
86 float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
87 result = vmulq_f32(result, term);
88 result = vmlaq_f32(result, piover4, z);
89
90 /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] */
91 result = vmulq_f32(result, scale);
92
93 /* If z > 1, result = 90 - result */
94 result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
95
96 /* Choose correct quadrant */
97 result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
98 result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
99
100 return result;
101 }
102
atan2_0_180(float32x4_t gx,float32x4_t gy)103 inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
104 {
105 const float32x4_t zero = vdupq_n_f32(0.0f);
106 const float32x4_t epsilon = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
107 const float32x4_t piover4 = vdupq_n_f32(PI_4);
108 const float32x4_t coeff1 = vdupq_n_f32(COEFF1);
109 const float32x4_t coeff2 = vdupq_n_f32(COEFF2);
110 const float32x4_t ninety = vdupq_n_f32(90.0f);
111 const float32x4_t oneeighty = vdupq_n_f32(180.0f);
112 const float32x4_t threesixty = vdupq_n_f32(360.0f);
113 const float32x4_t scale = vdupq_n_f32(SCALE_180);
114
115 float32x4_t abs_gx = vabsq_f32(gx);
116 float32x4_t abs_gy = vabsq_f32(gy);
117 float32x4_t tmin = vminq_f32(abs_gx, abs_gy);
118 float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy);
119 float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
120 float32x4_t absz = vabsq_f32(z);
121
122 /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */
123 float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
124 float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
125 result = vmulq_f32(result, term);
126 result = vmlaq_f32(result, piover4, z);
127
128 /* Radians to degrees conversion */
129 result = vmulq_f32(result, scale);
130
131 /* If z > 1, result = 90 - result */
132 result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
133
134 /* Choose correct quadrant */
135 result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
136 result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
137 result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
138
139 return result;
140 }
141
invsqrtv(float32x4_t x)142 inline float32x4_t invsqrtv(float32x4_t x)
143 {
144 float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
145
146 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
147 sqrt_reciprocal);
148 sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
149 sqrt_reciprocal);
150
151 return sqrt_reciprocal;
152 }
153
sqrtv(float32x4_t x)154 inline float32x4_t sqrtv(float32x4_t x)
155 {
156 float32x4_t res = vdupq_n_f32(0.5f);
157 return vmlaq_f32(res, x, invsqrtv(x));
158 }
159
magnitude_l2(int16x8_t input1,int16x8_t input2)160 inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
161 {
162 const int32x4x2_t square_x =
163 {
164 {
165 vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
166 vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
167 }
168 };
169
170 const int32x4x2_t square_y =
171 {
172 {
173 vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
174 vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
175 }
176 };
177
178 const uint32x4x2_t sum =
179 {
180 {
181 vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
182 vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
183 }
184 };
185
186 const float32x4x2_t res =
187 {
188 {
189 sqrtv(vcvtq_f32_u32(sum.val[0])),
190 sqrtv(vcvtq_f32_u32(sum.val[1]))
191 }
192 };
193
194 return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
195 vqmovn_s32(vcvtq_s32_f32(res.val[1])));
196 }
197
magnitude_l1(int16x8_t input1,int16x8_t input2)198 inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
199 {
200 /* Saturating add */
201 return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
202 }
203
phase_signed(int16x8_t input1,int16x8_t input2)204 inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
205 {
206 const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
207
208 float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
209 float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
210 float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
211 float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
212
213 /* Compute fast atan2 */
214 float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
215 float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low);
216
217 angle_high = vaddq_f32(angle_high, zeropointfive);
218 angle_low = vaddq_f32(angle_low, zeropointfive);
219
220 return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
221 vqmovun_s32(vcvtq_s32_f32(angle_high))));
222 }
223
phase_unsigned(int16x8_t input1,int16x8_t input2)224 inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
225 {
226 const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
227
228 float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
229 float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
230 float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
231 float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
232
233 /* Compute fast atan2 */
234 float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
235 float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low);
236
237 angle_high = vaddq_f32(angle_high, zeropointfive);
238 angle_low = vaddq_f32(angle_low, zeropointfive);
239
240 return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
241 vqmovun_s32(vcvtq_s32_f32(angle_high))));
242 }
243 } // namespace
244
245 template <MagnitudeType mag_type, PhaseType phase_type>
NEMagnitudePhaseKernel()246 NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
247 : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
248 {
249 }
250
251 template <MagnitudeType mag_type, PhaseType phase_type>
configure(const ITensor * gx,const ITensor * gy,ITensor * magnitude,ITensor * phase)252 void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
253 {
254 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
255 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
256 ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
257
258 const bool run_mag = magnitude != nullptr;
259 const bool run_phase = phase != nullptr;
260
261 if(run_mag)
262 {
263 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
264 }
265
266 if(run_phase)
267 {
268 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
269 }
270
271 _gx = gx;
272 _gy = gy;
273 _magnitude = magnitude;
274 _phase = phase;
275
276 if(run_mag && run_phase)
277 {
278 /* Run magnitude and phase */
279 _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
280 }
281 else
282 {
283 if(run_mag)
284 {
285 /* Run magnitude */
286 _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
287 }
288 else if(run_phase)
289 {
290 /* Run phase */
291 _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
292 }
293 else
294 {
295 ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
296 }
297 }
298
299 constexpr unsigned int num_elems_processed_per_iteration = 16;
300
301 // Configure kernel window
302 Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
303 AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
304 AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
305
306 update_window_and_padding(win,
307 AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
308 AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
309 magnitude_access,
310 phase_access);
311
312 ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
313 gy->info()->valid_region());
314
315 magnitude_access.set_valid_region(win, valid_region);
316 phase_access.set_valid_region(win, valid_region);
317
318 INEKernel::configure(win);
319 }
320
321 template <MagnitudeType mag_type, PhaseType phase_type>
magnitude(const Window & window)322 void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
323 {
324 Iterator gx(_gx, window);
325 Iterator gy(_gy, window);
326 Iterator magnitude(_magnitude, window);
327
328 execute_window_loop(window, [&](const Coordinates &)
329 {
330 const int16x8x2_t input1 =
331 {
332 {
333 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
334 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
335 }
336 };
337
338 const int16x8x2_t input2 =
339 {
340 {
341 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
342 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
343 }
344 };
345
346 /* Compute magnitude */
347 int16x8x2_t mag{ {} };
348
349 if(MagnitudeType::L2NORM == mag_type)
350 {
351 mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
352 mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
353 }
354 else
355 {
356 mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
357 mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
358 }
359
360 /* Store magnitude */
361 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
362 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
363 },
364 gx, gy, magnitude);
365 }
366
367 template <MagnitudeType mag_type, PhaseType phase_type>
phase(const Window & window)368 void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
369 {
370 Iterator gx(_gx, window);
371 Iterator gy(_gy, window);
372 Iterator phase(_phase, window);
373
374 execute_window_loop(window, [&](const Coordinates &)
375 {
376 const int16x8x2_t input1 =
377 {
378 {
379 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
380 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
381 }
382 };
383
384 const int16x8x2_t input2 =
385 {
386 {
387 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
388 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
389 }
390 };
391
392 /* Compute phase */
393 uint8x8x2_t vphase{ {} };
394
395 if(PhaseType::SIGNED == phase_type)
396 {
397 vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
398 vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
399 }
400 else
401 {
402 vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
403 vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
404 }
405
406 /* Store phase */
407 vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
408 },
409 gx, gy, phase);
410 }
411
412 template <MagnitudeType mag_type, PhaseType phase_type>
magnitude_phase(const Window & window)413 void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
414 {
415 Iterator gx(_gx, window);
416 Iterator gy(_gy, window);
417 Iterator magnitude(_magnitude, window);
418 Iterator phase(_phase, window);
419
420 execute_window_loop(window, [&](const Coordinates &)
421 {
422 const int16x8x2_t input1 =
423 {
424 {
425 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
426 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
427 }
428 };
429
430 const int16x8x2_t input2 =
431 {
432 {
433 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
434 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
435 }
436 };
437
438 /* Compute magnitude */
439 int16x8x2_t mag{ {} };
440
441 if(MagnitudeType::L2NORM == mag_type)
442 {
443 mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
444 mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
445 }
446 else
447 {
448 mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
449 mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
450 }
451
452 /* Store magnitude */
453 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
454 vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
455
456 /* Compute phase */
457 uint8x8x2_t vphase{ {} };
458
459 if(PhaseType::SIGNED == phase_type)
460 {
461 vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
462 vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
463 }
464 else
465 {
466 vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
467 vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
468 }
469
470 /* Store phase */
471 vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
472 },
473 gx, gy, magnitude, phase);
474 }
475
476 template <MagnitudeType mag_type, PhaseType phase_type>
run(const Window & window,const ThreadInfo & info)477 void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
478 {
479 ARM_COMPUTE_UNUSED(info);
480 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
481 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
482 ARM_COMPUTE_ERROR_ON(_func == nullptr);
483
484 (this->*_func)(window);
485 }
486
487 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
488 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
489 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
490 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
491