• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/IAccessWindow.h"
29 #include "arm_compute/core/ITensor.h"
30 #include "arm_compute/core/Validate.h"
31 #include "src/core/helpers/AutoConfiguration.h"
32 #include "src/core/helpers/WindowHelpers.h"
33 
34 #include <arm_neon.h>
35 #include <cstdint>
36 
37 using namespace arm_compute;
38 
39 namespace arm_compute
40 {
41 class Coordinates;
42 } // namespace arm_compute
43 
44 namespace
45 {
46 // Defines for computing atan2
47 constexpr float SCALE_FACTOR = 0.7111111111111111f;
48 constexpr float PI           = 3.141592653589793f;
49 constexpr float SCALE_180    = 180.0f / PI;
50 constexpr float SCALE_360    = SCALE_180 * SCALE_FACTOR;
51 constexpr float PI_4         = 0.7853981633974483f;
52 constexpr float COEFF1       = 0.0663f;
53 constexpr float COEFF2       = 0.2447f;
54 } // namespace
55 
56 namespace
57 {
inv(float32x4_t x)58 inline float32x4_t inv(float32x4_t x)
59 {
60     float32x4_t result = vrecpeq_f32(x);
61     result             = vmulq_f32(vrecpsq_f32(x, result), result);
62     return result;
63 }
64 
atan2_0_360(float32x4_t gx,float32x4_t gy)65 inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
66 {
67     const float32x4_t zero       = vdupq_n_f32(0.0f);
68     const float32x4_t epsilon    = vdupq_n_f32(1e-9f);
69     const float32x4_t piover4    = vdupq_n_f32(PI_4);
70     const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
71     const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
72     const float32x4_t ninety     = vdupq_n_f32(90.0f * SCALE_FACTOR);
73     const float32x4_t oneeighty  = vdupq_n_f32(180.0f * SCALE_FACTOR);
74     const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
75     const float32x4_t scale      = vdupq_n_f32(SCALE_360);
76 
77     float32x4_t abs_gx = vabsq_f32(gx);
78     float32x4_t abs_gy = vabsq_f32(gy);
79     float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
80     float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
81     float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
82     float32x4_t absz   = vabsq_f32(z);
83     float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
84 
85     /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */
86     float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
87     result             = vmulq_f32(result, term);
88     result             = vmlaq_f32(result, piover4, z);
89 
90     /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]  */
91     result = vmulq_f32(result, scale);
92 
93     /* If z > 1, result = 90 - result */
94     result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
95 
96     /* Choose correct quadrant */
97     result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
98     result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
99 
100     return result;
101 }
102 
atan2_0_180(float32x4_t gx,float32x4_t gy)103 inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
104 {
105     const float32x4_t zero       = vdupq_n_f32(0.0f);
106     const float32x4_t epsilon    = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
107     const float32x4_t piover4    = vdupq_n_f32(PI_4);
108     const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
109     const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
110     const float32x4_t ninety     = vdupq_n_f32(90.0f);
111     const float32x4_t oneeighty  = vdupq_n_f32(180.0f);
112     const float32x4_t threesixty = vdupq_n_f32(360.0f);
113     const float32x4_t scale      = vdupq_n_f32(SCALE_180);
114 
115     float32x4_t abs_gx = vabsq_f32(gx);
116     float32x4_t abs_gy = vabsq_f32(gy);
117     float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
118     float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
119     float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
120     float32x4_t absz   = vabsq_f32(z);
121 
122     /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */
123     float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
124     float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
125     result             = vmulq_f32(result, term);
126     result             = vmlaq_f32(result, piover4, z);
127 
128     /* Radians to degrees conversion */
129     result = vmulq_f32(result, scale);
130 
131     /* If z > 1, result = 90 - result */
132     result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
133 
134     /* Choose correct quadrant */
135     result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
136     result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
137     result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
138 
139     return result;
140 }
141 
invsqrtv(float32x4_t x)142 inline float32x4_t invsqrtv(float32x4_t x)
143 {
144     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
145 
146     sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
147                                 sqrt_reciprocal);
148     sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
149                                 sqrt_reciprocal);
150 
151     return sqrt_reciprocal;
152 }
153 
sqrtv(float32x4_t x)154 inline float32x4_t sqrtv(float32x4_t x)
155 {
156     float32x4_t res = vdupq_n_f32(0.5f);
157     return vmlaq_f32(res, x, invsqrtv(x));
158 }
159 
magnitude_l2(int16x8_t input1,int16x8_t input2)160 inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
161 {
162     const int32x4x2_t square_x =
163     {
164         {
165             vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
166             vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
167         }
168     };
169 
170     const int32x4x2_t square_y =
171     {
172         {
173             vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
174             vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
175         }
176     };
177 
178     const uint32x4x2_t sum =
179     {
180         {
181             vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
182             vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
183         }
184     };
185 
186     const float32x4x2_t res =
187     {
188         {
189             sqrtv(vcvtq_f32_u32(sum.val[0])),
190             sqrtv(vcvtq_f32_u32(sum.val[1]))
191         }
192     };
193 
194     return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
195                         vqmovn_s32(vcvtq_s32_f32(res.val[1])));
196 }
197 
magnitude_l1(int16x8_t input1,int16x8_t input2)198 inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
199 {
200     /* Saturating add */
201     return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
202 }
203 
phase_signed(int16x8_t input1,int16x8_t input2)204 inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
205 {
206     const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
207 
208     float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
209     float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
210     float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
211     float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
212 
213     /* Compute fast atan2 */
214     float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
215     float32x4_t angle_low  = atan2_0_360(inputx_f32_low, inputy_f32_low);
216 
217     angle_high = vaddq_f32(angle_high, zeropointfive);
218     angle_low  = vaddq_f32(angle_low, zeropointfive);
219 
220     return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
221                                   vqmovun_s32(vcvtq_s32_f32(angle_high))));
222 }
223 
phase_unsigned(int16x8_t input1,int16x8_t input2)224 inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
225 {
226     const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
227 
228     float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
229     float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
230     float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
231     float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
232 
233     /* Compute fast atan2 */
234     float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
235     float32x4_t angle_low  = atan2_0_180(inputx_f32_low, inputy_f32_low);
236 
237     angle_high = vaddq_f32(angle_high, zeropointfive);
238     angle_low  = vaddq_f32(angle_low, zeropointfive);
239 
240     return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
241                                   vqmovun_s32(vcvtq_s32_f32(angle_high))));
242 }
243 } // namespace
244 
245 template <MagnitudeType mag_type, PhaseType phase_type>
NEMagnitudePhaseKernel()246 NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
247     : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
248 {
249 }
250 
251 template <MagnitudeType mag_type, PhaseType phase_type>
configure(const ITensor * gx,const ITensor * gy,ITensor * magnitude,ITensor * phase)252 void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
253 {
254     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
255     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
256     ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
257 
258     const bool run_mag   = magnitude != nullptr;
259     const bool run_phase = phase != nullptr;
260 
261     if(run_mag)
262     {
263         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
264     }
265 
266     if(run_phase)
267     {
268         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
269     }
270 
271     _gx        = gx;
272     _gy        = gy;
273     _magnitude = magnitude;
274     _phase     = phase;
275 
276     if(run_mag && run_phase)
277     {
278         /* Run magnitude and phase */
279         _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
280     }
281     else
282     {
283         if(run_mag)
284         {
285             /* Run magnitude */
286             _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
287         }
288         else if(run_phase)
289         {
290             /* Run phase */
291             _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
292         }
293         else
294         {
295             ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
296         }
297     }
298 
299     constexpr unsigned int num_elems_processed_per_iteration = 16;
300 
301     // Configure kernel window
302     Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
303     AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
304     AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
305 
306     update_window_and_padding(win,
307                               AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
308                               AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
309                               magnitude_access,
310                               phase_access);
311 
312     ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
313                                                        gy->info()->valid_region());
314 
315     magnitude_access.set_valid_region(win, valid_region);
316     phase_access.set_valid_region(win, valid_region);
317 
318     INEKernel::configure(win);
319 }
320 
321 template <MagnitudeType mag_type, PhaseType phase_type>
magnitude(const Window & window)322 void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
323 {
324     Iterator gx(_gx, window);
325     Iterator gy(_gy, window);
326     Iterator magnitude(_magnitude, window);
327 
328     execute_window_loop(window, [&](const Coordinates &)
329     {
330         const int16x8x2_t input1 =
331         {
332             {
333                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
334                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
335             }
336         };
337 
338         const int16x8x2_t input2 =
339         {
340             {
341                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
342                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
343             }
344         };
345 
346         /* Compute magnitude */
347         int16x8x2_t mag{ {} };
348 
349         if(MagnitudeType::L2NORM == mag_type)
350         {
351             mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
352             mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
353         }
354         else
355         {
356             mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
357             mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
358         }
359 
360         /* Store magnitude */
361         vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
362         vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
363     },
364     gx, gy, magnitude);
365 }
366 
367 template <MagnitudeType mag_type, PhaseType phase_type>
phase(const Window & window)368 void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
369 {
370     Iterator gx(_gx, window);
371     Iterator gy(_gy, window);
372     Iterator phase(_phase, window);
373 
374     execute_window_loop(window, [&](const Coordinates &)
375     {
376         const int16x8x2_t input1 =
377         {
378             {
379                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
380                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
381             }
382         };
383 
384         const int16x8x2_t input2 =
385         {
386             {
387                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
388                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
389             }
390         };
391 
392         /* Compute phase */
393         uint8x8x2_t vphase{ {} };
394 
395         if(PhaseType::SIGNED == phase_type)
396         {
397             vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
398             vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
399         }
400         else
401         {
402             vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
403             vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
404         }
405 
406         /* Store phase */
407         vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
408     },
409     gx, gy, phase);
410 }
411 
412 template <MagnitudeType mag_type, PhaseType phase_type>
magnitude_phase(const Window & window)413 void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
414 {
415     Iterator gx(_gx, window);
416     Iterator gy(_gy, window);
417     Iterator magnitude(_magnitude, window);
418     Iterator phase(_phase, window);
419 
420     execute_window_loop(window, [&](const Coordinates &)
421     {
422         const int16x8x2_t input1 =
423         {
424             {
425                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
426                 vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
427             }
428         };
429 
430         const int16x8x2_t input2 =
431         {
432             {
433                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
434                 vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
435             }
436         };
437 
438         /* Compute magnitude */
439         int16x8x2_t mag{ {} };
440 
441         if(MagnitudeType::L2NORM == mag_type)
442         {
443             mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
444             mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
445         }
446         else
447         {
448             mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
449             mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
450         }
451 
452         /* Store magnitude */
453         vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
454         vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
455 
456         /* Compute phase */
457         uint8x8x2_t vphase{ {} };
458 
459         if(PhaseType::SIGNED == phase_type)
460         {
461             vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
462             vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
463         }
464         else
465         {
466             vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
467             vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
468         }
469 
470         /* Store phase */
471         vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
472     },
473     gx, gy, magnitude, phase);
474 }
475 
476 template <MagnitudeType mag_type, PhaseType phase_type>
run(const Window & window,const ThreadInfo & info)477 void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
478 {
479     ARM_COMPUTE_UNUSED(info);
480     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
481     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
482     ARM_COMPUTE_ERROR_ON(_func == nullptr);
483 
484     (this->*_func)(window);
485 }
486 
487 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
488 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
489 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
490 template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
491