• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
25 
26 #include "arm_compute/core/Coordinates.h"
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Helpers.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/Types.h"
31 #include "arm_compute/core/Utils.h"
32 #include "arm_compute/core/Validate.h"
33 #include "arm_compute/core/Window.h"
34 #include "src/core/helpers/AutoConfiguration.h"
35 #include "src/core/helpers/WindowHelpers.h"
36 
37 #include <algorithm>
38 #include <arm_neon.h>
39 #include <cmath>
40 #include <cstddef>
41 
42 using namespace arm_compute;
43 
44 template class arm_compute::NEHarrisScoreKernel<3>;
45 template class arm_compute::NEHarrisScoreKernel<5>;
46 template class arm_compute::NEHarrisScoreKernel<7>;
47 template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
48 template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
49 template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
50 
51 namespace
52 {
harris_score(float32x4_t gx2,float32x4_t gy2,float32x4_t gxgy,float32x4_t sensitivity,float32x4_t strength_thresh)53 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
54 {
55     // Trace^2
56     float32x4_t trace2 = vaddq_f32(gx2, gy2);
57     trace2             = vmulq_f32(trace2, trace2);
58 
59     // Det(A)
60     float32x4_t det = vmulq_f32(gx2, gy2);
61     det             = vmlsq_f32(det, gxgy, gxgy);
62 
63     // Det(A) - sensitivity * trace^2
64     const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
65 
66     // mc > strength_thresh
67     const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
68 
69     return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
70 }
71 
harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx,float32x4_t low_gy,float32x4_t high_gx,float32x4_t high_gy,float32x4_t & gx2,float32x4_t & gy2,float32x4_t & gxgy,float32x4_t norm_factor)72 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
73                                               float32x4_t norm_factor)
74 {
75     // Normalize
76     low_gx  = vmulq_f32(low_gx, norm_factor);
77     low_gy  = vmulq_f32(low_gy, norm_factor);
78     high_gx = vmulq_f32(high_gx, norm_factor);
79     high_gy = vmulq_f32(high_gy, norm_factor);
80 
81     const float32x4_t l_gx = low_gx;
82     const float32x4_t l_gy = low_gy;
83     const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
84     const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
85     const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
86     const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
87 
88     // Gx*Gx
89     gx2 = vmlaq_f32(gx2, l_gx, l_gx);
90     gx2 = vmlaq_f32(gx2, m_gx, m_gx);
91     gx2 = vmlaq_f32(gx2, r_gx, r_gx);
92 
93     // Gy*Gy
94     gy2 = vmlaq_f32(gy2, l_gy, l_gy);
95     gy2 = vmlaq_f32(gy2, m_gy, m_gy);
96     gy2 = vmlaq_f32(gy2, r_gy, r_gy);
97 
98     // Gx*Gy
99     gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
100     gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
101     gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
102 }
103 
harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx,float32x4_t low_gy,float32x4_t high_gx,float32x4_t high_gy,float32x4_t & gx2,float32x4_t & gy2,float32x4_t & gxgy,float32x4_t norm_factor)104 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
105                                               float32x4_t norm_factor)
106 {
107     // Normalize
108     low_gx  = vmulq_f32(low_gx, norm_factor);
109     low_gy  = vmulq_f32(low_gy, norm_factor);
110     high_gx = vmulq_f32(high_gx, norm_factor);
111     high_gy = vmulq_f32(high_gy, norm_factor);
112 
113     // L2 values
114     float32x4_t gx = low_gx;
115     float32x4_t gy = low_gy;
116 
117     // Accumulate
118     gx2  = vmlaq_f32(gx2, gx, gx);
119     gy2  = vmlaq_f32(gy2, gy, gy);
120     gxgy = vmlaq_f32(gxgy, gx, gy);
121 
122     // L1 values
123     gx = vextq_f32(low_gx, high_gx, 1);
124     gy = vextq_f32(low_gy, high_gy, 1);
125 
126     // Accumulate
127     gx2  = vmlaq_f32(gx2, gx, gx);
128     gy2  = vmlaq_f32(gy2, gy, gy);
129     gxgy = vmlaq_f32(gxgy, gx, gy);
130 
131     // M values
132     gx = vextq_f32(low_gx, high_gx, 2);
133     gy = vextq_f32(low_gy, high_gy, 2);
134 
135     // Accumulate
136     gx2  = vmlaq_f32(gx2, gx, gx);
137     gy2  = vmlaq_f32(gy2, gy, gy);
138     gxgy = vmlaq_f32(gxgy, gx, gy);
139 
140     // R1 values
141     gx = vextq_f32(low_gx, high_gx, 3);
142     gy = vextq_f32(low_gy, high_gy, 3);
143 
144     // Accumulate
145     gx2  = vmlaq_f32(gx2, gx, gx);
146     gy2  = vmlaq_f32(gy2, gy, gy);
147     gxgy = vmlaq_f32(gxgy, gx, gy);
148 
149     // R2 values
150     gx = high_gx;
151     gy = high_gy;
152 
153     // Accumulate
154     gx2  = vmlaq_f32(gx2, gx, gx);
155     gy2  = vmlaq_f32(gy2, gy, gy);
156     gxgy = vmlaq_f32(gxgy, gx, gy);
157 }
158 
harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx,float32x4_t low_gy,float32x4_t high_gx,float32x4_t high_gy,float32x4_t high_gx1,float32x4_t high_gy1,float32x4_t & gx2,float32x4_t & gy2,float32x4_t & gxgy,float32x4_t norm_factor)159 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
160                                               float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
161 {
162     // Normalize
163     low_gx  = vmulq_f32(low_gx, norm_factor);
164     low_gy  = vmulq_f32(low_gy, norm_factor);
165     high_gx = vmulq_f32(high_gx, norm_factor);
166     high_gy = vmulq_f32(high_gy, norm_factor);
167 
168     // L3 values
169     float32x4_t gx = low_gx;
170     float32x4_t gy = low_gy;
171 
172     // Accumulate
173     gx2  = vmlaq_f32(gx2, gx, gx);
174     gy2  = vmlaq_f32(gy2, gy, gy);
175     gxgy = vmlaq_f32(gxgy, gx, gy);
176 
177     // L2 values
178     gx = vextq_f32(low_gx, high_gx, 1);
179     gy = vextq_f32(low_gy, high_gy, 1);
180 
181     // Accumulate
182     gx2  = vmlaq_f32(gx2, gx, gx);
183     gy2  = vmlaq_f32(gy2, gy, gy);
184     gxgy = vmlaq_f32(gxgy, gx, gy);
185 
186     // L1 values
187     gx = vextq_f32(low_gx, high_gx, 2);
188     gy = vextq_f32(low_gy, high_gy, 2);
189 
190     // Accumulate
191     gx2  = vmlaq_f32(gx2, gx, gx);
192     gy2  = vmlaq_f32(gy2, gy, gy);
193     gxgy = vmlaq_f32(gxgy, gx, gy);
194 
195     // M values
196     gx = vextq_f32(low_gx, high_gx, 3);
197     gy = vextq_f32(low_gy, high_gy, 3);
198 
199     // Accumulate
200     gx2  = vmlaq_f32(gx2, gx, gx);
201     gy2  = vmlaq_f32(gy2, gy, gy);
202     gxgy = vmlaq_f32(gxgy, gx, gy);
203 
204     // R1 values
205     gx = high_gx;
206     gy = high_gy;
207 
208     // Accumulate
209     gx2  = vmlaq_f32(gx2, gx, gx);
210     gy2  = vmlaq_f32(gy2, gy, gy);
211     gxgy = vmlaq_f32(gxgy, gx, gy);
212 
213     // Change tmp_low and tmp_high for calculating R2 and R3 values
214     low_gx  = high_gx;
215     low_gy  = high_gy;
216     high_gx = high_gx1;
217     high_gy = high_gy1;
218 
219     // Normalize
220     high_gx = vmulq_f32(high_gx, norm_factor);
221     high_gy = vmulq_f32(high_gy, norm_factor);
222 
223     // R2 values
224     gx = vextq_f32(low_gx, high_gx, 1);
225     gy = vextq_f32(low_gy, high_gy, 1);
226 
227     // Accumulate
228     gx2  = vmlaq_f32(gx2, gx, gx);
229     gy2  = vmlaq_f32(gy2, gy, gy);
230     gxgy = vmlaq_f32(gxgy, gx, gy);
231 
232     // R3 values
233     gx = vextq_f32(low_gx, high_gx, 2);
234     gy = vextq_f32(low_gy, high_gy, 2);
235 
236     // Accumulate
237     gx2  = vmlaq_f32(gx2, gx, gx);
238     gy2  = vmlaq_f32(gy2, gy, gy);
239     gxgy = vmlaq_f32(gxgy, gx, gy);
240 }
241 
harris_score3x3_S16_S16_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)242 inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
243                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
244 
245 {
246     const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
247     const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
248     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
249     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
250     const auto     output   = static_cast<float *__restrict>(output_ptr);
251 
252     // Gx^2, Gy^2 and Gx*Gy
253     float32x4x2_t gx2 =
254     {
255         {
256             vdupq_n_f32(0.0f),
257             vdupq_n_f32(0.0f)
258         }
259     };
260     float32x4x2_t gy2 =
261     {
262         {
263             vdupq_n_f32(0.0f),
264             vdupq_n_f32(0.0f)
265         }
266     };
267     float32x4x2_t gxgy =
268     {
269         {
270             vdupq_n_f32(0.0f),
271             vdupq_n_f32(0.0f)
272         }
273     };
274 
275     // Row0
276     int16x8x2_t tmp_gx =
277     {
278         {
279             vld1q_s16(gx_ptr_0 - input_stride),
280             vld1q_s16(gx_ptr_1 - input_stride)
281         }
282     };
283     int16x8x2_t tmp_gy =
284     {
285         {
286             vld1q_s16(gy_ptr_0 - input_stride),
287             vld1q_s16(gy_ptr_1 - input_stride)
288         }
289     };
290     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
291     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
292     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
293 
294     float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
295     float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
296     float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
297     float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
298     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
299 
300     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
301     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
302     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
303     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
304     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
305 
306     // Row1
307     tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
308     tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
309     tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
310     tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
311 
312     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
313     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
314     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
315     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
316     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
317 
318     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
319     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
320     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
321     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
322     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
323 
324     // Row2
325     tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
326     tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
327     tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
328     tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
329 
330     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
331     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
332     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
333     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
334     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
335 
336     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
337     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
338     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
339     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
340     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
341 
342     // Calculate harris score
343     const float32x4x2_t mc =
344     {
345         {
346             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
347             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
348         }
349     };
350 
351     // Store score
352     vst1q_f32(output + 0, mc.val[0]);
353     vst1q_f32(output + 4, mc.val[1]);
354 }
355 
harris_score3x3_S32_S32_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)356 inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
357                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
358 {
359     auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
360     auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
361     const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
362     const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
363     const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
364     const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
365     const auto     output          = static_cast<float *__restrict>(output_ptr);
366     float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
367     float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
368     float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
369 
370     // Gx^2, Gy^2 and Gx*Gy
371     float32x4x2_t gx2 =
372     {
373         {
374             vdupq_n_f32(0.0f),
375             vdupq_n_f32(0.0f)
376         }
377     };
378     float32x4x2_t gy2 =
379     {
380         {
381             vdupq_n_f32(0.0f),
382             vdupq_n_f32(0.0f)
383         }
384     };
385     float32x4x2_t gxgy =
386     {
387         {
388             vdupq_n_f32(0.0f),
389             vdupq_n_f32(0.0f)
390         }
391     };
392 
393     // Row0
394     float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
395     float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
396     float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
397     float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
398     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
399 
400     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
401     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
402     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
403     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
404     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
405 
406     // Row1
407     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
408     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
409     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
410     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
411     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
412 
413     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
414     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
415     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
416     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
417     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
418 
419     // Row2
420     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
421     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
422     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
423     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
424     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
425 
426     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
427     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
428     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
429     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
430     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
431 
432     // Calculate harris score
433     const float32x4x2_t mc =
434     {
435         {
436             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
437             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
438         }
439     };
440 
441     // Store score
442     vst1q_f32(output + 0, mc.val[0]);
443     vst1q_f32(output + 4, mc.val[1]);
444 }
445 
harris_score5x5_S16_S16_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)446 inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
447                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
448 {
449     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
450     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
451     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
452     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
453     const auto     output   = static_cast<float *__restrict>(output_ptr);
454 
455     // Gx^2, Gy^2 and Gx*Gy
456     float32x4x2_t gx2 =
457     {
458         {
459             vdupq_n_f32(0.0f),
460             vdupq_n_f32(0.0f)
461         }
462     };
463     float32x4x2_t gy2 =
464     {
465         {
466             vdupq_n_f32(0.0f),
467             vdupq_n_f32(0.0f)
468         }
469     };
470     float32x4x2_t gxgy =
471     {
472         {
473             vdupq_n_f32(0.0f),
474             vdupq_n_f32(0.0f)
475         }
476     };
477     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
478     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
479     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
480 
481     for(int i = 0; i < 5; ++i)
482     {
483         const int16x8x2_t tmp_gx =
484         {
485             {
486                 vld1q_s16(gx_ptr_0),
487                 vld1q_s16(gx_ptr_1)
488             }
489         };
490         const int16x8x2_t tmp_gy =
491         {
492             {
493                 vld1q_s16(gy_ptr_0),
494                 vld1q_s16(gy_ptr_1)
495             }
496         };
497 
498         float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
499         float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
500         float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
501         float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
502         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
503 
504         low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
505         low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
506         high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
507         high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
508         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
509 
510         // Update gx and gy pointer
511         gx_ptr_0 += input_stride;
512         gy_ptr_0 += input_stride;
513         gx_ptr_1 += input_stride;
514         gy_ptr_1 += input_stride;
515     }
516 
517     // Calculate harris score
518     const float32x4x2_t mc =
519     {
520         {
521             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
522             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
523         }
524     };
525 
526     // Store score
527     vst1q_f32(output + 0, mc.val[0]);
528     vst1q_f32(output + 4, mc.val[1]);
529 }
530 
harris_score5x5_S32_S32_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)531 inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
532                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
533 
534 {
535     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
536     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
537     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
538     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
539     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
540     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
541     const auto     output   = static_cast<float *__restrict>(output_ptr);
542 
543     // Gx^2, Gy^2 and Gx*Gy
544     float32x4x2_t gx2 =
545     {
546         {
547             vdupq_n_f32(0.0f),
548             vdupq_n_f32(0.0f)
549         }
550     };
551     float32x4x2_t gy2 =
552     {
553         {
554             vdupq_n_f32(0.0f),
555             vdupq_n_f32(0.0f)
556         }
557     };
558     float32x4x2_t gxgy =
559     {
560         {
561             vdupq_n_f32(0.0f),
562             vdupq_n_f32(0.0f)
563         }
564     };
565     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
566     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
567     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
568 
569     for(int i = 0; i < 5; ++i)
570     {
571         const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
572         const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
573         const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
574         const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
575         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
576 
577         const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
578         const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
579         const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
580         const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
581         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
582 
583         // Update gx and gy pointer
584         gx_ptr_0 += input_stride;
585         gy_ptr_0 += input_stride;
586         gx_ptr_1 += input_stride;
587         gy_ptr_1 += input_stride;
588         gx_ptr_2 += input_stride;
589         gy_ptr_2 += input_stride;
590     }
591 
592     // Calculate harris score
593     const float32x4x2_t mc =
594     {
595         {
596             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
597             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
598         }
599     };
600 
601     // Store score
602     vst1q_f32(output + 0, mc.val[0]);
603     vst1q_f32(output + 4, mc.val[1]);
604 }
605 
harris_score7x7_S16_S16_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)606 inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
607                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
608 {
609     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
610     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
611     const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
612     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
613     const auto     output   = static_cast<float *__restrict>(output_ptr);
614 
615     // Gx^2, Gy^2 and Gx*Gy
616     float32x4_t gx2             = vdupq_n_f32(0.0f);
617     float32x4_t gy2             = vdupq_n_f32(0.0f);
618     float32x4_t gxgy            = vdupq_n_f32(0.0f);
619     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
620     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
621     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
622 
623     for(int i = 0; i < 7; ++i)
624     {
625         const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
626         const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
627         const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
628         const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
629 
630         float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
631         float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
632         float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
633         float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
634         float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
635         float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
636         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
637 
638         // Update gx and gy pointer
639         gx_ptr_0 += input_stride;
640         gy_ptr_0 += input_stride;
641         gx_ptr_1 += input_stride;
642         gy_ptr_1 += input_stride;
643     }
644 
645     // Calculate harris score
646     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
647 
648     // Store score
649     vst1q_f32(output, mc);
650 }
651 
harris_score7x7_S32_S32_FLOAT(const void * __restrict input1_ptr,const void * __restrict input2_ptr,void * __restrict output_ptr,int32_t input_stride,float in_norm_factor,float in_sensitivity,float in_strength_thresh)652 inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
653                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
654 {
655     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
656     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
657     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
658     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
659     const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
660     const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
661     const auto     output   = static_cast<float *__restrict>(output_ptr);
662 
663     // Gx^2, Gy^2 and Gx*Gy
664     float32x4_t gx2             = vdupq_n_f32(0.0f);
665     float32x4_t gy2             = vdupq_n_f32(0.0f);
666     float32x4_t gxgy            = vdupq_n_f32(0.0f);
667     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
668     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
669     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
670 
671     for(int i = 0; i < 7; ++i)
672     {
673         const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
674         const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
675         const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
676         const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
677         const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
678         const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
679         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
680 
681         // Update gx and gy pointer
682         gx_ptr_0 += input_stride;
683         gy_ptr_0 += input_stride;
684         gx_ptr_1 += input_stride;
685         gy_ptr_1 += input_stride;
686         gx_ptr_2 += input_stride;
687         gy_ptr_2 += input_stride;
688     }
689 
690     // Calculate harris score
691     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
692 
693     // Store score
694     vst1q_f32(output, mc);
695 }
696 
697 } // namespace
698 
INEHarrisScoreKernel()699 INEHarrisScoreKernel::INEHarrisScoreKernel()
700     : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
701 {
702 }
703 
704 template <int32_t block_size>
NEHarrisScoreKernel()705 NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
706     : INEHarrisScoreKernel(), _func(nullptr)
707 {
708 }
709 
710 template <int32_t block_size>
run(const Window & window,const ThreadInfo & info)711 void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
712 {
713     ARM_COMPUTE_UNUSED(info);
714     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
715     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
716     ARM_COMPUTE_ERROR_ON(_func == nullptr);
717 
718     Iterator input1(_input1, window);
719     Iterator input2(_input2, window);
720     Iterator output(_output, window);
721 
722     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
723 
724     execute_window_loop(window, [&](const Coordinates &)
725     {
726         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
727     },
728     input1, input2, output);
729 }
730 
731 template <int32_t block_size>
border_size() const732 BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
733 {
734     return _border_size;
735 }
736 
737 template <int32_t block_size>
configure(const IImage * input1,const IImage * input2,IImage * output,float norm_factor,float strength_thresh,float sensitivity,bool border_undefined)738 void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
739                                                 bool border_undefined)
740 {
741     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
742     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
743     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
744     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
745     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
746     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
747     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
748     ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
749 
750     _input1          = input1;
751     _input2          = input2;
752     _output          = output;
753     _sensitivity     = sensitivity;
754     _strength_thresh = strength_thresh;
755     _norm_factor     = norm_factor;
756     _border_size     = BorderSize(block_size / 2);
757 
758     if(input1->info()->data_type() == DataType::S16)
759     {
760         switch(block_size)
761         {
762             case 3:
763                 _func = &harris_score3x3_S16_S16_FLOAT;
764                 break;
765             case 5:
766                 _func = &harris_score5x5_S16_S16_FLOAT;
767                 break;
768             case 7:
769                 _func = &harris_score7x7_S16_S16_FLOAT;
770                 break;
771             default:
772                 ARM_COMPUTE_ERROR("Invalid block size");
773                 break;
774         }
775     }
776     else
777     {
778         switch(block_size)
779         {
780             case 3:
781                 _func = &harris_score3x3_S32_S32_FLOAT;
782                 break;
783             case 5:
784                 _func = &harris_score5x5_S32_S32_FLOAT;
785                 break;
786             case 7:
787                 _func = &harris_score7x7_S32_S32_FLOAT;
788                 break;
789             default:
790                 ARM_COMPUTE_ERROR("Invalid block size");
791                 break;
792         }
793     }
794 
795     ARM_COMPUTE_ERROR_ON(nullptr == _func);
796 
797     constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
798     constexpr unsigned int num_elems_read_per_iteration      = block_size != 7 ? 16 : 12;
799     constexpr unsigned int num_elems_written_per_iteration   = block_size != 7 ? 8 : 4;
800     constexpr unsigned int num_rows_read_per_iteration       = block_size;
801 
802     // Configure kernel window
803     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
804     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
805 
806     update_window_and_padding(win,
807                               AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
808                               AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
809                               output_access);
810 
811     ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
812                                                        input2->info()->valid_region());
813 
814     output_access.set_valid_region(win, valid_region, border_undefined, border_size());
815 
816     INEKernel::configure(win);
817 }
818