• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H
26 
27 #include "arm_compute/core/Types.h"
28 #include "arm_compute/core/utils/misc/Traits.h"
29 #include "src/core/NEON/NEAsymm.h"
30 #include "src/core/NEON/NEFixedPoint.h"
31 #include "src/core/NEON/NEMath.h"
32 #include "src/core/NEON/wrapper/wrapper.h"
33 #include "src/core/helpers/PoolingHelpers.h"
34 #include <arm_neon.h>
35 
36 namespace arm_compute
37 {
38 namespace cpu
39 {
40 template <typename T>
poolingMxN_q8_neon_nhwc(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)41 void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
42 {
43     ARM_COMPUTE_UNUSED(dst1);
44 
45     const int window_start_x     = window.x().start();
46     const int window_end_x       = window.x().end();
47     const int window_step_x      = 16;
48     const int window_half_step_x = window_step_x / 2;
49 
50     Window window_out = window;
51     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
52 
53     Iterator in(src, window_src);
54     Iterator out(dst0, window_out);
55 
56     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
57     using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
58     using q16_t   = typename wrapper::traits::promote_t<T>;
59     using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
60     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
61     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
62 
63     const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
64     const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
65     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
66     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
67     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
68     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
69 
70     int pool_stride_x = 0;
71     int pool_stride_y = 0;
72     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
73     const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
74     const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
75 
76     const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
77     const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
78     const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
79 
80     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
81     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
82     // With a requantization performed in a single step there won't be uncertainties introduced
83     const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
84 
85     const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
86     const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
87     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
88 
89     execute_window_loop(window_out, [&](const Coordinates & id)
90     {
91         const int idx_width    = id.y() * pool_stride_x;
92         const int idx_height   = id.z() * pool_stride_y;
93         const int pool_limit_y = pool_pad_top - idx_height;
94         const int pool_limit_x = pool_pad_left - idx_width;
95 
96         const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
97         const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
98         const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
99         const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
100 
101         int x_off = window_start_x;
102         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
103         {
104             if(pool_info.pool_type != PoolingType::MAX)
105             {
106                 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
107                 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
108                 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
109                 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
110 
111                 // Calculate scale
112                 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
113                                                                pool_stride_y);
114 
115                 // Perform pooling
116                 for(int y = pool_start_y; y < pool_end_y; ++y)
117                 {
118                     for(int x = pool_start_x; x < pool_end_x; ++x)
119                     {
120                         const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
121                                                                                          (src->info()->strides_in_bytes().z())) + x_off);
122 
123                         const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
124                         const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
125                         vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
126                         vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
127                         vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
128                         vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
129                     }
130                 }
131 
132                 if(src_qinfo != dst_qinfo)
133                 {
134                     const float32x4x4_t vres =
135                     {
136                         {
137                             vcvtq_f32_q32(vres1),
138                             vcvtq_f32_q32(vres2),
139                             vcvtq_f32_q32(vres3),
140                             vcvtq_f32_q32(vres4),
141                         }
142                     };
143                     const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
144                     // Store result
145                     wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
146                     wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
147                 }
148                 else
149                 {
150                     const float32x4_t scale_v = vdupq_n_f32(scale);
151                     // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
152                     vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
153                     vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
154                     vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
155                     vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
156 
157                     const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
158                     const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
159                     // Store result
160                     wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
161                     wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
162                 }
163             }
164             else
165             {
166                 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
167 
168                 for(int y = pool_start_y; y < pool_end_y; ++y)
169                 {
170                     for(int x = pool_start_x; x < pool_end_x; ++x)
171                     {
172                         const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
173                                                                                          (src->info()->strides_in_bytes().z())) + x_off);
174                         vres               = wrapper::vmax(vres, data);
175                     }
176                 }
177 
178                 // Store result
179                 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
180                                 requant_qinfo) :
181                                 vres);
182             }
183         }
184 
185         if(pool_info.pool_type == PoolingType::MAX)
186         {
187             for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
188             {
189                 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
190                 for(int y = pool_start_y; y < pool_end_y; ++y)
191                 {
192                     for(int x = pool_start_x; x < pool_end_x; ++x)
193                     {
194                         const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
195                                                                                        (src->info()->strides_in_bytes().z())) + x_off);
196                         vres              = wrapper::vmax(vres, data);
197                     }
198                 }
199 
200                 // Store result
201                 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
202                                 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
203             }
204         }
205 
206         // Left-overs loop
207         for(; x_off < window_end_x; ++x_off)
208         {
209             if(pool_info.pool_type != PoolingType::MAX)
210             {
211                 q32_t res = static_cast<q32_t>(0.f);
212 
213                 // Calculate scale
214                 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
215                                                                pool_stride_y);
216 
217                 // Perform pooling
218                 for(int y = pool_start_y; y < pool_end_y; ++y)
219                 {
220                     for(int x = pool_start_x; x < pool_end_x; ++x)
221                     {
222                         const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
223                                                                      (src->info()->strides_in_bytes().z())) + x_off);
224                         res += data;
225                     }
226                 }
227 
228                 if(src_qinfo != dst_qinfo)
229                 {
230                     const float res_f           = static_cast<float>(res);
231                     const float new_scale       = quant_rescale / scale;
232                     const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
233 
234                     // Store result
235                     *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
236                 }
237                 else
238                 {
239                     // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
240                     res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
241 
242                     // Store result
243                     *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
244                 }
245             }
246             else
247             {
248                 T res = std::numeric_limits<T>::min();
249 
250                 for(int y = pool_start_y; y < pool_end_y; ++y)
251                 {
252                     for(int x = pool_start_x; x < pool_end_x; ++x)
253                     {
254                         const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
255                                                                      (src->info()->strides_in_bytes().z())) + x_off);
256                         res          = std::max(res, data);
257                     }
258                 }
259 
260                 // Store result
261                 if(src_qinfo != dst_qinfo)
262                 {
263                     const float res_f                           = static_cast<float>(res);
264                     *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
265                 }
266                 else
267                 {
268                     *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
269                 }
270             }
271         }
272 
273     },
274     in, out);
275 }
276 
277 #if defined(ENABLE_NCHW_KERNELS)
278 template <typename T, typename TVec>
scale_vector_q16x8(bool exclude_padding,TVec & v,const Coordinates & id,int id_offset,int step,const int pool_size,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)279 inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
280                                const int pool_size, const int upper_bound_w, const int upper_bound_h,
281                                const int pad_x, const int pad_y, const int stride_x, const int stride_y)
282 {
283     int       start_x = (id.x() + id_offset) * stride_x - pad_x;
284     int       start_y = id.y() * stride_y - pad_y;
285     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
286     if(exclude_padding)
287     {
288         start_y = std::max(0, start_y);
289     }
290 
291     std::array<T, 8> elems =
292     {
293         {
294             wrapper::vgetlane(v, 0),
295             wrapper::vgetlane(v, 1),
296             wrapper::vgetlane(v, 2),
297             wrapper::vgetlane(v, 3),
298             wrapper::vgetlane(v, 4),
299             wrapper::vgetlane(v, 5),
300             wrapper::vgetlane(v, 6),
301             wrapper::vgetlane(v, 7),
302         }
303     };
304 
305     for(auto &el : elems)
306     {
307         int       c_start_x = start_x;
308         const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
309         if(exclude_padding)
310         {
311             c_start_x = std::max(0, c_start_x);
312         }
313         float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
314         el *= scale;
315         start_x += step * stride_x;
316     }
317 
318     v = wrapper::vsetlane(elems[0], v, 0);
319     v = wrapper::vsetlane(elems[1], v, 1);
320     v = wrapper::vsetlane(elems[2], v, 2);
321     v = wrapper::vsetlane(elems[3], v, 3);
322     v = wrapper::vsetlane(elems[4], v, 4);
323     v = wrapper::vsetlane(elems[5], v, 5);
324     v = wrapper::vsetlane(elems[6], v, 6);
325     v = wrapper::vsetlane(elems[7], v, 7);
326 }
327 
328 template <typename T>
load16_boundary_aware(int srcw,int srch,int pad_l,int pad_r,int pad_t,int pad_b,int x,int y,const T * ptr,T fval)329 auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
330 {
331     ARM_COMPUTE_UNUSED(pad_b, pad_r);
332     T vec[16];
333     //handle reading a row out of the tensor
334     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
335     for(int i = 0; i < 16; i++)
336     {
337         if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
338         {
339             vec[i] = *(ptr + i);
340         }
341         else
342         {
343             vec[i] = fval;
344         }
345     }
346     return wrapper::vloadq(vec);
347 }
348 
349 template <typename T, typename V, bool deinterleave>
write16_boundary_aware(int x,int dst_w,const V & lower,const V & upper,T * ptr)350 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
351 {
352     if(deinterleave)
353     {
354         for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
355         {
356             *(ptr + i * 2) = lower[i];
357         }
358         for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
359         {
360             *(ptr + 1 + i * 2) = upper[i];
361         }
362     }
363     else
364     {
365         for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
366         {
367             *(ptr + i) = lower[i];
368         }
369         for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
370         {
371             *(ptr + i + 8) = upper[i];
372         }
373     }
374 }
375 
376 template <typename T, typename V>
write8_boundary_aware(int x,int dst_w,const V & v,T * ptr)377 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
378 {
379     for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
380     {
381         *(ptr + i) = v[i];
382     }
383 }
384 
385 template <typename T>
pooling2_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)386 void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
387 {
388     ARM_COMPUTE_UNUSED(dst1);
389     Iterator in(src, window_src);
390     Iterator out(dst0, window);
391 
392     /** SIMD vector types */
393     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
394     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
395     using q16_t     = typename wrapper::traits::promote_t<T>;
396     using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
397     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
398     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
399 
400     constexpr int pool_size       = 2;
401     int           pool_stride_x   = 0;
402     int           pool_stride_y   = 0;
403     const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
404     const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
405     const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
406     const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
407     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
408     const int                     upper_bound_w        = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
409     const int                     upper_bound_h        = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
410     const T *const                src_top_ptr          = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
411     const T *const                src_bottom_ptr       = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
412     const int                     scale_step_x         = (pool_stride_x == 1) ? 2 : 1;
413     const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
414     const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
415     const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
416 
417     const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
418     const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
419     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
420     const int                     src_w          = src->info()->dimension(0);
421     const int                     src_h          = src->info()->dimension(1);
422     const int                     dst_w          = dst0->info()->dimension(0);
423 
424     const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
425 
426     execute_window_loop(window, [&](const Coordinates & id)
427     {
428         const auto x_val   = id.x() * pool_stride_x;
429         const auto y_val_0 = id.y() * pool_stride_y;
430         const auto y_val_1 = (id.y() * pool_stride_y) + 1;
431 
432         auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
433                                               x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
434         auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
435                                                  x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
436 
437         q8x8_t lower_res = {};
438         q8x8_t upper_res = {};
439 
440         if(pool_info.pool_type != PoolingType::MAX)
441         {
442             const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
443             const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
444 
445             // Add rows
446             const q16x8x2_t vrsum =
447             {
448                 {
449                     wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
450                     wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
451                 }
452             };
453 
454             // Pair-wise add row data
455             const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
456             const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
457 
458             q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
459 
460             // Scale lower result
461             scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
462                                                pool_size, upper_bound_w, upper_bound_h,
463                                                pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
464             lower_res = wrapper::vmovn(res_lower);
465 
466             // Compute upper result for stride_x == 1
467             if(pool_stride_x == 1)
468             {
469                 // Shifted row sum
470                 const q16x8x2_t vrsum_shifted =
471                 {
472                     {
473                         wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
474                         wrapper::vext_1(vrsum.val[1], vrsum.val[1])
475                     }
476                 };
477 
478                 // Pair-wise add shifted row
479                 q16x8_t res_upper = wrapper::vcombine(
480                                         wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
481                                         wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
482 
483                 // Scale upper result
484                 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
485                                                    pool_size, upper_bound_w, upper_bound_h,
486                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
487                 upper_res = wrapper::vmovn(res_upper);
488             }
489         }
490         else
491         {
492             const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
493             lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
494             if(pool_stride_x == 1)
495             {
496                 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
497                 upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
498             }
499         }
500 
501         if(have_different_qinfo)
502         {
503             const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
504             lower_res                  = wrapper::vgetlow(requantized_dst);
505             upper_res                  = wrapper::vgethigh(requantized_dst);
506         }
507         auto out_ptr = reinterpret_cast<T *>(out.ptr());
508         // Store result
509         if(pool_stride_x == 1)
510         {
511             write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
512         }
513         else
514         {
515             write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
516         }
517     },
518     in, out);
519 }
520 
521 template <typename T>
pooling3_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)522 void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
523 {
524     ARM_COMPUTE_UNUSED(dst1);
525     Iterator in(src, window_src);
526     Iterator out(dst0, window);
527 
528     /** SIMD vector types */
529     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
530     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
531     using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
532     using q16_t     = typename wrapper::traits::promote_t<T>;
533     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
534     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
535 
536     constexpr int pool_size       = 3;
537     const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
538     const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
539     const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
540     const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
541     int           pool_stride_x   = 0;
542     int           pool_stride_y   = 0;
543     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
544     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
545     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
546 
547     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
548     const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
549 
550     const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
551     const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
552     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
553 
554     const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
555     const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
556     const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
557 
558     const int src_w      = src->info()->dimension(0);
559     const int src_h      = src->info()->dimension(1);
560     const T   fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
561     const int dst_w      = dst0->info()->dimension(0);
562 
563     execute_window_loop(window, [&](const Coordinates & id)
564     {
565         const auto x_val   = id.x() * pool_stride_x;
566         const auto y_val_0 = id.y() * pool_stride_y;
567         const auto y_val_1 = (id.y() * pool_stride_y) + 1;
568         const auto y_val_2 = (id.y() * pool_stride_y) + 2;
569 
570         auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
571                                               x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
572         auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
573                                                  x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
574         auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
575                                                  x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
576 
577         q8x8_t  fres  = {};
578         q8x16_t fqres = {};
579 
580         if(pool_info.pool_type == PoolingType::AVG)
581         {
582             // Convert data to u16
583             const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
584             const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
585             const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
586 
587             // Calculate row sums
588             const q16x8x2_t vrsum =
589             {
590                 {
591                     wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
592                     wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
593                 }
594             };
595             const q16x8x2_t vrsum_shifted_1 =
596             {
597                 {
598                     wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
599                     wrapper::vext_1(vrsum.val[1], vrsum.val[1])
600                 }
601             };
602             const q16x8x2_t vrsum_shifted_2 =
603             {
604                 {
605                     wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
606                     wrapper::vext_2(vrsum.val[1], vrsum.val[1])
607                 }
608             };
609             // Calculate final sum
610             q16x8x2_t final_sum =
611             {
612                 {
613                     wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
614                     wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
615                 }
616             };
617             if(pool_stride_x == 2)
618             {
619                 q16x8_t res =
620                 {
621                     wrapper::vgetlane(final_sum.val[0], 0),
622                     wrapper::vgetlane(final_sum.val[0], 2),
623                     wrapper::vgetlane(final_sum.val[0], 4),
624                     wrapper::vgetlane(final_sum.val[0], 6),
625                     wrapper::vgetlane(final_sum.val[1], 0),
626                     wrapper::vgetlane(final_sum.val[1], 2),
627                     wrapper::vgetlane(final_sum.val[1], 4),
628                     wrapper::vgetlane(final_sum.val[1], 6),
629                 };
630 
631                 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
632                                                    pool_size, upper_bound_w, upper_bound_h,
633                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
634                 fres = wrapper::vmovn(res);
635             }
636             else
637             {
638                 // Scale lower result
639                 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
640                                                    pool_size, upper_bound_w, upper_bound_h,
641                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
642                 // Scale lower result
643                 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
644                                                    pool_size, upper_bound_w, upper_bound_h,
645                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
646                 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
647             }
648         }
649         else
650         {
651             const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
652             const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
653             const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
654             const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
655 
656             if(pool_stride_x == 2)
657             {
658                 const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
659                 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
660                 fres                           = wrapper::vtbl(table, lookup_val);
661             }
662             else
663             {
664                 fqres = final_max;
665             }
666         }
667 
668         // Store result
669         if(pool_stride_x == 1)
670         {
671             if(src_qinfo != dst_qinfo)
672             {
673                 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
674             }
675             write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
676         }
677         else
678         {
679             if(src_qinfo != dst_qinfo)
680             {
681                 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
682             }
683             write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
684         }
685     },
686     in, out);
687 }
688 
689 template <typename T>
poolingMxN_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)690 void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
691 {
692     ARM_COMPUTE_UNUSED(dst1);
693     Iterator in(src, window_src);
694     Iterator out(dst0, window);
695 
696     /** SIMD vector types */
697     using q16_t = typename wrapper::traits::promote_t<T>;
698     using q32_t = typename wrapper::traits::promote_t<q16_t>;
699 
700     const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
701     const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
702     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
703     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
704     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
705     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
706     int       pool_stride_x   = 0;
707     int       pool_stride_y   = 0;
708     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
709     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
710     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
711 
712     const UniformQuantizationInfo &src_qinfo        = src->info()->quantization_info().uniform();
713     const UniformQuantizationInfo &dst_qinfo        = dst0->info()->quantization_info().uniform();
714     const int                      src_w            = src->info()->dimension(0);
715     const int                      src_h            = src->info()->dimension(1);
716     const T                        fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
717     const int                      stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
718     const int                      stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
719 
720     execute_window_loop(window, [&](const Coordinates & id)
721     {
722         T res = std::numeric_limits<T>::min();
723 
724         if(pool_info.pool_type != PoolingType::MAX)
725         {
726             q32_t sres = 0;
727 
728             // Calculate scale
729             const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
730                                                            pool_stride_y);
731 
732             // Perform pooling
733             for(int y = 0; y < pool_size_y; ++y)
734             {
735                 for(int x = 0; x < pool_size_x; ++x)
736                 {
737                     const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
738 
739                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
740                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
741                     const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
742                     sres += data;
743                 }
744             }
745             // Divide by scale
746             res = static_cast<T>(support::cpp11::round(sres * scale));
747         }
748         else
749         {
750             for(int y = 0; y < pool_size_y; ++y)
751             {
752                 for(int x = 0; x < pool_size_x; ++x)
753                 {
754                     const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
755 
756                     const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
757                     const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
758                     const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
759                     res            = std::max(res, data);
760                 }
761             }
762         }
763         // Store result
764         res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
765         *(reinterpret_cast<T *>(out.ptr())) = res;
766     },
767     in, out);
768 }
769 #endif /* defined(ENABLE_NCHW_KERNELS) */
770 } // namespace cpu
771 } // namespace arm_compute
772 
773 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
774