• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/Utils.h"
31 #include "arm_compute/core/Validate.h"
32 #include "arm_compute/core/Window.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "src/core/AccessWindowStatic.h"
35 #include "src/core/CPP/Validate.h"
36 #include "src/core/NEON/NEAsymm.h"
37 #include "src/core/NEON/NEFixedPoint.h"
38 #include "src/core/NEON/NEMath.h"
39 #include "src/core/helpers/AutoConfiguration.h"
40 #include "src/core/helpers/WindowHelpers.h"
41 #include "support/ToolchainSupport.h"
42 
43 #include "src/core/NEON/wrapper/wrapper.h"
44 #include <algorithm>
45 #include <arm_neon.h>
46 #include <cmath>
47 #include <limits>
48 #include <set>
49 #include <string>
50 #include <tuple>
51 
52 namespace arm_compute
53 {
54 using namespace misc::shape_calculator;
55 
56 namespace
57 {
58 template <typename T>
59 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
quantize(float val,const UniformQuantizationInfo & info)60 quantize(float val, const UniformQuantizationInfo &info)
61 {
62     return quantize_qasymm8_signed(val, info);
63 }
64 
65 template <typename T>
66 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
quantize(float val,const UniformQuantizationInfo & info)67 quantize(float val, const UniformQuantizationInfo &info)
68 {
69     return quantize_qasymm8(val, info);
70 }
71 
calculate_avg_scale(bool exclude_padding,DataLayout data_layout,const Coordinates & id,const int pool_size_x,const int pool_size_y,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)72 inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
73                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
74 {
75     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
76     const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
77 
78     int start_x = id[idx_width] * stride_x - pad_x;
79     int start_y = id[idx_height] * stride_y - pad_y;
80 
81     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
82     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
83     if(exclude_padding)
84     {
85         start_x = std::max(0, start_x);
86         start_y = std::max(0, start_y);
87     }
88     return 1.f / ((end_y - start_y) * (end_x - start_x));
89 }
90 
91 template <typename T, typename TVec>
scale_vector_q16x8(bool exclude_padding,TVec & v,const Coordinates & id,int id_offset,int step,const int pool_size,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)92 inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
93                                const int pool_size, const int upper_bound_w, const int upper_bound_h,
94                                const int pad_x, const int pad_y, const int stride_x, const int stride_y)
95 {
96     int       start_x = (id.x() + id_offset) * stride_x - pad_x;
97     int       start_y = id.y() * stride_y - pad_y;
98     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
99     if(exclude_padding)
100     {
101         start_y = std::max(0, start_y);
102     }
103 
104     std::array<T, 8> elems =
105     {
106         {
107             wrapper::vgetlane(v, 0),
108             wrapper::vgetlane(v, 1),
109             wrapper::vgetlane(v, 2),
110             wrapper::vgetlane(v, 3),
111             wrapper::vgetlane(v, 4),
112             wrapper::vgetlane(v, 5),
113             wrapper::vgetlane(v, 6),
114             wrapper::vgetlane(v, 7),
115         }
116     };
117 
118     for(auto &el : elems)
119     {
120         int       c_start_x = start_x;
121         const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
122         if(exclude_padding)
123         {
124             c_start_x = std::max(0, c_start_x);
125         }
126         float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
127         el *= scale;
128         start_x += step * stride_x;
129     }
130 
131     v = wrapper::vsetlane(elems[0], v, 0);
132     v = wrapper::vsetlane(elems[1], v, 1);
133     v = wrapper::vsetlane(elems[2], v, 2);
134     v = wrapper::vsetlane(elems[3], v, 3);
135     v = wrapper::vsetlane(elems[4], v, 4);
136     v = wrapper::vsetlane(elems[5], v, 5);
137     v = wrapper::vsetlane(elems[6], v, 6);
138     v = wrapper::vsetlane(elems[7], v, 7);
139 }
140 
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,unsigned int & pooled_w,unsigned int pooled_h,const ITensorInfo * indices,Size2D pool_size)141 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
142                           unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
143 {
144     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
145 
146     int                 pool_stride_x   = 0;
147     int                 pool_stride_y   = 0;
148     PoolingType         pool_type       = pool_info.pool_type;
149     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
150     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
151 
152     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
153     if(indices)
154     {
155         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
156         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
157         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
158     }
159     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
160     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
161     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
162                                     && (input->data_layout() == DataLayout::NHWC),
163                                     "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
164 
165     if(output->total_size() != 0)
166     {
167         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
168         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
169         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
170                                     || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
171 
172         if(indices)
173         {
174             ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
175             ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
176                                         || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
177         }
178     }
179 
180     return Status{};
181 }
182 
validate_arguments_pool_info(const unsigned int pool_size_x,const unsigned int pool_size_y)183 Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
184 {
185     ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
186     ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
187 
188     return Status{};
189 }
190 
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output,ITensorInfo * indices,const PoolingLayerInfo & pool_info,unsigned int & num_elems_processed_per_iteration,BorderSize & border_size,unsigned int pooled_w,unsigned int pooled_h,int pool_size_x,int pool_size_y)191 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
192                                                         unsigned int &num_elems_processed_per_iteration,
193                                                         BorderSize   &border_size,
194                                                         unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
195 {
196     // Output auto inizialitation if not yet initialized
197     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
198     if(indices)
199     {
200         // Indices auto inizialitation if not yet initialized
201         auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
202                                                                                           pool_info)))
203                            .set_data_type(DataType::U32) /* we store the offset to the element */);
204     }
205     const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
206     unsigned int        num_elems_read_per_iteration = 0;
207     unsigned int        num_elems_horizontal_window  = 0;
208     int                 pool_stride_x                = 0;
209     int                 pool_stride_y                = 0;
210     const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
211     const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
212     const int           input_width                  = input->dimension(idx_width);
213     const int           input_height                 = input->dimension(idx_height);
214     const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
215     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
216     const int  pool_pad_right  = pad_stride_info.pad_right();
217     const int  pool_pad_top    = pad_stride_info.pad_top();
218     const int  pool_pad_left   = pad_stride_info.pad_left();
219     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
220     const bool is_square       = pool_size_x == pool_size_y;
221 
222     // Check output dimensions
223     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
224                                                      input->dimension(idx_height),
225                                                      pool_size_x,
226                                                      pool_size_y,
227                                                      pad_stride_info);
228 
229     //If it's not squared and optimized will be executed the MxN
230     num_elems_read_per_iteration      = 1;
231     num_elems_processed_per_iteration = 1;
232     num_elems_horizontal_window       = 1;
233 
234     if(is_square)
235     {
236         switch(input->data_type())
237         {
238             case DataType::QASYMM8:
239             case DataType::QASYMM8_SIGNED:
240                 switch(pool_size_x)
241                 {
242                     case 2:
243                         num_elems_read_per_iteration      = 16;
244                         num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
245                         num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
246                         break;
247                     case 3:
248                         num_elems_read_per_iteration      = 16;
249                         num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
250                         num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
251                         break;
252                     default:
253                         break;
254                 }
255                 break;
256 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
257             case DataType::F16:
258                 switch(pool_size_x)
259                 {
260                     case 2:
261                     case 3:
262                         num_elems_read_per_iteration      = 4;
263                         num_elems_processed_per_iteration = 1;
264                         num_elems_horizontal_window       = 1;
265                         break;
266                     default:
267                         break;
268                 }
269                 break;
270 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
271             case DataType::F32:
272                 switch(pool_size_x)
273                 {
274                     case 2:
275                         num_elems_read_per_iteration = 2;
276                         break;
277                     case 3:
278                         num_elems_read_per_iteration = 4; // We use vload4 for pooling3
279                         break;
280                     case 7:
281                         num_elems_read_per_iteration = 8; // We use vload8 for pooling7
282                         break;
283                     default:
284                         break;
285                 }
286                 num_elems_processed_per_iteration = 1;
287                 num_elems_horizontal_window       = 1;
288                 break;
289             default:
290                 ARM_COMPUTE_ERROR("Element size not supported");
291                 break;
292         }
293     }
294 
295     bool   window_changed = false;
296     Window win{};
297     if(data_layout == DataLayout::NCHW)
298     {
299         // Number of iterations in X dimension
300         const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
301         // Upper limit for the number of right/bottom border elements that are accessed
302         const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
303         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
304         border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
305         border_size.right       = std::max(upper_bound_w, pool_pad_right);
306         border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
307         TensorShape output_shape{ input->tensor_shape() };
308         output_shape.set(0, pooled_w);
309         output_shape.set(1, pooled_h);
310         TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
311         win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
312         AccessWindowStatic     input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
313         AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
314         if(indices)
315         {
316             AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
317             window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
318         }
319         else
320         {
321             window_changed = update_window_and_padding(win, input_access, output_access);
322         }
323         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
324     }
325 
326     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
327     return std::make_pair(err, win);
328 }
329 
330 template <typename T>
331 inline T vcvtq_q32_f32(float32x4_t values);
332 
333 template <>
vcvtq_q32_f32(float32x4_t values)334 inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
335 {
336     return vcvtq_u32_f32(values);
337 }
338 
339 template <>
vcvtq_q32_f32(float32x4_t values)340 inline int32x4_t vcvtq_q32_f32(float32x4_t values)
341 {
342     return vcvtq_s32_f32(values);
343 }
344 
345 template <typename T>
346 inline float32x4_t vcvtq_f32_q32(T values);
347 
348 template <>
vcvtq_f32_q32(uint32x4_t values)349 inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
350 {
351     return vcvtq_f32_u32(values);
352 }
353 
354 template <>
vcvtq_f32_q32(int32x4_t values)355 inline float32x4_t vcvtq_f32_q32(int32x4_t values)
356 {
357     return vcvtq_f32_s32(values);
358 }
359 
360 template <typename Tout>
361 inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
362 
363 template <>
vrequantize_pooling_with_scale(const float32x4x4_t & acc,const float quant_rescale,const float scale_pooling,const int32_t new_offset)364 inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
365 {
366     const float new_scale = quant_rescale / scale_pooling;
367     return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
368 }
369 
370 template <>
vrequantize_pooling_with_scale(const float32x4x4_t & acc,const float quant_rescale,const float scale_pooling,const int32_t new_offset)371 inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
372 {
373     const float new_scale = quant_rescale / scale_pooling;
374     return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
375 }
376 
377 template <typename Tin, typename Tout>
378 inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
379 
380 template <>
vrequantize_pooling(uint8x8_t vec1,uint8x8_t vec2,const UniformQuantizationInfo & requant_qinfo)381 inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
382 {
383     const float32x4x4_t acc =
384     {
385         {
386             vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
387             vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
388             vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
389             vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
390         }
391     };
392     return vquantize(acc, requant_qinfo);
393 }
394 
395 template <>
vrequantize_pooling(int8x8_t vec1,int8x8_t vec2,const UniformQuantizationInfo & requant_qinfo)396 inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
397 {
398     const float32x4x4_t acc =
399     {
400         {
401             vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
402             vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
403             vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
404             vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
405         }
406     };
407     return vquantize_signed(acc, requant_qinfo);
408 }
409 
410 template <typename T>
411 inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
412 
413 template <>
vrequantize_pooling(uint8x8_t & vec,const UniformQuantizationInfo & requant_qinfo)414 inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
415 {
416     const float32x4x2_t acc =
417     {
418         {
419             vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
420             vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
421         }
422     };
423     return vquantize(acc, requant_qinfo);
424 }
425 
426 template <>
vrequantize_pooling(int8x8_t & vec,const UniformQuantizationInfo & requant_qinfo)427 inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
428 {
429     const float32x4x2_t acc =
430     {
431         {
432             vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
433             vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
434         }
435     };
436     return vquantize_signed(acc, requant_qinfo);
437 }
438 
439 } // namespace
440 
NEPoolingLayerKernel()441 NEPoolingLayerKernel::NEPoolingLayerKernel()
442     : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
443 {
444 }
445 
border_size() const446 BorderSize NEPoolingLayerKernel::border_size() const
447 {
448     return _border_size;
449 }
450 
configure(const ITensor * input,ITensor * output,const PoolingLayerInfo & pool_info,ITensor * indices)451 void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
452 {
453     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
454     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
455     const bool          is_global_pooling = pool_info.is_global_pooling;
456     const int           pool_stride_x     = pad_stride_info.stride().first;
457 
458     // Get data layout
459     const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
460     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
461     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
462 
463     // Update pool size in case of global pooling
464     const Size2D pool_size(
465         is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
466         is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
467 
468     // Validate pool info before calling scaled_dimensions
469     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
470 
471     // Check output dimensions
472     unsigned int pooled_w;
473     unsigned int pooled_h;
474     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
475                                                      input->info()->dimension(idx_height),
476                                                      pool_size.x(),
477                                                      pool_size.y(),
478                                                      pad_stride_info);
479 
480     // Perform validation step
481     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
482 
483     // Set instance variables
484     _input       = input;
485     _output      = output;
486     _indices     = indices;
487     _pool_info   = pool_info;
488     _data_layout = input->info()->data_layout();
489     _is_square   = (pool_size.x() == pool_size.y());
490 
491     // Get data type
492     const DataType data_type = input->info()->data_type();
493     const bool     is_nchw   = _data_layout == DataLayout::NCHW;
494 
495     if(data_type == DataType::QASYMM8)
496     {
497         if(!is_nchw)
498         {
499             _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
500         }
501         else
502         {
503             if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
504             {
505                 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
506             }
507             else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
508             {
509                 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
510             }
511             else
512             {
513                 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
514             }
515         }
516     }
517     else if(data_type == DataType::QASYMM8_SIGNED)
518     {
519         if(!is_nchw)
520         {
521             _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
522         }
523         else
524         {
525             if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
526             {
527                 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
528             }
529             else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
530             {
531                 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
532             }
533             else
534             {
535                 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
536             }
537         }
538     }
539     else if(data_type == DataType::F16)
540     {
541         if(!is_nchw)
542         {
543             _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
544         }
545         else
546         {
547             if(_is_square)
548             {
549                 switch(pool_size.x())
550                 {
551                     case 2:
552                     {
553                         _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
554                     }
555                     break;
556                     case 3:
557                     {
558                         _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
559                     }
560                     break;
561                     default:
562                     {
563                         _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
564                         break;
565                     }
566                 }
567             }
568             else
569             {
570                 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
571             }
572         }
573     }
574     else if(data_type == DataType::F32)
575     {
576         if(!is_nchw)
577         {
578             _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
579         }
580         else
581         {
582             if(_is_square)
583             {
584                 switch(pool_size.x())
585                 {
586                     case 2:
587                     {
588                         _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
589                         break;
590                     }
591                     case 3:
592                     {
593                         _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
594                         break;
595                     }
596                     case 7:
597                     {
598                         _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
599                         break;
600                     }
601                     default:
602                     {
603                         _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
604                         break;
605                     }
606                 }
607             }
608             else
609             {
610                 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
611             }
612         }
613     }
614 
615     if(!is_nchw)
616     {
617         // Configure kernel window
618         Window      win = calculate_max_window(*output->info(), Steps());
619         Coordinates coord;
620         coord.set_num_dimensions(output->info()->num_dimensions());
621         output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
622         INEKernel::configure(win);
623     }
624     else
625     {
626         // Configure kernel window
627         auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
628                                                         pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
629         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
630         INEKernel::configure(win_config.second);
631     }
632 }
633 
634 template <typename T>
offset_no_padding(uint32_t padded_offset,const Coordinates & id,const ITensorInfo & info,int pool_stride_x,int pool_stride_y)635 inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
636 {
637     const int pad_left    = info.padding().left;
638     const int pad_right   = info.padding().right;
639     const int pad_top     = info.padding().top;
640     const int pad_bottom  = info.padding().bottom;
641     const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
642     const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
643     const int pad_horiz   = pad_left + pad_right;
644     const int pad_vert    = pad_top + pad_bottom;
645 
646     if(info.data_layout() == DataLayout::NCHW)
647     {
648         const uint32_t offset_base = padded_offset
649                                      - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
650                                      - pad_top * sizeof(T)                                                                       /* top padding */
651                                      - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
652                                      - in_stride_w * id[3];
653 
654         return offset_base;
655     }
656     else
657     {
658         const uint32_t offset_base = padded_offset
659                                      - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
660                                      - pad_top * sizeof(T)                                                     // top padding
661                                      - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
662                                      - in_stride_w * id[3];
663 
664         return offset_base;
665     }
666 }
667 
668 template <typename T>
pooling2_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)669 void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
670 {
671     Iterator input(_input, window_input);
672     Iterator output(_output, window);
673 
674     /** NEON vector types */
675     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
676     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
677     using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
678     using q16_t     = typename wrapper::traits::promote_t<T>;
679     using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
680     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
681     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
682 
683     constexpr int pool_size       = 2;
684     int           pool_stride_x   = 0;
685     int           pool_stride_y   = 0;
686     const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
687     const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
688     const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
689     const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
690     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
691     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
692     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
693 
694     const T *const input_top_ptr    = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
695     const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
696 
697     const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
698 
699     const UniformQuantizationInfo input_qinfo          = _input->info()->quantization_info().uniform();
700     const UniformQuantizationInfo output_qinfo         = _output->info()->quantization_info().uniform();
701     const bool                    have_different_qinfo = input_qinfo != output_qinfo;
702 
703     const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
704     const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
705     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
706 
707     execute_window_loop(window, [&](const Coordinates & id)
708     {
709         const auto top_data    = wrapper::vloadq(input_top_ptr + input.offset());
710         const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
711         q8x8_t     lower_res   = {};
712         q8x8_t     upper_res   = {};
713 
714         if(pooling_type != PoolingType::MAX)
715         {
716             const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
717             const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
718 
719             // Add rows
720             const q16x8x2_t vrsum =
721             {
722                 {
723                     wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
724                     wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
725                 }
726             };
727 
728             // Pair-wise add row data
729             const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
730             const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
731 
732             q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
733 
734             // Scale lower result
735             scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
736                                                pool_size, upper_bound_w, upper_bound_h,
737                                                pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
738             lower_res = wrapper::vmovn(res_lower);
739 
740             // Compute upper result for stride_x == 1
741             if(pool_stride_x == 1)
742             {
743                 // Shifted row sum
744                 const q16x8x2_t vrsum_shifted =
745                 {
746                     {
747                         wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
748                         wrapper::vext_1(vrsum.val[1], vrsum.val[1])
749                     }
750                 };
751 
752                 // Pair-wise add shifted row
753                 q16x8_t res_upper = wrapper::vcombine(
754                                         wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
755                                         wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
756 
757                 // Scale upper result
758                 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
759                                                    pool_size, upper_bound_w, upper_bound_h,
760                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
761                 upper_res = wrapper::vmovn(res_upper);
762             }
763         }
764         else
765         {
766             const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
767             lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
768             if(pool_stride_x == 1)
769             {
770                 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
771                 upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
772             }
773         }
774 
775         if(have_different_qinfo)
776         {
777             const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
778             lower_res                     = wrapper::vgetlow(requantized_output);
779             upper_res                     = wrapper::vgethigh(requantized_output);
780         }
781 
782         // Store result
783         if(pool_stride_x == 1)
784         {
785             const q8x8x2_t res = { { lower_res, upper_res } };
786             wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
787         }
788         else
789         {
790             wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
791         }
792     },
793     input, output);
794 }
795 
pooling3_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)796 void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
797 {
798     ARM_COMPUTE_UNUSED(pooling_type);
799     ARM_COMPUTE_UNUSED(exclude_padding);
800 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
801     Iterator input(_input, window_input);
802     Iterator output(_output, window);
803 
804     constexpr const int pool_size       = 3;
805     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
806     const int           pool_pad_top    = _pool_info.pad_stride_info.pad_top();
807     const int           pool_pad_left   = _pool_info.pad_stride_info.pad_left();
808     const int           pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
809     int                 pool_stride_x   = 0;
810     int                 pool_stride_y   = 0;
811     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
812     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
813     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
814 
815     const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
816     const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
817     const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
818 
819     execute_window_loop(window, [&](const Coordinates & id)
820     {
821         float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
822         float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
823         float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
824         float16x4_t res         = {};
825 
826         // Get power of 2 in case of l2 pooling
827         if(pooling_type == PoolingType::L2)
828         {
829             top_data    = vmul_f16(top_data, top_data);
830             middle_data = vmul_f16(middle_data, middle_data);
831             bottom_data = vmul_f16(bottom_data, bottom_data);
832         }
833 
834         if(pooling_type != PoolingType::MAX)
835         {
836             // Calculate scale
837             const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
838             const float16x4_t scale_v = vdup_n_f16(scale);
839             // Perform pooling
840             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
841             res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
842             res                        = vmul_f16(vpadd_f16(res, res), scale_v);
843         }
844         else
845         {
846             const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
847             res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
848             res                        = vpmax_f16(res, res);
849         }
850 
851         // Calculate square-root in case of l2 pooling
852         if(pooling_type == PoolingType::L2)
853         {
854             res = vinv_f16(vinvsqrt_f16(res));
855         }
856 
857         *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
858     },
859     input, output);
860 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
861     ARM_COMPUTE_UNUSED(window_input);
862     ARM_COMPUTE_UNUSED(window);
863     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
864 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
865 }
866 
867 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
868 template <typename T>
869 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
f16_to_f32(float16x4_t input)870 f16_to_f32(float16x4_t input)
871 {
872     float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
873     return output;
874 }
875 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
876 
877 template <typename T>
878 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
f16_to_f32(float32x2_t input)879 f16_to_f32(float32x2_t input)
880 {
881     return input;
882 }
883 
884 template <typename T>
pooling2_nchw_maxpool_indices(const Window & window_input,const Window & window)885 void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
886 {
887     Iterator  input(_input, window_input);
888     Iterator  output(_output, window);
889     Iterator  indices(_indices, window);
890     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
891     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
892     int       pool_stride_x = 0;
893     int       pool_stride_y = 0;
894     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
895     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
896     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
897     const int            pad_left         = _input->info()->padding().left;
898     const int            pad_right        = _input->info()->padding().right;
899     const int            in_stride_y      = static_cast<int>(_input->info()->strides_in_bytes().y());
900 
901     execute_window_loop(window, [&](const Coordinates & id)
902     {
903         auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
904         auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
905         float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
906         float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
907 
908         // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
909         const float32x2_t max_data_top         = vpmax_f32(top_data_f32, top_data_f32);
910         const float32x2_t max_data_bottom      = vpmax_f32(bottom_data_f32, bottom_data_f32);
911         const float32x2_t max_data             = vmax_f32(max_data_top, max_data_bottom);
912         *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
913 
914         // Calculate max data indice, which will be used in max unpool.
915         const uint32_t   offset_base              = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
916         const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
917         const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
918         const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
919         const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
920         const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
921         const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
922         *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
923     },
924     input, output, indices);
925 }
926 
pooling2_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)927 void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
928 {
929     ARM_COMPUTE_UNUSED(pooling_type);
930     ARM_COMPUTE_UNUSED(exclude_padding);
931 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
932     if(pooling_type == PoolingType::MAX && _indices)
933     {
934         pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
935     }
936     else
937     {
938         Iterator      input(_input, window_input);
939         Iterator      output(_output, window);
940         constexpr int pool_size       = 2;
941         const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
942         const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
943         const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
944         const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
945         int           pool_stride_x, pool_stride_y = 0;
946         std::tie(pool_stride_x, pool_stride_y)     = _pool_info.pad_stride_info.stride();
947         const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
948         const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
949 
950         const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
951         const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
952 
953         execute_window_loop(window, [&](const Coordinates & id)
954         {
955             float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
956             float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
957             float16x4_t res         = {};
958 
959             // Get power of 2 in case of l2 pooling
960             if(pooling_type == PoolingType::L2)
961             {
962                 top_data    = vmul_f16(top_data, top_data);
963                 bottom_data = vmul_f16(bottom_data, bottom_data);
964             }
965 
966             if(pooling_type != PoolingType::MAX)
967             {
968                 const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
969                 const float16x4_t scale_v = vdup_n_f16(scale);
970 
971                 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
972                 res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
973             }
974             else
975             {
976                 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
977                 res                        = vpmax_f16(max_data, max_data);
978             }
979 
980             // Calculate square-root in case of l2 pooling
981             if(pooling_type == PoolingType::L2)
982             {
983                 res = vinv_f16(vinvsqrt_f16(res));
984             }
985 
986             // Store result
987             *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
988         },
989         input, output);
990     }
991 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
992     ARM_COMPUTE_UNUSED(window_input);
993     ARM_COMPUTE_UNUSED(window);
994     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
995 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
996 }
997 
998 template <typename T>
pooling3_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)999 void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1000 {
1001     Iterator input(_input, window_input);
1002     Iterator output(_output, window);
1003 
1004     /** NEON vector types */
1005     using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
1006     using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
1007     using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1008     using q16_t     = typename wrapper::traits::promote_t<T>;
1009     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1010     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1011 
1012     constexpr int pool_size       = 3;
1013     const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1014     const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1015     const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1016     const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1017     int           pool_stride_x   = 0;
1018     int           pool_stride_y   = 0;
1019     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1020     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1021     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1022 
1023     const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
1024     const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1025 
1026     const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
1027     const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1028     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
1029 
1030     const T *const input_top_ptr    = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1031     const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1032     const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
1033 
1034     execute_window_loop(window, [&](const Coordinates & id)
1035     {
1036         const auto top_data    = wrapper::vloadq(input_top_ptr + input.offset());
1037         const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1038         const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1039         q8x8_t     fres        = {};
1040         q8x16_t    fqres       = {};
1041 
1042         if(pooling_type == PoolingType::AVG)
1043         {
1044             // Convert data to u16
1045             const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1046             const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1047             const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
1048 
1049             // Calculate row sums
1050             const q16x8x2_t vrsum =
1051             {
1052                 {
1053                     wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1054                     wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
1055                 }
1056             };
1057             const q16x8x2_t vrsum_shifted_1 =
1058             {
1059                 {
1060                     wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1061                     wrapper::vext_1(vrsum.val[1], vrsum.val[1])
1062                 }
1063             };
1064             const q16x8x2_t vrsum_shifted_2 =
1065             {
1066                 {
1067                     wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1068                     wrapper::vext_2(vrsum.val[1], vrsum.val[1])
1069                 }
1070             };
1071             // Calculate final sum
1072             q16x8x2_t final_sum =
1073             {
1074                 {
1075                     wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1076                     wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1077                 }
1078             };
1079             if(pool_stride_x == 2)
1080             {
1081                 q16x8_t res =
1082                 {
1083                     wrapper::vgetlane(final_sum.val[0], 0),
1084                     wrapper::vgetlane(final_sum.val[0], 2),
1085                     wrapper::vgetlane(final_sum.val[0], 4),
1086                     wrapper::vgetlane(final_sum.val[0], 6),
1087                     wrapper::vgetlane(final_sum.val[1], 0),
1088                     wrapper::vgetlane(final_sum.val[1], 2),
1089                     wrapper::vgetlane(final_sum.val[1], 4),
1090                     wrapper::vgetlane(final_sum.val[1], 6),
1091                 };
1092 
1093                 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1094                                                    pool_size, upper_bound_w, upper_bound_h,
1095                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1096                 fres = wrapper::vmovn(res);
1097             }
1098             else
1099             {
1100                 // Scale lower result
1101                 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1102                                                    pool_size, upper_bound_w, upper_bound_h,
1103                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1104                 // Scale lower result
1105                 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1106                                                    pool_size, upper_bound_w, upper_bound_h,
1107                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1108                 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
1109             }
1110         }
1111         else
1112         {
1113             const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1114             const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1115             const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1116             const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
1117 
1118             if(pool_stride_x == 2)
1119             {
1120                 const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1121                 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1122                 fres                           = wrapper::vtbl(table, lookup_val);
1123             }
1124             else
1125             {
1126                 fqres = final_max;
1127             }
1128         }
1129 
1130         // Store result
1131         if(pool_stride_x == 1)
1132         {
1133             if(input_qinfo != output_qinfo)
1134             {
1135                 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
1136             }
1137             wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
1138         }
1139         else
1140         {
1141             if(input_qinfo != output_qinfo)
1142             {
1143                 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
1144             }
1145             wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
1146         }
1147     },
1148     input, output);
1149 }
1150 
poolingMxN_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1151 void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1152 {
1153     ARM_COMPUTE_UNUSED(pooling_type);
1154     ARM_COMPUTE_UNUSED(exclude_padding);
1155 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1156     Iterator input(_input, window_input);
1157     Iterator output(_output, window);
1158 
1159     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1160     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1161     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1162     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1163     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1164     const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1165     int       pool_stride_x   = 0;
1166     int       pool_stride_y   = 0;
1167     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1168     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1169     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1170 
1171     execute_window_loop(window, [&](const Coordinates & id)
1172     {
1173         float16_t   res  = 0.0f;
1174         float16x8_t vres = vdupq_n_f16(0.0f);
1175 
1176         if(pooling_type != PoolingType::MAX)
1177         {
1178             // Calculate scale
1179             const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1180 
1181             // Perform pooling
1182 
1183             for(int y = 0; y < pool_size_y; ++y)
1184             {
1185                 int x = 0;
1186                 for(; x <= (pool_size_x - 8); x += 8)
1187                 {
1188                     const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1189                                                                                            (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1190 
1191                     // Get power of 2 in case of l2 pooling and accumulate
1192                     if(pooling_type == PoolingType::L2)
1193                     {
1194                         vres = vaddq_f16(vres, vmulq_f16(data, data));
1195                     }
1196                     else
1197                     {
1198                         vres = vaddq_f16(vres, data);
1199                     }
1200                 }
1201 
1202                 // Leftover for loop
1203                 for(; x < pool_size_x; ++x)
1204                 {
1205                     float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1206                                                                            + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1207 
1208                     // Get power of 2 in case of l2 pooling
1209                     if(pooling_type == PoolingType::L2)
1210                     {
1211                         data *= data;
1212                     }
1213 
1214                     res += data;
1215                 }
1216             }
1217 
1218             // Reduction
1219             float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1220             res += vget_lane_f16(tmp, 0);
1221             res += vget_lane_f16(tmp, 1);
1222             res += vget_lane_f16(tmp, 2);
1223             res += vget_lane_f16(tmp, 3);
1224 
1225             // Divide by scale
1226             res *= scale;
1227         }
1228         else
1229         {
1230             float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1231             res              = std::numeric_limits<float>::lowest();
1232 
1233             for(int y = 0; y < pool_size_y; ++y)
1234             {
1235                 int x = 0;
1236                 for(; x <= (pool_size_x - 8); x += 8)
1237                 {
1238                     const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1239                                                                                            (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1240                     vres                   = vmaxq_f16(vres, data);
1241                 }
1242 
1243                 // Leftover for loop
1244                 for(; x < pool_size_x; ++x)
1245                 {
1246                     const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1247                                                                                  + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1248                     res = std::max(res, data);
1249                 }
1250             }
1251 
1252             float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1253             res             = std::max(res, vget_lane_f16(tmp, 0));
1254             res             = std::max(res, vget_lane_f16(tmp, 1));
1255             res             = std::max(res, vget_lane_f16(tmp, 2));
1256             res             = std::max(res, vget_lane_f16(tmp, 3));
1257         }
1258 
1259         // Calculate square-root in case of l2 pooling
1260         if(pooling_type == PoolingType::L2)
1261         {
1262             res = std::sqrt(res);
1263         }
1264 
1265         // Store result
1266         *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1267     },
1268     input, output);
1269 
1270 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1271     ARM_COMPUTE_UNUSED(window_input);
1272     ARM_COMPUTE_UNUSED(window);
1273     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1274 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1275 }
1276 
1277 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
pooling2_f16_nhwc_maxpool_indices(const Window & window_input,const Window & window)1278 void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1279 {
1280     const int window_start_x = window.x().start();
1281     const int window_end_x   = window.x().end();
1282     const int window_step_x  = 8;
1283 
1284     Window window_out = window;
1285     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1286 
1287     Iterator input(_input, window_input);
1288     Iterator output(_output, window_out);
1289     Iterator indices(_indices, window_out);
1290 
1291     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
1292     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1293 
1294     int pool_stride_x = 0;
1295     int pool_stride_y = 0;
1296     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1297 
1298     const int pad_right   = _input->info()->padding().right;
1299     const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1300     const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1301 
1302     execute_window_loop(window_out, [&](const Coordinates & id)
1303     {
1304         const int idx_width    = id.y() * pool_stride_x;
1305         const int idx_height   = id.z() * pool_stride_y;
1306         const int pool_limit_y = pool_pad_top - idx_height;
1307         const int pool_limit_x = pool_pad_left - idx_width;
1308 
1309         const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1310         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1311         const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1312                                  (_input->info()->strides_in_bytes().z());
1313         const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1314                                  (_input->info()->strides_in_bytes().z());
1315         const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1316                                  (_input->info()->strides_in_bytes().z());
1317         const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1318                                  (_input->info()->strides_in_bytes().z());
1319 
1320         int x_off = window_start_x;
1321         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1322         {
1323             const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
1324             const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
1325             const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
1326             const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
1327             const auto  v_x0      = vld1q_f16(in_x0_ptr);
1328             const auto  v_x1      = vld1q_f16(in_x1_ptr);
1329             const auto  v_x2      = vld1q_f16(in_x2_ptr);
1330             const auto  v_x3      = vld1q_f16(in_x3_ptr);
1331             float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
1332             // Store result
1333             vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1334 
1335             const uint32_t   offset_base    = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1336             const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1337             const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1338             const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1339             const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1340             const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1341             const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
1342             const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
1343             const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1344             const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
1345             const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
1346             const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1347             const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
1348             const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
1349             const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1350             const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
1351             const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
1352             const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
1353             const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
1354             const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1355             const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
1356             const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
1357             // Store indicies
1358             vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
1359             vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
1360         }
1361 
1362         // Left-overs loop
1363         for(; x_off < window_end_x; ++x_off)
1364         {
1365             const auto x0  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
1366             const auto x1  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
1367             const auto x2  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
1368             const auto x3  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
1369             float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
1370 
1371             // Store result
1372             *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1373 
1374             const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1375             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1376             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1377             const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1378             const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1379             const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
1380             const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
1381             const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
1382 
1383             // Store indices
1384             *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
1385         }
1386     },
1387     input, output, indices);
1388 }
1389 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1390 
poolingMxN_f16_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1391 void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1392 {
1393     ARM_COMPUTE_UNUSED(pooling_type);
1394     ARM_COMPUTE_UNUSED(exclude_padding);
1395 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1396     if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1397     {
1398         pooling2_f16_nhwc_maxpool_indices(window_input, window);
1399     }
1400     const int window_start_x = window.x().start();
1401     const int window_end_x   = window.x().end();
1402     const int window_step_x  = 8;
1403 
1404     Window window_out = window;
1405     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1406 
1407     Iterator input(_input, window_input);
1408     Iterator output(_output, window_out);
1409 
1410     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1411     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1412     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1413     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1414     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1415     const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1416     int       pool_stride_x   = 0;
1417     int       pool_stride_y   = 0;
1418     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1419     const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1420     const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1421 
1422     float16x8_t vres;
1423 
1424     execute_window_loop(window_out, [&](const Coordinates & id)
1425     {
1426         const int idx_width    = id.y() * pool_stride_x;
1427         const int idx_height   = id.z() * pool_stride_y;
1428         const int pool_limit_y = pool_pad_top - idx_height;
1429         const int pool_limit_x = pool_pad_left - idx_width;
1430 
1431         const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1432         const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1433         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1434         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1435 
1436         int x_off = window_start_x;
1437         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1438         {
1439             if(pooling_type != PoolingType::MAX)
1440             {
1441                 // Calculate scale
1442                 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1443                                                         pool_stride_y);
1444                 const float16x8_t scale_v = vdupq_n_f16(scale);
1445 
1446                 // Perform pooling
1447                 vres = vdupq_n_f16(0.0f);
1448                 for(int y = pool_start_y; y < pool_end_y; ++y)
1449                 {
1450                     for(int x = pool_start_x; x < pool_end_x; ++x)
1451                     {
1452                         const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1453                                                                                                (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1454 
1455                         // Get power of 2 in case of l2 pooling and accumulate
1456                         if(pooling_type == PoolingType::L2)
1457                         {
1458                             vres = vaddq_f16(vres, vmulq_f16(data, data));
1459                         }
1460                         else
1461                         {
1462                             vres = vaddq_f16(vres, data);
1463                         }
1464                     }
1465                 }
1466                 // Divide by scale
1467                 vres = vmulq_f16(vres, scale_v);
1468             }
1469             else
1470             {
1471                 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1472 
1473                 for(int y = pool_start_y; y < pool_end_y; ++y)
1474                 {
1475                     for(int x = pool_start_x; x < pool_end_x; ++x)
1476                     {
1477                         const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1478                                                                                                (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1479                         vres                   = vmaxq_f16(vres, data);
1480                     }
1481                 }
1482             }
1483 
1484             // Calculate square-root in case of l2 pooling
1485             if(pooling_type == PoolingType::L2)
1486             {
1487                 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1488                 vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1489             }
1490 
1491             // Store result
1492             vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1493         }
1494 
1495         // Left-overs loop
1496         for(; x_off < window_end_x; ++x_off)
1497         {
1498             float16_t res = 0.0f;
1499 
1500             if(pooling_type != PoolingType::MAX)
1501             {
1502                 // Calculate scale
1503                 const float16_t scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1504                                                             pool_stride_y);
1505 
1506                 for(int y = pool_start_y; y < pool_end_y; ++y)
1507                 {
1508                     for(int x = pool_start_x; x < pool_end_x; ++x)
1509                     {
1510                         const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1511                                                                                  (_input->info()->strides_in_bytes().z())) + x_off);
1512 
1513                         // Get power of 2 in case of l2 pooling and accumulate
1514                         if(pooling_type == PoolingType::L2)
1515                         {
1516                             res += data * data;
1517                         }
1518                         else
1519                         {
1520                             res += data;
1521                         }
1522                     }
1523                 }
1524 
1525                 // Divide by scale
1526                 res *= scale;
1527             }
1528             else
1529             {
1530                 res = std::numeric_limits<float>::lowest();
1531                 for(int y = pool_start_y; y < pool_end_y; ++y)
1532                 {
1533                     for(int x = pool_start_x; x < pool_end_x; ++x)
1534                     {
1535                         const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1536                                                                                      (_input->info()->strides_in_bytes().z())) + x_off);
1537                         res                  = std::max(res, data);
1538                     }
1539                 }
1540             }
1541 
1542             // Calculate square-root in case of l2 pooling
1543             if(pooling_type == PoolingType::L2)
1544             {
1545                 res = std::sqrt(res);
1546             }
1547 
1548             // Store result
1549             *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1550         }
1551     },
1552     input, output);
1553 
1554 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1555     ARM_COMPUTE_UNUSED(window_input);
1556     ARM_COMPUTE_UNUSED(window);
1557     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1558 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1559 }
1560 
poolingMxN_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1561 void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1562 {
1563     Iterator input(_input, window_input);
1564     Iterator output(_output, window);
1565 
1566     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1567     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1568     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1569     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1570     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1571     const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1572     int       pool_stride_x   = 0;
1573     int       pool_stride_y   = 0;
1574     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1575     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1576     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1577 
1578     execute_window_loop(window, [&](const Coordinates & id)
1579     {
1580         float res = 0.0f;
1581 
1582         if(pooling_type != PoolingType::MAX)
1583         {
1584             // Calculate scale
1585             const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1586 
1587             // Perform pooling
1588             float32x4_t vres = vdupq_n_f32(0.0f);
1589 
1590             for(int y = 0; y < pool_size_y; ++y)
1591             {
1592                 int x = 0;
1593                 for(; x <= (pool_size_x - 4); x += 4)
1594                 {
1595                     const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1596                                                                                        (_input->info()->strides_in_bytes().y())));
1597 
1598                     // Get power of 2 in case of l2 pooling and accumulate
1599                     if(pooling_type == PoolingType::L2)
1600                     {
1601                         vres = vmlaq_f32(vres, data, data);
1602                     }
1603                     else
1604                     {
1605                         vres = vaddq_f32(vres, data);
1606                     }
1607                 }
1608 
1609                 // Leftover for loop
1610                 for(; x < pool_size_x; ++x)
1611                 {
1612                     float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1613                                                                    (_input->info()->strides_in_bytes().y())));
1614 
1615                     // Get power of 2 in case of l2 pooling
1616                     if(pooling_type == PoolingType::L2)
1617                     {
1618                         data *= data;
1619                     }
1620 
1621                     res += data;
1622                 }
1623             }
1624 
1625 #if defined(__aarch64__)
1626             // Reduction operation available on 64 bit architectures only
1627             res += vaddvq_f32(vres);
1628 #else  // __aarch64__
1629             // Reduction
1630             float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1631             tmp             = vpadd_f32(tmp, tmp);
1632 
1633             res += vget_lane_f32(tmp, 0);
1634 #endif // __aarch64__
1635             // Divide by scale
1636             res *= scale;
1637         }
1638         else
1639         {
1640             float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1641             res              = std::numeric_limits<float>::lowest();
1642 
1643             for(int y = 0; y < pool_size_y; ++y)
1644             {
1645                 int x = 0;
1646                 for(; x <= (pool_size_x - 4); x += 4)
1647                 {
1648                     const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1649                                                                                        (_input->info()->strides_in_bytes().y())));
1650                     vres                   = vmaxq_f32(vres, data);
1651                 }
1652 
1653                 // Leftover for loop
1654                 for(; x < pool_size_x; ++x)
1655                 {
1656                     const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1657                                                                          (_input->info()->strides_in_bytes().y())));
1658                     res              = std::max(res, data);
1659                 }
1660             }
1661 #if defined(__aarch64__)
1662             // Reduction operation available on 64 bit architectures only
1663             res = std::max(vmaxvq_f32(vres), res);
1664 #else  // __aarch64__
1665             float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1666             tmp             = vpmax_f32(tmp, tmp);
1667 
1668             res = std::max(res, vget_lane_f32(tmp, 0));
1669 #endif // __aarch64__
1670         }
1671 
1672         // Calculate square-root in case of l2 pooling
1673         if(pooling_type == PoolingType::L2)
1674         {
1675             res = std::sqrt(res);
1676         }
1677 
1678         // Store result
1679         *(reinterpret_cast<float *>(output.ptr())) = res;
1680     },
1681     input, output);
1682 }
1683 
pooling2_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1684 void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1685                                              bool exclude_padding)
1686 {
1687     if(pooling_type == PoolingType::MAX && _indices)
1688     {
1689         pooling2_nchw_maxpool_indices<float>(window_input, window);
1690     }
1691     else
1692     {
1693         Iterator      input(_input, window_input);
1694         Iterator      output(_output, window);
1695         constexpr int pool_size       = 2;
1696         const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1697         const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1698         const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1699         const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1700         int           pool_stride_x   = 0;
1701         int           pool_stride_y   = 0;
1702         std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1703         const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1704         const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1705 
1706         const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1707         const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1708 
1709         execute_window_loop(window, [&](const Coordinates & id)
1710         {
1711             const auto  in_top_ptr    = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1712             const auto  in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1713             float32x2_t top_data      = vld1_f32(in_top_ptr);
1714             float32x2_t bottom_data   = vld1_f32(in_bottom_ptr);
1715             float32x2_t res           = {};
1716             float       final_res     = 0;
1717             // Get power of 2 in case of l2 pooling
1718             if(pooling_type == PoolingType::L2)
1719             {
1720                 top_data    = vmul_f32(top_data, top_data);
1721                 bottom_data = vmul_f32(bottom_data, bottom_data);
1722             }
1723 
1724             if(pooling_type != PoolingType::MAX)
1725             {
1726                 // Calculate scale
1727                 float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1728                 const float32x2_t scale_v = vdup_n_f32(scale);
1729 
1730                 // Perform pooling
1731                 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1732                 res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1733             }
1734             else
1735             {
1736                 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1737                 res                        = vpmax_f32(max_data, max_data);
1738             }
1739             final_res = vget_lane_f32(res, 0);
1740 
1741             // Calculate square-root in case of l2 pooling
1742             if(pooling_type == PoolingType::L2)
1743             {
1744                 final_res = sqrt(final_res);
1745             }
1746 
1747             // Store result
1748             *(reinterpret_cast<float *>(output.ptr())) = final_res;
1749         },
1750         input, output);
1751     }
1752 }
1753 
pooling3_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1754 void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1755 {
1756     Iterator input(_input, window_input);
1757     Iterator output(_output, window);
1758 
1759     constexpr const int pool_size       = 3;
1760     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1761     const int           pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1762     const int           pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1763     const int           pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1764     int                 pool_stride_x   = 0;
1765     int                 pool_stride_y   = 0;
1766     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1767     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1768     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1769 
1770     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1771     const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1772     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1773 
1774     execute_window_loop(window, [&](const Coordinates & id)
1775     {
1776         float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1777         float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1778         float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1779         float32x2_t res         = {};
1780         float       final_res   = 0;
1781 
1782         // Get power of 2 in case of l2 pooling
1783         if(pooling_type == PoolingType::L2)
1784         {
1785             top_data    = vmulq_f32(top_data, top_data);
1786             middle_data = vmulq_f32(middle_data, middle_data);
1787             bottom_data = vmulq_f32(bottom_data, bottom_data);
1788         }
1789 
1790         if(pooling_type != PoolingType::MAX)
1791         {
1792             // Calculate scale
1793             float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1794             const float32x2_t scale_v = vdup_n_f32(scale);
1795 
1796             // Perform pooling
1797             const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1798             res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1799             res                        = vmul_f32(vpadd_f32(res, res), scale_v);
1800         }
1801         else
1802         {
1803             const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1804             res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1805             res                        = vpmax_f32(res, res);
1806         }
1807         final_res = vget_lane_f32(res, 0);
1808 
1809         // Calculate square-root in case of l2 pooling
1810         if(pooling_type == PoolingType::L2)
1811         {
1812             final_res = sqrt(final_res);
1813         }
1814 
1815         // Store result
1816         *(reinterpret_cast<float *>(output.ptr())) = final_res;
1817     },
1818     input, output);
1819 }
1820 
pooling7_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1821 void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1822 {
1823     Iterator input(_input, window_input);
1824     Iterator output(_output, window);
1825 
1826     constexpr const int pool_size       = 7;
1827     const int           pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1828     const int           pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1829     const int           pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1830     const int           pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1831     int                 pool_stride_x   = 0;
1832     int                 pool_stride_y   = 0;
1833     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1834     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1835     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1836 
1837     std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1838     for(int i = 0; i < pool_size; ++i)
1839     {
1840         input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1841     }
1842 
1843     execute_window_loop(window, [&](const Coordinates & id)
1844     {
1845         float32x2_t res       = {};
1846         float       final_res = 0.f;
1847         if(pooling_type != PoolingType::MAX)
1848         {
1849             // Calculate scale
1850             float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1851             const float32x2_t scale_v = vdup_n_f32(scale);
1852 
1853             // Perform pooling
1854             float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1855             // Get power of 2 in case of l2 pooling
1856             if(pooling_type == PoolingType::L2)
1857             {
1858                 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1859                 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1860             }
1861             float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1862             for(int i = 1; i < pool_size; ++i)
1863             {
1864                 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1865                 // Get power of 2 in case of l2 pooling
1866                 if(pooling_type == PoolingType::L2)
1867                 {
1868                     data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1869                     data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1870                 }
1871                 sum_data = vaddq_f32(sum_data, data.val[0]);
1872                 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1873             }
1874             res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1875             res = vmul_f32(vpadd_f32(res, res), scale_v);
1876         }
1877         else
1878         {
1879             float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1880             for(int i = 1; i < pool_size; ++i)
1881             {
1882                 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1883                 max_data                 = vmax2q_f32(max_data, data);
1884             }
1885             res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1886             res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1887             res = vpmax_f32(res, res);
1888         }
1889         final_res = vget_lane_f32(res, 0);
1890 
1891         // Calculate square-root in case of l2 pooling
1892         if(pooling_type == PoolingType::L2)
1893         {
1894             final_res = sqrt(final_res);
1895         }
1896 
1897         // Store result
1898         *(reinterpret_cast<float *>(output.ptr())) = final_res;
1899     },
1900     input, output);
1901 }
1902 
poolingMxN_f32_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1903 void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1904 {
1905     if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1906     {
1907         pooling2_f32_nhwc_maxpool_indices(window_input, window);
1908     }
1909     else
1910     {
1911         const int window_start_x = window.x().start();
1912         const int window_end_x   = window.x().end();
1913         const int window_step_x  = 4;
1914 
1915         Window window_out = window;
1916         window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1917 
1918         Iterator input(_input, window_input);
1919         Iterator output(_output, window_out);
1920 
1921         const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1922         const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1923         const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
1924         const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
1925         const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
1926         const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1927         int       pool_stride_x   = 0;
1928         int       pool_stride_y   = 0;
1929         std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1930         const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1931         const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1932 
1933         float32x4_t vres;
1934 
1935         execute_window_loop(window_out, [&](const Coordinates & id)
1936         {
1937             const int idx_width    = id.y() * pool_stride_x;
1938             const int idx_height   = id.z() * pool_stride_y;
1939             const int pool_limit_y = pool_pad_top - idx_height;
1940             const int pool_limit_x = pool_pad_left - idx_width;
1941 
1942             const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1943             const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1944             const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1945             const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1946 
1947             int x_off = window_start_x;
1948             for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1949             {
1950                 if(pooling_type != PoolingType::MAX)
1951                 {
1952                     // Calculate scale
1953                     const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1954                                                             pool_stride_y);
1955                     const float32x4_t scale_v = vdupq_n_f32(scale);
1956 
1957                     // Perform pooling
1958                     vres = vdupq_n_f32(0.0f);
1959 
1960                     for(int y = pool_start_y; y < pool_end_y; ++y)
1961                     {
1962                         for(int x = pool_start_x; x < pool_end_x; ++x)
1963                         {
1964                             const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1965                                                                                                (_input->info()->strides_in_bytes().z())) + x_off);
1966 
1967                             // Get power of 2 in case of l2 pooling and accumulate
1968                             if(pooling_type == PoolingType::L2)
1969                             {
1970                                 vres = vmlaq_f32(vres, data, data);
1971                             }
1972                             else
1973                             {
1974                                 vres = vaddq_f32(vres, data);
1975                             }
1976                         }
1977                     }
1978                     // Divide by scale
1979                     vres = vmulq_f32(vres, scale_v);
1980                 }
1981                 else
1982                 {
1983                     vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1984                     for(int y = pool_start_y; y < pool_end_y; ++y)
1985                     {
1986                         for(int x = pool_start_x; x < pool_end_x; ++x)
1987                         {
1988                             const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1989                                                                                                (_input->info()->strides_in_bytes().z())) + x_off);
1990                             vres                   = vmaxq_f32(vres, data);
1991                         }
1992                     }
1993                 }
1994 
1995                 // Calculate square-root in case of l2 pooling
1996                 if(pooling_type == PoolingType::L2)
1997                 {
1998                     float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1999                                            static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
2000                                            static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
2001                                            static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
2002                                          };
2003                     vres = l2_res;
2004                 }
2005 
2006                 // Store result
2007                 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2008             }
2009 
2010             // Left-overs loop
2011             for(; x_off < window_end_x; ++x_off)
2012             {
2013                 float res = 0.0f;
2014 
2015                 if(pooling_type != PoolingType::MAX)
2016                 {
2017                     // Calculate scale
2018                     const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2019                                                             pool_stride_y);
2020 
2021                     for(int y = pool_start_y; y < pool_end_y; ++y)
2022                     {
2023                         for(int x = pool_start_x; x < pool_end_x; ++x)
2024                         {
2025                             const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2026                                                                                  (_input->info()->strides_in_bytes().z())) + x_off);
2027 
2028                             // Get power of 2 in case of l2 pooling and accumulate
2029                             if(pooling_type == PoolingType::L2)
2030                             {
2031                                 res += data * data;
2032                             }
2033                             else
2034                             {
2035                                 res += data;
2036                             }
2037                         }
2038                     }
2039 
2040                     // Divide by scale
2041                     res *= scale;
2042                 }
2043                 else
2044                 {
2045                     res = std::numeric_limits<float>::lowest();
2046                     for(int y = pool_start_y; y < pool_end_y; ++y)
2047                     {
2048                         for(int x = pool_start_x; x < pool_end_x; ++x)
2049                         {
2050                             const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2051                                                                                  (_input->info()->strides_in_bytes().z())) + x_off);
2052                             res              = std::max(res, data);
2053                         }
2054                     }
2055                 }
2056 
2057                 // Calculate square-root in case of l2 pooling
2058                 if(pooling_type == PoolingType::L2)
2059                 {
2060                     res = std::sqrt(res);
2061                 }
2062 
2063                 // Store result
2064                 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2065             }
2066         },
2067         input, output);
2068     }
2069 }
2070 
pooling2_f32_nhwc_maxpool_indices(const Window & window_input,const Window & window)2071 void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
2072 {
2073     const int window_start_x = window.x().start();
2074     const int window_end_x   = window.x().end();
2075     const int window_step_x  = 4;
2076 
2077     Window window_out = window;
2078     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2079 
2080     Iterator input(_input, window_input);
2081     Iterator output(_output, window_out);
2082     Iterator indices(_indices, window_out);
2083 
2084     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
2085     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2086 
2087     int pool_stride_x = 0;
2088     int pool_stride_y = 0;
2089     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2090 
2091     float32x4_t vres;
2092     float       res;
2093 
2094     const int pad_right   = _input->info()->padding().right;
2095     const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
2096     const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
2097 
2098     execute_window_loop(window_out, [&](const Coordinates & id)
2099     {
2100         const int idx_width    = id.y() * pool_stride_x;
2101         const int idx_height   = id.z() * pool_stride_y;
2102         const int pool_limit_y = pool_pad_top - idx_height;
2103         const int pool_limit_x = pool_pad_left - idx_width;
2104 
2105         const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2106         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2107 
2108         const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2109                                  (_input->info()->strides_in_bytes().z());
2110         const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2111                                  (_input->info()->strides_in_bytes().z());
2112         const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2113                                  (_input->info()->strides_in_bytes().z());
2114         const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2115                                  (_input->info()->strides_in_bytes().z());
2116 
2117         int x_off = window_start_x;
2118         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
2119         {
2120             const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
2121             const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
2122             const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
2123             const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
2124             const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
2125             const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
2126             const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
2127             const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
2128             vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
2129             // Store result
2130             vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2131 
2132             const uint32_t   offset_base  = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2133             const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
2134             const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2135             const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2136             const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2137             const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
2138             const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
2139             const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
2140             const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
2141             const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
2142             const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
2143             const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
2144 
2145             // Store indices
2146             vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
2147         }
2148 
2149         // Left-overs loop
2150         for(; x_off < window_end_x; ++x_off)
2151         {
2152             const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
2153             const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
2154             const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
2155             const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
2156             res           = std::max(std::max(x2, x3), std::max(x0, x1));
2157 
2158             // Store result
2159             *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2160 
2161             const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2162             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
2163             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2164             const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2165             const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2166             const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
2167             const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
2168             const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
2169 
2170             // Store indices
2171             *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
2172         }
2173     },
2174     input, output, indices);
2175 }
2176 
2177 template <typename T>
poolingMxN_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)2178 void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
2179 {
2180     Iterator input(_input, window_input);
2181     Iterator output(_output, window);
2182 
2183     /** NEON vector types */
2184     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
2185     using q16_t   = typename wrapper::traits::promote_t<T>;
2186     using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2187     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
2188     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2189 
2190     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
2191     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
2192     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
2193     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
2194     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
2195     const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
2196     int       pool_stride_x   = 0;
2197     int       pool_stride_y   = 0;
2198     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2199     const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
2200     const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
2201 
2202     const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
2203     const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
2204 
2205     execute_window_loop(window, [&](const Coordinates & id)
2206     {
2207         T res = std::numeric_limits<T>::min();
2208 
2209         if(pooling_type != PoolingType::MAX)
2210         {
2211             q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2212             q32_t   sres = 0;
2213 
2214             // Calculate scale
2215             const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
2216 
2217             // Perform pooling
2218             for(int y = 0; y < pool_size_y; ++y)
2219             {
2220                 int x = 0;
2221                 for(; x <= (pool_size_x - 8); x += 8)
2222                 {
2223                     const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2224                                                                                    (_input->info()->strides_in_bytes().y())));
2225 
2226                     const q16x8_t data_q16 = wrapper::vmovl(data);
2227                     vres                   = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
2228                 }
2229 
2230                 // Leftover for loop
2231                 for(; x < pool_size_x; ++x)
2232                 {
2233                     T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2234                                                            (_input->info()->strides_in_bytes().y())));
2235                     sres += data;
2236                 }
2237             }
2238 
2239             // Reduction
2240             const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2241             sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
2242 
2243             // Divide by scale
2244             res = static_cast<T>(support::cpp11::round(sres * scale));
2245         }
2246         else
2247         {
2248             q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
2249 
2250             for(int y = 0; y < pool_size_y; ++y)
2251             {
2252                 int x = 0;
2253                 for(; x <= (pool_size_x - 8); x += 8)
2254                 {
2255                     const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2256                                                                                    (_input->info()->strides_in_bytes().y())));
2257                     vres              = wrapper::vmax(vres, data);
2258                 }
2259                 // Leftover for loop
2260                 for(; x < pool_size_x; ++x)
2261                 {
2262                     const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2263                                                                  (_input->info()->strides_in_bytes().y())));
2264                     res          = std::max(res, data);
2265                 }
2266             }
2267 
2268             // Reduce max
2269             vres = wrapper::vpmax(vres, vres);
2270             vres = wrapper::vpmax(vres, vres);
2271             vres = wrapper::vpmax(vres, vres);
2272 
2273             // Get max value
2274             res = std::max(res, wrapper::vgetlane(vres, 0));
2275         }
2276         // Store result
2277         res                                    = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2278         *(reinterpret_cast<T *>(output.ptr())) = res;
2279     },
2280     input, output);
2281 }
2282 
2283 template <typename T>
poolingMxN_q8_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)2284 void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
2285 {
2286     const int window_start_x     = window.x().start();
2287     const int window_end_x       = window.x().end();
2288     const int window_step_x      = 16;
2289     const int window_half_step_x = window_step_x / 2;
2290 
2291     Window window_out = window;
2292     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2293 
2294     Iterator input(_input, window_input);
2295     Iterator output(_output, window_out);
2296 
2297     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
2298     using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2299     using q16_t   = typename wrapper::traits::promote_t<T>;
2300     using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2301     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
2302     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2303 
2304     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2305     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2306     const int pool_pad_right  = _pool_info.pad_stride_info.pad_right();
2307     const int pool_pad_top    = _pool_info.pad_stride_info.pad_top();
2308     const int pool_pad_left   = _pool_info.pad_stride_info.pad_left();
2309     const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
2310 
2311     int pool_stride_x = 0;
2312     int pool_stride_y = 0;
2313     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2314     const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2315     const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2316 
2317     const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
2318     const UniformQuantizationInfo input_qinfo  = _input->info()->quantization_info().uniform();
2319     const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
2320 
2321     const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
2322     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2323     // With a requantization performed in a single step there won't be uncertainties introduced
2324     const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
2325 
2326     const float                   requant_scale  = output_qinfo.scale / input_qinfo.scale;
2327     const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2328     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
2329 
2330     execute_window_loop(window_out, [&](const Coordinates & id)
2331     {
2332         const int idx_width    = id.y() * pool_stride_x;
2333         const int idx_height   = id.z() * pool_stride_y;
2334         const int pool_limit_y = pool_pad_top - idx_height;
2335         const int pool_limit_x = pool_pad_left - idx_width;
2336 
2337         const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2338         const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2339         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2340         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2341 
2342         int x_off = window_start_x;
2343         for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
2344         {
2345             if(pooling_type != PoolingType::MAX)
2346             {
2347                 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2348                 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2349                 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2350                 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2351 
2352                 // Calculate scale
2353                 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2354                                                         pool_stride_y);
2355 
2356                 // Perform pooling
2357                 for(int y = pool_start_y; y < pool_end_y; ++y)
2358                 {
2359                     for(int x = pool_start_x; x < pool_end_x; ++x)
2360                     {
2361                         const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2362                                                                                          (_input->info()->strides_in_bytes().z())) + x_off);
2363 
2364                         const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
2365                         const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2366                         vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2367                         vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2368                         vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2369                         vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
2370                     }
2371                 }
2372 
2373                 if(input_qinfo != output_qinfo)
2374                 {
2375                     const float32x4x4_t vres =
2376                     {
2377                         {
2378                             vcvtq_f32_q32(vres1),
2379                             vcvtq_f32_q32(vres2),
2380                             vcvtq_f32_q32(vres3),
2381                             vcvtq_f32_q32(vres4),
2382                         }
2383                     };
2384                     const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2385                     // Store result
2386                     wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
2387                     wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
2388                 }
2389                 else
2390                 {
2391                     const float32x4_t scale_v = vdupq_n_f32(scale);
2392                     // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2393                     vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2394                     vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2395                     vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2396                     vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
2397 
2398                     const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2399                     const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2400                     // Store result
2401                     wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
2402                     wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
2403                 }
2404             }
2405             else
2406             {
2407                 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
2408 
2409                 for(int y = pool_start_y; y < pool_end_y; ++y)
2410                 {
2411                     for(int x = pool_start_x; x < pool_end_x; ++x)
2412                     {
2413                         const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2414                                                                                          (_input->info()->strides_in_bytes().z())) + x_off);
2415                         vres               = wrapper::vmax(vres, data);
2416                     }
2417                 }
2418 
2419                 // Store result
2420                 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
2421                                 requant_qinfo) :
2422                                 vres);
2423             }
2424         }
2425 
2426         if(pooling_type == PoolingType::MAX)
2427         {
2428             for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
2429             {
2430                 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
2431                 for(int y = pool_start_y; y < pool_end_y; ++y)
2432                 {
2433                     for(int x = pool_start_x; x < pool_end_x; ++x)
2434                     {
2435                         const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2436                                                                                        (_input->info()->strides_in_bytes().z())) + x_off);
2437                         vres              = wrapper::vmax(vres, data);
2438                     }
2439                 }
2440 
2441                 // Store result
2442                 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off,
2443                                 (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
2444             }
2445         }
2446 
2447         // Left-overs loop
2448         for(; x_off < window_end_x; ++x_off)
2449         {
2450             if(pooling_type != PoolingType::MAX)
2451             {
2452                 q32_t res = static_cast<q32_t>(0.f);
2453 
2454                 // Calculate scale
2455                 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2456                                                         pool_stride_y);
2457 
2458                 // Perform pooling
2459                 for(int y = pool_start_y; y < pool_end_y; ++y)
2460                 {
2461                     for(int x = pool_start_x; x < pool_end_x; ++x)
2462                     {
2463                         const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2464                                                                      (_input->info()->strides_in_bytes().z())) + x_off);
2465                         res += data;
2466                     }
2467                 }
2468 
2469                 if(input_qinfo != output_qinfo)
2470                 {
2471                     const float res_f              = static_cast<float>(res);
2472                     const float new_scale          = quant_rescale / scale;
2473                     const auto  requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
2474 
2475                     // Store result
2476                     *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
2477                 }
2478                 else
2479                 {
2480                     // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2481                     res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
2482 
2483                     // Store result
2484                     *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2485                 }
2486             }
2487             else
2488             {
2489                 T res = std::numeric_limits<T>::min();
2490 
2491                 for(int y = pool_start_y; y < pool_end_y; ++y)
2492                 {
2493                     for(int x = pool_start_x; x < pool_end_x; ++x)
2494                     {
2495                         const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2496                                                                      (_input->info()->strides_in_bytes().z())) + x_off);
2497                         res          = std::max(res, data);
2498                     }
2499                 }
2500 
2501                 // Store result
2502                 if(input_qinfo != output_qinfo)
2503                 {
2504                     const float res_f                              = static_cast<float>(res);
2505                     *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
2506                 }
2507                 else
2508                 {
2509                     *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2510                 }
2511             }
2512         }
2513 
2514     },
2515     input, output);
2516 }
2517 
validate(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,const ITensorInfo * indices)2518 Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
2519 {
2520     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2521 
2522     unsigned int pooled_w                          = 0;
2523     unsigned int pooled_h                          = 0;
2524     unsigned int num_elems_processed_per_iteration = 0;
2525     BorderSize   border_size(0);
2526 
2527     const bool   is_global_pooling = pool_info.is_global_pooling;
2528     unsigned int pool_size_x       = 0;
2529     unsigned int pool_size_y       = 0;
2530 
2531     // Get data layout
2532     const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2533     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2534     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
2535 
2536     pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2537     pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
2538 
2539     // Validate pool info before calling scaled_dimensions
2540     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
2541 
2542     // Check output dimensions
2543     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2544                                                      input->dimension(idx_height),
2545                                                      pool_size_x,
2546                                                      pool_size_y,
2547                                                      pool_info.pad_stride_info);
2548 
2549     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2550     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2551                                                               (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
2552                                                               pool_size_x, pool_size_y)
2553                                 .first);
2554 
2555     return Status{};
2556 }
2557 
run(const Window & window,const ThreadInfo & info)2558 void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
2559 {
2560     ARM_COMPUTE_UNUSED(info);
2561     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2562     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2563     ARM_COMPUTE_ERROR_ON(_func == nullptr);
2564 
2565     const unsigned int pool_stride_x   = _pool_info.pad_stride_info.stride().first;
2566     const unsigned int pool_stride_y   = _pool_info.pad_stride_info.stride().second;
2567     const unsigned int pool_size       = _pool_info.pool_size.width;
2568     const bool         exclude_padding = _pool_info.exclude_padding;
2569 
2570     Window window_input(window);
2571     if(_data_layout == DataLayout::NCHW)
2572     {
2573         // Set step for input in x and y direction for the input
2574         unsigned int window_x_inc = 0;
2575         switch(_input->info()->data_type())
2576         {
2577             case DataType::QASYMM8:
2578             case DataType::QASYMM8_SIGNED:
2579             {
2580                 window_x_inc = pool_stride_x;
2581                 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2582                 {
2583                     window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2584                 }
2585                 break;
2586             }
2587 
2588             case DataType::F16:
2589             case DataType::F32:
2590             {
2591                 window_x_inc = pool_stride_x;
2592                 break;
2593             }
2594             default:
2595             {
2596                 ARM_COMPUTE_ERROR("Not supported");
2597             }
2598         }
2599         window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2600         window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
2601     }
2602     else
2603     {
2604         window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
2605         window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2606         window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2607     }
2608 
2609     // Run function
2610     (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
2611 }
2612 } // namespace arm_compute
2613