1 /*
2 * Copyright (c) 2017-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
25
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/Utils.h"
31 #include "arm_compute/core/Validate.h"
32 #include "arm_compute/core/Window.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "src/core/AccessWindowStatic.h"
35 #include "src/core/CPP/Validate.h"
36 #include "src/core/NEON/NEAsymm.h"
37 #include "src/core/NEON/NEFixedPoint.h"
38 #include "src/core/NEON/NEMath.h"
39 #include "src/core/helpers/AutoConfiguration.h"
40 #include "src/core/helpers/WindowHelpers.h"
41 #include "support/ToolchainSupport.h"
42
43 #include "src/core/NEON/wrapper/wrapper.h"
44 #include <algorithm>
45 #include <arm_neon.h>
46 #include <cmath>
47 #include <limits>
48 #include <set>
49 #include <string>
50 #include <tuple>
51
52 namespace arm_compute
53 {
54 using namespace misc::shape_calculator;
55
56 namespace
57 {
58 template <typename T>
59 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
quantize(float val,const UniformQuantizationInfo & info)60 quantize(float val, const UniformQuantizationInfo &info)
61 {
62 return quantize_qasymm8_signed(val, info);
63 }
64
65 template <typename T>
66 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
quantize(float val,const UniformQuantizationInfo & info)67 quantize(float val, const UniformQuantizationInfo &info)
68 {
69 return quantize_qasymm8(val, info);
70 }
71
calculate_avg_scale(bool exclude_padding,DataLayout data_layout,const Coordinates & id,const int pool_size_x,const int pool_size_y,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)72 inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
73 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
74 {
75 const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
76 const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
77
78 int start_x = id[idx_width] * stride_x - pad_x;
79 int start_y = id[idx_height] * stride_y - pad_y;
80
81 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
82 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
83 if(exclude_padding)
84 {
85 start_x = std::max(0, start_x);
86 start_y = std::max(0, start_y);
87 }
88 return 1.f / ((end_y - start_y) * (end_x - start_x));
89 }
90
91 template <typename T, typename TVec>
scale_vector_q16x8(bool exclude_padding,TVec & v,const Coordinates & id,int id_offset,int step,const int pool_size,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)92 inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
93 const int pool_size, const int upper_bound_w, const int upper_bound_h,
94 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
95 {
96 int start_x = (id.x() + id_offset) * stride_x - pad_x;
97 int start_y = id.y() * stride_y - pad_y;
98 const int end_y = std::min(start_y + pool_size, upper_bound_h);
99 if(exclude_padding)
100 {
101 start_y = std::max(0, start_y);
102 }
103
104 std::array<T, 8> elems =
105 {
106 {
107 wrapper::vgetlane(v, 0),
108 wrapper::vgetlane(v, 1),
109 wrapper::vgetlane(v, 2),
110 wrapper::vgetlane(v, 3),
111 wrapper::vgetlane(v, 4),
112 wrapper::vgetlane(v, 5),
113 wrapper::vgetlane(v, 6),
114 wrapper::vgetlane(v, 7),
115 }
116 };
117
118 for(auto &el : elems)
119 {
120 int c_start_x = start_x;
121 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
122 if(exclude_padding)
123 {
124 c_start_x = std::max(0, c_start_x);
125 }
126 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
127 el *= scale;
128 start_x += step * stride_x;
129 }
130
131 v = wrapper::vsetlane(elems[0], v, 0);
132 v = wrapper::vsetlane(elems[1], v, 1);
133 v = wrapper::vsetlane(elems[2], v, 2);
134 v = wrapper::vsetlane(elems[3], v, 3);
135 v = wrapper::vsetlane(elems[4], v, 4);
136 v = wrapper::vsetlane(elems[5], v, 5);
137 v = wrapper::vsetlane(elems[6], v, 6);
138 v = wrapper::vsetlane(elems[7], v, 7);
139 }
140
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,unsigned int & pooled_w,unsigned int pooled_h,const ITensorInfo * indices,Size2D pool_size)141 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info,
142 unsigned int &pooled_w, unsigned int pooled_h, const ITensorInfo *indices, Size2D pool_size)
143 {
144 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
145
146 int pool_stride_x = 0;
147 int pool_stride_y = 0;
148 PoolingType pool_type = pool_info.pool_type;
149 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
150 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
151
152 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
153 if(indices)
154 {
155 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
156 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
157 ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
158 }
159 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
160 ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
161 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
162 && (input->data_layout() == DataLayout::NHWC),
163 "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
164
165 if(output->total_size() != 0)
166 {
167 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
168 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
169 ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
170 || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
171
172 if(indices)
173 {
174 ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
175 ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
176 || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
177 }
178 }
179
180 return Status{};
181 }
182
validate_arguments_pool_info(const unsigned int pool_size_x,const unsigned int pool_size_y)183 Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
184 {
185 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
186 ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
187
188 return Status{};
189 }
190
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output,ITensorInfo * indices,const PoolingLayerInfo & pool_info,unsigned int & num_elems_processed_per_iteration,BorderSize & border_size,unsigned int pooled_w,unsigned int pooled_h,int pool_size_x,int pool_size_y)191 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
192 unsigned int &num_elems_processed_per_iteration,
193 BorderSize &border_size,
194 unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
195 {
196 // Output auto inizialitation if not yet initialized
197 auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
198 if(indices)
199 {
200 // Indices auto inizialitation if not yet initialized
201 auto_init_if_empty(*indices, (input->clone()->set_tensor_shape(compute_pool_shape(*input,
202 pool_info)))
203 .set_data_type(DataType::U32) /* we store the offset to the element */);
204 }
205 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
206 unsigned int num_elems_read_per_iteration = 0;
207 unsigned int num_elems_horizontal_window = 0;
208 int pool_stride_x = 0;
209 int pool_stride_y = 0;
210 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
211 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
212 const int input_width = input->dimension(idx_width);
213 const int input_height = input->dimension(idx_height);
214 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
215 std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
216 const int pool_pad_right = pad_stride_info.pad_right();
217 const int pool_pad_top = pad_stride_info.pad_top();
218 const int pool_pad_left = pad_stride_info.pad_left();
219 const int pool_pad_bottom = pad_stride_info.pad_bottom();
220 const bool is_square = pool_size_x == pool_size_y;
221
222 // Check output dimensions
223 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
224 input->dimension(idx_height),
225 pool_size_x,
226 pool_size_y,
227 pad_stride_info);
228
229 //If it's not squared and optimized will be executed the MxN
230 num_elems_read_per_iteration = 1;
231 num_elems_processed_per_iteration = 1;
232 num_elems_horizontal_window = 1;
233
234 if(is_square)
235 {
236 switch(input->data_type())
237 {
238 case DataType::QASYMM8:
239 case DataType::QASYMM8_SIGNED:
240 switch(pool_size_x)
241 {
242 case 2:
243 num_elems_read_per_iteration = 16;
244 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
245 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
246 break;
247 case 3:
248 num_elems_read_per_iteration = 16;
249 num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
250 num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
251 break;
252 default:
253 break;
254 }
255 break;
256 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
257 case DataType::F16:
258 switch(pool_size_x)
259 {
260 case 2:
261 case 3:
262 num_elems_read_per_iteration = 4;
263 num_elems_processed_per_iteration = 1;
264 num_elems_horizontal_window = 1;
265 break;
266 default:
267 break;
268 }
269 break;
270 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
271 case DataType::F32:
272 switch(pool_size_x)
273 {
274 case 2:
275 num_elems_read_per_iteration = 2;
276 break;
277 case 3:
278 num_elems_read_per_iteration = 4; // We use vload4 for pooling3
279 break;
280 case 7:
281 num_elems_read_per_iteration = 8; // We use vload8 for pooling7
282 break;
283 default:
284 break;
285 }
286 num_elems_processed_per_iteration = 1;
287 num_elems_horizontal_window = 1;
288 break;
289 default:
290 ARM_COMPUTE_ERROR("Element size not supported");
291 break;
292 }
293 }
294
295 bool window_changed = false;
296 Window win{};
297 if(data_layout == DataLayout::NCHW)
298 {
299 // Number of iterations in X dimension
300 const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
301 // Upper limit for the number of right/bottom border elements that are accessed
302 const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
303 const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
304 border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
305 border_size.right = std::max(upper_bound_w, pool_pad_right);
306 border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
307 TensorShape output_shape{ input->tensor_shape() };
308 output_shape.set(0, pooled_w);
309 output_shape.set(1, pooled_h);
310 TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
311 win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
312 AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
313 AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
314 if(indices)
315 {
316 AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
317 window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
318 }
319 else
320 {
321 window_changed = update_window_and_padding(win, input_access, output_access);
322 }
323 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
324 }
325
326 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
327 return std::make_pair(err, win);
328 }
329
330 template <typename T>
331 inline T vcvtq_q32_f32(float32x4_t values);
332
333 template <>
vcvtq_q32_f32(float32x4_t values)334 inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
335 {
336 return vcvtq_u32_f32(values);
337 }
338
339 template <>
vcvtq_q32_f32(float32x4_t values)340 inline int32x4_t vcvtq_q32_f32(float32x4_t values)
341 {
342 return vcvtq_s32_f32(values);
343 }
344
345 template <typename T>
346 inline float32x4_t vcvtq_f32_q32(T values);
347
348 template <>
vcvtq_f32_q32(uint32x4_t values)349 inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
350 {
351 return vcvtq_f32_u32(values);
352 }
353
354 template <>
vcvtq_f32_q32(int32x4_t values)355 inline float32x4_t vcvtq_f32_q32(int32x4_t values)
356 {
357 return vcvtq_f32_s32(values);
358 }
359
360 template <typename Tout>
361 inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
362
363 template <>
vrequantize_pooling_with_scale(const float32x4x4_t & acc,const float quant_rescale,const float scale_pooling,const int32_t new_offset)364 inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
365 {
366 const float new_scale = quant_rescale / scale_pooling;
367 return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
368 }
369
370 template <>
vrequantize_pooling_with_scale(const float32x4x4_t & acc,const float quant_rescale,const float scale_pooling,const int32_t new_offset)371 inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
372 {
373 const float new_scale = quant_rescale / scale_pooling;
374 return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
375 }
376
377 template <typename Tin, typename Tout>
378 inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
379
380 template <>
vrequantize_pooling(uint8x8_t vec1,uint8x8_t vec2,const UniformQuantizationInfo & requant_qinfo)381 inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
382 {
383 const float32x4x4_t acc =
384 {
385 {
386 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
387 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
388 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
389 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
390 }
391 };
392 return vquantize(acc, requant_qinfo);
393 }
394
395 template <>
vrequantize_pooling(int8x8_t vec1,int8x8_t vec2,const UniformQuantizationInfo & requant_qinfo)396 inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
397 {
398 const float32x4x4_t acc =
399 {
400 {
401 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
402 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
403 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
404 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
405 }
406 };
407 return vquantize_signed(acc, requant_qinfo);
408 }
409
410 template <typename T>
411 inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
412
413 template <>
vrequantize_pooling(uint8x8_t & vec,const UniformQuantizationInfo & requant_qinfo)414 inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
415 {
416 const float32x4x2_t acc =
417 {
418 {
419 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
420 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
421 }
422 };
423 return vquantize(acc, requant_qinfo);
424 }
425
426 template <>
vrequantize_pooling(int8x8_t & vec,const UniformQuantizationInfo & requant_qinfo)427 inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
428 {
429 const float32x4x2_t acc =
430 {
431 {
432 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
433 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
434 }
435 };
436 return vquantize_signed(acc, requant_qinfo);
437 }
438
439 } // namespace
440
NEPoolingLayerKernel()441 NEPoolingLayerKernel::NEPoolingLayerKernel()
442 : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
443 {
444 }
445
border_size() const446 BorderSize NEPoolingLayerKernel::border_size() const
447 {
448 return _border_size;
449 }
450
configure(const ITensor * input,ITensor * output,const PoolingLayerInfo & pool_info,ITensor * indices)451 void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
452 {
453 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
454 const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
455 const bool is_global_pooling = pool_info.is_global_pooling;
456 const int pool_stride_x = pad_stride_info.stride().first;
457
458 // Get data layout
459 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
460 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
461 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
462
463 // Update pool size in case of global pooling
464 const Size2D pool_size(
465 is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width,
466 is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height);
467
468 // Validate pool info before calling scaled_dimensions
469 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
470
471 // Check output dimensions
472 unsigned int pooled_w;
473 unsigned int pooled_h;
474 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
475 input->info()->dimension(idx_height),
476 pool_size.x(),
477 pool_size.y(),
478 pad_stride_info);
479
480 // Perform validation step
481 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, (indices) ? indices->info() : nullptr, pool_size));
482
483 // Set instance variables
484 _input = input;
485 _output = output;
486 _indices = indices;
487 _pool_info = pool_info;
488 _data_layout = input->info()->data_layout();
489 _is_square = (pool_size.x() == pool_size.y());
490
491 // Get data type
492 const DataType data_type = input->info()->data_type();
493 const bool is_nchw = _data_layout == DataLayout::NCHW;
494
495 if(data_type == DataType::QASYMM8)
496 {
497 if(!is_nchw)
498 {
499 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
500 }
501 else
502 {
503 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
504 {
505 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
506 }
507 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
508 {
509 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
510 }
511 else
512 {
513 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
514 }
515 }
516 }
517 else if(data_type == DataType::QASYMM8_SIGNED)
518 {
519 if(!is_nchw)
520 {
521 _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
522 }
523 else
524 {
525 if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
526 {
527 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
528 }
529 else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
530 {
531 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
532 }
533 else
534 {
535 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
536 }
537 }
538 }
539 else if(data_type == DataType::F16)
540 {
541 if(!is_nchw)
542 {
543 _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
544 }
545 else
546 {
547 if(_is_square)
548 {
549 switch(pool_size.x())
550 {
551 case 2:
552 {
553 _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
554 }
555 break;
556 case 3:
557 {
558 _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
559 }
560 break;
561 default:
562 {
563 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
564 break;
565 }
566 }
567 }
568 else
569 {
570 _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
571 }
572 }
573 }
574 else if(data_type == DataType::F32)
575 {
576 if(!is_nchw)
577 {
578 _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
579 }
580 else
581 {
582 if(_is_square)
583 {
584 switch(pool_size.x())
585 {
586 case 2:
587 {
588 _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
589 break;
590 }
591 case 3:
592 {
593 _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
594 break;
595 }
596 case 7:
597 {
598 _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
599 break;
600 }
601 default:
602 {
603 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
604 break;
605 }
606 }
607 }
608 else
609 {
610 _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
611 }
612 }
613 }
614
615 if(!is_nchw)
616 {
617 // Configure kernel window
618 Window win = calculate_max_window(*output->info(), Steps());
619 Coordinates coord;
620 coord.set_num_dimensions(output->info()->num_dimensions());
621 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
622 INEKernel::configure(win);
623 }
624 else
625 {
626 // Configure kernel window
627 auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
628 pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
629 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
630 INEKernel::configure(win_config.second);
631 }
632 }
633
634 template <typename T>
offset_no_padding(uint32_t padded_offset,const Coordinates & id,const ITensorInfo & info,int pool_stride_x,int pool_stride_y)635 inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
636 {
637 const int pad_left = info.padding().left;
638 const int pad_right = info.padding().right;
639 const int pad_top = info.padding().top;
640 const int pad_bottom = info.padding().bottom;
641 const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
642 const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
643 const int pad_horiz = pad_left + pad_right;
644 const int pad_vert = pad_top + pad_bottom;
645
646 if(info.data_layout() == DataLayout::NCHW)
647 {
648 const uint32_t offset_base = padded_offset
649 - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
650 - pad_top * sizeof(T) /* top padding */
651 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
652 - in_stride_w * id[3];
653
654 return offset_base;
655 }
656 else
657 {
658 const uint32_t offset_base = padded_offset
659 - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
660 - pad_top * sizeof(T) // top padding
661 - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
662 - in_stride_w * id[3];
663
664 return offset_base;
665 }
666 }
667
668 template <typename T>
pooling2_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)669 void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
670 {
671 Iterator input(_input, window_input);
672 Iterator output(_output, window);
673
674 /** NEON vector types */
675 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
676 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
677 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
678 using q16_t = typename wrapper::traits::promote_t<T>;
679 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
680 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
681 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
682
683 constexpr int pool_size = 2;
684 int pool_stride_x = 0;
685 int pool_stride_y = 0;
686 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
687 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
688 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
689 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
690 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
691 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
692 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
693
694 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
695 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
696
697 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
698
699 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
700 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
701 const bool have_different_qinfo = input_qinfo != output_qinfo;
702
703 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
704 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
705 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
706
707 execute_window_loop(window, [&](const Coordinates & id)
708 {
709 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
710 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
711 q8x8_t lower_res = {};
712 q8x8_t upper_res = {};
713
714 if(pooling_type != PoolingType::MAX)
715 {
716 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
717 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
718
719 // Add rows
720 const q16x8x2_t vrsum =
721 {
722 {
723 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
724 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
725 }
726 };
727
728 // Pair-wise add row data
729 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
730 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
731
732 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
733
734 // Scale lower result
735 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_lower, id, 0, scale_step_x,
736 pool_size, upper_bound_w, upper_bound_h,
737 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
738 lower_res = wrapper::vmovn(res_lower);
739
740 // Compute upper result for stride_x == 1
741 if(pool_stride_x == 1)
742 {
743 // Shifted row sum
744 const q16x8x2_t vrsum_shifted =
745 {
746 {
747 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
748 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
749 }
750 };
751
752 // Pair-wise add shifted row
753 q16x8_t res_upper = wrapper::vcombine(
754 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
755 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
756
757 // Scale upper result
758 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res_upper, id, 1, 2,
759 pool_size, upper_bound_w, upper_bound_h,
760 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
761 upper_res = wrapper::vmovn(res_upper);
762 }
763 }
764 else
765 {
766 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
767 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
768 if(pool_stride_x == 1)
769 {
770 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
771 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
772 }
773 }
774
775 if(have_different_qinfo)
776 {
777 const auto requantized_output = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
778 lower_res = wrapper::vgetlow(requantized_output);
779 upper_res = wrapper::vgethigh(requantized_output);
780 }
781
782 // Store result
783 if(pool_stride_x == 1)
784 {
785 const q8x8x2_t res = { { lower_res, upper_res } };
786 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
787 }
788 else
789 {
790 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), lower_res);
791 }
792 },
793 input, output);
794 }
795
pooling3_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)796 void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
797 {
798 ARM_COMPUTE_UNUSED(pooling_type);
799 ARM_COMPUTE_UNUSED(exclude_padding);
800 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
801 Iterator input(_input, window_input);
802 Iterator output(_output, window);
803
804 constexpr const int pool_size = 3;
805 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
806 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
807 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
808 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
809 int pool_stride_x = 0;
810 int pool_stride_y = 0;
811 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
812 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
813 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
814
815 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
816 const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
817 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
818
819 execute_window_loop(window, [&](const Coordinates & id)
820 {
821 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
822 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(input_middle_ptr + input.offset()));
823 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
824 float16x4_t res = {};
825
826 // Get power of 2 in case of l2 pooling
827 if(pooling_type == PoolingType::L2)
828 {
829 top_data = vmul_f16(top_data, top_data);
830 middle_data = vmul_f16(middle_data, middle_data);
831 bottom_data = vmul_f16(bottom_data, bottom_data);
832 }
833
834 if(pooling_type != PoolingType::MAX)
835 {
836 // Calculate scale
837 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
838 const float16x4_t scale_v = vdup_n_f16(scale);
839 // Perform pooling
840 const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
841 res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
842 res = vmul_f16(vpadd_f16(res, res), scale_v);
843 }
844 else
845 {
846 const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
847 res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
848 res = vpmax_f16(res, res);
849 }
850
851 // Calculate square-root in case of l2 pooling
852 if(pooling_type == PoolingType::L2)
853 {
854 res = vinv_f16(vinvsqrt_f16(res));
855 }
856
857 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
858 },
859 input, output);
860 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
861 ARM_COMPUTE_UNUSED(window_input);
862 ARM_COMPUTE_UNUSED(window);
863 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
864 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
865 }
866
867 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
868 template <typename T>
869 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
f16_to_f32(float16x4_t input)870 f16_to_f32(float16x4_t input)
871 {
872 float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
873 return output;
874 }
875 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
876
877 template <typename T>
878 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
f16_to_f32(float32x2_t input)879 f16_to_f32(float32x2_t input)
880 {
881 return input;
882 }
883
884 template <typename T>
pooling2_nchw_maxpool_indices(const Window & window_input,const Window & window)885 void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
886 {
887 Iterator input(_input, window_input);
888 Iterator output(_output, window);
889 Iterator indices(_indices, window);
890 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
891 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
892 int pool_stride_x = 0;
893 int pool_stride_y = 0;
894 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
895 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
896 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
897 const int pad_left = _input->info()->padding().left;
898 const int pad_right = _input->info()->padding().right;
899 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
900
901 execute_window_loop(window, [&](const Coordinates & id)
902 {
903 auto top_data = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
904 auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
905 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
906 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
907
908 // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
909 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
910 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
911 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
912 *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
913
914 // Calculate max data indice, which will be used in max unpool.
915 const uint32_t offset_base = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
916 const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
917 const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
918 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
919 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
920 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
921 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
922 *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
923 },
924 input, output, indices);
925 }
926
pooling2_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)927 void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
928 {
929 ARM_COMPUTE_UNUSED(pooling_type);
930 ARM_COMPUTE_UNUSED(exclude_padding);
931 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
932 if(pooling_type == PoolingType::MAX && _indices)
933 {
934 pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
935 }
936 else
937 {
938 Iterator input(_input, window_input);
939 Iterator output(_output, window);
940 constexpr int pool_size = 2;
941 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
942 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
943 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
944 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
945 int pool_stride_x, pool_stride_y = 0;
946 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
947 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
948 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
949
950 const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
951 const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
952
953 execute_window_loop(window, [&](const Coordinates & id)
954 {
955 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
956 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
957 float16x4_t res = {};
958
959 // Get power of 2 in case of l2 pooling
960 if(pooling_type == PoolingType::L2)
961 {
962 top_data = vmul_f16(top_data, top_data);
963 bottom_data = vmul_f16(bottom_data, bottom_data);
964 }
965
966 if(pooling_type != PoolingType::MAX)
967 {
968 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
969 const float16x4_t scale_v = vdup_n_f16(scale);
970
971 const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
972 res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
973 }
974 else
975 {
976 const float16x4_t max_data = vmax_f16(top_data, bottom_data);
977 res = vpmax_f16(max_data, max_data);
978 }
979
980 // Calculate square-root in case of l2 pooling
981 if(pooling_type == PoolingType::L2)
982 {
983 res = vinv_f16(vinvsqrt_f16(res));
984 }
985
986 // Store result
987 *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
988 },
989 input, output);
990 }
991 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
992 ARM_COMPUTE_UNUSED(window_input);
993 ARM_COMPUTE_UNUSED(window);
994 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
995 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
996 }
997
998 template <typename T>
pooling3_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)999 void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1000 {
1001 Iterator input(_input, window_input);
1002 Iterator output(_output, window);
1003
1004 /** NEON vector types */
1005 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
1006 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
1007 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
1008 using q16_t = typename wrapper::traits::promote_t<T>;
1009 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
1010 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
1011
1012 constexpr int pool_size = 3;
1013 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1014 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1015 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1016 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1017 int pool_stride_x = 0;
1018 int pool_stride_y = 0;
1019 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1020 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1021 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1022
1023 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
1024 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
1025
1026 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
1027 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
1028 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
1029
1030 const T *const input_top_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
1031 const T *const input_middle_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
1032 const T *const input_bottom_ptr = reinterpret_cast<const T *>(_input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
1033
1034 execute_window_loop(window, [&](const Coordinates & id)
1035 {
1036 const auto top_data = wrapper::vloadq(input_top_ptr + input.offset());
1037 const auto middle_data = wrapper::vloadq(input_middle_ptr + input.offset());
1038 const auto bottom_data = wrapper::vloadq(input_bottom_ptr + input.offset());
1039 q8x8_t fres = {};
1040 q8x16_t fqres = {};
1041
1042 if(pooling_type == PoolingType::AVG)
1043 {
1044 // Convert data to u16
1045 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
1046 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
1047 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
1048
1049 // Calculate row sums
1050 const q16x8x2_t vrsum =
1051 {
1052 {
1053 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
1054 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
1055 }
1056 };
1057 const q16x8x2_t vrsum_shifted_1 =
1058 {
1059 {
1060 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
1061 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
1062 }
1063 };
1064 const q16x8x2_t vrsum_shifted_2 =
1065 {
1066 {
1067 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
1068 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
1069 }
1070 };
1071 // Calculate final sum
1072 q16x8x2_t final_sum =
1073 {
1074 {
1075 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
1076 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
1077 }
1078 };
1079 if(pool_stride_x == 2)
1080 {
1081 q16x8_t res =
1082 {
1083 wrapper::vgetlane(final_sum.val[0], 0),
1084 wrapper::vgetlane(final_sum.val[0], 2),
1085 wrapper::vgetlane(final_sum.val[0], 4),
1086 wrapper::vgetlane(final_sum.val[0], 6),
1087 wrapper::vgetlane(final_sum.val[1], 0),
1088 wrapper::vgetlane(final_sum.val[1], 2),
1089 wrapper::vgetlane(final_sum.val[1], 4),
1090 wrapper::vgetlane(final_sum.val[1], 6),
1091 };
1092
1093 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, res, id, 0, 1,
1094 pool_size, upper_bound_w, upper_bound_h,
1095 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1096 fres = wrapper::vmovn(res);
1097 }
1098 else
1099 {
1100 // Scale lower result
1101 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[0], id, 0, 1,
1102 pool_size, upper_bound_w, upper_bound_h,
1103 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1104 // Scale lower result
1105 scale_vector_q16x8<q16_t, q16x8_t>(exclude_padding, final_sum.val[1], id, 8, 1,
1106 pool_size, upper_bound_w, upper_bound_h,
1107 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1108 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
1109 }
1110 }
1111 else
1112 {
1113 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
1114 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
1115 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
1116 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
1117
1118 if(pool_stride_x == 2)
1119 {
1120 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
1121 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
1122 fres = wrapper::vtbl(table, lookup_val);
1123 }
1124 else
1125 {
1126 fqres = final_max;
1127 }
1128 }
1129
1130 // Store result
1131 if(pool_stride_x == 1)
1132 {
1133 if(input_qinfo != output_qinfo)
1134 {
1135 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
1136 }
1137 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fqres);
1138 }
1139 else
1140 {
1141 if(input_qinfo != output_qinfo)
1142 {
1143 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
1144 }
1145 wrapper::vstore(reinterpret_cast<T *>(output.ptr()), fres);
1146 }
1147 },
1148 input, output);
1149 }
1150
poolingMxN_f16_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1151 void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1152 {
1153 ARM_COMPUTE_UNUSED(pooling_type);
1154 ARM_COMPUTE_UNUSED(exclude_padding);
1155 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1156 Iterator input(_input, window_input);
1157 Iterator output(_output, window);
1158
1159 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1160 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1161 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1162 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1163 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1164 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1165 int pool_stride_x = 0;
1166 int pool_stride_y = 0;
1167 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1168 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1169 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1170
1171 execute_window_loop(window, [&](const Coordinates & id)
1172 {
1173 float16_t res = 0.0f;
1174 float16x8_t vres = vdupq_n_f16(0.0f);
1175
1176 if(pooling_type != PoolingType::MAX)
1177 {
1178 // Calculate scale
1179 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1180
1181 // Perform pooling
1182
1183 for(int y = 0; y < pool_size_y; ++y)
1184 {
1185 int x = 0;
1186 for(; x <= (pool_size_x - 8); x += 8)
1187 {
1188 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1189 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1190
1191 // Get power of 2 in case of l2 pooling and accumulate
1192 if(pooling_type == PoolingType::L2)
1193 {
1194 vres = vaddq_f16(vres, vmulq_f16(data, data));
1195 }
1196 else
1197 {
1198 vres = vaddq_f16(vres, data);
1199 }
1200 }
1201
1202 // Leftover for loop
1203 for(; x < pool_size_x; ++x)
1204 {
1205 float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1206 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1207
1208 // Get power of 2 in case of l2 pooling
1209 if(pooling_type == PoolingType::L2)
1210 {
1211 data *= data;
1212 }
1213
1214 res += data;
1215 }
1216 }
1217
1218 // Reduction
1219 float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
1220 res += vget_lane_f16(tmp, 0);
1221 res += vget_lane_f16(tmp, 1);
1222 res += vget_lane_f16(tmp, 2);
1223 res += vget_lane_f16(tmp, 3);
1224
1225 // Divide by scale
1226 res *= scale;
1227 }
1228 else
1229 {
1230 float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1231 res = std::numeric_limits<float>::lowest();
1232
1233 for(int y = 0; y < pool_size_y; ++y)
1234 {
1235 int x = 0;
1236 for(; x <= (pool_size_x - 8); x += 8)
1237 {
1238 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) +
1239 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1240 vres = vmaxq_f16(vres, data);
1241 }
1242
1243 // Leftover for loop
1244 for(; x < pool_size_x; ++x)
1245 {
1246 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x())
1247 + (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().y())));
1248 res = std::max(res, data);
1249 }
1250 }
1251
1252 float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
1253 res = std::max(res, vget_lane_f16(tmp, 0));
1254 res = std::max(res, vget_lane_f16(tmp, 1));
1255 res = std::max(res, vget_lane_f16(tmp, 2));
1256 res = std::max(res, vget_lane_f16(tmp, 3));
1257 }
1258
1259 // Calculate square-root in case of l2 pooling
1260 if(pooling_type == PoolingType::L2)
1261 {
1262 res = std::sqrt(res);
1263 }
1264
1265 // Store result
1266 *(reinterpret_cast<float16_t *>(output.ptr())) = res;
1267 },
1268 input, output);
1269
1270 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1271 ARM_COMPUTE_UNUSED(window_input);
1272 ARM_COMPUTE_UNUSED(window);
1273 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1274 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1275 }
1276
1277 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
pooling2_f16_nhwc_maxpool_indices(const Window & window_input,const Window & window)1278 void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
1279 {
1280 const int window_start_x = window.x().start();
1281 const int window_end_x = window.x().end();
1282 const int window_step_x = 8;
1283
1284 Window window_out = window;
1285 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1286
1287 Iterator input(_input, window_input);
1288 Iterator output(_output, window_out);
1289 Iterator indices(_indices, window_out);
1290
1291 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1292 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1293
1294 int pool_stride_x = 0;
1295 int pool_stride_y = 0;
1296 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1297
1298 const int pad_right = _input->info()->padding().right;
1299 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
1300 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
1301
1302 execute_window_loop(window_out, [&](const Coordinates & id)
1303 {
1304 const int idx_width = id.y() * pool_stride_x;
1305 const int idx_height = id.z() * pool_stride_y;
1306 const int pool_limit_y = pool_pad_top - idx_height;
1307 const int pool_limit_x = pool_pad_left - idx_width;
1308
1309 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1310 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1311 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1312 (_input->info()->strides_in_bytes().z());
1313 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
1314 (_input->info()->strides_in_bytes().z());
1315 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1316 (_input->info()->strides_in_bytes().z());
1317 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
1318 (_input->info()->strides_in_bytes().z());
1319
1320 int x_off = window_start_x;
1321 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1322 {
1323 const auto in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
1324 const auto in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
1325 const auto in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
1326 const auto in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
1327 const auto v_x0 = vld1q_f16(in_x0_ptr);
1328 const auto v_x1 = vld1q_f16(in_x1_ptr);
1329 const auto v_x2 = vld1q_f16(in_x2_ptr);
1330 const auto v_x3 = vld1q_f16(in_x3_ptr);
1331 float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
1332 // Store result
1333 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1334
1335 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1336 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1337 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1338 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1339 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1340 const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
1341 const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
1342 const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
1343 const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
1344 const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
1345 const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
1346 const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
1347 const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
1348 const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
1349 const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
1350 const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
1351 const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
1352 const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
1353 const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
1354 const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
1355 const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
1356 const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
1357 // Store indicies
1358 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
1359 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
1360 }
1361
1362 // Left-overs loop
1363 for(; x_off < window_end_x; ++x_off)
1364 {
1365 const auto x0 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
1366 const auto x1 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
1367 const auto x2 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
1368 const auto x3 = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
1369 float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
1370
1371 // Store result
1372 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1373
1374 const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
1375 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
1376 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
1377 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
1378 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
1379 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
1380 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
1381 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
1382
1383 // Store indices
1384 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
1385 }
1386 },
1387 input, output, indices);
1388 }
1389 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1390
poolingMxN_f16_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1391 void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1392 {
1393 ARM_COMPUTE_UNUSED(pooling_type);
1394 ARM_COMPUTE_UNUSED(exclude_padding);
1395 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1396 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1397 {
1398 pooling2_f16_nhwc_maxpool_indices(window_input, window);
1399 }
1400 const int window_start_x = window.x().start();
1401 const int window_end_x = window.x().end();
1402 const int window_step_x = 8;
1403
1404 Window window_out = window;
1405 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1406
1407 Iterator input(_input, window_input);
1408 Iterator output(_output, window_out);
1409
1410 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1411 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1412 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1413 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1414 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1415 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1416 int pool_stride_x = 0;
1417 int pool_stride_y = 0;
1418 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1419 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1420 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1421
1422 float16x8_t vres;
1423
1424 execute_window_loop(window_out, [&](const Coordinates & id)
1425 {
1426 const int idx_width = id.y() * pool_stride_x;
1427 const int idx_height = id.z() * pool_stride_y;
1428 const int pool_limit_y = pool_pad_top - idx_height;
1429 const int pool_limit_x = pool_pad_left - idx_width;
1430
1431 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1432 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1433 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1434 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1435
1436 int x_off = window_start_x;
1437 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1438 {
1439 if(pooling_type != PoolingType::MAX)
1440 {
1441 // Calculate scale
1442 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1443 pool_stride_y);
1444 const float16x8_t scale_v = vdupq_n_f16(scale);
1445
1446 // Perform pooling
1447 vres = vdupq_n_f16(0.0f);
1448 for(int y = pool_start_y; y < pool_end_y; ++y)
1449 {
1450 for(int x = pool_start_x; x < pool_end_x; ++x)
1451 {
1452 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1453 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1454
1455 // Get power of 2 in case of l2 pooling and accumulate
1456 if(pooling_type == PoolingType::L2)
1457 {
1458 vres = vaddq_f16(vres, vmulq_f16(data, data));
1459 }
1460 else
1461 {
1462 vres = vaddq_f16(vres, data);
1463 }
1464 }
1465 }
1466 // Divide by scale
1467 vres = vmulq_f16(vres, scale_v);
1468 }
1469 else
1470 {
1471 vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
1472
1473 for(int y = pool_start_y; y < pool_end_y; ++y)
1474 {
1475 for(int x = pool_start_x; x < pool_end_x; ++x)
1476 {
1477 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
1478 (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
1479 vres = vmaxq_f16(vres, data);
1480 }
1481 }
1482 }
1483
1484 // Calculate square-root in case of l2 pooling
1485 if(pooling_type == PoolingType::L2)
1486 {
1487 float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
1488 vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
1489 }
1490
1491 // Store result
1492 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
1493 }
1494
1495 // Left-overs loop
1496 for(; x_off < window_end_x; ++x_off)
1497 {
1498 float16_t res = 0.0f;
1499
1500 if(pooling_type != PoolingType::MAX)
1501 {
1502 // Calculate scale
1503 const float16_t scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1504 pool_stride_y);
1505
1506 for(int y = pool_start_y; y < pool_end_y; ++y)
1507 {
1508 for(int x = pool_start_x; x < pool_end_x; ++x)
1509 {
1510 const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1511 (_input->info()->strides_in_bytes().z())) + x_off);
1512
1513 // Get power of 2 in case of l2 pooling and accumulate
1514 if(pooling_type == PoolingType::L2)
1515 {
1516 res += data * data;
1517 }
1518 else
1519 {
1520 res += data;
1521 }
1522 }
1523 }
1524
1525 // Divide by scale
1526 res *= scale;
1527 }
1528 else
1529 {
1530 res = std::numeric_limits<float>::lowest();
1531 for(int y = pool_start_y; y < pool_end_y; ++y)
1532 {
1533 for(int x = pool_start_x; x < pool_end_x; ++x)
1534 {
1535 const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1536 (_input->info()->strides_in_bytes().z())) + x_off);
1537 res = std::max(res, data);
1538 }
1539 }
1540 }
1541
1542 // Calculate square-root in case of l2 pooling
1543 if(pooling_type == PoolingType::L2)
1544 {
1545 res = std::sqrt(res);
1546 }
1547
1548 // Store result
1549 *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
1550 }
1551 },
1552 input, output);
1553
1554 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1555 ARM_COMPUTE_UNUSED(window_input);
1556 ARM_COMPUTE_UNUSED(window);
1557 ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
1558 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1559 }
1560
poolingMxN_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1561 void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1562 {
1563 Iterator input(_input, window_input);
1564 Iterator output(_output, window);
1565
1566 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
1567 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
1568 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1569 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1570 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1571 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1572 int pool_stride_x = 0;
1573 int pool_stride_y = 0;
1574 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1575 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1576 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1577
1578 execute_window_loop(window, [&](const Coordinates & id)
1579 {
1580 float res = 0.0f;
1581
1582 if(pooling_type != PoolingType::MAX)
1583 {
1584 // Calculate scale
1585 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1586
1587 // Perform pooling
1588 float32x4_t vres = vdupq_n_f32(0.0f);
1589
1590 for(int y = 0; y < pool_size_y; ++y)
1591 {
1592 int x = 0;
1593 for(; x <= (pool_size_x - 4); x += 4)
1594 {
1595 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1596 (_input->info()->strides_in_bytes().y())));
1597
1598 // Get power of 2 in case of l2 pooling and accumulate
1599 if(pooling_type == PoolingType::L2)
1600 {
1601 vres = vmlaq_f32(vres, data, data);
1602 }
1603 else
1604 {
1605 vres = vaddq_f32(vres, data);
1606 }
1607 }
1608
1609 // Leftover for loop
1610 for(; x < pool_size_x; ++x)
1611 {
1612 float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1613 (_input->info()->strides_in_bytes().y())));
1614
1615 // Get power of 2 in case of l2 pooling
1616 if(pooling_type == PoolingType::L2)
1617 {
1618 data *= data;
1619 }
1620
1621 res += data;
1622 }
1623 }
1624
1625 #if defined(__aarch64__)
1626 // Reduction operation available on 64 bit architectures only
1627 res += vaddvq_f32(vres);
1628 #else // __aarch64__
1629 // Reduction
1630 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
1631 tmp = vpadd_f32(tmp, tmp);
1632
1633 res += vget_lane_f32(tmp, 0);
1634 #endif // __aarch64__
1635 // Divide by scale
1636 res *= scale;
1637 }
1638 else
1639 {
1640 float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1641 res = std::numeric_limits<float>::lowest();
1642
1643 for(int y = 0; y < pool_size_y; ++y)
1644 {
1645 int x = 0;
1646 for(; x <= (pool_size_x - 4); x += 4)
1647 {
1648 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1649 (_input->info()->strides_in_bytes().y())));
1650 vres = vmaxq_f32(vres, data);
1651 }
1652
1653 // Leftover for loop
1654 for(; x < pool_size_x; ++x)
1655 {
1656 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
1657 (_input->info()->strides_in_bytes().y())));
1658 res = std::max(res, data);
1659 }
1660 }
1661 #if defined(__aarch64__)
1662 // Reduction operation available on 64 bit architectures only
1663 res = std::max(vmaxvq_f32(vres), res);
1664 #else // __aarch64__
1665 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
1666 tmp = vpmax_f32(tmp, tmp);
1667
1668 res = std::max(res, vget_lane_f32(tmp, 0));
1669 #endif // __aarch64__
1670 }
1671
1672 // Calculate square-root in case of l2 pooling
1673 if(pooling_type == PoolingType::L2)
1674 {
1675 res = std::sqrt(res);
1676 }
1677
1678 // Store result
1679 *(reinterpret_cast<float *>(output.ptr())) = res;
1680 },
1681 input, output);
1682 }
1683
pooling2_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1684 void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
1685 bool exclude_padding)
1686 {
1687 if(pooling_type == PoolingType::MAX && _indices)
1688 {
1689 pooling2_nchw_maxpool_indices<float>(window_input, window);
1690 }
1691 else
1692 {
1693 Iterator input(_input, window_input);
1694 Iterator output(_output, window);
1695 constexpr int pool_size = 2;
1696 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1697 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1698 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1699 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1700 int pool_stride_x = 0;
1701 int pool_stride_y = 0;
1702 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1703 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1704 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1705
1706 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1707 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1708
1709 execute_window_loop(window, [&](const Coordinates & id)
1710 {
1711 const auto in_top_ptr = reinterpret_cast<const float *>(input_top_ptr + input.offset());
1712 const auto in_bottom_ptr = reinterpret_cast<const float *>(input_bottom_ptr + input.offset());
1713 float32x2_t top_data = vld1_f32(in_top_ptr);
1714 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
1715 float32x2_t res = {};
1716 float final_res = 0;
1717 // Get power of 2 in case of l2 pooling
1718 if(pooling_type == PoolingType::L2)
1719 {
1720 top_data = vmul_f32(top_data, top_data);
1721 bottom_data = vmul_f32(bottom_data, bottom_data);
1722 }
1723
1724 if(pooling_type != PoolingType::MAX)
1725 {
1726 // Calculate scale
1727 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1728 const float32x2_t scale_v = vdup_n_f32(scale);
1729
1730 // Perform pooling
1731 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
1732 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
1733 }
1734 else
1735 {
1736 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
1737 res = vpmax_f32(max_data, max_data);
1738 }
1739 final_res = vget_lane_f32(res, 0);
1740
1741 // Calculate square-root in case of l2 pooling
1742 if(pooling_type == PoolingType::L2)
1743 {
1744 final_res = sqrt(final_res);
1745 }
1746
1747 // Store result
1748 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1749 },
1750 input, output);
1751 }
1752 }
1753
pooling3_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1754 void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1755 {
1756 Iterator input(_input, window_input);
1757 Iterator output(_output, window);
1758
1759 constexpr const int pool_size = 3;
1760 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1761 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1762 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1763 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1764 int pool_stride_x = 0;
1765 int pool_stride_y = 0;
1766 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1767 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1768 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1769
1770 const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
1771 const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
1772 const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
1773
1774 execute_window_loop(window, [&](const Coordinates & id)
1775 {
1776 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
1777 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
1778 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
1779 float32x2_t res = {};
1780 float final_res = 0;
1781
1782 // Get power of 2 in case of l2 pooling
1783 if(pooling_type == PoolingType::L2)
1784 {
1785 top_data = vmulq_f32(top_data, top_data);
1786 middle_data = vmulq_f32(middle_data, middle_data);
1787 bottom_data = vmulq_f32(bottom_data, bottom_data);
1788 }
1789
1790 if(pooling_type != PoolingType::MAX)
1791 {
1792 // Calculate scale
1793 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1794 const float32x2_t scale_v = vdup_n_f32(scale);
1795
1796 // Perform pooling
1797 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
1798 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
1799 res = vmul_f32(vpadd_f32(res, res), scale_v);
1800 }
1801 else
1802 {
1803 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
1804 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
1805 res = vpmax_f32(res, res);
1806 }
1807 final_res = vget_lane_f32(res, 0);
1808
1809 // Calculate square-root in case of l2 pooling
1810 if(pooling_type == PoolingType::L2)
1811 {
1812 final_res = sqrt(final_res);
1813 }
1814
1815 // Store result
1816 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1817 },
1818 input, output);
1819 }
1820
pooling7_f32_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1821 void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1822 {
1823 Iterator input(_input, window_input);
1824 Iterator output(_output, window);
1825
1826 constexpr const int pool_size = 7;
1827 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1828 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1829 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1830 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1831 int pool_stride_x = 0;
1832 int pool_stride_y = 0;
1833 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1834 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
1835 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
1836
1837 std::array<const uint8_t *, pool_size> input_ptrs{ {} };
1838 for(int i = 0; i < pool_size; ++i)
1839 {
1840 input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
1841 }
1842
1843 execute_window_loop(window, [&](const Coordinates & id)
1844 {
1845 float32x2_t res = {};
1846 float final_res = 0.f;
1847 if(pooling_type != PoolingType::MAX)
1848 {
1849 // Calculate scale
1850 float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
1851 const float32x2_t scale_v = vdup_n_f32(scale);
1852
1853 // Perform pooling
1854 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1855 // Get power of 2 in case of l2 pooling
1856 if(pooling_type == PoolingType::L2)
1857 {
1858 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1859 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1860 }
1861 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
1862 for(int i = 1; i < pool_size; ++i)
1863 {
1864 data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1865 // Get power of 2 in case of l2 pooling
1866 if(pooling_type == PoolingType::L2)
1867 {
1868 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
1869 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
1870 }
1871 sum_data = vaddq_f32(sum_data, data.val[0]);
1872 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
1873 }
1874 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
1875 res = vmul_f32(vpadd_f32(res, res), scale_v);
1876 }
1877 else
1878 {
1879 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
1880 for(int i = 1; i < pool_size; ++i)
1881 {
1882 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
1883 max_data = vmax2q_f32(max_data, data);
1884 }
1885 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
1886 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
1887 res = vpmax_f32(res, res);
1888 }
1889 final_res = vget_lane_f32(res, 0);
1890
1891 // Calculate square-root in case of l2 pooling
1892 if(pooling_type == PoolingType::L2)
1893 {
1894 final_res = sqrt(final_res);
1895 }
1896
1897 // Store result
1898 *(reinterpret_cast<float *>(output.ptr())) = final_res;
1899 },
1900 input, output);
1901 }
1902
poolingMxN_f32_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)1903 void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
1904 {
1905 if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
1906 {
1907 pooling2_f32_nhwc_maxpool_indices(window_input, window);
1908 }
1909 else
1910 {
1911 const int window_start_x = window.x().start();
1912 const int window_end_x = window.x().end();
1913 const int window_step_x = 4;
1914
1915 Window window_out = window;
1916 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1917
1918 Iterator input(_input, window_input);
1919 Iterator output(_output, window_out);
1920
1921 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
1922 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
1923 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
1924 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
1925 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
1926 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
1927 int pool_stride_x = 0;
1928 int pool_stride_y = 0;
1929 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
1930 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
1931 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
1932
1933 float32x4_t vres;
1934
1935 execute_window_loop(window_out, [&](const Coordinates & id)
1936 {
1937 const int idx_width = id.y() * pool_stride_x;
1938 const int idx_height = id.z() * pool_stride_y;
1939 const int pool_limit_y = pool_pad_top - idx_height;
1940 const int pool_limit_x = pool_pad_left - idx_width;
1941
1942 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
1943 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
1944 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
1945 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
1946
1947 int x_off = window_start_x;
1948 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
1949 {
1950 if(pooling_type != PoolingType::MAX)
1951 {
1952 // Calculate scale
1953 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
1954 pool_stride_y);
1955 const float32x4_t scale_v = vdupq_n_f32(scale);
1956
1957 // Perform pooling
1958 vres = vdupq_n_f32(0.0f);
1959
1960 for(int y = pool_start_y; y < pool_end_y; ++y)
1961 {
1962 for(int x = pool_start_x; x < pool_end_x; ++x)
1963 {
1964 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1965 (_input->info()->strides_in_bytes().z())) + x_off);
1966
1967 // Get power of 2 in case of l2 pooling and accumulate
1968 if(pooling_type == PoolingType::L2)
1969 {
1970 vres = vmlaq_f32(vres, data, data);
1971 }
1972 else
1973 {
1974 vres = vaddq_f32(vres, data);
1975 }
1976 }
1977 }
1978 // Divide by scale
1979 vres = vmulq_f32(vres, scale_v);
1980 }
1981 else
1982 {
1983 vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
1984 for(int y = pool_start_y; y < pool_end_y; ++y)
1985 {
1986 for(int x = pool_start_x; x < pool_end_x; ++x)
1987 {
1988 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
1989 (_input->info()->strides_in_bytes().z())) + x_off);
1990 vres = vmaxq_f32(vres, data);
1991 }
1992 }
1993 }
1994
1995 // Calculate square-root in case of l2 pooling
1996 if(pooling_type == PoolingType::L2)
1997 {
1998 float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
1999 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
2000 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
2001 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
2002 };
2003 vres = l2_res;
2004 }
2005
2006 // Store result
2007 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2008 }
2009
2010 // Left-overs loop
2011 for(; x_off < window_end_x; ++x_off)
2012 {
2013 float res = 0.0f;
2014
2015 if(pooling_type != PoolingType::MAX)
2016 {
2017 // Calculate scale
2018 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2019 pool_stride_y);
2020
2021 for(int y = pool_start_y; y < pool_end_y; ++y)
2022 {
2023 for(int x = pool_start_x; x < pool_end_x; ++x)
2024 {
2025 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2026 (_input->info()->strides_in_bytes().z())) + x_off);
2027
2028 // Get power of 2 in case of l2 pooling and accumulate
2029 if(pooling_type == PoolingType::L2)
2030 {
2031 res += data * data;
2032 }
2033 else
2034 {
2035 res += data;
2036 }
2037 }
2038 }
2039
2040 // Divide by scale
2041 res *= scale;
2042 }
2043 else
2044 {
2045 res = std::numeric_limits<float>::lowest();
2046 for(int y = pool_start_y; y < pool_end_y; ++y)
2047 {
2048 for(int x = pool_start_x; x < pool_end_x; ++x)
2049 {
2050 const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2051 (_input->info()->strides_in_bytes().z())) + x_off);
2052 res = std::max(res, data);
2053 }
2054 }
2055 }
2056
2057 // Calculate square-root in case of l2 pooling
2058 if(pooling_type == PoolingType::L2)
2059 {
2060 res = std::sqrt(res);
2061 }
2062
2063 // Store result
2064 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2065 }
2066 },
2067 input, output);
2068 }
2069 }
2070
pooling2_f32_nhwc_maxpool_indices(const Window & window_input,const Window & window)2071 void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
2072 {
2073 const int window_start_x = window.x().start();
2074 const int window_end_x = window.x().end();
2075 const int window_step_x = 4;
2076
2077 Window window_out = window;
2078 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2079
2080 Iterator input(_input, window_input);
2081 Iterator output(_output, window_out);
2082 Iterator indices(_indices, window_out);
2083
2084 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2085 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2086
2087 int pool_stride_x = 0;
2088 int pool_stride_y = 0;
2089 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2090
2091 float32x4_t vres;
2092 float res;
2093
2094 const int pad_right = _input->info()->padding().right;
2095 const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
2096 const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
2097
2098 execute_window_loop(window_out, [&](const Coordinates & id)
2099 {
2100 const int idx_width = id.y() * pool_stride_x;
2101 const int idx_height = id.z() * pool_stride_y;
2102 const int pool_limit_y = pool_pad_top - idx_height;
2103 const int pool_limit_x = pool_pad_left - idx_width;
2104
2105 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2106 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2107
2108 const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2109 (_input->info()->strides_in_bytes().z());
2110 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
2111 (_input->info()->strides_in_bytes().z());
2112 const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2113 (_input->info()->strides_in_bytes().z());
2114 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
2115 (_input->info()->strides_in_bytes().z());
2116
2117 int x_off = window_start_x;
2118 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
2119 {
2120 const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
2121 const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
2122 const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
2123 const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
2124 const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
2125 const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
2126 const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
2127 const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
2128 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
2129 // Store result
2130 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
2131
2132 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2133 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2134 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2135 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2136 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2137 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
2138 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
2139 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
2140 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
2141 const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
2142 const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
2143 const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
2144
2145 // Store indices
2146 vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
2147 }
2148
2149 // Left-overs loop
2150 for(; x_off < window_end_x; ++x_off)
2151 {
2152 const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
2153 const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
2154 const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
2155 const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
2156 res = std::max(std::max(x2, x3), std::max(x0, x1));
2157
2158 // Store result
2159 *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
2160
2161 const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
2162 const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
2163 const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
2164 const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
2165 const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
2166 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
2167 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
2168 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
2169
2170 // Store indices
2171 *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
2172 }
2173 },
2174 input, output, indices);
2175 }
2176
2177 template <typename T>
poolingMxN_q8_nchw(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)2178 void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
2179 {
2180 Iterator input(_input, window_input);
2181 Iterator output(_output, window);
2182
2183 /** NEON vector types */
2184 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2185 using q16_t = typename wrapper::traits::promote_t<T>;
2186 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2187 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2188 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2189
2190 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().x() : _pool_info.pool_size.width;
2191 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.height;
2192 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2193 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2194 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2195 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
2196 int pool_stride_x = 0;
2197 int pool_stride_y = 0;
2198 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2199 const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
2200 const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
2201
2202 const UniformQuantizationInfo &input_qinfo = _input->info()->quantization_info().uniform();
2203 const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
2204
2205 execute_window_loop(window, [&](const Coordinates & id)
2206 {
2207 T res = std::numeric_limits<T>::min();
2208
2209 if(pooling_type != PoolingType::MAX)
2210 {
2211 q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2212 q32_t sres = 0;
2213
2214 // Calculate scale
2215 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
2216
2217 // Perform pooling
2218 for(int y = 0; y < pool_size_y; ++y)
2219 {
2220 int x = 0;
2221 for(; x <= (pool_size_x - 8); x += 8)
2222 {
2223 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2224 (_input->info()->strides_in_bytes().y())));
2225
2226 const q16x8_t data_q16 = wrapper::vmovl(data);
2227 vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
2228 }
2229
2230 // Leftover for loop
2231 for(; x < pool_size_x; ++x)
2232 {
2233 T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2234 (_input->info()->strides_in_bytes().y())));
2235 sres += data;
2236 }
2237 }
2238
2239 // Reduction
2240 const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
2241 sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
2242
2243 // Divide by scale
2244 res = static_cast<T>(support::cpp11::round(sres * scale));
2245 }
2246 else
2247 {
2248 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
2249
2250 for(int y = 0; y < pool_size_y; ++y)
2251 {
2252 int x = 0;
2253 for(; x <= (pool_size_x - 8); x += 8)
2254 {
2255 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2256 (_input->info()->strides_in_bytes().y())));
2257 vres = wrapper::vmax(vres, data);
2258 }
2259 // Leftover for loop
2260 for(; x < pool_size_x; ++x)
2261 {
2262 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
2263 (_input->info()->strides_in_bytes().y())));
2264 res = std::max(res, data);
2265 }
2266 }
2267
2268 // Reduce max
2269 vres = wrapper::vpmax(vres, vres);
2270 vres = wrapper::vpmax(vres, vres);
2271 vres = wrapper::vpmax(vres, vres);
2272
2273 // Get max value
2274 res = std::max(res, wrapper::vgetlane(vres, 0));
2275 }
2276 // Store result
2277 res = (input_qinfo != output_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, input_qinfo), output_qinfo) : res;
2278 *(reinterpret_cast<T *>(output.ptr())) = res;
2279 },
2280 input, output);
2281 }
2282
2283 template <typename T>
poolingMxN_q8_nhwc(const Window & window_input,const Window & window,PoolingType pooling_type,bool exclude_padding)2284 void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
2285 {
2286 const int window_start_x = window.x().start();
2287 const int window_end_x = window.x().end();
2288 const int window_step_x = 16;
2289 const int window_half_step_x = window_step_x / 2;
2290
2291 Window window_out = window;
2292 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
2293
2294 Iterator input(_input, window_input);
2295 Iterator output(_output, window_out);
2296
2297 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
2298 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
2299 using q16_t = typename wrapper::traits::promote_t<T>;
2300 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
2301 using q32_t = typename wrapper::traits::promote_t<q16_t>;
2302 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
2303
2304 const int pool_size_x = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
2305 const int pool_size_y = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
2306 const int pool_pad_right = _pool_info.pad_stride_info.pad_right();
2307 const int pool_pad_top = _pool_info.pad_stride_info.pad_top();
2308 const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
2309 const int pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
2310
2311 int pool_stride_x = 0;
2312 int pool_stride_y = 0;
2313 std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
2314 const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
2315 const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
2316
2317 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
2318 const UniformQuantizationInfo input_qinfo = _input->info()->quantization_info().uniform();
2319 const UniformQuantizationInfo output_qinfo = _output->info()->quantization_info().uniform();
2320
2321 const float quant_rescale = output_qinfo.scale / input_qinfo.scale;
2322 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
2323 // With a requantization performed in a single step there won't be uncertainties introduced
2324 const int32_t new_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / quant_rescale);
2325
2326 const float requant_scale = output_qinfo.scale / input_qinfo.scale;
2327 const int32_t requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
2328 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
2329
2330 execute_window_loop(window_out, [&](const Coordinates & id)
2331 {
2332 const int idx_width = id.y() * pool_stride_x;
2333 const int idx_height = id.z() * pool_stride_y;
2334 const int pool_limit_y = pool_pad_top - idx_height;
2335 const int pool_limit_x = pool_pad_left - idx_width;
2336
2337 const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
2338 const int pool_end_y = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
2339 const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
2340 const int pool_end_x = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
2341
2342 int x_off = window_start_x;
2343 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
2344 {
2345 if(pooling_type != PoolingType::MAX)
2346 {
2347 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2348 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2349 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2350 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
2351
2352 // Calculate scale
2353 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2354 pool_stride_y);
2355
2356 // Perform pooling
2357 for(int y = pool_start_y; y < pool_end_y; ++y)
2358 {
2359 for(int x = pool_start_x; x < pool_end_x; ++x)
2360 {
2361 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2362 (_input->info()->strides_in_bytes().z())) + x_off);
2363
2364 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
2365 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
2366 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
2367 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
2368 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
2369 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
2370 }
2371 }
2372
2373 if(input_qinfo != output_qinfo)
2374 {
2375 const float32x4x4_t vres =
2376 {
2377 {
2378 vcvtq_f32_q32(vres1),
2379 vcvtq_f32_q32(vres2),
2380 vcvtq_f32_q32(vres3),
2381 vcvtq_f32_q32(vres4),
2382 }
2383 };
2384 const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
2385 // Store result
2386 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
2387 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
2388 }
2389 else
2390 {
2391 const float32x4_t scale_v = vdupq_n_f32(scale);
2392 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2393 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
2394 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
2395 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
2396 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
2397
2398 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
2399 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
2400 // Store result
2401 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
2402 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
2403 }
2404 }
2405 else
2406 {
2407 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
2408
2409 for(int y = pool_start_y; y < pool_end_y; ++y)
2410 {
2411 for(int x = pool_start_x; x < pool_end_x; ++x)
2412 {
2413 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2414 (_input->info()->strides_in_bytes().z())) + x_off);
2415 vres = wrapper::vmax(vres, data);
2416 }
2417 }
2418
2419 // Store result
2420 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
2421 requant_qinfo) :
2422 vres);
2423 }
2424 }
2425
2426 if(pooling_type == PoolingType::MAX)
2427 {
2428 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
2429 {
2430 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
2431 for(int y = pool_start_y; y < pool_end_y; ++y)
2432 {
2433 for(int x = pool_start_x; x < pool_end_x; ++x)
2434 {
2435 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2436 (_input->info()->strides_in_bytes().z())) + x_off);
2437 vres = wrapper::vmax(vres, data);
2438 }
2439 }
2440
2441 // Store result
2442 wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off,
2443 (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
2444 }
2445 }
2446
2447 // Left-overs loop
2448 for(; x_off < window_end_x; ++x_off)
2449 {
2450 if(pooling_type != PoolingType::MAX)
2451 {
2452 q32_t res = static_cast<q32_t>(0.f);
2453
2454 // Calculate scale
2455 const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
2456 pool_stride_y);
2457
2458 // Perform pooling
2459 for(int y = pool_start_y; y < pool_end_y; ++y)
2460 {
2461 for(int x = pool_start_x; x < pool_end_x; ++x)
2462 {
2463 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2464 (_input->info()->strides_in_bytes().z())) + x_off);
2465 res += data;
2466 }
2467 }
2468
2469 if(input_qinfo != output_qinfo)
2470 {
2471 const float res_f = static_cast<float>(res);
2472 const float new_scale = quant_rescale / scale;
2473 const auto requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
2474
2475 // Store result
2476 *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
2477 }
2478 else
2479 {
2480 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
2481 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
2482
2483 // Store result
2484 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2485 }
2486 }
2487 else
2488 {
2489 T res = std::numeric_limits<T>::min();
2490
2491 for(int y = pool_start_y; y < pool_end_y; ++y)
2492 {
2493 for(int x = pool_start_x; x < pool_end_x; ++x)
2494 {
2495 const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
2496 (_input->info()->strides_in_bytes().z())) + x_off);
2497 res = std::max(res, data);
2498 }
2499 }
2500
2501 // Store result
2502 if(input_qinfo != output_qinfo)
2503 {
2504 const float res_f = static_cast<float>(res);
2505 *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
2506 }
2507 else
2508 {
2509 *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
2510 }
2511 }
2512 }
2513
2514 },
2515 input, output);
2516 }
2517
validate(const ITensorInfo * input,const ITensorInfo * output,const PoolingLayerInfo & pool_info,const ITensorInfo * indices)2518 Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
2519 {
2520 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
2521
2522 unsigned int pooled_w = 0;
2523 unsigned int pooled_h = 0;
2524 unsigned int num_elems_processed_per_iteration = 0;
2525 BorderSize border_size(0);
2526
2527 const bool is_global_pooling = pool_info.is_global_pooling;
2528 unsigned int pool_size_x = 0;
2529 unsigned int pool_size_y = 0;
2530
2531 // Get data layout
2532 const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->data_layout() : pool_info.data_layout;
2533 const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
2534 const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
2535
2536 pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size.width;
2537 pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size.height;
2538
2539 // Validate pool info before calling scaled_dimensions
2540 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
2541
2542 // Check output dimensions
2543 std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
2544 input->dimension(idx_height),
2545 pool_size_x,
2546 pool_size_y,
2547 pool_info.pad_stride_info);
2548
2549 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, indices, Size2D(pool_size_x, pool_size_y)));
2550 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
2551 (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
2552 pool_size_x, pool_size_y)
2553 .first);
2554
2555 return Status{};
2556 }
2557
run(const Window & window,const ThreadInfo & info)2558 void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
2559 {
2560 ARM_COMPUTE_UNUSED(info);
2561 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
2562 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
2563 ARM_COMPUTE_ERROR_ON(_func == nullptr);
2564
2565 const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
2566 const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
2567 const unsigned int pool_size = _pool_info.pool_size.width;
2568 const bool exclude_padding = _pool_info.exclude_padding;
2569
2570 Window window_input(window);
2571 if(_data_layout == DataLayout::NCHW)
2572 {
2573 // Set step for input in x and y direction for the input
2574 unsigned int window_x_inc = 0;
2575 switch(_input->info()->data_type())
2576 {
2577 case DataType::QASYMM8:
2578 case DataType::QASYMM8_SIGNED:
2579 {
2580 window_x_inc = pool_stride_x;
2581 if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
2582 {
2583 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
2584 }
2585 break;
2586 }
2587
2588 case DataType::F16:
2589 case DataType::F32:
2590 {
2591 window_x_inc = pool_stride_x;
2592 break;
2593 }
2594 default:
2595 {
2596 ARM_COMPUTE_ERROR("Not supported");
2597 }
2598 }
2599 window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
2600 window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
2601 }
2602 else
2603 {
2604 window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
2605 window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
2606 window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
2607 }
2608
2609 // Run function
2610 (this->*_func)(window_input, window, _pool_info.pool_type, exclude_padding);
2611 }
2612 } // namespace arm_compute
2613