1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H
26
27 #include "arm_compute/core/Types.h"
28 #include "arm_compute/core/utils/misc/Traits.h"
29 #include "src/core/NEON/NEAsymm.h"
30 #include "src/core/NEON/NEFixedPoint.h"
31 #include "src/core/NEON/NEMath.h"
32 #include "src/core/NEON/wrapper/wrapper.h"
33 #include "src/core/helpers/PoolingHelpers.h"
34 #include <arm_neon.h>
35
36 namespace arm_compute
37 {
38 namespace cpu
39 {
40 template <typename T>
poolingMxN_q8_neon_nhwc(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)41 void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
42 {
43 ARM_COMPUTE_UNUSED(dst1);
44
45 const int window_start_x = window.x().start();
46 const int window_end_x = window.x().end();
47 const int window_step_x = 16;
48 const int window_half_step_x = window_step_x / 2;
49
50 Window window_out = window;
51 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
52
53 Iterator in(src, window_src);
54 Iterator out(dst0, window_out);
55
56 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
57 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
58 using q16_t = typename wrapper::traits::promote_t<T>;
59 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
60 using q32_t = typename wrapper::traits::promote_t<q16_t>;
61 using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
62
63 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
64 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
65 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
66 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
67 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
68 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
69
70 int pool_stride_x = 0;
71 int pool_stride_y = 0;
72 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
73 const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
74 const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
75
76 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
77 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
78 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
79
80 const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
81 // "new_offset" doesn't have to consider the "half_scale_v" in its computation
82 // With a requantization performed in a single step there won't be uncertainties introduced
83 const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
84
85 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
86 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
87 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
88
89 execute_window_loop(window_out, [&](const Coordinates & id)
90 {
91 const int idx_width = id.y() * pool_stride_x;
92 const int idx_height = id.z() * pool_stride_y;
93 const int pool_limit_y = pool_pad_top - idx_height;
94 const int pool_limit_x = pool_pad_left - idx_width;
95
96 const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
97 const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
98 const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
99 const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
100
101 int x_off = window_start_x;
102 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
103 {
104 if(pool_info.pool_type != PoolingType::MAX)
105 {
106 q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
107 q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
108 q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
109 q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
110
111 // Calculate scale
112 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
113 pool_stride_y);
114
115 // Perform pooling
116 for(int y = pool_start_y; y < pool_end_y; ++y)
117 {
118 for(int x = pool_start_x; x < pool_end_x; ++x)
119 {
120 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
121 (src->info()->strides_in_bytes().z())) + x_off);
122
123 const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
124 const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
125 vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
126 vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
127 vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
128 vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
129 }
130 }
131
132 if(src_qinfo != dst_qinfo)
133 {
134 const float32x4x4_t vres =
135 {
136 {
137 vcvtq_f32_q32(vres1),
138 vcvtq_f32_q32(vres2),
139 vcvtq_f32_q32(vres3),
140 vcvtq_f32_q32(vres4),
141 }
142 };
143 const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
144 // Store result
145 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
146 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
147 }
148 else
149 {
150 const float32x4_t scale_v = vdupq_n_f32(scale);
151 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
152 vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
153 vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
154 vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
155 vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
156
157 const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
158 const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
159 // Store result
160 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
161 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
162 }
163 }
164 else
165 {
166 q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
167
168 for(int y = pool_start_y; y < pool_end_y; ++y)
169 {
170 for(int x = pool_start_x; x < pool_end_x; ++x)
171 {
172 const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
173 (src->info()->strides_in_bytes().z())) + x_off);
174 vres = wrapper::vmax(vres, data);
175 }
176 }
177
178 // Store result
179 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
180 requant_qinfo) :
181 vres);
182 }
183 }
184
185 if(pool_info.pool_type == PoolingType::MAX)
186 {
187 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
188 {
189 q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
190 for(int y = pool_start_y; y < pool_end_y; ++y)
191 {
192 for(int x = pool_start_x; x < pool_end_x; ++x)
193 {
194 const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
195 (src->info()->strides_in_bytes().z())) + x_off);
196 vres = wrapper::vmax(vres, data);
197 }
198 }
199
200 // Store result
201 wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
202 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
203 }
204 }
205
206 // Left-overs loop
207 for(; x_off < window_end_x; ++x_off)
208 {
209 if(pool_info.pool_type != PoolingType::MAX)
210 {
211 q32_t res = static_cast<q32_t>(0.f);
212
213 // Calculate scale
214 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
215 pool_stride_y);
216
217 // Perform pooling
218 for(int y = pool_start_y; y < pool_end_y; ++y)
219 {
220 for(int x = pool_start_x; x < pool_end_x; ++x)
221 {
222 const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
223 (src->info()->strides_in_bytes().z())) + x_off);
224 res += data;
225 }
226 }
227
228 if(src_qinfo != dst_qinfo)
229 {
230 const float res_f = static_cast<float>(res);
231 const float new_scale = quant_rescale / scale;
232 const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
233
234 // Store result
235 *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
236 }
237 else
238 {
239 // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
240 res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
241
242 // Store result
243 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
244 }
245 }
246 else
247 {
248 T res = std::numeric_limits<T>::min();
249
250 for(int y = pool_start_y; y < pool_end_y; ++y)
251 {
252 for(int x = pool_start_x; x < pool_end_x; ++x)
253 {
254 const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
255 (src->info()->strides_in_bytes().z())) + x_off);
256 res = std::max(res, data);
257 }
258 }
259
260 // Store result
261 if(src_qinfo != dst_qinfo)
262 {
263 const float res_f = static_cast<float>(res);
264 *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
265 }
266 else
267 {
268 *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
269 }
270 }
271 }
272
273 },
274 in, out);
275 }
276
277 #if defined(ENABLE_NCHW_KERNELS)
278 template <typename T, typename TVec>
scale_vector_q16x8(bool exclude_padding,TVec & v,const Coordinates & id,int id_offset,int step,const int pool_size,const int upper_bound_w,const int upper_bound_h,const int pad_x,const int pad_y,const int stride_x,const int stride_y)279 inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
280 const int pool_size, const int upper_bound_w, const int upper_bound_h,
281 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
282 {
283 int start_x = (id.x() + id_offset) * stride_x - pad_x;
284 int start_y = id.y() * stride_y - pad_y;
285 const int end_y = std::min(start_y + pool_size, upper_bound_h);
286 if(exclude_padding)
287 {
288 start_y = std::max(0, start_y);
289 }
290
291 std::array<T, 8> elems =
292 {
293 {
294 wrapper::vgetlane(v, 0),
295 wrapper::vgetlane(v, 1),
296 wrapper::vgetlane(v, 2),
297 wrapper::vgetlane(v, 3),
298 wrapper::vgetlane(v, 4),
299 wrapper::vgetlane(v, 5),
300 wrapper::vgetlane(v, 6),
301 wrapper::vgetlane(v, 7),
302 }
303 };
304
305 for(auto &el : elems)
306 {
307 int c_start_x = start_x;
308 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
309 if(exclude_padding)
310 {
311 c_start_x = std::max(0, c_start_x);
312 }
313 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
314 el *= scale;
315 start_x += step * stride_x;
316 }
317
318 v = wrapper::vsetlane(elems[0], v, 0);
319 v = wrapper::vsetlane(elems[1], v, 1);
320 v = wrapper::vsetlane(elems[2], v, 2);
321 v = wrapper::vsetlane(elems[3], v, 3);
322 v = wrapper::vsetlane(elems[4], v, 4);
323 v = wrapper::vsetlane(elems[5], v, 5);
324 v = wrapper::vsetlane(elems[6], v, 6);
325 v = wrapper::vsetlane(elems[7], v, 7);
326 }
327
328 template <typename T>
load16_boundary_aware(int srcw,int srch,int pad_l,int pad_r,int pad_t,int pad_b,int x,int y,const T * ptr,T fval)329 auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
330 {
331 ARM_COMPUTE_UNUSED(pad_b, pad_r);
332 T vec[16];
333 //handle reading a row out of the tensor
334 const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
335 for(int i = 0; i < 16; i++)
336 {
337 if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
338 {
339 vec[i] = *(ptr + i);
340 }
341 else
342 {
343 vec[i] = fval;
344 }
345 }
346 return wrapper::vloadq(vec);
347 }
348
349 template <typename T, typename V, bool deinterleave>
write16_boundary_aware(int x,int dst_w,const V & lower,const V & upper,T * ptr)350 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
351 {
352 if(deinterleave)
353 {
354 for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
355 {
356 *(ptr + i * 2) = lower[i];
357 }
358 for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
359 {
360 *(ptr + 1 + i * 2) = upper[i];
361 }
362 }
363 else
364 {
365 for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
366 {
367 *(ptr + i) = lower[i];
368 }
369 for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
370 {
371 *(ptr + i + 8) = upper[i];
372 }
373 }
374 }
375
376 template <typename T, typename V>
write8_boundary_aware(int x,int dst_w,const V & v,T * ptr)377 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
378 {
379 for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
380 {
381 *(ptr + i) = v[i];
382 }
383 }
384
385 template <typename T>
pooling2_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)386 void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
387 {
388 ARM_COMPUTE_UNUSED(dst1);
389 Iterator in(src, window_src);
390 Iterator out(dst0, window);
391
392 /** SIMD vector types */
393 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
394 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
395 using q16_t = typename wrapper::traits::promote_t<T>;
396 using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
397 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
398 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
399
400 constexpr int pool_size = 2;
401 int pool_stride_x = 0;
402 int pool_stride_y = 0;
403 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
404 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
405 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
406 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
407 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
408 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
409 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
410 const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
411 const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
412 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
413 const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
414 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
415 const bool have_different_qinfo = src_qinfo != dst_qinfo;
416
417 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
418 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
419 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
420 const int src_w = src->info()->dimension(0);
421 const int src_h = src->info()->dimension(1);
422 const int dst_w = dst0->info()->dimension(0);
423
424 const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
425
426 execute_window_loop(window, [&](const Coordinates & id)
427 {
428 const auto x_val = id.x() * pool_stride_x;
429 const auto y_val_0 = id.y() * pool_stride_y;
430 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
431
432 auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
433 x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
434 auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
435 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
436
437 q8x8_t lower_res = {};
438 q8x8_t upper_res = {};
439
440 if(pool_info.pool_type != PoolingType::MAX)
441 {
442 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
443 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
444
445 // Add rows
446 const q16x8x2_t vrsum =
447 {
448 {
449 wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
450 wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
451 }
452 };
453
454 // Pair-wise add row data
455 const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
456 const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
457
458 q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
459
460 // Scale lower result
461 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
462 pool_size, upper_bound_w, upper_bound_h,
463 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
464 lower_res = wrapper::vmovn(res_lower);
465
466 // Compute upper result for stride_x == 1
467 if(pool_stride_x == 1)
468 {
469 // Shifted row sum
470 const q16x8x2_t vrsum_shifted =
471 {
472 {
473 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
474 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
475 }
476 };
477
478 // Pair-wise add shifted row
479 q16x8_t res_upper = wrapper::vcombine(
480 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
481 wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
482
483 // Scale upper result
484 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
485 pool_size, upper_bound_w, upper_bound_h,
486 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
487 upper_res = wrapper::vmovn(res_upper);
488 }
489 }
490 else
491 {
492 const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
493 lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
494 if(pool_stride_x == 1)
495 {
496 const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
497 upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
498 }
499 }
500
501 if(have_different_qinfo)
502 {
503 const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
504 lower_res = wrapper::vgetlow(requantized_dst);
505 upper_res = wrapper::vgethigh(requantized_dst);
506 }
507 auto out_ptr = reinterpret_cast<T *>(out.ptr());
508 // Store result
509 if(pool_stride_x == 1)
510 {
511 write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
512 }
513 else
514 {
515 write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
516 }
517 },
518 in, out);
519 }
520
521 template <typename T>
pooling3_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)522 void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
523 {
524 ARM_COMPUTE_UNUSED(dst1);
525 Iterator in(src, window_src);
526 Iterator out(dst0, window);
527
528 /** SIMD vector types */
529 using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
530 using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
531 using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
532 using q16_t = typename wrapper::traits::promote_t<T>;
533 using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
534 using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
535
536 constexpr int pool_size = 3;
537 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
538 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
539 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
540 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
541 int pool_stride_x = 0;
542 int pool_stride_y = 0;
543 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
544 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
545 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
546
547 const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
548 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
549
550 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
551 const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
552 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
553
554 const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
555 const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
556 const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
557
558 const int src_w = src->info()->dimension(0);
559 const int src_h = src->info()->dimension(1);
560 const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
561 const int dst_w = dst0->info()->dimension(0);
562
563 execute_window_loop(window, [&](const Coordinates & id)
564 {
565 const auto x_val = id.x() * pool_stride_x;
566 const auto y_val_0 = id.y() * pool_stride_y;
567 const auto y_val_1 = (id.y() * pool_stride_y) + 1;
568 const auto y_val_2 = (id.y() * pool_stride_y) + 2;
569
570 auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
571 x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
572 auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
573 x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
574 auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
575 x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
576
577 q8x8_t fres = {};
578 q8x16_t fqres = {};
579
580 if(pool_info.pool_type == PoolingType::AVG)
581 {
582 // Convert data to u16
583 const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
584 const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
585 const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
586
587 // Calculate row sums
588 const q16x8x2_t vrsum =
589 {
590 {
591 wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
592 wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
593 }
594 };
595 const q16x8x2_t vrsum_shifted_1 =
596 {
597 {
598 wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
599 wrapper::vext_1(vrsum.val[1], vrsum.val[1])
600 }
601 };
602 const q16x8x2_t vrsum_shifted_2 =
603 {
604 {
605 wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
606 wrapper::vext_2(vrsum.val[1], vrsum.val[1])
607 }
608 };
609 // Calculate final sum
610 q16x8x2_t final_sum =
611 {
612 {
613 wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
614 wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
615 }
616 };
617 if(pool_stride_x == 2)
618 {
619 q16x8_t res =
620 {
621 wrapper::vgetlane(final_sum.val[0], 0),
622 wrapper::vgetlane(final_sum.val[0], 2),
623 wrapper::vgetlane(final_sum.val[0], 4),
624 wrapper::vgetlane(final_sum.val[0], 6),
625 wrapper::vgetlane(final_sum.val[1], 0),
626 wrapper::vgetlane(final_sum.val[1], 2),
627 wrapper::vgetlane(final_sum.val[1], 4),
628 wrapper::vgetlane(final_sum.val[1], 6),
629 };
630
631 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
632 pool_size, upper_bound_w, upper_bound_h,
633 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
634 fres = wrapper::vmovn(res);
635 }
636 else
637 {
638 // Scale lower result
639 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
640 pool_size, upper_bound_w, upper_bound_h,
641 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
642 // Scale lower result
643 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
644 pool_size, upper_bound_w, upper_bound_h,
645 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
646 fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
647 }
648 }
649 else
650 {
651 const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
652 const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
653 const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
654 const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
655
656 if(pool_stride_x == 2)
657 {
658 const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
659 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
660 fres = wrapper::vtbl(table, lookup_val);
661 }
662 else
663 {
664 fqres = final_max;
665 }
666 }
667
668 // Store result
669 if(pool_stride_x == 1)
670 {
671 if(src_qinfo != dst_qinfo)
672 {
673 fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
674 }
675 write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
676 }
677 else
678 {
679 if(src_qinfo != dst_qinfo)
680 {
681 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
682 }
683 write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
684 }
685 },
686 in, out);
687 }
688
689 template <typename T>
poolingMxN_quantized_neon_nchw(const ITensor * src,ITensor * dst0,ITensor * dst1,PoolingLayerInfo & pool_info,const Window & window_src,const Window & window)690 void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
691 {
692 ARM_COMPUTE_UNUSED(dst1);
693 Iterator in(src, window_src);
694 Iterator out(dst0, window);
695
696 /** SIMD vector types */
697 using q16_t = typename wrapper::traits::promote_t<T>;
698 using q32_t = typename wrapper::traits::promote_t<q16_t>;
699
700 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
701 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
702 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
703 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
704 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
705 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
706 int pool_stride_x = 0;
707 int pool_stride_y = 0;
708 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
709 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
710 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
711
712 const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
713 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
714 const int src_w = src->info()->dimension(0);
715 const int src_h = src->info()->dimension(1);
716 const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
717 const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
718 const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
719
720 execute_window_loop(window, [&](const Coordinates & id)
721 {
722 T res = std::numeric_limits<T>::min();
723
724 if(pool_info.pool_type != PoolingType::MAX)
725 {
726 q32_t sres = 0;
727
728 // Calculate scale
729 const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
730 pool_stride_y);
731
732 // Perform pooling
733 for(int y = 0; y < pool_size_y; ++y)
734 {
735 for(int x = 0; x < pool_size_x; ++x)
736 {
737 const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
738
739 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
740 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
741 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
742 sres += data;
743 }
744 }
745 // Divide by scale
746 res = static_cast<T>(support::cpp11::round(sres * scale));
747 }
748 else
749 {
750 for(int y = 0; y < pool_size_y; ++y)
751 {
752 for(int x = 0; x < pool_size_x; ++x)
753 {
754 const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
755
756 const int idx = x + id.x() * pool_stride_x - pool_pad_left;
757 const int idy = y + id.y() * pool_stride_y - pool_pad_top;
758 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
759 res = std::max(res, data);
760 }
761 }
762 }
763 // Store result
764 res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
765 *(reinterpret_cast<T *>(out.ptr())) = res;
766 },
767 in, out);
768 }
769 #endif /* defined(ENABLE_NCHW_KERNELS) */
770 } // namespace cpu
771 } // namespace arm_compute
772
773 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
774