• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEScaleKernel.h"
25 
26 #include "arm_compute/core/Helpers.h"
27 #include "arm_compute/core/Window.h"
28 #include "arm_compute/core/utils/misc/Utility.h"
29 #include "src/core/AccessWindowStatic.h"
30 #include "src/core/CPP/Validate.h"
31 #include "src/core/NEON/wrapper/wrapper.h"
32 #include "src/core/helpers/AutoConfiguration.h"
33 #include "src/core/helpers/ScaleHelpers.h"
34 #include "src/core/helpers/WindowHelpers.h"
35 #include "src/core/utils/ScaleUtils.h"
36 #include "support/Rounding.h"
37 
38 #include <arm_neon.h>
39 #include <map>
40 
41 namespace arm_compute
42 {
43 namespace
44 {
compute_bilinear(float a00,float a01,float a10,float a11,float dx_val,float dy_val)45 inline float compute_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val)
46 {
47     const float dx1_val = 1.0f - dx_val;
48     const float dy1_val = 1.0f - dy_val;
49 
50     const float w1 = dx1_val * dy1_val;
51     const float w2 = dx_val * dy1_val;
52     const float w3 = dx1_val * dy_val;
53     const float w4 = dx_val * dy_val;
54     return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
55 }
56 
validate_arguments(const ITensorInfo * input,const ITensorInfo * dx,const ITensorInfo * dy,const ITensorInfo * offsets,ITensorInfo * output,const ScaleKernelInfo & info)57 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
58                           const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
59 {
60     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
61     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
62     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
63     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
64     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
65     ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
66     ARM_COMPUTE_UNUSED(info.constant_border_value);
67     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
68 
69     const DataLayout data_layout   = input->data_layout();
70     const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
71     const auto       height_index  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
72     const auto       output_width  = output->dimension(width_index);
73     const auto       output_height = output->dimension(height_index);
74     ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
75     ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
76 
77     if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
78     {
79         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
80     }
81 
82     if(info.interpolation_policy == InterpolationPolicy::BILINEAR)
83     {
84         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
85         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
86         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
87     }
88 
89     ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
90 
91     if(info.interpolation_policy == InterpolationPolicy::AREA)
92     {
93         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
94         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
95     }
96 
97     return Status{};
98 }
99 } // namespace
100 
NEScaleKernel()101 NEScaleKernel::NEScaleKernel()
102     : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0),
103       _align_corners(false)
104 {
105 }
106 
configure(const ITensor * input,const ITensor * dx,const ITensor * dy,const ITensor * offsets,ITensor * output,const ScaleKernelInfo & info)107 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
108                               ITensor *output, const ScaleKernelInfo &info)
109 {
110     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
111     // Perform validation step
112     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
113                                                   dx != nullptr ? dx->info() : nullptr,
114                                                   dy != nullptr ? dy->info() : nullptr,
115                                                   offsets != nullptr ? offsets->info() : nullptr,
116                                                   output->info(),
117                                                   info));
118 
119     // Get data layout and width/height indices
120     const DataLayout data_layout = input->info()->data_layout();
121     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
122     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
123 
124     _input                 = input;
125     _output                = output;
126     _offsets               = offsets;
127     _dx                    = dx;
128     _dy                    = dy;
129     _policy                = info.interpolation_policy;
130     _border_mode           = info.border_mode;
131     _constant_border_value = info.constant_border_value;
132     _align_corners         = info.align_corners;
133 
134     if(info.sampling_policy == SamplingPolicy::CENTER)
135     {
136         _sampling_offset = 0.5f;
137     }
138 
139     // Compute the ratio between source width/height and destination width/height
140     const auto wr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
141     const auto hr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
142 
143     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
144     const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
145 
146     if(_border_mode == BorderMode::UNDEFINED)
147     {
148         _border_mode           = BorderMode::CONSTANT;
149         _constant_border_value = PixelValue();
150     }
151     std::string function_to_call("scale_");
152     function_to_call += string_from_data_type(_input->info()->data_type()) + "_";
153     function_to_call += string_from_data_layout(_input->info()->data_layout()) + "_";
154     function_to_call += string_from_interpolation_policy(policy_to_use);
155 
156     static std::map<std::string, ScaleFunctionPtr> map_function =
157     {
158         { "scale_U8_NCHW_AREA_CONSTANT", &NEScaleKernel::scale_area_nchw_u8 },
159 
160         { "scale_U8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<uint8_t> },
161         { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
162 
163         { "scale_U8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<uint8_t> },
164         { "scale_U8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
165 
166         { "scale_QASYMM8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
167         { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
168 
169         { "scale_QASYMM8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
170         { "scale_QASYMM8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
171 
172         { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
173         { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
174 
175         { "scale_QASYMM8_SIGNED_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
176         { "scale_QASYMM8_SIGNED_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
177 
178         { "scale_S16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<int16_t> },
179         { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
180 
181         { "scale_S16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<int16_t> },
182         { "scale_S16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
183 
184 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
185         { "scale_F16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float16_t> },
186         { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
187 
188         { "scale_F16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float16_t> },
189         { "scale_F16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
190 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
191 
192         { "scale_F32_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float> },
193         { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<float> },
194 
195         { "scale_F32_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float> },
196         { "scale_F32_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<float> },
197     };
198     auto it = map_function.find(function_to_call);
199     if(it != map_function.end())
200     {
201         _func = it->second;
202     }
203 
204     // Configure window
205     Window      win = calculate_max_window(*output->info(), Steps());
206     Coordinates coord;
207     coord.set_num_dimensions(output->info()->num_dimensions());
208     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
209     INEKernel::configure(win);
210 }
211 
212 template <typename T>
scale_nearest_nchw(const Window & window)213 void NEScaleKernel::scale_nearest_nchw(const Window &window)
214 {
215     const size_t in_stride_x = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
216 
217     // Compute the ratio between source height and destination height
218     const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
219 
220     // Don't increment in X and Y direction for the input tensor
221     // A pointer to the start of this plane is needed as base for the precomputed offsets
222     Window win_in(window);
223     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
224     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
225 
226     // Set offsets window
227     Window win_off;
228     win_off.set(Window::DimX, window[Window::DimX]);
229     win_off.set(Window::DimY, window[Window::DimY]);
230     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
231     {
232         win_off.set(d, Window::Dimension(0, 0, 0));
233     }
234 
235     // Create iterators
236     Iterator in(_input, win_in);
237     Iterator out(_output, window);
238     Iterator offsets(_offsets, win_off);
239     execute_window_loop(window, [&](const Coordinates & id)
240     {
241         const auto    offsets_ptr         = reinterpret_cast<const int32_t *>(offsets.ptr());
242         const auto    in_yi               = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
243         const int32_t offset_row          = in_yi * in_stride_x;
244         *reinterpret_cast<T *>(out.ptr()) = *(reinterpret_cast<const T *>(in.ptr()) + offsets_ptr[0] + offset_row);
245     },
246     in, offsets, out);
247 }
248 
249 template <typename T>
scale_bilinear_nchw(const Window & window)250 void NEScaleKernel::scale_bilinear_nchw(const Window &window)
251 {
252     // Compute the ratio between source height and destination height
253     const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
254     Window     win_off;
255     win_off.set(Window::DimX, window.x());
256     win_off.set(Window::DimY, window.y());
257 
258     // Don't increment in X and Y direction for the input tensor
259     // A pointer to the start of this plane is needed as base for the precomputed offsets
260     Window win_in(window);
261     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
262     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
263 
264     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
265     {
266         win_off.set(d, Window::Dimension(0, 0, 0));
267     }
268 
269     Iterator in(_input, win_in);
270     Iterator out(_output, window);
271     Iterator offsets(_offsets, win_off);
272     Iterator dx(_dx, win_off);
273     Iterator dy(_dy, win_off);
274 
275     const int32_t in_dim_w    = _input->info()->dimension(0);
276     const int32_t in_dim_h    = _input->info()->dimension(1);
277     const int32_t in_stride_w = in_dim_w + _input->info()->padding().left + _input->info()->padding().right;
278 
279     if(_border_mode == BorderMode::CONSTANT)
280     {
281 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
282         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
283 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
284         using ConstType = T;
285 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
286         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
287         execute_window_loop(window, [&](const Coordinates & id)
288         {
289             const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
290             const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
291             const auto    dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
292             const auto    dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
293             const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
294 
295             const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
296             const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
297             const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
298                               && index_h < in_dim_h - 1) ?
299                              (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
300                              const_border_value;
301             const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
302                               && index_h < in_dim_h - 1) ?
303                              (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
304                              const_border_value;
305 
306             *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
307         },
308         in, offsets, dx, dy, out);
309     }
310     else if(_border_mode == BorderMode::REPLICATE)
311     {
312         execute_window_loop(window, [&](const Coordinates & id)
313         {
314             const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
315             const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
316             const auto dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
317             const auto dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
318             const auto pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
319 
320             auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
321             auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
322             auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
323             auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
324 
325             const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
326             const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
327             const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
328             const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
329 
330             *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
331         },
332         in, offsets, dx, dy, out);
333     }
334     else
335     {
336         ARM_COMPUTE_ERROR("Not implemented");
337     }
338 }
339 
scale_area_nchw_u8(const Window & window)340 void NEScaleKernel::scale_area_nchw_u8(const Window &window)
341 {
342     using namespace scale_helpers;
343 
344     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
345 
346     // Don't increment in width/height/channels for the input tensor
347     // A pointer to the start of this plane is needed as base for the precomputed offsets
348     Window win_in(window);
349     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
350     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
351     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
352 
353     Iterator in(_input, win_in);
354     Iterator out(_output, window);
355 
356     const auto   wr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
357     const auto   hr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
358     const auto   w         = _input->info()->dimension(0);
359     const auto   h         = _input->info()->dimension(1);
360     const size_t in_stride = _input->info()->strides_in_bytes()[1];
361 
362     execute_window_loop(window, [&](const Coordinates & id)
363     {
364         const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
365 
366         uint8x8_t tmp0 = vdup_n_u8(0);
367         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
368         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
369         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
370         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
371         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
372         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
373         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
374         tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
375 
376         uint8x8_t tmp1 = vdup_n_u8(0);
377         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
378         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
379         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
380         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
381         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
382         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
383         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
384         tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
385 
386         vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
387     },
388     in, out);
389 }
390 
391 template <typename T>
scale_nearest_nhwc(const Window & window)392 void NEScaleKernel::scale_nearest_nhwc(const Window &window)
393 {
394     const size_t in_stride_c  = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
395     const size_t in_stride_w  = _input->info()->dimension(1) + _input->info()->padding().top + _input->info()->padding().bottom;
396     const size_t in_stride_wc = in_stride_w * in_stride_c;
397     const size_t in_dim_h     = _input->info()->dimension(2);
398 
399     // Compute the ratio between source height and destination height
400     const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, _output->info()->dimension(2), _align_corners);
401     const auto window_start_x = static_cast<int32_t>(window.x().start());
402     const auto window_end_x   = static_cast<int32_t>(window.x().end());
403     const int  window_step_x  = 16 / sizeof(T);
404 
405     Window win(window);
406     win.set(Window::DimX, Window::Dimension(0, 1, 1));
407     Iterator out(_output, win);
408 
409     const uint8_t     *in_ptr_start        = _input->buffer() + _input->info()->offset_first_element_in_bytes();
410     const unsigned int in_stride_bytes_hwc = _input->info()->strides_in_bytes()[3];
411 
412     execute_window_loop(win, [&](const Coordinates & id)
413     {
414         const int32_t offset     = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
415         const auto    in_hi      = static_cast<int>(_align_corners ? utils::rounding::round_half_away_from_zero((id.z() + _sampling_offset) * hr) : std::floor((id.z() + _sampling_offset) * hr));
416         const int     offset_row = in_hi * in_stride_wc;
417         int32_t       x          = window_start_x;
418         const T      *in_ptr     = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
419 
420         for(; x <= window_end_x - window_step_x; x += window_step_x)
421         {
422             wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
423                             wrapper::vloadq(in_ptr + offset + offset_row + x));
424         }
425         for(; x < window_end_x; ++x)
426         {
427             *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
428         }
429     },
430     out);
431 }
432 
433 template <typename T>
scale_bilinear_nhwc(const Window & window)434 void NEScaleKernel::scale_bilinear_nhwc(const Window &window)
435 {
436     // Compute the ratio between source height and destination height
437     const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(2), _output->info()->dimension(2), _align_corners);
438 
439     Iterator  out(_output, window);
440     const int in_stride_c  = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
441     const int in_dim_w     = _input->info()->dimension(1);
442     const int in_dim_h     = _input->info()->dimension(2);
443     const int in_stride_wc = in_stride_c * (in_dim_w + _input->info()->padding().top + _input->info()->padding().bottom);
444 
445     // Don't increment in Y and Z direction for the input tensor
446     // A pointer to the start of this plane is needed as base for the precomputed offsets
447     Window win_in(window);
448     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
449     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
450     Iterator in(_input, win_in);
451 
452     if(_border_mode == BorderMode::CONSTANT)
453     {
454 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
455         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
456 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
457         using ConstType = T;
458 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
459         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
460         execute_window_loop(window, [&](const Coordinates & id)
461         {
462             const auto    offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
463             const auto    dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
464             const auto    dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
465             const int32_t in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
466             const T      *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
467 
468             const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
469             const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
470             const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
471             const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
472 
473             *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
474         },
475         in, out);
476     }
477     else if(_border_mode == BorderMode::REPLICATE)
478     {
479         execute_window_loop(window, [&](const Coordinates & id)
480         {
481             const auto offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
482             const auto dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
483             const auto dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
484             const int  in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
485 
486             auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
487             auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
488             auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
489             auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
490 
491             const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
492             const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
493             const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
494             const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
495 
496             *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
497         },
498         in, out);
499     }
500     else
501     {
502         ARM_COMPUTE_ERROR("Not implemented");
503     }
504 }
505 
506 template <typename T>
scale_bilinear_qasymm(const Window & window)507 void NEScaleKernel::scale_bilinear_qasymm(const Window &window)
508 {
509     // Get data layout and width/height indices
510     const DataLayout data_layout = _input->info()->data_layout();
511     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
512     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
513 
514     // Compute the ratio between source height and destination height
515     const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
516     Window     win_off;
517     win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
518     win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
519 
520     // Don't increment in X and Y direction for the input tensor
521     // A pointer to the start of this plane is needed as base for the precomputed offsets
522     Window win_in(window);
523     win_in.set(idx_width, Window::Dimension(0, 0, 0));
524     win_in.set(idx_height, Window::Dimension(0, 0, 0));
525 
526     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
527     {
528         win_off.set(d, Window::Dimension(0, 0, 0));
529     }
530 
531     Iterator in(_input, win_in);
532     Iterator out(_output, window);
533 
534     const int32_t in_dim_w = _input->info()->dimension(idx_width);
535     const int32_t in_dim_h = _input->info()->dimension(idx_height);
536     const int32_t stride_w = _input->info()->strides_in_bytes()[idx_width];
537     const int32_t stride_h = _input->info()->strides_in_bytes()[idx_height];
538 
539     const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
540     const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
541 
542     if(_border_mode == BorderMode::CONSTANT)
543     {
544 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
545         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
546 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
547         using ConstType = T;
548 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
549         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
550         execute_window_loop(window, [&](const Coordinates & id)
551         {
552             const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
553             const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
554             const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
555             const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
556             const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
557 
558             const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
559                              (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
560                              const_border_value;
561             const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
562                              (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
563                              const_border_value;
564             const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
565                              (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
566                              const_border_value;
567             const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
568                              (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
569                              const_border_value;
570 
571             const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
572             const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
573             const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
574             const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
575             *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
576         },
577         in, out);
578     }
579     else if(_border_mode == BorderMode::REPLICATE)
580     {
581         execute_window_loop(window, [&](const Coordinates & id)
582         {
583             const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
584             const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
585             const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
586             const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
587             const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
588 
589             auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
590             auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
591             auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
592             auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
593 
594             const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
595             const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
596             const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
597             const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
598 
599             const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
600             const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
601             const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
602             const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
603             *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
604         },
605         in, out);
606     }
607     else
608     {
609         ARM_COMPUTE_ERROR("Not implemented");
610     }
611 }
612 
validate(const ITensorInfo * input,const ITensorInfo * dx,const ITensorInfo * dy,const ITensorInfo * offsets,ITensorInfo * output,const ScaleKernelInfo & info)613 Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
614                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
615 {
616     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
617     return Status{};
618 }
619 
run(const Window & window,const ThreadInfo & info)620 void NEScaleKernel::run(const Window &window, const ThreadInfo &info)
621 {
622     ARM_COMPUTE_UNUSED(info);
623     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
624     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
625     ARM_COMPUTE_ERROR_ON(_func == nullptr);
626 
627     (this->*_func)(window);
628 }
629 } // namespace arm_compute
630