• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/Validate.h"
31 #include "src/core/CPP/Validate.h"
32 #include "src/core/NEON/NEFixedPoint.h"
33 #include "src/core/NEON/NEMath.h"
34 #include "src/core/NEON/wrapper/wrapper.h"
35 #include "src/core/helpers/AutoConfiguration.h"
36 #include "src/core/helpers/WindowHelpers.h"
37 #include "support/SaturateCast.h"
38 
39 using namespace arm_compute;
40 
41 namespace
42 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,ConvertPolicy policy,uint32_t shift)43 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
44 {
45     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
46     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
47     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(input);
48     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(output);
49     ARM_COMPUTE_UNUSED(policy);
50     ARM_COMPUTE_RETURN_ERROR_ON(input == output);
51     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
52                                                          DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
53                                                          DataType::F32, DataType::S32);
54     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
55                                                          DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
56                                                          DataType::U32, DataType::S32, DataType::F32);
57     ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
58 
59     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8_SIGNED && (output->data_type() != DataType::S16 && output->data_type() != DataType::S32
60                                                                                        && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
61                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
62 
63     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
64                                                                                 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
65                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
66 
67     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
68                                                                            && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
69                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
70 
71     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
72                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
73 
74     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
75                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
76 
77     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::BFLOAT16 && output->data_type() != DataType::F32,
78                                     "Only data_types supported [in] BFLOAT16 ->  [out] F32");
79 
80     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
81                                                                             && output->data_type() != DataType::U8
82                                                                             && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
83                                     "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
84 
85     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
86                                                                             && output->data_type() != DataType::F16 && output->data_type() != DataType::BFLOAT16
87                                                                             && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
88                                     "Only data_types supported [in] F32 ->  [out] QASYMM8, BFLOAT16, F16, S32, U8");
89 
90     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
91                                                                             && output->data_type() != DataType::F16
92                                                                             && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
93                                     "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
94 
95     // Validate in case of configured output
96     if(output->total_size() > 0)
97     {
98         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
99     }
100 
101     return Status{};
102 }
103 } // namespace
104 
NEDepthConvertLayerKernel()105 NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
106     : _input(nullptr), _output(nullptr), _policy(), _shift(0)
107 {
108 }
109 
configure(const ITensor * input,ITensor * output,ConvertPolicy policy,uint32_t shift)110 void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
111 {
112     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
113 
114     // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
115     set_shape_if_empty(*output->info(), input->info()->tensor_shape());
116 
117     _input  = input;
118     _output = output;
119     _policy = policy;
120     _shift  = shift;
121 
122     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
123 
124     // Configure kernel window
125     Window      win = calculate_max_window(*input->info(), Steps());
126     Coordinates coord;
127     coord.set_num_dimensions(output->info()->num_dimensions());
128     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
129 
130     ICPPKernel::configure(win);
131 }
132 
validate(const ITensorInfo * input,const ITensorInfo * output,ConvertPolicy policy,uint32_t shift)133 Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
134 {
135     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
136     return Status{};
137 }
138 
run(const Window & window,const ThreadInfo & info)139 void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
140 {
141     ARM_COMPUTE_UNUSED(info);
142     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
143     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
144     ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
145     ARM_COMPUTE_ERROR_ON(_input == _output);
146 
147     const auto window_start_x = static_cast<int>(window.x().start());
148     const auto window_end_x   = static_cast<int>(window.x().end());
149     const int  window_step_x  = 16;
150 
151     Window win{ window };
152     win.set(Window::DimX, Window::Dimension(0, 1, 1));
153 
154     Iterator input(_input, win);
155     Iterator output(_output, win);
156 
157     switch(_input->info()->data_type())
158     {
159         case DataType::QASYMM8_SIGNED:
160         {
161             const int16x8_t b = vdupq_n_s16(_shift);
162 
163             switch(_output->info()->data_type())
164             {
165                 case DataType::S16:
166                 {
167                     /* Up-conversion QASYMM8_SIGNED -> S16 */
168                     execute_window_loop(win, [&](const Coordinates &)
169                     {
170                         const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
171                         const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
172                         int        x          = window_start_x;
173 
174                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
175                         {
176                             const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
177 
178                             const int16x8x2_t texels =
179                             {
180                                 {
181                                     vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
182                                     vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
183                                 }
184                             };
185 
186                             vst1q_s16(output_ptr + x, texels.val[0]);
187                             vst1q_s16(output_ptr + x + 8, texels.val[1]);
188                         }
189 
190                         // Compute left-over elements
191                         for(; x < window_end_x; ++x)
192                         {
193                             *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
194                         }
195                     },
196                     input, output);
197                     break;
198                 }
199                 case DataType::S32:
200                 {
201                     /* Up-conversion QASYMM8_SIGNED -> S32 */
202                     execute_window_loop(win, [&](const Coordinates &)
203                     {
204                         const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
205                         const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
206                         int        x          = window_start_x;
207 
208                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
209                         {
210                             const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
211 
212                             const int16x8x2_t texels =
213                             {
214                                 {
215                                     vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
216                                     vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
217                                 }
218                             };
219 
220                             vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
221                             vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
222                             vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
223                             vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
224                         }
225 
226                         // Compute left-over elements
227                         for(; x < window_end_x; ++x)
228                         {
229                             *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
230                         }
231                     },
232                     input, output);
233                     break;
234                 }
235                 case DataType::F32:
236                 {
237                     /* Up-conversion QASYMM8_SIGNED -> F32 */
238                     execute_window_loop(win, [&](const Coordinates &)
239                     {
240                         const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
241                         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
242 
243                         int x = window_start_x;
244                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
245                         {
246                             const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(input.ptr()));
247 
248                             const int16x8x2_t texels =
249                             {
250                                 {
251                                     vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
252                                     vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
253                                 }
254                             };
255                             vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
256                             vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
257                             vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
258                             vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
259                         }
260 
261                         // Compute left-over elements
262                         for(; x < window_end_x; ++x)
263                         {
264                             *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
265                         }
266                     },
267                     input, output);
268                     break;
269                 }
270 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
271                 case DataType::F16:
272                 {
273                     /* Up-conversion QASYMM8_SIGNED -> F16 */
274                     execute_window_loop(win, [&](const Coordinates &)
275                     {
276                         const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
277                         const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
278                         int        x          = window_start_x;
279 
280                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
281                         {
282                             const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
283 
284                             const int16x8x2_t texels =
285                             {
286                                 {
287                                     vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
288                                     vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
289                                 }
290                             };
291                             vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
292                             vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
293                         }
294 
295                         // Compute left-over elements
296                         for(; x < window_end_x; ++x)
297                         {
298                             *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
299                         }
300                     },
301                     input, output);
302                     break;
303                 }
304 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
305 
306                 default:
307                     ARM_COMPUTE_ERROR("Output data type not supported");
308             }
309             break;
310         }
311 
312         case DataType::QASYMM8:
313         case DataType::U8:
314         {
315             const int16x8_t b = vdupq_n_s16(_shift);
316 
317             switch(_output->info()->data_type())
318             {
319                 case DataType::S16:
320                 {
321                     /* Up-conversion U8 -> S16 */
322                     execute_window_loop(win, [&](const Coordinates &)
323                     {
324                         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
325                         const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
326 
327                         int x = window_start_x;
328                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
329                         {
330                             const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
331 
332                             const int16x8x2_t texels =
333                             {
334                                 {
335                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
336                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
337                                 }
338                             };
339 
340                             vst1q_s16(output_ptr + x, texels.val[0]);
341                             vst1q_s16(output_ptr + x + 8, texels.val[1]);
342                         }
343 
344                         // Compute left-over elements
345                         for(; x < window_end_x; ++x)
346                         {
347                             auto in           = static_cast<int32_t>(*(input_ptr + x));
348                             *(output_ptr + x) = in << _shift;
349                         }
350                     },
351                     input, output);
352                     break;
353                 }
354                 case DataType::S32:
355                 {
356                     /* Up-conversion U8 -> S32 */
357                     execute_window_loop(win, [&](const Coordinates &)
358                     {
359                         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
360                         const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
361 
362                         int x = window_start_x;
363                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
364                         {
365                             const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
366 
367                             const int16x8x2_t texels =
368                             {
369                                 {
370                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
371                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
372                                 }
373                             };
374 
375                             vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
376                             vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
377                             vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
378                             vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
379                         }
380 
381                         // Compute left-over elements
382                         for(; x < window_end_x; ++x)
383                         {
384                             auto in           = static_cast<uint32_t>(*(input_ptr + x));
385                             *(output_ptr + x) = in << _shift;
386                         }
387                     },
388                     input, output);
389                     break;
390                 }
391                 case DataType::F32:
392                 {
393                     /* Up-conversion U8 -> F32 */
394                     execute_window_loop(win, [&](const Coordinates &)
395                     {
396                         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
397                         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
398 
399                         int x = window_start_x;
400                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
401                         {
402                             const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
403 
404                             const int16x8x2_t texels =
405                             {
406                                 {
407                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
408                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
409                                 }
410                             };
411                             vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
412                             vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
413                             vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
414                             vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
415                         }
416 
417                         // Compute left-over elements
418                         for(; x < window_end_x; ++x)
419                         {
420                             auto in           = static_cast<uint32_t>(*(input_ptr + x));
421                             *(output_ptr + x) = static_cast<float>(in << _shift);
422                         }
423                     },
424                     input, output);
425                     break;
426                 }
427 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
428                 case DataType::F16:
429                 {
430                     /* Up-conversion U8 -> F16 */
431                     execute_window_loop(win, [&](const Coordinates &)
432                     {
433                         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
434                         const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
435 
436                         int x = window_start_x;
437                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
438                         {
439                             const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
440 
441                             const int16x8x2_t texels =
442                             {
443                                 {
444                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
445                                     vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
446                                 }
447                             };
448                             vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
449                             vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
450                         }
451 
452                         // Compute left-over elements
453                         for(; x < window_end_x; ++x)
454                         {
455                             *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
456                         }
457                     },
458                     input, output);
459                     break;
460                 }
461 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
462                 case DataType::U16:
463                 {
464                     /* Up-conversion U8 -> U16 */
465                     execute_window_loop(win, [&](const Coordinates &)
466                     {
467                         const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
468                         const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
469 
470                         int x = window_start_x;
471                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
472                         {
473                             const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
474 
475                             const uint16x8x2_t texels =
476                             {
477                                 {
478                                     vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
479                                     vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
480                                 }
481                             };
482 
483                             vst1q_u16(output_ptr + x, texels.val[0]);
484                             vst1q_u16(output_ptr + x + 8, texels.val[1]);
485                         }
486 
487                         // Compute left-over elements
488                         for(; x < window_end_x; ++x)
489                         {
490                             *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
491                         }
492                     },
493                     input, output);
494                     break;
495                 }
496                 default:
497                     ARM_COMPUTE_ERROR("Output data type not supported");
498             }
499             break;
500         }
501         case DataType::S16:
502         {
503             switch(_output->info()->data_type())
504             {
505                 case DataType::QASYMM8_SIGNED:
506                 {
507                     const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
508 
509                     /* Down-conversion S16 -> QASYMM8_SIGNED */
510                     if(ConvertPolicy::SATURATE == _policy)
511                     {
512                         execute_window_loop(win, [&](const Coordinates &)
513                         {
514                             const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
515                             const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
516 
517                             int x = window_start_x;
518                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
519                             {
520                                 const int16x8x2_t texels =
521                                 {
522                                     {
523                                         vqshlq_s16(vld1q_s16(input_ptr + x), b),
524                                         vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
525                                     }
526                                 };
527 
528                                 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
529                             }
530 
531                             // Compute left-over elements
532                             for(; x < window_end_x; ++x)
533                             {
534                                 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
535                             }
536                         },
537                         input, output);
538                     }
539                     else
540                     {
541                         execute_window_loop(win, [&](const Coordinates &)
542                         {
543                             const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
544                             const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
545 
546                             int x = window_start_x;
547                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
548                             {
549                                 const int16x8x2_t texels =
550                                 {
551                                     {
552                                         vshlq_s16(vld1q_s16(input_ptr + x), b),
553                                         vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
554                                     }
555                                 };
556 
557                                 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
558                             }
559 
560                             // Compute left-over elements
561                             for(; x < window_end_x; ++x)
562                             {
563                                 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
564                             }
565                         },
566                         input, output);
567                     }
568                     break;
569                 }
570                 case DataType::U8:
571                 {
572                     const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
573 
574                     /* Down-conversion S16 -> U8 */
575                     if(ConvertPolicy::SATURATE == _policy)
576                     {
577                         execute_window_loop(win, [&](const Coordinates &)
578                         {
579                             const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
580                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
581 
582                             int x = window_start_x;
583                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
584                             {
585                                 const int16x8x2_t texels =
586                                 {
587                                     {
588                                         vqshlq_s16(vld1q_s16(input_ptr + x), b),
589                                         vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
590                                     }
591                                 };
592 
593                                 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
594                             }
595 
596                             // Compute left-over elements
597                             for(; x < window_end_x; ++x)
598                             {
599                                 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
600                             }
601                         },
602                         input, output);
603                     }
604                     else
605                     {
606                         execute_window_loop(win, [&](const Coordinates &)
607                         {
608                             const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
609                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
610 
611                             int x = window_start_x;
612                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
613                             {
614                                 const int16x8x2_t texels =
615                                 {
616                                     {
617                                         vshlq_s16(vld1q_s16(input_ptr + x), b),
618                                         vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
619                                     }
620                                 };
621 
622                                 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
623                                                                      vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
624                             }
625 
626                             // Compute left-over elements
627                             for(; x < window_end_x; ++x)
628                             {
629                                 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
630                             }
631                         },
632                         input, output);
633                     }
634                     break;
635                 }
636                 case DataType::S32:
637                 {
638                     const int32x4_t b = vdupq_n_s32(_shift);
639 
640                     /* Up-conversion S16 -> S32 */
641                     execute_window_loop(win, [&](const Coordinates &)
642                     {
643                         const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
644                         const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
645 
646                         int x = window_start_x;
647                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
648                         {
649                             const int16x8x2_t texels =
650                             {
651                                 {
652                                     vld1q_s16(input_ptr + x),
653                                     vld1q_s16(input_ptr + x + 8)
654                                 }
655                             };
656 
657                             const int32x4x4_t texels_s32 =
658                             {
659                                 {
660                                     vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
661                                     vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
662                                     vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
663                                     vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
664                                 }
665                             };
666 
667                             vst1q_s32(output_ptr + x, texels_s32.val[0]);
668                             vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
669                             vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
670                             vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
671                         }
672 
673                         // Compute left-over elements
674                         for(; x < window_end_x; ++x)
675                         {
676                             *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
677                         }
678                     },
679                     input, output);
680                     break;
681                 }
682                 default:
683                     ARM_COMPUTE_ERROR("Output data type not supported");
684             }
685             break;
686         }
687         case DataType::U16:
688         {
689             switch(_output->info()->data_type())
690             {
691                 case DataType::U8:
692                 {
693                     const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
694 
695                     /* Down-conversion U16 -> U8 */
696                     if(ConvertPolicy::SATURATE == _policy)
697                     {
698                         execute_window_loop(win, [&](const Coordinates &)
699                         {
700                             const auto input_ptr  = reinterpret_cast<const uint16_t *>(input.ptr());
701                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
702 
703                             int x = window_start_x;
704                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
705                             {
706                                 const uint16x8x2_t texels =
707                                 {
708                                     {
709                                         vqshlq_u16(vld1q_u16(input_ptr + x), b),
710                                         vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
711                                     }
712                                 };
713 
714                                 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
715                             }
716 
717                             // Compute left-over elements
718                             for(; x < window_end_x; ++x)
719                             {
720                                 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
721                             }
722                         },
723                         input, output);
724                     }
725                     else
726                     {
727                         execute_window_loop(win, [&](const Coordinates &)
728                         {
729                             const auto input_ptr  = reinterpret_cast<const uint16_t *>(input.ptr());
730                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
731 
732                             int x = window_start_x;
733                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
734                             {
735                                 const uint16x8x2_t texels =
736                                 {
737                                     {
738                                         vshlq_u16(vld1q_u16(input_ptr + x), b),
739                                         vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
740                                     }
741                                 };
742 
743                                 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
744                             }
745 
746                             // Compute left-over elements
747                             for(; x < window_end_x; ++x)
748                             {
749                                 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
750                             }
751 
752                         },
753                         input, output);
754                     }
755                     break;
756                 }
757                 case DataType::U32:
758                 {
759                     const int32x4_t b = vdupq_n_s32(_shift);
760 
761                     /* Up-conversion U16 -> U32 */
762                     execute_window_loop(win, [&](const Coordinates &)
763                     {
764                         const auto input_ptr  = reinterpret_cast<const uint16_t *>(input.ptr());
765                         const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
766 
767                         int x = window_start_x;
768                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
769                         {
770                             const uint16x8x2_t texels =
771                             {
772                                 {
773                                     vld1q_u16(input_ptr + x),
774                                     vld1q_u16(input_ptr + x + 8)
775                                 }
776                             };
777 
778                             vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
779                             vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
780                             vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
781                             vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
782                         }
783                         // Compute left-over elements
784                         for(; x < window_end_x; ++x)
785                         {
786                             *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
787                         }
788 
789                     },
790                     input, output);
791                     break;
792                 }
793                 default:
794                     ARM_COMPUTE_ERROR("Output data type not supported");
795             }
796             break;
797         }
798 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
799         case DataType::BFLOAT16:
800             switch(_output->info()->data_type())
801             {
802                 case DataType::F32:
803                 {
804                     /* Up-conversion BFLOAT16 -> F32 */
805                     execute_window_loop(win, [&](const Coordinates &)
806                     {
807                         const auto input_ptr  = reinterpret_cast<const bfloat16 *>(input.ptr());
808                         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
809 
810                         int x = window_start_x;
811                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
812                         {
813                             const uint16x8x2_t texels =
814                             {
815                                 {
816                                     vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
817                                     vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
818                                 }
819                             };
820 
821                             vst1q_f32(reinterpret_cast<float *>(output.ptr()),
822                                       vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
823                             vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
824                                       vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
825                             vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
826                                       vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
827                             vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
828                                       vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
829                         }
830 
831                         for(; x < window_end_x; ++x)
832                         {
833                             *(output_ptr + x) = float(*(input_ptr + x));
834                         }
835                     },
836                     input, output);
837                     break;
838                 }
839                 default:
840                     ARM_COMPUTE_ERROR("Output data type unsupported");
841             }
842             break;
843 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
844 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
845         case DataType::F16:
846             switch(_output->info()->data_type())
847             {
848                 case DataType::QASYMM8_SIGNED:
849                 {
850                     const float16_t   scale_s = 1 << _shift;
851                     const float16x8_t scale   = vdupq_n_f16(scale_s);
852 
853                     /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
854                     execute_window_loop(win, [&](const Coordinates &)
855                     {
856                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
857                         const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
858 
859                         int x = window_start_x;
860                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
861                         {
862                             const float16x8x2_t texels =
863                             {
864                                 {
865                                     vmulq_f16(vld1q_f16(input_ptr + x), scale),
866                                     vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
867                                 }
868                             };
869 
870                             vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
871                         }
872 
873                         // Compute left-over elements
874                         for(; x < window_end_x; ++x)
875                         {
876                             *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
877                         }
878                     },
879                     input, output);
880                     break;
881                 }
882                 case DataType::QASYMM8:
883                 case DataType::U8:
884                 {
885                     const float16_t   scale_s = 1 << _shift;
886                     const float16x8_t scale   = vdupq_n_f16(scale_s);
887 
888                     /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
889                     execute_window_loop(win, [&](const Coordinates &)
890                     {
891                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
892                         const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
893 
894                         int x = window_start_x;
895                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
896                         {
897                             const float16x8x2_t texels =
898                             {
899                                 {
900                                     vmulq_f16(vld1q_f16(input_ptr + x), scale),
901                                     vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
902                                 }
903                             };
904 
905                             vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
906                         }
907 
908                         // Compute left-over elements
909                         for(; x < window_end_x; ++x)
910                         {
911                             *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
912                         }
913 
914                     },
915                     input, output);
916                     break;
917                 }
918                 case DataType::F32:
919                 {
920                     const float       scale_s = 1 << _shift;
921                     const float32x4_t scale   = vdupq_n_f32(scale_s);
922 
923                     /* Up-conversion F16 -> F32 */
924                     execute_window_loop(win, [&](const Coordinates &)
925                     {
926                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
927                         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
928 
929                         int x = window_start_x;
930                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
931                         {
932                             const float16x8x2_t texels =
933                             {
934                                 {
935                                     vld1q_f16(input_ptr + x),
936                                     vld1q_f16(input_ptr + x + 8)
937                                 }
938                             };
939                             vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
940                             vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
941                             vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
942                             vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
943                         }
944 
945                         // Compute left-over elements
946                         for(; x < window_end_x; ++x)
947                         {
948                             *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
949                         }
950                     },
951                     input, output);
952                     break;
953                 }
954                 case DataType::S32:
955                 {
956                     const float       scale_s = 1 << _shift;
957                     const float32x4_t scale   = vdupq_n_f32(scale_s);
958 
959                     /* Up-conversion F16 -> S32 */
960                     execute_window_loop(win, [&](const Coordinates &)
961                     {
962                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
963                         const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
964 
965                         int x = window_start_x;
966                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
967                         {
968                             const float16x8x2_t texels =
969                             {
970                                 {
971                                     vld1q_f16(input_ptr + x),
972                                     vld1q_f16(input_ptr + x + 8)
973                                 }
974                             };
975 
976                             vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
977                             vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
978                             vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
979                             vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
980                         }
981 
982                         // Compute left-over elements
983                         for(; x < window_end_x; ++x)
984                         {
985                             *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
986                         }
987                     },
988                     input, output);
989                     break;
990                 }
991                 default:
992                     ARM_COMPUTE_ERROR("Output data type not supported");
993             }
994             break;
995 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
996         case DataType::F32:
997             switch(_output->info()->data_type())
998             {
999 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1000                 case DataType::F16:
1001                 {
1002                     const float       scale_s = 1.f / (1 << _shift);
1003                     const float32x4_t scale   = vdupq_n_f32(scale_s);
1004 
1005                     /* Down-conversion F32 -> F16 */
1006                     execute_window_loop(win, [&](const Coordinates &)
1007                     {
1008                         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
1009                         const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1010 
1011                         int x = window_start_x;
1012                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1013                         {
1014                             const float32x4x4_t texels =
1015                             {
1016                                 {
1017                                     vmulq_f32(vld1q_f32(input_ptr + x), scale),
1018                                     vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1019                                     vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1020                                     vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
1021                                 }
1022                             };
1023 
1024                             vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1025                             vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1026                         }
1027 
1028                         // Compute left-over elements
1029                         for(; x < window_end_x; ++x)
1030                         {
1031                             *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1032                         }
1033                     },
1034                     input, output);
1035                     break;
1036                 }
1037 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1038 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1039                 case DataType::BFLOAT16:
1040                 {
1041                     /* Down-conversion F32 -> BFLOAT16 */
1042                     execute_window_loop(win, [&](const Coordinates &)
1043                     {
1044                         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
1045                         const auto output_ptr = reinterpret_cast<bfloat16 *>(output.ptr());
1046 
1047                         int x = window_start_x;
1048                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1049                         {
1050                             wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()),
1051                                                    reinterpret_cast<uint16_t *>(output.ptr()));
1052                             wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()) + 8,
1053                                                    reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1054                         }
1055 
1056                         for(; x < window_end_x; ++x)
1057                         {
1058                             *(output_ptr + x) = *(input_ptr + x);
1059                         }
1060                     },
1061                     input, output);
1062                     break;
1063                 }
1064 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
1065                 case DataType::S32:
1066                 {
1067                     const float       scale_s = 1.f / (1 << _shift);
1068                     const float32x4_t scale   = vdupq_n_f32(scale_s);
1069 
1070                     /* Conversion F32 -> S32 */
1071                     execute_window_loop(win, [&](const Coordinates &)
1072                     {
1073                         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
1074                         const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
1075 
1076                         int x = window_start_x;
1077                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1078                         {
1079                             const float32x4x4_t texels =
1080                             {
1081                                 {
1082                                     vmulq_f32(vld1q_f32(input_ptr + x), scale),
1083                                     vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1084                                     vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1085                                     vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1086                                 }
1087                             };
1088 
1089                             vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1090                             vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1091                             vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1092                             vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1093                         }
1094 
1095                         // Compute left-over elements
1096                         for(; x < window_end_x; ++x)
1097                         {
1098                             *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1099                         }
1100                     },
1101                     input, output);
1102                     break;
1103                 }
1104                 case DataType::QASYMM8:
1105                 case DataType::U8:
1106                 {
1107                     const float       scale_s = 1.f / (1 << _shift);
1108                     const float32x4_t scale   = vdupq_n_f32(scale_s);
1109 
1110                     /* Down-conversion F32 -> U8 */
1111                     execute_window_loop(win, [&](const Coordinates &)
1112                     {
1113                         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
1114                         const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1115 
1116                         int x = window_start_x;
1117                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1118                         {
1119                             const float32x4x4_t texels =
1120                             {
1121                                 {
1122                                     vmulq_f32(vld1q_f32(input_ptr + x), scale),
1123                                     vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1124                                     vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1125                                     vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1126                                 }
1127                             };
1128 
1129                             vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1130                             vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1131                         }
1132 
1133                         // Compute left-over elements
1134                         for(; x < window_end_x; ++x)
1135                         {
1136                             *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1137                         }
1138                     },
1139                     input, output);
1140                     break;
1141                 }
1142                 case DataType::QASYMM8_SIGNED:
1143                 {
1144                     const float       scale_s = 1.f / (1 << _shift);
1145                     const float32x4_t scale   = vdupq_n_f32(scale_s);
1146 
1147                     /* Down-conversion F32 -> QASYMM8_SIGNED */
1148                     execute_window_loop(win, [&](const Coordinates &)
1149                     {
1150                         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
1151                         const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1152 
1153                         int x = window_start_x;
1154                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1155                         {
1156                             const float32x4x4_t texels =
1157                             {
1158                                 {
1159                                     vmulq_f32(vld1q_f32(input_ptr + x), scale),
1160                                     vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1161                                     vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1162                                     vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1163                                 }
1164                             };
1165 
1166                             vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1167                             vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1168                         }
1169                         // Compute left-over elements
1170                         for(; x < window_end_x; ++x)
1171                         {
1172                             *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1173                         }
1174                     },
1175                     input, output);
1176                     break;
1177                 }
1178 
1179                 default:
1180                     ARM_COMPUTE_ERROR("Output data type not supported");
1181             }
1182             break;
1183 
1184         case DataType::S32:
1185             switch(_output->info()->data_type())
1186             {
1187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1188                 case DataType::F16:
1189                 {
1190                     const float       scale_s = 1.f / (1 << _shift);
1191                     const float32x4_t scale   = vdupq_n_f32(scale_s);
1192 
1193                     /* Down-conversion S32 -> F16 */
1194                     execute_window_loop(win, [&](const Coordinates &)
1195                     {
1196                         const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1197                         const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1198 
1199                         int x = window_start_x;
1200                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1201                         {
1202                             const float32x4x4_t texels =
1203                             {
1204                                 {
1205                                     vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1206                                     vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1207                                     vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1208                                     vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1209                                 }
1210                             };
1211 
1212                             vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1213                             vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1214                         }
1215 
1216                         // Compute left-over elements
1217                         for(; x < window_end_x; ++x)
1218                         {
1219                             *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1220                         }
1221                     },
1222                     input, output);
1223                     break;
1224                 }
1225 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1226                 case DataType::F32:
1227                 {
1228                     const int       scale_s = 1.f / (1 << _shift);
1229                     const int32x4_t scale   = vdupq_n_s32(scale_s);
1230 
1231                     /* Conversion S32 -> F32 */
1232                     execute_window_loop(win, [&](const Coordinates &)
1233                     {
1234                         const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1235                         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
1236 
1237                         int x = window_start_x;
1238                         for(; x <= (window_end_x - window_step_x); x += window_step_x)
1239                         {
1240                             const int32x4x4_t texels =
1241                             {
1242                                 {
1243                                     vmulq_s32(vld1q_s32(input_ptr + x), scale),
1244                                     vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1245                                     vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1246                                     vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1247                                 }
1248                             };
1249 
1250                             vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1251                             vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1252                             vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1253                             vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1254                         }
1255 
1256                         // Compute left-over elements
1257                         for(; x < window_end_x; ++x)
1258                         {
1259                             *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1260                         }
1261                     },
1262                     input, output);
1263                     break;
1264                 }
1265                 case DataType::QASYMM8_SIGNED:
1266                 {
1267                     const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1268 
1269                     /* Down-conversion S32 -> QASYMM8_SIGNED */
1270                     if(ConvertPolicy::SATURATE == _policy)
1271                     {
1272                         execute_window_loop(win, [&](const Coordinates &)
1273                         {
1274                             const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1275                             const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1276 
1277                             int x = window_start_x;
1278                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1279                             {
1280                                 const int32x4x4_t texels =
1281                                 {
1282                                     {
1283                                         vqshlq_s32(vld1q_s32(input_ptr + x), b),
1284                                         vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1285                                         vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1286                                         vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1287                                     }
1288                                 };
1289                                 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1290                                 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1291                             }
1292 
1293                             // Compute left-over elements
1294                             for(; x < window_end_x; ++x)
1295                             {
1296                                 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1297                             }
1298                         },
1299                         input, output);
1300                     }
1301                     else
1302                     {
1303                         execute_window_loop(win, [&](const Coordinates &)
1304                         {
1305                             const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1306                             const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1307 
1308                             int x = window_start_x;
1309                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1310                             {
1311                                 const int32x4x4_t texels =
1312                                 {
1313                                     {
1314                                         vshlq_s32(vld1q_s32(input_ptr + x), b),
1315                                         vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1316                                         vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1317                                         vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1318                                     }
1319                                 };
1320 
1321                                 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1322                                 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1323                             }
1324 
1325                             // Compute left-over elements
1326                             for(; x < window_end_x; ++x)
1327                             {
1328                                 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1329                             }
1330                         },
1331                         input, output);
1332                     }
1333                     break;
1334                 }
1335                 case DataType::QASYMM8:
1336                 case DataType::U8:
1337                 {
1338                     const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1339 
1340                     /* Down-conversion S32 -> U8 */
1341                     if(ConvertPolicy::SATURATE == _policy)
1342                     {
1343                         execute_window_loop(win, [&](const Coordinates &)
1344                         {
1345                             const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1346                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1347 
1348                             int x = window_start_x;
1349                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1350                             {
1351                                 const int32x4x4_t texels =
1352                                 {
1353                                     {
1354                                         vqshlq_s32(vld1q_s32(input_ptr + x), b),
1355                                         vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1356                                         vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1357                                         vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1358                                     }
1359                                 };
1360                                 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1361                                 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1362                             }
1363 
1364                             // Compute left-over elements
1365                             for(; x < window_end_x; ++x)
1366                             {
1367                                 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1368                             }
1369                         },
1370                         input, output);
1371                     }
1372                     else
1373                     {
1374                         execute_window_loop(win, [&](const Coordinates &)
1375                         {
1376                             const auto input_ptr  = reinterpret_cast<const int32_t *>(input.ptr());
1377                             const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1378 
1379                             int x = window_start_x;
1380                             for(; x <= (window_end_x - window_step_x); x += window_step_x)
1381                             {
1382                                 const int32x4x4_t texels =
1383                                 {
1384                                     {
1385                                         vshlq_s32(vld1q_s32(input_ptr + x), b),
1386                                         vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1387                                         vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1388                                         vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1389                                     }
1390                                 };
1391 
1392                                 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1393                                 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1394                             }
1395 
1396                             // Compute left-over elements
1397                             for(; x < window_end_x; ++x)
1398                             {
1399                                 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
1400                             }
1401                         },
1402                         input, output);
1403                     }
1404                     break;
1405                 }
1406                 default:
1407                     ARM_COMPUTE_ERROR("Output data type not supported");
1408             }
1409             break;
1410         default:
1411             ARM_COMPUTE_ERROR("Not supported");
1412     }
1413 }
1414