1 /*
2 * Copyright (c) 2016-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
25
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/Validate.h"
31 #include "src/core/CPP/Validate.h"
32 #include "src/core/NEON/NEFixedPoint.h"
33 #include "src/core/NEON/NEMath.h"
34 #include "src/core/NEON/wrapper/wrapper.h"
35 #include "src/core/helpers/AutoConfiguration.h"
36 #include "src/core/helpers/WindowHelpers.h"
37 #include "support/SaturateCast.h"
38
39 using namespace arm_compute;
40
41 namespace
42 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,ConvertPolicy policy,uint32_t shift)43 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
44 {
45 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
46 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
47 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(input);
48 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(output);
49 ARM_COMPUTE_UNUSED(policy);
50 ARM_COMPUTE_RETURN_ERROR_ON(input == output);
51 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
52 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
53 DataType::F32, DataType::S32);
54 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
55 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
56 DataType::U32, DataType::S32, DataType::F32);
57 ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
58
59 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8_SIGNED && (output->data_type() != DataType::S16 && output->data_type() != DataType::S32
60 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
61 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
62
63 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
64 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
65 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
66
67 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
68 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
69 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
70
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
72 "Only data_types supported [in] U16 -> [out] U8, U32");
73
74 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
75 "Only data_types supported [in] S16 -> [out] U8, S32");
76
77 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::BFLOAT16 && output->data_type() != DataType::F32,
78 "Only data_types supported [in] BFLOAT16 -> [out] F32");
79
80 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
81 && output->data_type() != DataType::U8
82 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
83 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
84
85 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
86 && output->data_type() != DataType::F16 && output->data_type() != DataType::BFLOAT16
87 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
88 "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
89
90 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
91 && output->data_type() != DataType::F16
92 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
93 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
94
95 // Validate in case of configured output
96 if(output->total_size() > 0)
97 {
98 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
99 }
100
101 return Status{};
102 }
103 } // namespace
104
NEDepthConvertLayerKernel()105 NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
106 : _input(nullptr), _output(nullptr), _policy(), _shift(0)
107 {
108 }
109
configure(const ITensor * input,ITensor * output,ConvertPolicy policy,uint32_t shift)110 void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
111 {
112 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
113
114 // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
115 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
116
117 _input = input;
118 _output = output;
119 _policy = policy;
120 _shift = shift;
121
122 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
123
124 // Configure kernel window
125 Window win = calculate_max_window(*input->info(), Steps());
126 Coordinates coord;
127 coord.set_num_dimensions(output->info()->num_dimensions());
128 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
129
130 ICPPKernel::configure(win);
131 }
132
validate(const ITensorInfo * input,const ITensorInfo * output,ConvertPolicy policy,uint32_t shift)133 Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
134 {
135 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
136 return Status{};
137 }
138
run(const Window & window,const ThreadInfo & info)139 void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
140 {
141 ARM_COMPUTE_UNUSED(info);
142 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
143 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
144 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
145 ARM_COMPUTE_ERROR_ON(_input == _output);
146
147 const auto window_start_x = static_cast<int>(window.x().start());
148 const auto window_end_x = static_cast<int>(window.x().end());
149 const int window_step_x = 16;
150
151 Window win{ window };
152 win.set(Window::DimX, Window::Dimension(0, 1, 1));
153
154 Iterator input(_input, win);
155 Iterator output(_output, win);
156
157 switch(_input->info()->data_type())
158 {
159 case DataType::QASYMM8_SIGNED:
160 {
161 const int16x8_t b = vdupq_n_s16(_shift);
162
163 switch(_output->info()->data_type())
164 {
165 case DataType::S16:
166 {
167 /* Up-conversion QASYMM8_SIGNED -> S16 */
168 execute_window_loop(win, [&](const Coordinates &)
169 {
170 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
171 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
172 int x = window_start_x;
173
174 for(; x <= (window_end_x - window_step_x); x += window_step_x)
175 {
176 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
177
178 const int16x8x2_t texels =
179 {
180 {
181 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
182 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
183 }
184 };
185
186 vst1q_s16(output_ptr + x, texels.val[0]);
187 vst1q_s16(output_ptr + x + 8, texels.val[1]);
188 }
189
190 // Compute left-over elements
191 for(; x < window_end_x; ++x)
192 {
193 *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
194 }
195 },
196 input, output);
197 break;
198 }
199 case DataType::S32:
200 {
201 /* Up-conversion QASYMM8_SIGNED -> S32 */
202 execute_window_loop(win, [&](const Coordinates &)
203 {
204 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
205 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
206 int x = window_start_x;
207
208 for(; x <= (window_end_x - window_step_x); x += window_step_x)
209 {
210 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
211
212 const int16x8x2_t texels =
213 {
214 {
215 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
216 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
217 }
218 };
219
220 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
221 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
222 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
223 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
224 }
225
226 // Compute left-over elements
227 for(; x < window_end_x; ++x)
228 {
229 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
230 }
231 },
232 input, output);
233 break;
234 }
235 case DataType::F32:
236 {
237 /* Up-conversion QASYMM8_SIGNED -> F32 */
238 execute_window_loop(win, [&](const Coordinates &)
239 {
240 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
241 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
242
243 int x = window_start_x;
244 for(; x <= (window_end_x - window_step_x); x += window_step_x)
245 {
246 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(input.ptr()));
247
248 const int16x8x2_t texels =
249 {
250 {
251 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
252 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
253 }
254 };
255 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
256 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
257 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
258 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
259 }
260
261 // Compute left-over elements
262 for(; x < window_end_x; ++x)
263 {
264 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
265 }
266 },
267 input, output);
268 break;
269 }
270 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
271 case DataType::F16:
272 {
273 /* Up-conversion QASYMM8_SIGNED -> F16 */
274 execute_window_loop(win, [&](const Coordinates &)
275 {
276 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
277 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
278 int x = window_start_x;
279
280 for(; x <= (window_end_x - window_step_x); x += window_step_x)
281 {
282 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
283
284 const int16x8x2_t texels =
285 {
286 {
287 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
288 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
289 }
290 };
291 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
292 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
293 }
294
295 // Compute left-over elements
296 for(; x < window_end_x; ++x)
297 {
298 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
299 }
300 },
301 input, output);
302 break;
303 }
304 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
305
306 default:
307 ARM_COMPUTE_ERROR("Output data type not supported");
308 }
309 break;
310 }
311
312 case DataType::QASYMM8:
313 case DataType::U8:
314 {
315 const int16x8_t b = vdupq_n_s16(_shift);
316
317 switch(_output->info()->data_type())
318 {
319 case DataType::S16:
320 {
321 /* Up-conversion U8 -> S16 */
322 execute_window_loop(win, [&](const Coordinates &)
323 {
324 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
325 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
326
327 int x = window_start_x;
328 for(; x <= (window_end_x - window_step_x); x += window_step_x)
329 {
330 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
331
332 const int16x8x2_t texels =
333 {
334 {
335 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
336 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
337 }
338 };
339
340 vst1q_s16(output_ptr + x, texels.val[0]);
341 vst1q_s16(output_ptr + x + 8, texels.val[1]);
342 }
343
344 // Compute left-over elements
345 for(; x < window_end_x; ++x)
346 {
347 auto in = static_cast<int32_t>(*(input_ptr + x));
348 *(output_ptr + x) = in << _shift;
349 }
350 },
351 input, output);
352 break;
353 }
354 case DataType::S32:
355 {
356 /* Up-conversion U8 -> S32 */
357 execute_window_loop(win, [&](const Coordinates &)
358 {
359 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
360 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
361
362 int x = window_start_x;
363 for(; x <= (window_end_x - window_step_x); x += window_step_x)
364 {
365 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
366
367 const int16x8x2_t texels =
368 {
369 {
370 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
371 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
372 }
373 };
374
375 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
376 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
377 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
378 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
379 }
380
381 // Compute left-over elements
382 for(; x < window_end_x; ++x)
383 {
384 auto in = static_cast<uint32_t>(*(input_ptr + x));
385 *(output_ptr + x) = in << _shift;
386 }
387 },
388 input, output);
389 break;
390 }
391 case DataType::F32:
392 {
393 /* Up-conversion U8 -> F32 */
394 execute_window_loop(win, [&](const Coordinates &)
395 {
396 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
397 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
398
399 int x = window_start_x;
400 for(; x <= (window_end_x - window_step_x); x += window_step_x)
401 {
402 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
403
404 const int16x8x2_t texels =
405 {
406 {
407 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
408 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
409 }
410 };
411 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
412 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
413 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
414 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
415 }
416
417 // Compute left-over elements
418 for(; x < window_end_x; ++x)
419 {
420 auto in = static_cast<uint32_t>(*(input_ptr + x));
421 *(output_ptr + x) = static_cast<float>(in << _shift);
422 }
423 },
424 input, output);
425 break;
426 }
427 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
428 case DataType::F16:
429 {
430 /* Up-conversion U8 -> F16 */
431 execute_window_loop(win, [&](const Coordinates &)
432 {
433 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
434 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
435
436 int x = window_start_x;
437 for(; x <= (window_end_x - window_step_x); x += window_step_x)
438 {
439 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
440
441 const int16x8x2_t texels =
442 {
443 {
444 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
445 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
446 }
447 };
448 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
449 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
450 }
451
452 // Compute left-over elements
453 for(; x < window_end_x; ++x)
454 {
455 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
456 }
457 },
458 input, output);
459 break;
460 }
461 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
462 case DataType::U16:
463 {
464 /* Up-conversion U8 -> U16 */
465 execute_window_loop(win, [&](const Coordinates &)
466 {
467 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
468 const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
469
470 int x = window_start_x;
471 for(; x <= (window_end_x - window_step_x); x += window_step_x)
472 {
473 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
474
475 const uint16x8x2_t texels =
476 {
477 {
478 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
479 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
480 }
481 };
482
483 vst1q_u16(output_ptr + x, texels.val[0]);
484 vst1q_u16(output_ptr + x + 8, texels.val[1]);
485 }
486
487 // Compute left-over elements
488 for(; x < window_end_x; ++x)
489 {
490 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
491 }
492 },
493 input, output);
494 break;
495 }
496 default:
497 ARM_COMPUTE_ERROR("Output data type not supported");
498 }
499 break;
500 }
501 case DataType::S16:
502 {
503 switch(_output->info()->data_type())
504 {
505 case DataType::QASYMM8_SIGNED:
506 {
507 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
508
509 /* Down-conversion S16 -> QASYMM8_SIGNED */
510 if(ConvertPolicy::SATURATE == _policy)
511 {
512 execute_window_loop(win, [&](const Coordinates &)
513 {
514 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
515 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
516
517 int x = window_start_x;
518 for(; x <= (window_end_x - window_step_x); x += window_step_x)
519 {
520 const int16x8x2_t texels =
521 {
522 {
523 vqshlq_s16(vld1q_s16(input_ptr + x), b),
524 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
525 }
526 };
527
528 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
529 }
530
531 // Compute left-over elements
532 for(; x < window_end_x; ++x)
533 {
534 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
535 }
536 },
537 input, output);
538 }
539 else
540 {
541 execute_window_loop(win, [&](const Coordinates &)
542 {
543 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
544 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
545
546 int x = window_start_x;
547 for(; x <= (window_end_x - window_step_x); x += window_step_x)
548 {
549 const int16x8x2_t texels =
550 {
551 {
552 vshlq_s16(vld1q_s16(input_ptr + x), b),
553 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
554 }
555 };
556
557 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
558 }
559
560 // Compute left-over elements
561 for(; x < window_end_x; ++x)
562 {
563 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
564 }
565 },
566 input, output);
567 }
568 break;
569 }
570 case DataType::U8:
571 {
572 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
573
574 /* Down-conversion S16 -> U8 */
575 if(ConvertPolicy::SATURATE == _policy)
576 {
577 execute_window_loop(win, [&](const Coordinates &)
578 {
579 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
580 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
581
582 int x = window_start_x;
583 for(; x <= (window_end_x - window_step_x); x += window_step_x)
584 {
585 const int16x8x2_t texels =
586 {
587 {
588 vqshlq_s16(vld1q_s16(input_ptr + x), b),
589 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
590 }
591 };
592
593 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
594 }
595
596 // Compute left-over elements
597 for(; x < window_end_x; ++x)
598 {
599 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
600 }
601 },
602 input, output);
603 }
604 else
605 {
606 execute_window_loop(win, [&](const Coordinates &)
607 {
608 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
609 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
610
611 int x = window_start_x;
612 for(; x <= (window_end_x - window_step_x); x += window_step_x)
613 {
614 const int16x8x2_t texels =
615 {
616 {
617 vshlq_s16(vld1q_s16(input_ptr + x), b),
618 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
619 }
620 };
621
622 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
623 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
624 }
625
626 // Compute left-over elements
627 for(; x < window_end_x; ++x)
628 {
629 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
630 }
631 },
632 input, output);
633 }
634 break;
635 }
636 case DataType::S32:
637 {
638 const int32x4_t b = vdupq_n_s32(_shift);
639
640 /* Up-conversion S16 -> S32 */
641 execute_window_loop(win, [&](const Coordinates &)
642 {
643 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
644 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
645
646 int x = window_start_x;
647 for(; x <= (window_end_x - window_step_x); x += window_step_x)
648 {
649 const int16x8x2_t texels =
650 {
651 {
652 vld1q_s16(input_ptr + x),
653 vld1q_s16(input_ptr + x + 8)
654 }
655 };
656
657 const int32x4x4_t texels_s32 =
658 {
659 {
660 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
661 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
662 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
663 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
664 }
665 };
666
667 vst1q_s32(output_ptr + x, texels_s32.val[0]);
668 vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
669 vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
670 vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
671 }
672
673 // Compute left-over elements
674 for(; x < window_end_x; ++x)
675 {
676 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
677 }
678 },
679 input, output);
680 break;
681 }
682 default:
683 ARM_COMPUTE_ERROR("Output data type not supported");
684 }
685 break;
686 }
687 case DataType::U16:
688 {
689 switch(_output->info()->data_type())
690 {
691 case DataType::U8:
692 {
693 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
694
695 /* Down-conversion U16 -> U8 */
696 if(ConvertPolicy::SATURATE == _policy)
697 {
698 execute_window_loop(win, [&](const Coordinates &)
699 {
700 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
701 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
702
703 int x = window_start_x;
704 for(; x <= (window_end_x - window_step_x); x += window_step_x)
705 {
706 const uint16x8x2_t texels =
707 {
708 {
709 vqshlq_u16(vld1q_u16(input_ptr + x), b),
710 vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
711 }
712 };
713
714 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
715 }
716
717 // Compute left-over elements
718 for(; x < window_end_x; ++x)
719 {
720 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
721 }
722 },
723 input, output);
724 }
725 else
726 {
727 execute_window_loop(win, [&](const Coordinates &)
728 {
729 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
730 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
731
732 int x = window_start_x;
733 for(; x <= (window_end_x - window_step_x); x += window_step_x)
734 {
735 const uint16x8x2_t texels =
736 {
737 {
738 vshlq_u16(vld1q_u16(input_ptr + x), b),
739 vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
740 }
741 };
742
743 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
744 }
745
746 // Compute left-over elements
747 for(; x < window_end_x; ++x)
748 {
749 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
750 }
751
752 },
753 input, output);
754 }
755 break;
756 }
757 case DataType::U32:
758 {
759 const int32x4_t b = vdupq_n_s32(_shift);
760
761 /* Up-conversion U16 -> U32 */
762 execute_window_loop(win, [&](const Coordinates &)
763 {
764 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
765 const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
766
767 int x = window_start_x;
768 for(; x <= (window_end_x - window_step_x); x += window_step_x)
769 {
770 const uint16x8x2_t texels =
771 {
772 {
773 vld1q_u16(input_ptr + x),
774 vld1q_u16(input_ptr + x + 8)
775 }
776 };
777
778 vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
779 vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
780 vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
781 vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
782 }
783 // Compute left-over elements
784 for(; x < window_end_x; ++x)
785 {
786 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
787 }
788
789 },
790 input, output);
791 break;
792 }
793 default:
794 ARM_COMPUTE_ERROR("Output data type not supported");
795 }
796 break;
797 }
798 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
799 case DataType::BFLOAT16:
800 switch(_output->info()->data_type())
801 {
802 case DataType::F32:
803 {
804 /* Up-conversion BFLOAT16 -> F32 */
805 execute_window_loop(win, [&](const Coordinates &)
806 {
807 const auto input_ptr = reinterpret_cast<const bfloat16 *>(input.ptr());
808 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
809
810 int x = window_start_x;
811 for(; x <= (window_end_x - window_step_x); x += window_step_x)
812 {
813 const uint16x8x2_t texels =
814 {
815 {
816 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
817 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
818 }
819 };
820
821 vst1q_f32(reinterpret_cast<float *>(output.ptr()),
822 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
823 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
824 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
825 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
826 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
827 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
828 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
829 }
830
831 for(; x < window_end_x; ++x)
832 {
833 *(output_ptr + x) = float(*(input_ptr + x));
834 }
835 },
836 input, output);
837 break;
838 }
839 default:
840 ARM_COMPUTE_ERROR("Output data type unsupported");
841 }
842 break;
843 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
844 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
845 case DataType::F16:
846 switch(_output->info()->data_type())
847 {
848 case DataType::QASYMM8_SIGNED:
849 {
850 const float16_t scale_s = 1 << _shift;
851 const float16x8_t scale = vdupq_n_f16(scale_s);
852
853 /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
854 execute_window_loop(win, [&](const Coordinates &)
855 {
856 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
857 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
858
859 int x = window_start_x;
860 for(; x <= (window_end_x - window_step_x); x += window_step_x)
861 {
862 const float16x8x2_t texels =
863 {
864 {
865 vmulq_f16(vld1q_f16(input_ptr + x), scale),
866 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
867 }
868 };
869
870 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
871 }
872
873 // Compute left-over elements
874 for(; x < window_end_x; ++x)
875 {
876 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
877 }
878 },
879 input, output);
880 break;
881 }
882 case DataType::QASYMM8:
883 case DataType::U8:
884 {
885 const float16_t scale_s = 1 << _shift;
886 const float16x8_t scale = vdupq_n_f16(scale_s);
887
888 /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
889 execute_window_loop(win, [&](const Coordinates &)
890 {
891 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
892 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
893
894 int x = window_start_x;
895 for(; x <= (window_end_x - window_step_x); x += window_step_x)
896 {
897 const float16x8x2_t texels =
898 {
899 {
900 vmulq_f16(vld1q_f16(input_ptr + x), scale),
901 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
902 }
903 };
904
905 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
906 }
907
908 // Compute left-over elements
909 for(; x < window_end_x; ++x)
910 {
911 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
912 }
913
914 },
915 input, output);
916 break;
917 }
918 case DataType::F32:
919 {
920 const float scale_s = 1 << _shift;
921 const float32x4_t scale = vdupq_n_f32(scale_s);
922
923 /* Up-conversion F16 -> F32 */
924 execute_window_loop(win, [&](const Coordinates &)
925 {
926 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
927 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
928
929 int x = window_start_x;
930 for(; x <= (window_end_x - window_step_x); x += window_step_x)
931 {
932 const float16x8x2_t texels =
933 {
934 {
935 vld1q_f16(input_ptr + x),
936 vld1q_f16(input_ptr + x + 8)
937 }
938 };
939 vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
940 vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
941 vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
942 vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
943 }
944
945 // Compute left-over elements
946 for(; x < window_end_x; ++x)
947 {
948 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
949 }
950 },
951 input, output);
952 break;
953 }
954 case DataType::S32:
955 {
956 const float scale_s = 1 << _shift;
957 const float32x4_t scale = vdupq_n_f32(scale_s);
958
959 /* Up-conversion F16 -> S32 */
960 execute_window_loop(win, [&](const Coordinates &)
961 {
962 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
963 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
964
965 int x = window_start_x;
966 for(; x <= (window_end_x - window_step_x); x += window_step_x)
967 {
968 const float16x8x2_t texels =
969 {
970 {
971 vld1q_f16(input_ptr + x),
972 vld1q_f16(input_ptr + x + 8)
973 }
974 };
975
976 vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
977 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
978 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
979 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
980 }
981
982 // Compute left-over elements
983 for(; x < window_end_x; ++x)
984 {
985 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
986 }
987 },
988 input, output);
989 break;
990 }
991 default:
992 ARM_COMPUTE_ERROR("Output data type not supported");
993 }
994 break;
995 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
996 case DataType::F32:
997 switch(_output->info()->data_type())
998 {
999 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1000 case DataType::F16:
1001 {
1002 const float scale_s = 1.f / (1 << _shift);
1003 const float32x4_t scale = vdupq_n_f32(scale_s);
1004
1005 /* Down-conversion F32 -> F16 */
1006 execute_window_loop(win, [&](const Coordinates &)
1007 {
1008 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1009 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1010
1011 int x = window_start_x;
1012 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1013 {
1014 const float32x4x4_t texels =
1015 {
1016 {
1017 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1018 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1019 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1020 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
1021 }
1022 };
1023
1024 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1025 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1026 }
1027
1028 // Compute left-over elements
1029 for(; x < window_end_x; ++x)
1030 {
1031 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1032 }
1033 },
1034 input, output);
1035 break;
1036 }
1037 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1038 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1039 case DataType::BFLOAT16:
1040 {
1041 /* Down-conversion F32 -> BFLOAT16 */
1042 execute_window_loop(win, [&](const Coordinates &)
1043 {
1044 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1045 const auto output_ptr = reinterpret_cast<bfloat16 *>(output.ptr());
1046
1047 int x = window_start_x;
1048 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1049 {
1050 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()),
1051 reinterpret_cast<uint16_t *>(output.ptr()));
1052 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()) + 8,
1053 reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1054 }
1055
1056 for(; x < window_end_x; ++x)
1057 {
1058 *(output_ptr + x) = *(input_ptr + x);
1059 }
1060 },
1061 input, output);
1062 break;
1063 }
1064 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
1065 case DataType::S32:
1066 {
1067 const float scale_s = 1.f / (1 << _shift);
1068 const float32x4_t scale = vdupq_n_f32(scale_s);
1069
1070 /* Conversion F32 -> S32 */
1071 execute_window_loop(win, [&](const Coordinates &)
1072 {
1073 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1074 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
1075
1076 int x = window_start_x;
1077 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1078 {
1079 const float32x4x4_t texels =
1080 {
1081 {
1082 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1083 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1084 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1085 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1086 }
1087 };
1088
1089 vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1090 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1091 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1092 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1093 }
1094
1095 // Compute left-over elements
1096 for(; x < window_end_x; ++x)
1097 {
1098 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1099 }
1100 },
1101 input, output);
1102 break;
1103 }
1104 case DataType::QASYMM8:
1105 case DataType::U8:
1106 {
1107 const float scale_s = 1.f / (1 << _shift);
1108 const float32x4_t scale = vdupq_n_f32(scale_s);
1109
1110 /* Down-conversion F32 -> U8 */
1111 execute_window_loop(win, [&](const Coordinates &)
1112 {
1113 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1114 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1115
1116 int x = window_start_x;
1117 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1118 {
1119 const float32x4x4_t texels =
1120 {
1121 {
1122 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1123 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1124 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1125 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1126 }
1127 };
1128
1129 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1130 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1131 }
1132
1133 // Compute left-over elements
1134 for(; x < window_end_x; ++x)
1135 {
1136 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1137 }
1138 },
1139 input, output);
1140 break;
1141 }
1142 case DataType::QASYMM8_SIGNED:
1143 {
1144 const float scale_s = 1.f / (1 << _shift);
1145 const float32x4_t scale = vdupq_n_f32(scale_s);
1146
1147 /* Down-conversion F32 -> QASYMM8_SIGNED */
1148 execute_window_loop(win, [&](const Coordinates &)
1149 {
1150 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1151 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1152
1153 int x = window_start_x;
1154 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1155 {
1156 const float32x4x4_t texels =
1157 {
1158 {
1159 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1160 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1161 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1162 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1163 }
1164 };
1165
1166 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1167 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1168 }
1169 // Compute left-over elements
1170 for(; x < window_end_x; ++x)
1171 {
1172 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1173 }
1174 },
1175 input, output);
1176 break;
1177 }
1178
1179 default:
1180 ARM_COMPUTE_ERROR("Output data type not supported");
1181 }
1182 break;
1183
1184 case DataType::S32:
1185 switch(_output->info()->data_type())
1186 {
1187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1188 case DataType::F16:
1189 {
1190 const float scale_s = 1.f / (1 << _shift);
1191 const float32x4_t scale = vdupq_n_f32(scale_s);
1192
1193 /* Down-conversion S32 -> F16 */
1194 execute_window_loop(win, [&](const Coordinates &)
1195 {
1196 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1197 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1198
1199 int x = window_start_x;
1200 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1201 {
1202 const float32x4x4_t texels =
1203 {
1204 {
1205 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1206 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1207 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1208 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1209 }
1210 };
1211
1212 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1213 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1214 }
1215
1216 // Compute left-over elements
1217 for(; x < window_end_x; ++x)
1218 {
1219 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1220 }
1221 },
1222 input, output);
1223 break;
1224 }
1225 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1226 case DataType::F32:
1227 {
1228 const int scale_s = 1.f / (1 << _shift);
1229 const int32x4_t scale = vdupq_n_s32(scale_s);
1230
1231 /* Conversion S32 -> F32 */
1232 execute_window_loop(win, [&](const Coordinates &)
1233 {
1234 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1235 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
1236
1237 int x = window_start_x;
1238 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1239 {
1240 const int32x4x4_t texels =
1241 {
1242 {
1243 vmulq_s32(vld1q_s32(input_ptr + x), scale),
1244 vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1245 vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1246 vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1247 }
1248 };
1249
1250 vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1251 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1252 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1253 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1254 }
1255
1256 // Compute left-over elements
1257 for(; x < window_end_x; ++x)
1258 {
1259 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1260 }
1261 },
1262 input, output);
1263 break;
1264 }
1265 case DataType::QASYMM8_SIGNED:
1266 {
1267 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1268
1269 /* Down-conversion S32 -> QASYMM8_SIGNED */
1270 if(ConvertPolicy::SATURATE == _policy)
1271 {
1272 execute_window_loop(win, [&](const Coordinates &)
1273 {
1274 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1275 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1276
1277 int x = window_start_x;
1278 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1279 {
1280 const int32x4x4_t texels =
1281 {
1282 {
1283 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1284 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1285 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1286 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1287 }
1288 };
1289 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1290 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1291 }
1292
1293 // Compute left-over elements
1294 for(; x < window_end_x; ++x)
1295 {
1296 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1297 }
1298 },
1299 input, output);
1300 }
1301 else
1302 {
1303 execute_window_loop(win, [&](const Coordinates &)
1304 {
1305 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1306 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1307
1308 int x = window_start_x;
1309 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1310 {
1311 const int32x4x4_t texels =
1312 {
1313 {
1314 vshlq_s32(vld1q_s32(input_ptr + x), b),
1315 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1316 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1317 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1318 }
1319 };
1320
1321 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1322 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1323 }
1324
1325 // Compute left-over elements
1326 for(; x < window_end_x; ++x)
1327 {
1328 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1329 }
1330 },
1331 input, output);
1332 }
1333 break;
1334 }
1335 case DataType::QASYMM8:
1336 case DataType::U8:
1337 {
1338 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1339
1340 /* Down-conversion S32 -> U8 */
1341 if(ConvertPolicy::SATURATE == _policy)
1342 {
1343 execute_window_loop(win, [&](const Coordinates &)
1344 {
1345 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1346 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1347
1348 int x = window_start_x;
1349 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1350 {
1351 const int32x4x4_t texels =
1352 {
1353 {
1354 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1355 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1356 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1357 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1358 }
1359 };
1360 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1361 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1362 }
1363
1364 // Compute left-over elements
1365 for(; x < window_end_x; ++x)
1366 {
1367 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1368 }
1369 },
1370 input, output);
1371 }
1372 else
1373 {
1374 execute_window_loop(win, [&](const Coordinates &)
1375 {
1376 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1377 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1378
1379 int x = window_start_x;
1380 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1381 {
1382 const int32x4x4_t texels =
1383 {
1384 {
1385 vshlq_s32(vld1q_s32(input_ptr + x), b),
1386 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1387 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1388 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1389 }
1390 };
1391
1392 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1393 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1394 }
1395
1396 // Compute left-over elements
1397 for(; x < window_end_x; ++x)
1398 {
1399 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
1400 }
1401 },
1402 input, output);
1403 }
1404 break;
1405 }
1406 default:
1407 ARM_COMPUTE_ERROR("Output data type not supported");
1408 }
1409 break;
1410 default:
1411 ARM_COMPUTE_ERROR("Not supported");
1412 }
1413 }
1414