1 /*
2 * Copyright (c) 2017-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
25
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/Utils.h"
29 #include "arm_compute/core/Validate.h"
30 #include "arm_compute/core/Window.h"
31 #include "src/core/AccessWindowStatic.h"
32 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEAsymm.h"
34 #include "src/core/NEON/NESymm.h"
35 #include "src/core/NEON/wrapper/wrapper.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38
39 #include <arm_neon.h>
40
41 namespace arm_compute
42 {
43 namespace
44 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output)45 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
46 {
47 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
48 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
49
50 if(output->tensor_shape().total_size() > 0)
51 {
52 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
53 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
54 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
55 }
56
57 return Status{};
58 }
59
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output)60 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
61 {
62 // Configure kernel window
63 Window win = calculate_max_window(*input, Steps());
64
65 // Output tensor auto initialization if not yet initialized
66 auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
67
68 // NEDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
69 Coordinates coord;
70 coord.set_num_dimensions(output->num_dimensions());
71 output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
72
73 return std::make_tuple(Status{}, win);
74 }
75
76 template <typename T>
store_result(T * ptr,const float32x4x4_t & v)77 inline void store_result(T *ptr, const float32x4x4_t &v)
78 {
79 ARM_COMPUTE_UNUSED(ptr, v);
80 }
81
82 template <>
store_result(float * ptr,const float32x4x4_t & v)83 inline void store_result<float>(float *ptr, const float32x4x4_t &v)
84 {
85 wrapper::vstore(ptr, v.val[0]);
86 wrapper::vstore(ptr + 4, v.val[1]);
87 wrapper::vstore(ptr + 8, v.val[2]);
88 wrapper::vstore(ptr + 12, v.val[3]);
89 }
90
91 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
92 template <>
store_result(float16_t * ptr,const float32x4x4_t & v)93 inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
94 {
95 wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
96 wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
97 }
98 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
99
100 template <typename T>
store_result(T * ptr,const float32x4x2_t & v)101 inline void store_result(T *ptr, const float32x4x2_t &v)
102 {
103 ARM_COMPUTE_UNUSED(ptr, v);
104 }
105
106 template <>
store_result(float * ptr,const float32x4x2_t & v)107 inline void store_result<float>(float *ptr, const float32x4x2_t &v)
108 {
109 wrapper::vstore(ptr, v.val[0]);
110 wrapper::vstore(ptr + 4, v.val[1]);
111 }
112
113 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
114 template <>
store_result(float16_t * ptr,const float32x4x2_t & v)115 inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
116 {
117 wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
118 }
119 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
120
121 template <typename TOut, typename TIn>
run_dequantization_qasymm8(const ITensor * input,ITensor * output,const Window & window)122 void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
123 {
124 const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
125 const float scale = qinfo.scale;
126 const int32_t offset = qinfo.offset;
127
128 const int window_step_x = 16;
129 const auto window_start_x = static_cast<int>(window.x().start());
130 const auto window_end_x = static_cast<int>(window.x().end());
131
132 // Collapse window and reset first dimension to handle tail calculations manually
133 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
134 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
135
136 // Create iterators
137 Iterator in(input, win_collapsed);
138 Iterator out(output, win_collapsed);
139
140 execute_window_loop(win_collapsed, [&](const Coordinates &)
141 {
142 const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr());
143 const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
144
145 int x = window_start_x;
146 for(; x <= (window_end_x - window_step_x); x += window_step_x)
147 {
148 const auto vin = wrapper::vloadq(in_ptr + x);
149 const auto vdeq = vdequantize(vin, scale, offset);
150
151 store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
152 }
153
154 // Compute left-over elements
155 for(; x < window_end_x; ++x)
156 {
157 auto val = *(in_ptr + x);
158 *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
159 }
160 },
161 in, out);
162 }
163
164 template <typename T>
run_dequantization_qsymm8_per_channel_nchw(const ITensor * input,ITensor * output,const Window & window)165 void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
166 {
167 const auto scale = input->info()->quantization_info().scale();
168
169 const int window_step_x = 16;
170 const auto window_start_x = static_cast<int>(window.x().start());
171 const auto window_end_x = static_cast<int>(window.x().end());
172
173 // Reset first dimension to handle tail calculations manually
174 Window win(window);
175 win.set(Window::DimX, Window::Dimension(0, 1, 1));
176
177 // Create iterators
178 Iterator in(input, win);
179 Iterator out(output, win);
180
181 execute_window_loop(win, [&](const Coordinates & id)
182 {
183 const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
184 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
185
186 int x = window_start_x;
187 for(; x <= (window_end_x - window_step_x); x += window_step_x)
188 {
189 const auto vin = wrapper::vloadq(in_ptr + x);
190 const auto vdeq = vdequantize(vin, scale[id.z()]);
191
192 store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
193 }
194
195 // Compute left-over elements
196 for(; x < window_end_x; ++x)
197 {
198 int8_t val = *(in_ptr + x);
199 *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
200 }
201 },
202 in, out);
203 }
204
205 template <typename T>
run_dequantization_qsymm8_per_channel_nhwc(const ITensor * input,ITensor * output,const Window & window)206 void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
207 {
208 const auto scale = input->info()->quantization_info().scale();
209
210 const int window_step_x = 16;
211 const auto window_start_x = static_cast<int>(window.x().start());
212 const auto window_end_x = static_cast<int>(window.x().end());
213
214 // Reset first dimension to handle tail calculations manually
215 Window win(window);
216 win.set(Window::DimX, Window::Dimension(0, 1, 1));
217
218 // Create iterators
219 Iterator in(input, win);
220 Iterator out(output, win);
221
222 execute_window_loop(win, [&](const Coordinates &)
223 {
224 const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
225 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
226
227 int x = window_start_x;
228 for(; x <= (window_end_x - window_step_x); x += window_step_x)
229 {
230 const float32x4x4_t vscale =
231 {
232 {
233 scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
234 scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
235 scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
236 scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
237 }
238 };
239 const auto vin = wrapper::vloadq(in_ptr + x);
240 const auto vdeq = vdequantize(vin, vscale);
241
242 store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
243 }
244
245 // Compute left-over elements
246 for(; x < window_end_x; ++x)
247 {
248 int8_t val = *(in_ptr + x);
249 *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
250 }
251 },
252 in, out);
253 }
254
255 template <typename T>
run_dequantization_qsymm8(const ITensor * input,ITensor * output,const Window & window)256 void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
257 {
258 const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
259 const float scale = qinfo.scale;
260
261 const int window_step_x = 16;
262 const auto window_start_x = static_cast<int>(window.x().start());
263 const auto window_end_x = static_cast<int>(window.x().end());
264
265 // Collapse window and reset first dimension to handle tail calculations manually
266 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
267 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
268
269 // Create iterators
270 Iterator in(input, win_collapsed);
271 Iterator out(output, win_collapsed);
272
273 execute_window_loop(win_collapsed, [&](const Coordinates &)
274 {
275 const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
276 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
277
278 int x = window_start_x;
279 for(; x <= (window_end_x - window_step_x); x += window_step_x)
280 {
281 const auto vin = wrapper::vloadq(in_ptr + x);
282 const auto vdeq = vdequantize(vin, scale);
283
284 store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
285 }
286
287 // Compute left-over elements
288 for(; x < window_end_x; ++x)
289 {
290 int8_t val = *(in_ptr + x);
291 *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
292 }
293 },
294 in, out);
295 }
296
297 template <typename T>
run_dequantization_qsymm16(const ITensor * input,ITensor * output,const Window & window)298 void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
299 {
300 const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
301 const float scale = qinfo.scale;
302
303 const int window_step_x = 8;
304 const auto window_start_x = static_cast<int>(window.x().start());
305 const auto window_end_x = static_cast<int>(window.x().end());
306
307 // Collapse window and reset first dimension to handle tail calculations manually
308 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
309 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
310
311 // Create iterators
312 Iterator in(input, win_collapsed);
313 Iterator out(output, win_collapsed);
314
315 execute_window_loop(win_collapsed, [&](const Coordinates &)
316 {
317 const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr());
318 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
319
320 int x = window_start_x;
321 for(; x <= (window_end_x - window_step_x); x += window_step_x)
322 {
323 const auto vin = wrapper::vloadq(in_ptr + x);
324 const auto vdeq = vdequantize_int16(vin, scale);
325
326 store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
327 }
328
329 // Compute left-over elements
330 for(; x < window_end_x; ++x)
331 {
332 int16_t val = *(in_ptr + x);
333 *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
334 }
335 },
336 in, out);
337 }
338
339 template <typename T>
run_dequantization_core(const ITensor * input,ITensor * output,const Window & window)340 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
341 {
342 switch(input->info()->data_type())
343 {
344 case DataType::QASYMM8:
345 run_dequantization_qasymm8<T, uint8_t>(input, output, window);
346 break;
347 case DataType::QASYMM8_SIGNED:
348 run_dequantization_qasymm8<T, int8_t>(input, output, window);
349 break;
350 case DataType::QSYMM8_PER_CHANNEL:
351 input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
352 break;
353 case DataType::QSYMM8:
354 run_dequantization_qsymm8<T>(input, output, window);
355 break;
356 case DataType::QSYMM16:
357 run_dequantization_qsymm16<T>(input, output, window);
358 break;
359 default:
360 ARM_COMPUTE_ERROR("Unsupported data type.");
361 }
362 }
363 } // namespace
364
NEDequantizationLayerKernel()365 NEDequantizationLayerKernel::NEDequantizationLayerKernel()
366 : _input(nullptr), _output(nullptr)
367 {
368 }
369
configure(const ITensor * input,ITensor * output)370 void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output)
371 {
372 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
373 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
374
375 _input = input;
376 _output = output;
377
378 // Configure kernel window
379 auto win_config = validate_and_configure_window(input->info(), output->info());
380
381 ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
382
383 INEKernel::configure(std::get<1>(win_config));
384 }
385
validate(const ITensorInfo * input,const ITensorInfo * output)386 Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
387 {
388 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
389 ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
390 return Status{};
391 }
392
run(const Window & window,const ThreadInfo & info)393 void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
394 {
395 ARM_COMPUTE_UNUSED(info);
396 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
397 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
398
399 switch(_output->info()->data_type())
400 {
401 case DataType::F32:
402 run_dequantization_core<float>(_input, _output, window);
403 break;
404 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
405 case DataType::F16:
406 run_dequantization_core<float16_t>(_input, _output, window);
407 break;
408 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
409 default:
410 ARM_COMPUTE_ERROR("Unsupported data type.");
411 }
412 }
413 } // namespace arm_compute
414