• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEIm2ColKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/Size2D.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Types.h"
32 #include "arm_compute/core/Validate.h"
33 #include "src/core/CPP/Validate.h"
34 #include "src/core/helpers/AutoConfiguration.h"
35 #include "src/core/helpers/WindowHelpers.h"
36 
37 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
38 
39 #include <arm_neon.h>
40 #include <cstddef>
41 #include <cstdint>
42 #include <cstring>
43 #include <tuple>
44 
45 using namespace arm_compute;
46 using namespace misc::shape_calculator;
47 
48 namespace
49 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)50 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
51                           bool has_bias, const Size2D &dilation, unsigned int num_groups)
52 {
53     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
54     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
55     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
56     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
57     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON");
58 
59     if(output->total_size() > 0)
60     {
61         TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
62         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
63         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
64         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
65     }
66 
67     return Status{};
68 }
69 
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation)70 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
71                                                         bool has_bias, const Size2D &dilation)
72 {
73     const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
74     const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
75     const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
76 
77     std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input->dimension(width_idx), input->dimension(height_idx),
78                                                                              kernel_dims.width, kernel_dims.height,
79                                                                              conv_info, dilation);
80 
81     // Output tensor auto initialization if not yet initialized
82     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)));
83 
84     Window win = calculate_max_window(*input, Steps());
85     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
86     win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
87     win.set(channel_idx, Window::Dimension(0, 1, 1));
88 
89     // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
90     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
91 
92     return std::make_pair(Status{}, win);
93 }
94 
95 template <typename T, bool has_pads>
linearize_volume_nchw(const uint8_t * const in_ptr,T * out_ptr,bool has_bias,int top_left_x,int top_left_y,int kernel_width,int kernel_height,int kernel_depth,int input_w,int input_h,int input_stride_x,int input_stride_y,int input_stride_z,int pad_value,int dilation_x,int dilation_y)96 inline void linearize_volume_nchw(const uint8_t *const in_ptr,
97                                   T                   *out_ptr,
98                                   bool                 has_bias,
99                                   int                  top_left_x,
100                                   int                  top_left_y,
101                                   int                  kernel_width,
102                                   int                  kernel_height,
103                                   int                  kernel_depth,
104                                   int                  input_w,
105                                   int                  input_h,
106                                   int                  input_stride_x,
107                                   int                  input_stride_y,
108                                   int                  input_stride_z,
109                                   int                  pad_value,
110                                   int                  dilation_x,
111                                   int                  dilation_y)
112 {
113     const int kernel_size2 = kernel_width * kernel_height;
114     const int x_e          = top_left_x + kernel_width * dilation_x;
115     const int y_e          = top_left_y + kernel_height * dilation_y;
116 
117     // Linearize volume
118     int d = 0;
119     // This for loop linearize a volume with 3 slices. This allows:
120     // 1) to reduce the iterations of the outer for loop "d"
121     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
122     for(; d <= (kernel_depth - 3); d += 3)
123     {
124         for(int y = top_left_y; y < y_e; y += dilation_y)
125         {
126             if((y < 0 || y >= input_h) && has_pads)
127             {
128                 // All the values will be the offset (will be zeros when not quantized)
129                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
130                 {
131                     *(out_ptr + 0 * kernel_size2) = pad_value;
132                     *(out_ptr + 1 * kernel_size2) = pad_value;
133                     *(out_ptr + 2 * kernel_size2) = pad_value;
134                 }
135             }
136             else
137             {
138                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
139                 {
140                     if((x < 0 || x >= input_w) && has_pads)
141                     {
142                         *(out_ptr + 0 * kernel_size2) = pad_value;
143                         *(out_ptr + 1 * kernel_size2) = pad_value;
144                         *(out_ptr + 2 * kernel_size2) = pad_value;
145                     }
146                     else
147                     {
148                         *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
149                         *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
150                         *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
151                     }
152                 }
153             }
154         }
155         out_ptr += 2 * kernel_size2;
156     }
157 
158     // Left over
159     for(; d < kernel_depth; d++)
160     {
161         for(int y = top_left_y; y < y_e; y += dilation_y)
162         {
163             if((y < 0 || y >= input_h) && has_pads)
164             {
165                 // All the values will be the offset (will be zeros when not quantized)
166                 memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
167                 out_ptr += kernel_width;
168             }
169             else
170             {
171                 for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
172                 {
173                     if((x < 0 || x >= input_w) && has_pads)
174                     {
175                         *out_ptr = pad_value;
176                     }
177                     else
178                     {
179                         *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
180                     }
181                 }
182             }
183         }
184     }
185 
186     // Append 1 if the convolution layer has biases
187     if(has_bias)
188     {
189         *out_ptr = static_cast<T>(1);
190     }
191 }
192 
193 template <typename T, bool has_pads>
linearize_volume_nhwc(const uint8_t * const in_ptr,T * out_ptr,bool has_bias,int start_x,int start_y,int kernel_width,int kernel_height,int input_w,int input_h,int input_c,int input_stride_y,int input_stride_z,int pad_value,int dilation_x,int dilation_y)194 inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
195                                   T                   *out_ptr,
196                                   bool                 has_bias,
197                                   int                  start_x,
198                                   int                  start_y,
199                                   int                  kernel_width,
200                                   int                  kernel_height,
201                                   int                  input_w,
202                                   int                  input_h,
203                                   int                  input_c,
204                                   int                  input_stride_y,
205                                   int                  input_stride_z,
206                                   int                  pad_value,
207                                   int                  dilation_x,
208                                   int                  dilation_y)
209 {
210     const int end_x        = start_x + kernel_width * dilation_x;
211     const int end_y        = start_y + kernel_height * dilation_y;
212     const int pad_quant    = kernel_width * input_c;
213     const int element_size = static_cast<int>(sizeof(T));
214     if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
215     {
216         for(int y = start_y; y < end_y; y += dilation_y)
217         {
218             //optimized for no dilation and no boundary pixels
219             memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
220             out_ptr += input_c * kernel_width;
221         }
222     }
223     else
224     {
225         for(int y = start_y; y < end_y; y += dilation_y)
226         {
227             if(y < 0 || y >= input_h)
228             {
229                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
230                 out_ptr += pad_quant;
231             }
232             else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
233             {
234                 for(int x = start_x; x < end_x; x += dilation_x)
235                 {
236                     if(x < 0 || x >= input_w)
237                     {
238                         memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
239                         out_ptr += input_c;
240                     }
241                     else
242                     {
243                         memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
244                         out_ptr += input_c;
245                     }
246                 }
247             }
248             else
249             {
250                 //optimized for no dilation and no boundary pixels
251                 memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
252                 out_ptr += input_c * kernel_width;
253             }
254         }
255     }
256     // Append 1 if the convolution layer has biases
257     if(has_bias)
258     {
259         *out_ptr = static_cast<T>(1);
260     }
261 }
262 } // namespace
263 
264 template <typename T, bool has_pads, bool is_nchw>
run_im2col(const Window & window)265 void NEIm2ColKernel::run_im2col(const Window &window)
266 {
267     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
268     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
269 
270     const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
271     const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
272     const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
273 
274     const int input_w        = _input->info()->dimension(width_idx);
275     const int input_h        = _input->info()->dimension(height_idx);
276     const int input_c        = _input->info()->dimension(channel_idx);
277     const int input_stride_x = _input->info()->strides_in_bytes().x();
278     const int input_stride_y = _input->info()->strides_in_bytes().y();
279     const int input_stride_z = _input->info()->strides_in_bytes().z();
280     const int pad_left       = _conv_info.pad_left();
281     const int pad_top        = _conv_info.pad_top();
282     const int stride_x       = _conv_info.stride().first;
283     const int stride_y       = _conv_info.stride().second;
284     const int pad_value      = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().uniform().offset : 0;
285 
286     Window window_in_out(window);
287     // The first three dimensions of the input and output are increased by the inner loops
288     window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
289     window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
290     window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
291 
292     // Create iterators
293     Iterator in(_input, window_in_out);
294     Iterator out(_output, window_in_out);
295 
296     execute_window_loop(window, [&](const Coordinates & id)
297     {
298         const int start_w = id[width_idx] * stride_x - pad_left;
299         const int start_h = id[height_idx] * stride_y - pad_top;
300 
301         // Get pointers
302         const uint8_t *const input_ptr  = in.ptr();
303         auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
304 
305         // Linearize volume
306         if(is_nchw)
307         {
308             linearize_volume_nchw<T, has_pads>(input_ptr,
309                                                output_ptr,
310                                                _has_bias,
311                                                start_w,
312                                                start_h,
313                                                _kernel_width,
314                                                _kernel_height,
315                                                input_c,
316                                                input_w,
317                                                input_h,
318                                                input_stride_x,
319                                                input_stride_y,
320                                                input_stride_z,
321                                                pad_value,
322                                                _dilation.x(),
323                                                _dilation.y());
324         }
325         else
326         {
327             linearize_volume_nhwc<T, has_pads>(input_ptr,
328                                                output_ptr,
329                                                _has_bias,
330                                                start_w,
331                                                start_h,
332                                                _kernel_width,
333                                                _kernel_height,
334                                                input_w,
335                                                input_h,
336                                                input_c,
337                                                input_stride_y,
338                                                input_stride_z,
339                                                pad_value,
340                                                _dilation.x(),
341                                                _dilation.y());
342         }
343     },
344     in, out);
345 }
346 
NEIm2ColKernel()347 NEIm2ColKernel::NEIm2ColKernel()
348     : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
349 {
350 }
351 
configure(const ITensor * input,ITensor * output,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)352 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
353                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
354 {
355     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
356     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
357     ARM_COMPUTE_UNUSED(num_groups);
358 
359     _data_layout                  = input->info()->data_layout();
360     const unsigned int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
361     const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
362 
363     _input          = input;
364     _output         = output;
365     _conv_info      = conv_info;
366     _kernel_width   = kernel_dims.width;
367     _kernel_height  = kernel_dims.height;
368     _dilation       = dilation;
369     _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
370                                         _kernel_width, _kernel_height,
371                                         _conv_info, _dilation);
372     _has_bias = has_bias;
373 
374     if(_data_layout == DataLayout::NCHW)
375     {
376         switch(_input->info()->data_type())
377         {
378             case DataType::F32:
379                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, true> : &NEIm2ColKernel::run_im2col<float, true, true>;
380                 break;
381 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
382             case DataType::BFLOAT16:
383                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, true> : &NEIm2ColKernel::run_im2col<bfloat16, true, true>;
384                 break;
385 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
386 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
387             case DataType::F16:
388                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
389                 break;
390 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
391             case DataType::QASYMM8_SIGNED:
392             case DataType::QASYMM8:
393                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
394                 break;
395             default:
396                 ARM_COMPUTE_ERROR("Data type not supported");
397                 break;
398         }
399     }
400     else
401     {
402         switch(_input->info()->data_type())
403         {
404             case DataType::F32:
405                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, false> : &NEIm2ColKernel::run_im2col<float, true, false>;
406                 break;
407 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
408             case DataType::BFLOAT16:
409                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, false> : &NEIm2ColKernel::run_im2col<bfloat16, true, false>;
410                 break;
411 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
412 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
413             case DataType::F16:
414                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, false> : &NEIm2ColKernel::run_im2col<float16_t, true, false>;
415                 break;
416 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
417             case DataType::QASYMM8:
418                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<uint8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
419                 break;
420             case DataType::QASYMM8_SIGNED:
421                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<int8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
422                 break;
423             default:
424                 ARM_COMPUTE_ERROR("Data type not supported");
425                 break;
426         }
427     }
428 
429     // Configure kernel window
430     auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation);
431     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
432     INEKernel::configure(win_config.second);
433 }
434 
validate(const ITensorInfo * input,const ITensorInfo * output,const Size2D & kernel_dims,const PadStrideInfo & conv_info,bool has_bias,const Size2D & dilation,unsigned int num_groups)435 Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
436                                 bool has_bias, const Size2D &dilation, unsigned int num_groups)
437 {
438     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
439     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation).first);
440     return Status{};
441 }
442 
run(const Window & window,const ThreadInfo & info)443 void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
444 {
445     ARM_COMPUTE_UNUSED(info);
446     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
447     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
448 
449     (this->*_func)(window);
450 }
451