• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "arm_compute/core/CL/CLKernelLibrary.h"
28 #include "arm_compute/core/CL/ICLTensor.h"
29 #include "arm_compute/core/Helpers.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Utils.h"
32 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
33 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
34 #include "src/core/CL/CLValidate.h"
35 #include "src/core/CL/ICLKernel.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 #include "support/StringSupport.h"
39 
40 namespace arm_compute
41 {
42 namespace
43 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const DWCWeightsKernelInfo & dwc_weights_info,const DWCKernelInfo & dwc_info,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const Size2D & dilation,const ITensorInfo * output_multipliers,const ITensorInfo * output_shifts)44 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
45                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
46                           const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
47 {
48     ARM_COMPUTE_UNUSED(dwc_info);
49     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
50     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
51     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
52     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
53     ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1);
54     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1);
55     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1);
56     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
57     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
58     ARM_COMPUTE_UNUSED(idx_c);
59     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier));
60 
61     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
62 
63     const bool is_quantized = is_data_type_quantized(input->data_type());
64 
65     if(biases != nullptr)
66     {
67         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
68         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
69 
70         if(is_quantized)
71         {
72             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
73         }
74         else
75         {
76             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
77         }
78     }
79 
80     if(is_quantized)
81     {
82         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
83         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
84         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
85         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
86         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
87 
88         if(is_data_type_quantized_per_channel(weights->data_type()))
89         {
90             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
91             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
92             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_shifts->dimension(0));
93         }
94         else
95         {
96             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
97             ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0));
98             ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0));
99         }
100     }
101     else
102     {
103         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
104     }
105 
106     if(output->total_size() != 0)
107     {
108         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
109         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
110     }
111 
112     if(is_data_type_quantized(input->data_type()))
113     {
114         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
115         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
116         const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
117 
118         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
119         int   output_multiplier = 0;
120         int   output_shift      = 0;
121         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
122     }
123 
124     return Status{};
125 }
126 } // namespace
127 
CLDepthwiseConvolutionLayerNativeKernel()128 CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel()
129     : _input(nullptr),
130       _weights(nullptr),
131       _biases(nullptr),
132       _output(nullptr),
133       _depth_multiplier(1),
134       _output_multipliers(nullptr),
135       _output_shifts(nullptr),
136       _is_quantized(false)
137 {
138 }
139 
configure(const ICLTensor * input,const ICLTensor * weights,const ICLTensor * biases,ICLTensor * output,const DWCWeightsKernelInfo & dwc_weights_info,const DWCKernelInfo & dwc_info,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const Size2D & dilation,const ICLTensor * output_multipliers,const ICLTensor * output_shifts)140 void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
141                                                         const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
142                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
143 {
144     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts);
145 }
146 
configure(const CLCompileContext & compile_context,const ICLTensor * input,const ICLTensor * weights,const ICLTensor * biases,ICLTensor * output,const DWCWeightsKernelInfo & dwc_weights_info,const DWCKernelInfo & dwc_info,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const Size2D & dilation,const ICLTensor * output_multipliers,const ICLTensor * output_shifts)147 void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
148                                                         const DWCWeightsKernelInfo &dwc_weights_info,
149                                                         const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
150                                                         const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
151 {
152     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
153     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
154                                                   dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
155                                                   (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
156 
157     auto padding_info = get_padding_info({ input, output });
158 
159     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
160     auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
161 
162     _input              = input;
163     _output             = output;
164     _weights            = weights;
165     _biases             = biases;
166     _depth_multiplier   = depth_multiplier;
167     _output_multipliers = output_multipliers;
168     _output_shifts      = output_shifts;
169     _is_quantized       = is_data_type_quantized(input->info()->data_type());
170 
171     const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0));
172 
173     CLBuildOptions build_opts;
174     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
175     build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(_output->info()->dimension(2))));
176     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
177     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
178     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
179     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
180     build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
181     build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
182     build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1)));
183     build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2)));
184     build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
185     build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
186     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
187     build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
188     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
189     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
190     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
191 
192     std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
193 
194     if(_is_quantized)
195     {
196         const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
197         const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform();
198         const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
199 
200         build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset));
201         build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset));
202         build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset));
203         build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION");
204 
205         // Compute non-per-channel multiplier and shift anyway to make OpenCL kernel simpler
206         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
207         int   output_multiplier = 0;
208         int   output_shift      = 0;
209         quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
210         build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
211         build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
212 
213         if(dwc_info.activation_info.enabled())
214         {
215             int a_val{};
216             int b_val{};
217             std::tie(b_val, a_val) = get_quantized_activation_min_max(dwc_info.activation_info, input->info()->data_type(), oq_info);
218 
219             const int o1 = oq_info.offset;
220 
221             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
222             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
223             build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
224 
225             const float s1 = iq_info.scale;
226             build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
227             build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
228         }
229 
230         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
231         build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
232     }
233     else
234     {
235         build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.a()));
236         build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
237     }
238 
239     Window win = calculate_max_window(*(output->info()), Steps(n0));
240     ICLKernel::configure_internal(win);
241 
242     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
243 
244     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
245 
246     // Set config_id for enabling LWS tuning
247     _config_id = kernel_name;
248     _config_id += "_";
249     _config_id += support::cpp11::to_string(input->info()->dimension(0));
250     _config_id += "_";
251     _config_id += support::cpp11::to_string(input->info()->dimension(1));
252     _config_id += "_";
253     _config_id += support::cpp11::to_string(input->info()->dimension(2));
254     _config_id += "_";
255     _config_id += support::cpp11::to_string(output->info()->dimension(0));
256     _config_id += "_";
257     _config_id += support::cpp11::to_string(output->info()->dimension(1));
258     _config_id += "_";
259     _config_id += support::cpp11::to_string(output->info()->dimension(2));
260     _config_id += "_";
261     _config_id += string_from_data_type(input->info()->data_type());
262 }
263 
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const DWCWeightsKernelInfo & dwc_weights_info,const DWCKernelInfo & dwc_info,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const Size2D & dilation,const ITensorInfo * output_multipliers,const ITensorInfo * output_shifts)264 Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
265                                                          const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info,
266                                                          unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
267 {
268     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
269     return Status{};
270 }
271 
run(const Window & window,cl::CommandQueue & queue)272 void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::CommandQueue &queue)
273 {
274     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
275     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
276 
277     // Collapse window
278     Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
279     Window slice_in         = window.first_slice_window_4D();
280     Window slice_out        = window_collapsed.first_slice_window_4D();
281 
282     if(_depth_multiplier != 1)
283     {
284         ARM_COMPUTE_ERROR_ON(slice_out.x().step() != 1);
285         slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1));
286     }
287 
288     unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
289 
290     // Set output multipliers in case of quantized data type
291     if(_is_quantized)
292     {
293         add_1D_tensor_argument(idx, _output_multipliers, slice_in);
294         add_1D_tensor_argument(idx, _output_shifts, slice_in);
295     }
296 
297     if(_biases != nullptr)
298     {
299         add_1D_tensor_argument(idx, _biases, slice_in);
300     }
301 
302     do
303     {
304         idx = 0;
305         add_4D_tensor_argument(idx, _input, slice_in);
306         add_4D_tensor_argument(idx, _output, slice_out);
307         add_3D_tensor_argument(idx, _weights, slice_out);
308         enqueue(queue, *this, slice_out, lws_hint());
309     }
310     while(window_collapsed.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
311 }
312 } // namespace arm_compute
313