• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
26 
27 #include "arm_compute/core/ITensor.h"
28 #include "arm_compute/core/Utils.h"
29 #include "arm_compute/core/utils/misc/InfoHelpers.h"
30 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
31 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
32 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
34 #include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
35 #include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
36 #include "src/core/helpers/AutoConfiguration.h"
37 
38 #include "arm_compute/runtime/NEON/NEScheduler.h"
39 
40 #include "support/MemorySupport.h"
41 
42 #include <set>
43 
44 namespace arm_compute
45 {
46 namespace
47 {
get_qasymm8_convolver(int kernel_size,int stride_x,int n_batches,int in_rows,int in_cols,int n_channels,int dilation_factor,neon_convolution_kernels::ActivationFunction activation,const qasymm8::QAsymm8Params & wqinfo,const qasymm8::QAsymm8Params & iqinfo,const qasymm8::QAsymm8Params & oqinfo,const qasymm8::QAsymm8RescaleParams & rescale_params,int padding_top,int padding_left,int padding_bottom,int padding_right)48 std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
49                                                                         int n_batches, int in_rows, int in_cols, int n_channels,
50                                                                         int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
51                                                                         const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
52                                                                         const qasymm8::QAsymm8RescaleParams &rescale_params,
53                                                                         int padding_top, int padding_left, int padding_bottom, int padding_right)
54 {
55     switch(kernel_size)
56     {
57         case 3:
58         {
59             switch(stride_x)
60             {
61                 case 1:
62                     return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
63                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
64                 case 2:
65                     return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
66                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
67                 default:
68                     return nullptr;
69             }
70         }
71         case 5:
72         {
73             switch(stride_x)
74             {
75                 case 1:
76                     return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
77                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
78                 case 2:
79                     return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
80                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
81                 default:
82                     return nullptr;
83             }
84         }
85         default:
86             return nullptr;
87     }
88 }
89 
get_qsymm8_perchannel_convolver(int kernel_size,int stride_x,int n_batches,int in_rows,int in_cols,int n_channels,neon_convolution_kernels::ActivationFunction activation,const qsymm8::QSymm8PerChannelParams & wqinfo,const qasymm8::QAsymm8Params & iqinfo,const qasymm8::QAsymm8Params & oqinfo,const qsymm8::QSymm8PerChannelRescaleParams & rescale_params,int padding_top,int padding_left,int padding_bottom,int padding_right)90 std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
91                                                                                   int n_batches, int in_rows, int in_cols, int n_channels,
92                                                                                   neon_convolution_kernels::ActivationFunction activation,
93                                                                                   const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
94                                                                                   const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
95                                                                                   int padding_top, int padding_left, int padding_bottom, int padding_right)
96 {
97     switch(kernel_size)
98     {
99         case 3:
100         {
101             switch(stride_x)
102             {
103                 case 1:
104                     return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
105                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
106                 case 2:
107                     return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
108                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
109                 default:
110                     return nullptr;
111             }
112         }
113         case 5:
114         {
115             switch(stride_x)
116             {
117                 case 1:
118                     return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
119                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
120                 case 2:
121                     return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
122                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
123                 default:
124                     return nullptr;
125             }
126         }
127         default:
128             return nullptr;
129     }
130 }
131 
132 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
get_fp16_convolver(int kernel_size,int stride_x,int n_batches,int in_rows,int in_cols,int n_channels,int dilation_factor,neon_convolution_kernels::ActivationFunction activation,int padding_top,int padding_left,int padding_bottom,int padding_right)133 std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
134                                                                      int n_batches, int in_rows, int in_cols, int n_channels,
135                                                                      int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
136                                                                      int padding_top, int padding_left, int padding_bottom, int padding_right)
137 {
138     switch(kernel_size)
139     {
140         case 3:
141         {
142             switch(stride_x)
143             {
144                 case 1:
145                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
146                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
147                 case 2:
148                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
149                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
150                 default:
151                     return nullptr;
152             }
153         }
154         case 5:
155         {
156             switch(stride_x)
157             {
158                 case 1:
159                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
160                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
161                 case 2:
162                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
163                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
164                 default:
165                     return nullptr;
166             }
167         }
168         default:
169             return nullptr;
170     }
171 }
172 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
173 
get_fp32_convolver(int kernel_size,int stride_x,int n_batches,int in_rows,int in_cols,int n_channels,int dilation_factor,neon_convolution_kernels::ActivationFunction activation,int padding_top,int padding_left,int padding_bottom,int padding_right)174 std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
175                                                                      int n_batches, int in_rows, int in_cols, int n_channels,
176                                                                      int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
177                                                                      int padding_top, int padding_left, int padding_bottom, int padding_right)
178 {
179     switch(kernel_size)
180     {
181         case 3:
182         {
183             switch(stride_x)
184             {
185                 case 1:
186                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
187                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
188                 case 2:
189                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
190                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
191                 default:
192                     return nullptr;
193             }
194         }
195         case 5:
196         {
197             switch(stride_x)
198             {
199                 case 1:
200                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
201                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
202                 case 2:
203                     return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
204                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
205                 default:
206                     return nullptr;
207             }
208         }
209         default:
210             return nullptr;
211     }
212 }
213 
create_convolver(const ITensor * input,const ITensor * weights,ITensor * output,PadStrideInfo conv_info,ActivationLayerInfo act_info,const Size2D & dilation)214 std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor      *input,
215                                                                    const ITensor      *weights,
216                                                                    ITensor            *output,
217                                                                    PadStrideInfo       conv_info,
218                                                                    ActivationLayerInfo act_info,
219                                                                    const Size2D       &dilation)
220 {
221     ARM_COMPUTE_UNUSED(dilation);
222     const DataType    data_type = input->info()->data_type();
223     const TensorShape shape     = input->info()->tensor_shape();
224 
225     const int n_batches       = shape[3];
226     const int in_rows         = shape.z();
227     const int in_cols         = shape.y();
228     const int n_channels      = shape.x();
229     const int dilation_factor = dilation.x();
230     const int padding_top     = conv_info.pad_top();
231     const int padding_left    = conv_info.pad_left();
232     const int padding_bottom  = conv_info.pad_bottom();
233     const int padding_right   = conv_info.pad_right();
234 
235     const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8);
236     const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
237 
238     const unsigned int stride_x    = conv_info.stride().first;
239     const unsigned int kernel_size = weights->info()->tensor_shape().y();
240 
241     // Map activation function
242     neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
243     if(arm_compute::utils::info_helpers::is_relu(act_info))
244     {
245         activation = neon_convolution_kernels::ActivationFunction::ReLU;
246     }
247     else if(arm_compute::utils::info_helpers::is_relu6(act_info))
248     {
249         activation = neon_convolution_kernels::ActivationFunction::ReLU6;
250     }
251 
252     // Create quantized convolver
253     if(is_uniform_quantized)
254     {
255         const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
256         const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform();
257         const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
258 
259         // Check that quantization info are in the range [0, 255]
260         ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
261         ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
262         ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
263         const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
264         const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
265         const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
266 
267         // Calculate rescale parameters
268         const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
269         int32_t     qmultiplier = 0;
270         int32_t     qshift      = 0;
271         quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
272         qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
273 
274         return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
275                                      wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
276     }
277     else if(is_perchannel_quantized)
278     {
279         const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
280         const QuantizationInfo        weights_qinfo = weights->info()->quantization_info();
281         const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
282 
283         // Check that quantization info are in the range [0, 255]
284         ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
285         ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
286         const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
287         const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
288         const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
289 
290         // Calculate rescale parameters
291         std::vector<float>   fmultipliers;
292         std::vector<int32_t> qmultipliers;
293         std::vector<int32_t> qshifts;
294 
295         for(auto const s : wqinfo.scales)
296         {
297             const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
298             int32_t     qmultiplier = 0;
299             int32_t     qshift      = 0;
300             quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
301             fmultipliers.push_back(fmultipler);
302             qmultipliers.push_back(qmultiplier);
303             qshifts.push_back(qshift);
304         }
305 
306         qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
307 
308         return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
309                                                wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
310     }
311     else
312     {
313         // Create float convolver
314         switch(data_type)
315         {
316 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
317             case DataType::F16:
318             {
319                 return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
320             }
321 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
322             case DataType::F32:
323             {
324                 return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
325             }
326             default:
327                 return nullptr;
328         }
329     }
330 }
331 } // namespace
332 
333 struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
334 {
335     std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
336     NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel{};
337 };
338 
339 #ifndef DOXYGEN_SKIP_THIS
NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)340 NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
341     : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
342       _pImpl(support::cpp14::make_unique<LocalImpl>())
343 {
344 }
345 #endif /* DOXYGEN_SKIP_THIS */
346 
347 NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
348 
configure(const ITensor * input,const ITensor * weights,const ITensor * bias,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)349 void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
350                                                        const ITensor             *weights,
351                                                        const ITensor             *bias,
352                                                        ITensor                   *output,
353                                                        const PadStrideInfo       &conv_info,
354                                                        unsigned int               depth_multiplier,
355                                                        const ActivationLayerInfo &act_info,
356                                                        const Size2D              &dilation)
357 {
358     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
359     ARM_COMPUTE_UNUSED(depth_multiplier);
360     ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
361                                                                                 weights->info(),
362                                                                                 bias != nullptr ? bias->info() : nullptr,
363                                                                                 output->info(),
364                                                                                 conv_info,
365                                                                                 depth_multiplier,
366                                                                                 act_info,
367                                                                                 dilation));
368 
369     // Output auto inizialitation if not yet initialized
370     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
371     auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
372 
373     _input       = input;
374     _weights     = weights;
375     _bias        = bias;
376     _output      = output;
377     _is_prepared = false;
378 
379     // Create convolver
380     _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
381     ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
382 
383     // Create assembly kernel wrapper
384     _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
385 
386     constexpr size_t alignment = 128;
387 
388     // Create workspace
389     const unsigned int num_threads    = NEScheduler::get().num_threads();
390     const size_t       workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
391     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
392     _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
393     _memory_group.manage(&_workspace);
394     _workspace.allocator()->allocate();
395 
396     // Create packing tensor
397     const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
398     ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
399     _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
400 }
401 
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * bias,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)402 Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo         *input,
403                                                         const ITensorInfo         *weights,
404                                                         const ITensorInfo         *bias,
405                                                         const ITensorInfo         *output,
406                                                         const PadStrideInfo       &conv_info,
407                                                         unsigned int               depth_multiplier,
408                                                         const ActivationLayerInfo &act_info,
409                                                         const Size2D              &dilation)
410 {
411     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
412     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
413     if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
414     {
415         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
416     }
417     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
418 
419     // Validate convolver
420     ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation));
421 
422     // Validate activation
423     const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
424     const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
425     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
426 
427     // Check bias
428     if(bias != nullptr)
429     {
430         unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
431         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
432         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
433     }
434 
435     // Check output
436     if(output->total_size() != 0)
437     {
438         const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
439         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
440         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
441     }
442 
443     // The uniform quantization case will only have 1 scale value in the weights quantization info
444     const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
445     const QuantizationInfo        weights_qinfo = weights->quantization_info();
446     const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
447     for(auto const s : weights_qinfo.scale())
448     {
449         const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
450         ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
451     }
452 
453     return Status{};
454 }
455 
is_optimized_supported(const ITensorInfo * input,const ITensorInfo * weights,PadStrideInfo conv_info,unsigned int depth_multiplier,const Size2D & dilation)456 bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
457                                                                     const ITensorInfo *weights,
458                                                                     PadStrideInfo      conv_info,
459                                                                     unsigned int       depth_multiplier,
460                                                                     const Size2D      &dilation)
461 {
462     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
463 
464     // Reshape input shape if in NHWC format
465     const DataLayout data_layout = input->data_layout();
466     TensorShape      in_shape{ input->tensor_shape() };
467     if(data_layout == DataLayout::NHWC)
468     {
469         in_shape.set(Window::DimX, input->tensor_shape().y());
470         in_shape.set(Window::DimY, input->tensor_shape().z());
471         in_shape.set(Window::DimZ, input->tensor_shape().x());
472     }
473 
474     // Check data type
475     // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer
476     const DataType input_type            = input->data_type();
477     const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
478     const DataType weights_type          = weights->data_type();
479     const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
480                                            || weights_type == DataType::QSYMM8_PER_CHANNEL;
481 
482     // Check weighs size
483     std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
484     const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
485     const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
486     const unsigned int     kernel_w               = weights->dimension(width_idx);
487     const unsigned int     kernel_h               = weights->dimension(height_idx);
488     bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
489 
490     // Check for supported strides
491     const auto &strides           = conv_info.stride();
492     bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
493 
494     // Check for supported padding
495     const auto    pad_top           = conv_info.pad_top();
496     const auto    pad_right         = conv_info.pad_right();
497     const auto    pad_bottom        = conv_info.pad_bottom();
498     const auto    pad_left          = conv_info.pad_left();
499     PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), conv_info, DataLayout::NCHW, dilation);
500     bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
501     bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
502     bool          supported_padding = is_same_padding || is_valid_padding;
503     // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
504     bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
505 
506     if(weights_type == DataType::QSYMM8_PER_CHANNEL)
507     {
508         is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
509     }
510 
511     return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
512 }
513 
run()514 void NEDepthwiseConvolutionAssemblyDispatch::run()
515 {
516     // Prepare assembly kernel
517     prepare();
518 
519     MemoryGroupResourceScope scope_mg(_memory_group);
520 
521     // Setup inputs/outputs
522     ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
523     _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
524 
525     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
526     const int   input_element_size = _input->info()->element_size();
527     const int   input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
528     const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
529     const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
530     const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
531     _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
532 
533     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
534     const int output_element_size = _output->info()->element_size();
535     const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
536     const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
537     const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
538     void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
539     _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
540 
541     // Schedule assembly kernel
542     NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
543 }
544 
prepare()545 void NEDepthwiseConvolutionAssemblyDispatch::prepare()
546 {
547     if(!_is_prepared)
548     {
549         _packed_weights.allocator()->allocate();
550         ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
551 
552         // Pack weights and bias
553         const int weights_element_size = _weights->info()->element_size();
554         const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
555         const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
556         _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
557                                                   _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
558                                                   weights_row_stride,
559                                                   weights_col_stride,
560                                                   (_bias != nullptr) ? _bias->buffer() : nullptr);
561         _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
562 
563         _weights->mark_as_unused();
564         if(_bias != nullptr)
565         {
566             _bias->mark_as_unused();
567         }
568         _is_prepared = true;
569     }
570 }
571 } // namespace arm_compute
572