• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018-2019 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 #include "depthwise.hpp"
27 #include "qasymm8.hpp"
28 #include "qsymm8.hpp"
29 #pragma once
30 
31 using namespace neon_convolution_kernels;
32 using namespace qasymm8;
33 
saturating_doubling_high_mul(const int32x4_t & a,const int32x4_t & b)34 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
35 {
36   return vqrdmulhq_s32(a, b);
37 }
38 
saturating_doubling_high_mul(const int32x4_t & a,const int32_t & b)39 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
40 {
41   return vqrdmulhq_n_s32(a, b);
42 }
43 
saturating_doubling_high_mul(const int32_t & a,const int32_t & b)44 inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
45 {
46   return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
47 }
48 
rounding_divide_by_exp2(const int32x4_t & x,const int32x4_t shift)49 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
50 {
51   const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
52   const int32x4_t fixed = vqaddq_s32(x, fixup);
53   return vrshlq_s32(fixed, shift);
54 }
55 
rounding_divide_by_exp2(const int32x4_t & x,const int exponent)56 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
57 {
58   const int32x4_t shift = vdupq_n_s32(-exponent);
59   const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
60   const int32x4_t fixed = vqaddq_s32(x, fixup);
61   return vrshlq_s32(fixed, shift);
62 }
63 
rounding_divide_by_exp2(const int32x2_t & x,const int exponent)64 inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
65 {
66   const int32x2_t shift = vdup_n_s32(-exponent);
67   const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
68   const int32x2_t fixed = vqadd_s32(x, fixup);
69   return vrshl_s32(fixed, shift);
70 }
71 
rounding_divide_by_exp2(const int32_t & x,const int exponent)72 inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
73 {
74   const int32x2_t xs = vdup_n_s32(x);
75   return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
76 }
77 
78 namespace depthwise
79 {
80 
81 namespace nck = neon_convolution_kernels;
82 
83 template <
84   unsigned int OutputTileRows, unsigned int OutputTileCols,
85   unsigned int KernelRows, unsigned int KernelCols,
86   unsigned int StrideRows, unsigned int StrideCols
87 >
88 class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
89   OutputTileRows, OutputTileCols,
90   KernelRows, KernelCols,
91   StrideRows, StrideCols,
92   uint8_t, int32_t, uint8_t,
93   QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
94 >
95 {
96   using Base = DepthwiseConvolutionBase<
97     OutputTileRows, OutputTileCols,
98     KernelRows, KernelCols,
99     StrideRows, StrideCols,
100     uint8_t, int32_t, uint8_t,
101     QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
102   >;
103   friend Base;
104   using InputType = typename Base::InputType;
105   using OutputType = typename Base::OutputType;
106 
107   public:
108     QAsymm8DepthwiseConvolution(
109       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
110       nck::ActivationFunction activation,
111       const qasymm8::QAsymm8Params& weight_quantisation,
112       const qasymm8::QAsymm8Params& input_quantisation,
113       const qasymm8::QAsymm8Params& output_quantisation,
114       unsigned int padding_top,
115       unsigned int padding_left,
116       unsigned int padding_bottom,
117       unsigned int padding_right
118     );
119 
120     QAsymm8DepthwiseConvolution(
121       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
122       int n_output_rows, int n_output_cols,
123       nck::ActivationFunction activation,
124       const qasymm8::QAsymm8Params& weight_quantisation,
125       const qasymm8::QAsymm8Params& input_quantisation,
126       const qasymm8::QAsymm8Params& output_quantisation,
127       unsigned int padding_top,
128       unsigned int padding_left,
129       unsigned int padding_bottom,
130       unsigned int padding_right
131     );
132 
133     QAsymm8DepthwiseConvolution(
134       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
135       nck::ActivationFunction activation,
136       const qasymm8::QAsymm8Params& weight_quantisation,
137       const qasymm8::QAsymm8Params& input_quantisation,
138       const qasymm8::QAsymm8Params& output_quantisation,
139       const qasymm8::QAsymm8RescaleParams& rescale_parameters,
140       unsigned int padding_top,
141       unsigned int padding_left,
142       unsigned int padding_bottom,
143       unsigned int padding_right
144     );
145 
146     QAsymm8DepthwiseConvolution(
147       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
148       int n_output_rows, int n_output_cols,
149       nck::ActivationFunction activation,
150       const qasymm8::QAsymm8Params& weight_quantisation,
151       const qasymm8::QAsymm8Params& input_quantisation,
152       const qasymm8::QAsymm8Params& output_quantisation,
153       const qasymm8::QAsymm8RescaleParams& rescale_parameters,
154       unsigned int padding_top,
155       unsigned int padding_left,
156       unsigned int padding_bottom,
157       unsigned int padding_right
158     );
159 
160   protected:
161     uint8_t _input_padding_value(void) const;
162 
163     void _pack_params(
164       void *buffer,
165       const void *weights,
166       unsigned int weight_row_stride,
167       unsigned int weight_col_stride,
168       const void *biases=nullptr
169     ) const;
170 
171     template <nck::ActivationFunction Activation>
172     void execute_tile(
173       int n_channels,
174       const void* packed_params,
175       const uint8_t* inptr,
176       unsigned int in_row_stride,
177       unsigned int in_col_stride,
178       uint8_t* outptr,
179       unsigned int out_row_stride,
180       unsigned int out_col_stride
181     );
182 
183     template <nck::ActivationFunction Activation>
184     void execute_tile(
185       int n_channels,
186       const void* packed_params,
187       const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
188       uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
189     );
190 
191   private:
192     // Quantization parameters
193     const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
194     const qasymm8::QAsymm8RescaleParams rescale_parameters;
195 };
196 
197 template <
198   unsigned int OutputTileRows, unsigned int OutputTileCols,
199   unsigned int KernelRows, unsigned int KernelCols,
200   unsigned int StrideRows, unsigned int StrideCols
201 >
202 class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
203   OutputTileRows, OutputTileCols,
204   KernelRows, KernelCols,
205   StrideRows, StrideCols,
206   uint8_t, int32_t, uint8_t,
207   QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
208 >
209 {
210   using Base = DepthwiseConvolutionBase<
211     OutputTileRows, OutputTileCols,
212     KernelRows, KernelCols,
213     StrideRows, StrideCols,
214     uint8_t, int32_t, uint8_t,
215     QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
216   >;
217   friend Base;
218   using InputType = typename Base::InputType;
219   using OutputType = typename Base::OutputType;
220 
221   public:
222   QSymm8HybridPerChannelDepthwiseConvolution(
223       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
224       nck::ActivationFunction activation,
225       const qsymm8::QSymm8PerChannelParams& weight_quantisation,
226       const qasymm8::QAsymm8Params& input_quantisation,
227       const qasymm8::QAsymm8Params& output_quantisation,
228       unsigned int padding_top,
229       unsigned int padding_left,
230       unsigned int padding_bottom,
231       unsigned int padding_right
232     );
233 
234   QSymm8HybridPerChannelDepthwiseConvolution(
235       int n_batches, int n_input_rows, int n_input_cols, int n_channels,
236       nck::ActivationFunction activation,
237       const qsymm8::QSymm8PerChannelParams& weight_quantisation,
238       const qasymm8::QAsymm8Params& input_quantisation,
239       const qasymm8::QAsymm8Params& output_quantisation,
240       const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
241       unsigned int padding_top,
242       unsigned int padding_left,
243       unsigned int padding_bottom,
244       unsigned int padding_right
245     );
246 
get_packed_params_size(void) const247   size_t get_packed_params_size(void) const override
248   {
249       return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
250 
251   }
252 
253   protected:
254     uint8_t _input_padding_value(void) const;
255 
256     void _pack_params(
257       void *buffer,
258       const void *weights,
259       unsigned int weight_row_stride,
260       unsigned int weight_col_stride,
261       const void *biases=nullptr
262     ) const;
263 
264     template <nck::ActivationFunction Activation>
265     void execute_tile(
266       int n_channels,
267       const void* packed_params,
268       const uint8_t* inptr,
269       unsigned int in_row_stride,
270       unsigned int in_col_stride,
271       uint8_t* outptr,
272       unsigned int out_row_stride,
273       unsigned int out_col_stride
274     );
275 
276     template <nck::ActivationFunction Activation>
277     void execute_tile(
278       int n_channels,
279       const void* packed_params,
280       const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
281       uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
282     );
283 
284   private:
285     // Quantization parameters
286     const qsymm8::QSymm8PerChannelParams _weights_quant;
287     const qasymm8::QAsymm8Params _input_quant, _output_quant;
288     const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
289 };
290 
291 }  // namespace depthwise
292