1 /*
2 * Copyright (c) 2018-2019 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #pragma once
26 #include "depthwise.hpp"
27 #include "qasymm8.hpp"
28 #include "qsymm8.hpp"
29 #pragma once
30
31 using namespace neon_convolution_kernels;
32 using namespace qasymm8;
33
saturating_doubling_high_mul(const int32x4_t & a,const int32x4_t & b)34 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
35 {
36 return vqrdmulhq_s32(a, b);
37 }
38
saturating_doubling_high_mul(const int32x4_t & a,const int32_t & b)39 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
40 {
41 return vqrdmulhq_n_s32(a, b);
42 }
43
saturating_doubling_high_mul(const int32_t & a,const int32_t & b)44 inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
45 {
46 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
47 }
48
rounding_divide_by_exp2(const int32x4_t & x,const int32x4_t shift)49 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
50 {
51 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
52 const int32x4_t fixed = vqaddq_s32(x, fixup);
53 return vrshlq_s32(fixed, shift);
54 }
55
rounding_divide_by_exp2(const int32x4_t & x,const int exponent)56 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
57 {
58 const int32x4_t shift = vdupq_n_s32(-exponent);
59 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
60 const int32x4_t fixed = vqaddq_s32(x, fixup);
61 return vrshlq_s32(fixed, shift);
62 }
63
rounding_divide_by_exp2(const int32x2_t & x,const int exponent)64 inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
65 {
66 const int32x2_t shift = vdup_n_s32(-exponent);
67 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
68 const int32x2_t fixed = vqadd_s32(x, fixup);
69 return vrshl_s32(fixed, shift);
70 }
71
rounding_divide_by_exp2(const int32_t & x,const int exponent)72 inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
73 {
74 const int32x2_t xs = vdup_n_s32(x);
75 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
76 }
77
78 namespace depthwise
79 {
80
81 namespace nck = neon_convolution_kernels;
82
83 template <
84 unsigned int OutputTileRows, unsigned int OutputTileCols,
85 unsigned int KernelRows, unsigned int KernelCols,
86 unsigned int StrideRows, unsigned int StrideCols
87 >
88 class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
89 OutputTileRows, OutputTileCols,
90 KernelRows, KernelCols,
91 StrideRows, StrideCols,
92 uint8_t, int32_t, uint8_t,
93 QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
94 >
95 {
96 using Base = DepthwiseConvolutionBase<
97 OutputTileRows, OutputTileCols,
98 KernelRows, KernelCols,
99 StrideRows, StrideCols,
100 uint8_t, int32_t, uint8_t,
101 QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
102 >;
103 friend Base;
104 using InputType = typename Base::InputType;
105 using OutputType = typename Base::OutputType;
106
107 public:
108 QAsymm8DepthwiseConvolution(
109 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
110 nck::ActivationFunction activation,
111 const qasymm8::QAsymm8Params& weight_quantisation,
112 const qasymm8::QAsymm8Params& input_quantisation,
113 const qasymm8::QAsymm8Params& output_quantisation,
114 unsigned int padding_top,
115 unsigned int padding_left,
116 unsigned int padding_bottom,
117 unsigned int padding_right
118 );
119
120 QAsymm8DepthwiseConvolution(
121 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
122 int n_output_rows, int n_output_cols,
123 nck::ActivationFunction activation,
124 const qasymm8::QAsymm8Params& weight_quantisation,
125 const qasymm8::QAsymm8Params& input_quantisation,
126 const qasymm8::QAsymm8Params& output_quantisation,
127 unsigned int padding_top,
128 unsigned int padding_left,
129 unsigned int padding_bottom,
130 unsigned int padding_right
131 );
132
133 QAsymm8DepthwiseConvolution(
134 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
135 nck::ActivationFunction activation,
136 const qasymm8::QAsymm8Params& weight_quantisation,
137 const qasymm8::QAsymm8Params& input_quantisation,
138 const qasymm8::QAsymm8Params& output_quantisation,
139 const qasymm8::QAsymm8RescaleParams& rescale_parameters,
140 unsigned int padding_top,
141 unsigned int padding_left,
142 unsigned int padding_bottom,
143 unsigned int padding_right
144 );
145
146 QAsymm8DepthwiseConvolution(
147 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
148 int n_output_rows, int n_output_cols,
149 nck::ActivationFunction activation,
150 const qasymm8::QAsymm8Params& weight_quantisation,
151 const qasymm8::QAsymm8Params& input_quantisation,
152 const qasymm8::QAsymm8Params& output_quantisation,
153 const qasymm8::QAsymm8RescaleParams& rescale_parameters,
154 unsigned int padding_top,
155 unsigned int padding_left,
156 unsigned int padding_bottom,
157 unsigned int padding_right
158 );
159
160 protected:
161 uint8_t _input_padding_value(void) const;
162
163 void _pack_params(
164 void *buffer,
165 const void *weights,
166 unsigned int weight_row_stride,
167 unsigned int weight_col_stride,
168 const void *biases=nullptr
169 ) const;
170
171 template <nck::ActivationFunction Activation>
172 void execute_tile(
173 int n_channels,
174 const void* packed_params,
175 const uint8_t* inptr,
176 unsigned int in_row_stride,
177 unsigned int in_col_stride,
178 uint8_t* outptr,
179 unsigned int out_row_stride,
180 unsigned int out_col_stride
181 );
182
183 template <nck::ActivationFunction Activation>
184 void execute_tile(
185 int n_channels,
186 const void* packed_params,
187 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
188 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
189 );
190
191 private:
192 // Quantization parameters
193 const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
194 const qasymm8::QAsymm8RescaleParams rescale_parameters;
195 };
196
197 template <
198 unsigned int OutputTileRows, unsigned int OutputTileCols,
199 unsigned int KernelRows, unsigned int KernelCols,
200 unsigned int StrideRows, unsigned int StrideCols
201 >
202 class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
203 OutputTileRows, OutputTileCols,
204 KernelRows, KernelCols,
205 StrideRows, StrideCols,
206 uint8_t, int32_t, uint8_t,
207 QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
208 >
209 {
210 using Base = DepthwiseConvolutionBase<
211 OutputTileRows, OutputTileCols,
212 KernelRows, KernelCols,
213 StrideRows, StrideCols,
214 uint8_t, int32_t, uint8_t,
215 QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
216 >;
217 friend Base;
218 using InputType = typename Base::InputType;
219 using OutputType = typename Base::OutputType;
220
221 public:
222 QSymm8HybridPerChannelDepthwiseConvolution(
223 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
224 nck::ActivationFunction activation,
225 const qsymm8::QSymm8PerChannelParams& weight_quantisation,
226 const qasymm8::QAsymm8Params& input_quantisation,
227 const qasymm8::QAsymm8Params& output_quantisation,
228 unsigned int padding_top,
229 unsigned int padding_left,
230 unsigned int padding_bottom,
231 unsigned int padding_right
232 );
233
234 QSymm8HybridPerChannelDepthwiseConvolution(
235 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
236 nck::ActivationFunction activation,
237 const qsymm8::QSymm8PerChannelParams& weight_quantisation,
238 const qasymm8::QAsymm8Params& input_quantisation,
239 const qasymm8::QAsymm8Params& output_quantisation,
240 const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
241 unsigned int padding_top,
242 unsigned int padding_left,
243 unsigned int padding_bottom,
244 unsigned int padding_right
245 );
246
get_packed_params_size(void) const247 size_t get_packed_params_size(void) const override
248 {
249 return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
250
251 }
252
253 protected:
254 uint8_t _input_padding_value(void) const;
255
256 void _pack_params(
257 void *buffer,
258 const void *weights,
259 unsigned int weight_row_stride,
260 unsigned int weight_col_stride,
261 const void *biases=nullptr
262 ) const;
263
264 template <nck::ActivationFunction Activation>
265 void execute_tile(
266 int n_channels,
267 const void* packed_params,
268 const uint8_t* inptr,
269 unsigned int in_row_stride,
270 unsigned int in_col_stride,
271 uint8_t* outptr,
272 unsigned int out_row_stride,
273 unsigned int out_col_stride
274 );
275
276 template <nck::ActivationFunction Activation>
277 void execute_tile(
278 int n_channels,
279 const void* packed_params,
280 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
281 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
282 );
283
284 private:
285 // Quantization parameters
286 const qsymm8::QSymm8PerChannelParams _weights_quant;
287 const qasymm8::QAsymm8Params _input_quant, _output_quant;
288 const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
289 };
290
291 } // namespace depthwise
292