1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <xtensa/tie/xt_hifi2.h>
17
18 #include "tensorflow/lite/c/builtin_op_data.h"
19 #include "tensorflow/lite/c/common.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/quantization_util.h"
22 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
23 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
24 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
25 #include "tensorflow/lite/kernels/kernel_util.h"
26 #include "tensorflow/lite/kernels/padding.h"
27 #include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
28 #include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
29
30 namespace tflite {
31 namespace ops {
32 namespace micro {
33 namespace depthwise_conv {
34 namespace xtensa {
35 namespace hifimini {
36
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32 * output_multiplier,const int32 * output_shift,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,int8 * output_data)37 inline void DepthwiseConvPerChannel(
38 const DepthwiseParams& params, const int32* output_multiplier,
39 const int32* output_shift, const RuntimeShape& input_shape,
40 const int8* input_data, const RuntimeShape& filter_shape,
41 const int8* filter_data, const RuntimeShape& bias_shape,
42 const int32* bias_data, const RuntimeShape& output_shape,
43 int8* output_data) {
44 // Get parameters.
45 // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
46 const int stride_width = params.stride_width;
47 const int stride_height = params.stride_height;
48 const int dilation_width_factor = params.dilation_width_factor;
49 const int dilation_height_factor = params.dilation_height_factor;
50 const int pad_width = params.padding_values.width;
51 const int pad_height = params.padding_values.height;
52 const int depth_multiplier = params.depth_multiplier;
53 const int32 input_offset = params.input_offset;
54 const int32 output_offset = params.output_offset;
55 const int32 output_activation_min = params.quantized_activation_min;
56 const int32 output_activation_max = params.quantized_activation_max;
57
58 const int batches = input_shape.Dims(0);
59
60 const int input_height = input_shape.Dims(1);
61 const int input_width = input_shape.Dims(2);
62 const int input_depth = input_shape.Dims(3);
63
64 const int filter_height = filter_shape.Dims(1);
65 const int filter_width = filter_shape.Dims(2);
66 const int filter_depth = filter_shape.Dims(3);
67
68 const int output_height = output_shape.Dims(1);
69 const int output_width = output_shape.Dims(2);
70 const int output_depth = output_shape.Dims(3);
71
72 ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
73 ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
74 ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
75 ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
76
77 for (int batch = 0; batch < batches; ++batch) {
78 for (int out_y = 0; out_y < output_height; ++out_y) {
79 const int in_y_origin = (out_y * stride_height) - pad_height;
80 for (int out_x = 0; out_x < output_width; ++out_x) {
81 const int in_x_origin = (out_x * stride_width) - pad_width;
82 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
83 for (int m = 0; m < depth_multiplier; ++m) {
84 const int output_channel = m + in_channel * depth_multiplier;
85 ae_q56s acc_56 = AE_ZEROQ56();
86 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
87 const int in_y = in_y_origin + dilation_height_factor * filter_y;
88 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
89 const int in_x = in_x_origin + dilation_width_factor * filter_x;
90 // Zero padding by omitting the areas outside the image.
91 const bool is_point_inside_image =
92 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
93 (in_y < input_height);
94
95 if (is_point_inside_image) {
96 // Find current input index, minus 2 for Xtensa load
97 // alignments:
98 // TODO(b/147322595): Consider doing these offset calculations
99 // with intrinsics:
100 int input_idx =
101 ((batch * input_height + in_y) * input_width + in_x) *
102 input_depth +
103 (in_channel);
104 int32 input_val = input_data[input_idx];
105
106 // Find current filter index, minus 2 for Xtensa load
107 // alignments:
108 int filter_idx =
109 ((filter_y)*filter_width + filter_x) * filter_depth +
110 (output_channel);
111 int32 filter_val = filter_data[filter_idx];
112
113 // Load 8bit value as int32 into a 24x24 register and right
114 // shift into 24bit space. Note: value is duplicated in the HH
115 // and LL register - but all calculations are done on the HH
116 // side.
117 ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
118
119 // Add input offset (24bit aligned):
120 input_val_24x2 =
121 AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);
122
123 // Load filter 8bit value into 24bit alignment:
124 ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
125
126 // Multiply and accumulate the HH side of each 24x24 PR
127 // register:
128 AE_MULAS56P24S_HH(acc_56, filter_val_24x2, input_val_24x2);
129 }
130 }
131 }
132
133 // Left shift from 48bit alignment to 32bit:
134 acc_56 = AE_Q56S_SLAI(acc_56, 16);
135
136 if (bias_data) {
137 // Load and add bias at 32bit alignment:
138 ae_q56s bias_56 = AE_CVTQ48A32S(bias_data[output_channel]);
139 acc_56 = AE_ADDQ56(acc_56, bias_56);
140 }
141
142 // Shift from 32bit alignment to 24bit alignment and place back on
143 // the PR register:
144 acc_56 = AE_Q56S_SLAI(acc_56, 8);
145 ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
146
147 // Apply quantized multiplier and accumulate result at 48bit
148 // alignment:
149 acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
150 acc_24x2, output_multiplier[output_channel],
151 output_shift[output_channel]);
152
153 // Shift from 48bit aligned to 32bit:
154 acc_56 = AE_Q56S_SLAI(acc_56, 16);
155
156 // Add output offset, cap activation, and assign to the output:
157 acc_56 = AE_ADDQ56(acc_56, output_offset_56);
158 acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
159 acc_56 = AE_MAXQ56S(acc_56, output_activation_min_56);
160
161 int output_idx =
162 ((batch * output_height + out_y) * output_width + out_x) *
163 output_depth +
164 output_channel;
165 output_data[output_idx] =
166 static_cast<int8_t>(AE_TRUNCA32Q48(acc_56));
167 }
168 }
169 }
170 }
171 }
172 }
173
174 } // namespace hifimini
175 } // namespace xtensa
176
177 namespace {
178
179 constexpr int kInputTensor = 0;
180 constexpr int kFilterTensor = 1;
181 constexpr int kBiasTensor = 2;
182 constexpr int kOutputTensor = 0;
183 constexpr int kMaxChannels = 256;
184
185 struct OpData {
186 TfLitePaddingValues padding;
187 // The scaling factor from input to output (aka the 'real multiplier') can
188 // be represented as a fixed point multiplier plus a left shift.
189 int32_t output_multiplier;
190 int output_shift;
191
192 // Per channel output multiplier and shift.
193 // TODO(b/141139247): Allocate these dynamically when possible.
194 int32_t per_channel_output_multiplier[kMaxChannels];
195 int32_t per_channel_output_shift[kMaxChannels];
196
197 // The range of the fused activation layer. For example for kNone and
198 // uint8_t these would be 0 and 255.
199 int32_t output_activation_min;
200 int32_t output_activation_max;
201 };
202
203 // These constants represent constants specific to the music detect model.
204 // They exist until (b/132070898) is fixed.
205 static const int kMaxOpDataSize = 6;
206 static int kStaticOpDataCounter = 0;
207 static OpData kStaticOpData[kMaxOpDataSize];
208
CalculateOpData(TfLiteContext * context,TfLiteNode * node,TfLiteDepthwiseConvParams * params,int width,int height,int filter_width,int filter_height,const TfLiteType data_type,OpData * data)209 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
210 TfLiteDepthwiseConvParams* params, int width,
211 int height, int filter_width, int filter_height,
212 const TfLiteType data_type, OpData* data) {
213 bool has_bias = node->inputs->size == 3;
214 // Check number of inputs/outputs
215 TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
216 TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
217
218 int unused_output_height, unused_output_width;
219 data->padding = ComputePaddingHeightWidth(
220 params->stride_height, params->stride_width, 1, 1, height, width,
221 filter_height, filter_width, params->padding, &unused_output_height,
222 &unused_output_width);
223
224 // Note that quantized inference requires that all tensors have their
225 // parameters set. This is usually done during quantized training.
226 if (data_type != kTfLiteFloat32) {
227 const TfLiteTensor* input = GetInput(context, node, kInputTensor);
228 const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
229 const TfLiteTensor* bias =
230 GetOptionalInputTensor(context, node, kBiasTensor);
231 TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
232
233 // TODO(b/148610881): Consider calculating quantized params at int24
234 // calculations:
235 TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
236 context, input, filter, bias, output, params->activation,
237 &data->output_multiplier, &data->output_shift,
238 &data->output_activation_min, &data->output_activation_max,
239 data->per_channel_output_multiplier,
240 reinterpret_cast<int*>(data->per_channel_output_shift)));
241 }
242 return kTfLiteOk;
243 }
244
245 } // namespace
246
Init(TfLiteContext * context,const char * buffer,size_t length)247 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
248 return nullptr;
249 }
250
Free(TfLiteContext * context,void * buffer)251 void Free(TfLiteContext* context, void* buffer) {}
252
Prepare(TfLiteContext * context,TfLiteNode * node)253 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
254 auto* params =
255 reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
256
257 TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
258 const TfLiteTensor* input = GetInput(context, node, kInputTensor);
259 const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
260 const TfLiteTensor* bias =
261 (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
262
263 // TODO(b/132070898): Use statically slotted OpData structures until a
264 // scratch memory API is ready.
265 OpData* op_data = &kStaticOpData[kStaticOpDataCounter++];
266 node->user_data = op_data;
267
268 const TfLiteType data_type = input->type;
269 int width = SizeOfDimension(input, 2);
270 int height = SizeOfDimension(input, 1);
271 int filter_width = SizeOfDimension(filter, 2);
272 int filter_height = SizeOfDimension(filter, 1);
273
274 // All per-channel quantized tensors need valid zero point and scale arrays.
275 if (input->type == kTfLiteInt8) {
276 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
277 kTfLiteAffineQuantization);
278
279 const auto* affine_quantization =
280 reinterpret_cast<TfLiteAffineQuantization*>(
281 filter->quantization.params);
282 TF_LITE_ENSURE(context, affine_quantization);
283 TF_LITE_ENSURE(context, affine_quantization->scale);
284 TF_LITE_ENSURE(context, affine_quantization->zero_point);
285 // Depthwise conv is quantized along dimension 3:
286 // https://www.tensorflow.org/lite/performance/quantization_spec
287 TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
288 affine_quantization->scale->size);
289 TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
290 affine_quantization->zero_point->size);
291 }
292
293 TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
294 filter_width, filter_height, data_type,
295 op_data));
296 return kTfLiteOk;
297 }
298
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteDepthwiseConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output)299 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
300 TfLiteDepthwiseConvParams* params, OpData* data,
301 const TfLiteTensor* input,
302 const TfLiteTensor* filter,
303 const TfLiteTensor* bias, TfLiteTensor* output) {
304 DepthwiseParams op_params;
305 op_params.padding_type = PaddingType::kSame;
306 op_params.padding_values.width = data->padding.width;
307 op_params.padding_values.height = data->padding.height;
308 op_params.stride_width = params->stride_width;
309 op_params.stride_height = params->stride_height;
310 op_params.dilation_width_factor = params->dilation_width_factor;
311 op_params.dilation_height_factor = params->dilation_height_factor;
312 op_params.depth_multiplier = params->depth_multiplier;
313 op_params.input_offset = -input->params.zero_point;
314 op_params.weights_offset = 0;
315 op_params.output_offset = output->params.zero_point;
316 // TODO(b/130439627): Use calculated value for clamping.
317 op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
318 op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
319
320 xtensa::hifimini::DepthwiseConvPerChannel(
321 op_params, data->per_channel_output_multiplier,
322 data->per_channel_output_shift, GetTensorShape(input),
323 GetTensorData<int8>(input), GetTensorShape(filter),
324 GetTensorData<int8>(filter), GetTensorShape(bias),
325 GetTensorData<int32>(bias), GetTensorShape(output),
326 GetTensorData<int8>(output));
327 }
328
Eval(TfLiteContext * context,TfLiteNode * node)329 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
330 auto* params =
331 reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
332 auto* op_data = reinterpret_cast<OpData*>(node->user_data);
333
334 TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
335 const TfLiteTensor* input = GetInput(context, node, kInputTensor);
336 const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
337 const TfLiteTensor* bias =
338 (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
339
340 // TODO(b/147710241): Consider whether float conv and quantized conv should be
341 // separate ops to avoid dispatch overhead here.
342 switch (input->type) { // Already know in/out types are same.
343 case kTfLiteInt8:
344 EvalQuantizedPerChannel(context, node, params, op_data, input, filter,
345 bias, output);
346 break;
347 default:
348 context->ReportError(context, "Type %s (%d) not supported.",
349 TfLiteTypeGetName(input->type), input->type);
350 return kTfLiteError;
351 }
352 return kTfLiteOk;
353 }
354
355 } // namespace depthwise_conv
356
Register_DEPTHWISE_CONV_2D()357 TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
358 static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
359 depthwise_conv::Prepare, depthwise_conv::Eval};
360 return &r;
361 }
362
363 } // namespace micro
364 } // namespace ops
365 } // namespace tflite
366