• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <xtensa/tie/xt_hifi2.h>
17 
18 #include "tensorflow/lite/c/builtin_op_data.h"
19 #include "tensorflow/lite/c/common.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/quantization_util.h"
22 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
23 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
24 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
25 #include "tensorflow/lite/kernels/kernel_util.h"
26 #include "tensorflow/lite/kernels/padding.h"
27 #include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
28 #include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
29 
30 namespace tflite {
31 namespace ops {
32 namespace micro {
33 namespace depthwise_conv {
34 namespace xtensa {
35 namespace hifimini {
36 
DepthwiseConvPerChannel(const DepthwiseParams & params,const int32 * output_multiplier,const int32 * output_shift,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,int8 * output_data)37 inline void DepthwiseConvPerChannel(
38     const DepthwiseParams& params, const int32* output_multiplier,
39     const int32* output_shift, const RuntimeShape& input_shape,
40     const int8* input_data, const RuntimeShape& filter_shape,
41     const int8* filter_data, const RuntimeShape& bias_shape,
42     const int32* bias_data, const RuntimeShape& output_shape,
43     int8* output_data) {
44   // Get parameters.
45   // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
46   const int stride_width = params.stride_width;
47   const int stride_height = params.stride_height;
48   const int dilation_width_factor = params.dilation_width_factor;
49   const int dilation_height_factor = params.dilation_height_factor;
50   const int pad_width = params.padding_values.width;
51   const int pad_height = params.padding_values.height;
52   const int depth_multiplier = params.depth_multiplier;
53   const int32 input_offset = params.input_offset;
54   const int32 output_offset = params.output_offset;
55   const int32 output_activation_min = params.quantized_activation_min;
56   const int32 output_activation_max = params.quantized_activation_max;
57 
58   const int batches = input_shape.Dims(0);
59 
60   const int input_height = input_shape.Dims(1);
61   const int input_width = input_shape.Dims(2);
62   const int input_depth = input_shape.Dims(3);
63 
64   const int filter_height = filter_shape.Dims(1);
65   const int filter_width = filter_shape.Dims(2);
66   const int filter_depth = filter_shape.Dims(3);
67 
68   const int output_height = output_shape.Dims(1);
69   const int output_width = output_shape.Dims(2);
70   const int output_depth = output_shape.Dims(3);
71 
72   ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
73   ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
74   ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
75   ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
76 
77   for (int batch = 0; batch < batches; ++batch) {
78     for (int out_y = 0; out_y < output_height; ++out_y) {
79       const int in_y_origin = (out_y * stride_height) - pad_height;
80       for (int out_x = 0; out_x < output_width; ++out_x) {
81         const int in_x_origin = (out_x * stride_width) - pad_width;
82         for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
83           for (int m = 0; m < depth_multiplier; ++m) {
84             const int output_channel = m + in_channel * depth_multiplier;
85             ae_q56s acc_56 = AE_ZEROQ56();
86             for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
87               const int in_y = in_y_origin + dilation_height_factor * filter_y;
88               for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
89                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
90                 // Zero padding by omitting the areas outside the image.
91                 const bool is_point_inside_image =
92                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
93                     (in_y < input_height);
94 
95                 if (is_point_inside_image) {
96                   // Find current input index, minus 2 for Xtensa load
97                   // alignments:
98                   // TODO(b/147322595): Consider doing these offset calculations
99                   // with intrinsics:
100                   int input_idx =
101                       ((batch * input_height + in_y) * input_width + in_x) *
102                           input_depth +
103                       (in_channel);
104                   int32 input_val = input_data[input_idx];
105 
106                   // Find current filter index, minus 2 for Xtensa load
107                   // alignments:
108                   int filter_idx =
109                       ((filter_y)*filter_width + filter_x) * filter_depth +
110                       (output_channel);
111                   int32 filter_val = filter_data[filter_idx];
112 
113                   // Load 8bit value as int32 into a 24x24 register and right
114                   // shift into 24bit space. Note: value is duplicated in the HH
115                   // and LL register - but all calculations are done on the HH
116                   // side.
117                   ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
118 
119                   // Add input offset (24bit aligned):
120                   input_val_24x2 =
121                       AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);
122 
123                   // Load filter 8bit value into 24bit alignment:
124                   ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
125 
126                   // Multiply and accumulate the HH side of each 24x24 PR
127                   // register:
128                   AE_MULAS56P24S_HH(acc_56, filter_val_24x2, input_val_24x2);
129                 }
130               }
131             }
132 
133             // Left shift from 48bit alignment to 32bit:
134             acc_56 = AE_Q56S_SLAI(acc_56, 16);
135 
136             if (bias_data) {
137               // Load and add bias at 32bit alignment:
138               ae_q56s bias_56 = AE_CVTQ48A32S(bias_data[output_channel]);
139               acc_56 = AE_ADDQ56(acc_56, bias_56);
140             }
141 
142             // Shift from 32bit alignment to 24bit alignment and place back on
143             // the PR register:
144             acc_56 = AE_Q56S_SLAI(acc_56, 8);
145             ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
146 
147             // Apply quantized multiplier and accumulate result at 48bit
148             // alignment:
149             acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
150                 acc_24x2, output_multiplier[output_channel],
151                 output_shift[output_channel]);
152 
153             // Shift from 48bit aligned to 32bit:
154             acc_56 = AE_Q56S_SLAI(acc_56, 16);
155 
156             // Add output offset, cap activation, and assign to the output:
157             acc_56 = AE_ADDQ56(acc_56, output_offset_56);
158             acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
159             acc_56 = AE_MAXQ56S(acc_56, output_activation_min_56);
160 
161             int output_idx =
162                 ((batch * output_height + out_y) * output_width + out_x) *
163                     output_depth +
164                 output_channel;
165             output_data[output_idx] =
166                 static_cast<int8_t>(AE_TRUNCA32Q48(acc_56));
167           }
168         }
169       }
170     }
171   }
172 }
173 
174 }  // namespace hifimini
175 }  // namespace xtensa
176 
177 namespace {
178 
179 constexpr int kInputTensor = 0;
180 constexpr int kFilterTensor = 1;
181 constexpr int kBiasTensor = 2;
182 constexpr int kOutputTensor = 0;
183 constexpr int kMaxChannels = 256;
184 
185 struct OpData {
186   TfLitePaddingValues padding;
187   // The scaling factor from input to output (aka the 'real multiplier') can
188   // be represented as a fixed point multiplier plus a left shift.
189   int32_t output_multiplier;
190   int output_shift;
191 
192   // Per channel output multiplier and shift.
193   // TODO(b/141139247): Allocate these dynamically when possible.
194   int32_t per_channel_output_multiplier[kMaxChannels];
195   int32_t per_channel_output_shift[kMaxChannels];
196 
197   // The range of the fused activation layer. For example for kNone and
198   // uint8_t these would be 0 and 255.
199   int32_t output_activation_min;
200   int32_t output_activation_max;
201 };
202 
203 // These constants represent constants specific to the music detect model.
204 // They exist until (b/132070898) is fixed.
205 static const int kMaxOpDataSize = 6;
206 static int kStaticOpDataCounter = 0;
207 static OpData kStaticOpData[kMaxOpDataSize];
208 
CalculateOpData(TfLiteContext * context,TfLiteNode * node,TfLiteDepthwiseConvParams * params,int width,int height,int filter_width,int filter_height,const TfLiteType data_type,OpData * data)209 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
210                              TfLiteDepthwiseConvParams* params, int width,
211                              int height, int filter_width, int filter_height,
212                              const TfLiteType data_type, OpData* data) {
213   bool has_bias = node->inputs->size == 3;
214   // Check number of inputs/outputs
215   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
216   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
217 
218   int unused_output_height, unused_output_width;
219   data->padding = ComputePaddingHeightWidth(
220       params->stride_height, params->stride_width, 1, 1, height, width,
221       filter_height, filter_width, params->padding, &unused_output_height,
222       &unused_output_width);
223 
224   // Note that quantized inference requires that all tensors have their
225   // parameters set. This is usually done during quantized training.
226   if (data_type != kTfLiteFloat32) {
227     const TfLiteTensor* input = GetInput(context, node, kInputTensor);
228     const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
229     const TfLiteTensor* bias =
230         GetOptionalInputTensor(context, node, kBiasTensor);
231     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
232 
233     // TODO(b/148610881): Consider calculating quantized params at int24
234     // calculations:
235     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
236         context, input, filter, bias, output, params->activation,
237         &data->output_multiplier, &data->output_shift,
238         &data->output_activation_min, &data->output_activation_max,
239         data->per_channel_output_multiplier,
240         reinterpret_cast<int*>(data->per_channel_output_shift)));
241   }
242   return kTfLiteOk;
243 }
244 
245 }  // namespace
246 
Init(TfLiteContext * context,const char * buffer,size_t length)247 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
248   return nullptr;
249 }
250 
Free(TfLiteContext * context,void * buffer)251 void Free(TfLiteContext* context, void* buffer) {}
252 
Prepare(TfLiteContext * context,TfLiteNode * node)253 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
254   auto* params =
255       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
256 
257   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
258   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
259   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
260   const TfLiteTensor* bias =
261       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
262 
263   // TODO(b/132070898): Use statically slotted OpData structures until a
264   // scratch memory API is ready.
265   OpData* op_data = &kStaticOpData[kStaticOpDataCounter++];
266   node->user_data = op_data;
267 
268   const TfLiteType data_type = input->type;
269   int width = SizeOfDimension(input, 2);
270   int height = SizeOfDimension(input, 1);
271   int filter_width = SizeOfDimension(filter, 2);
272   int filter_height = SizeOfDimension(filter, 1);
273 
274   // All per-channel quantized tensors need valid zero point and scale arrays.
275   if (input->type == kTfLiteInt8) {
276     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
277                       kTfLiteAffineQuantization);
278 
279     const auto* affine_quantization =
280         reinterpret_cast<TfLiteAffineQuantization*>(
281             filter->quantization.params);
282     TF_LITE_ENSURE(context, affine_quantization);
283     TF_LITE_ENSURE(context, affine_quantization->scale);
284     TF_LITE_ENSURE(context, affine_quantization->zero_point);
285     // Depthwise conv is quantized along dimension 3:
286     // https://www.tensorflow.org/lite/performance/quantization_spec
287     TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
288                       affine_quantization->scale->size);
289     TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
290                       affine_quantization->zero_point->size);
291   }
292 
293   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
294                                         filter_width, filter_height, data_type,
295                                         op_data));
296   return kTfLiteOk;
297 }
298 
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteDepthwiseConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output)299 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
300                              TfLiteDepthwiseConvParams* params, OpData* data,
301                              const TfLiteTensor* input,
302                              const TfLiteTensor* filter,
303                              const TfLiteTensor* bias, TfLiteTensor* output) {
304   DepthwiseParams op_params;
305   op_params.padding_type = PaddingType::kSame;
306   op_params.padding_values.width = data->padding.width;
307   op_params.padding_values.height = data->padding.height;
308   op_params.stride_width = params->stride_width;
309   op_params.stride_height = params->stride_height;
310   op_params.dilation_width_factor = params->dilation_width_factor;
311   op_params.dilation_height_factor = params->dilation_height_factor;
312   op_params.depth_multiplier = params->depth_multiplier;
313   op_params.input_offset = -input->params.zero_point;
314   op_params.weights_offset = 0;
315   op_params.output_offset = output->params.zero_point;
316   // TODO(b/130439627): Use calculated value for clamping.
317   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
318   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
319 
320   xtensa::hifimini::DepthwiseConvPerChannel(
321       op_params, data->per_channel_output_multiplier,
322       data->per_channel_output_shift, GetTensorShape(input),
323       GetTensorData<int8>(input), GetTensorShape(filter),
324       GetTensorData<int8>(filter), GetTensorShape(bias),
325       GetTensorData<int32>(bias), GetTensorShape(output),
326       GetTensorData<int8>(output));
327 }
328 
Eval(TfLiteContext * context,TfLiteNode * node)329 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
330   auto* params =
331       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
332   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
333 
334   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
335   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
336   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
337   const TfLiteTensor* bias =
338       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
339 
340   // TODO(b/147710241): Consider whether float conv and quantized conv should be
341   // separate ops to avoid dispatch overhead here.
342   switch (input->type) {  // Already know in/out types are same.
343     case kTfLiteInt8:
344       EvalQuantizedPerChannel(context, node, params, op_data, input, filter,
345                               bias, output);
346       break;
347     default:
348       context->ReportError(context, "Type %s (%d) not supported.",
349                            TfLiteTypeGetName(input->type), input->type);
350       return kTfLiteError;
351   }
352   return kTfLiteOk;
353 }
354 
355 }  // namespace depthwise_conv
356 
Register_DEPTHWISE_CONV_2D()357 TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
358   static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
359                                  depthwise_conv::Prepare, depthwise_conv::Eval};
360   return &r;
361 }
362 
363 }  // namespace micro
364 }  // namespace ops
365 }  // namespace tflite
366