• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/add.h"
16 
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 #include <algorithm>
21 
22 #include "tensorflow/lite/c/builtin_op_data.h"
23 #include "tensorflow/lite/c/common.h"
24 #include "tensorflow/lite/kernels/internal/compatibility.h"
25 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
26 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
27 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
28 #include "tensorflow/lite/kernels/internal/quantization_util.h"
29 #include "tensorflow/lite/kernels/internal/reference/add.h"
30 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
31 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
32 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
33 #include "tensorflow/lite/kernels/internal/tensor.h"
34 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
35 #include "tensorflow/lite/kernels/internal/types.h"
36 #include "tensorflow/lite/kernels/kernel_util.h"
37 #include "tensorflow/lite/kernels/op_macros.h"
38 
39 namespace tflite {
40 namespace ops {
41 namespace builtin {
42 namespace add {
43 // This file has three implementation of Add.
44 enum KernelType {
45   kReference,
46   kGenericOptimized,  // Neon-free
47   kNeonOptimized,
48 };
49 
50 constexpr int kInputTensor1 = 0;
51 constexpr int kInputTensor2 = 1;
52 constexpr int kOutputTensor = 0;
53 
54 struct OpData {
55   // These fields are used in both the general 8-bit -> 8bit quantized path,
56   // and the special 16-bit -> 16bit quantized path
57   int input1_shift;
58   int input2_shift;
59   int32 output_activation_min;
60   int32 output_activation_max;
61 
62   // These fields are used only in the general 8-bit -> 8bit quantized path
63   int32 input1_multiplier;
64   int32 input2_multiplier;
65   int32 output_multiplier;
66   int output_shift;
67   int left_shift;
68   int32 input1_offset;
69   int32 input2_offset;
70   int32 output_offset;
71 
72   // This parameter is used to indicate whether
73   // parameter scale is power of two.
74   // It is used in 16-bit -> 16-bit quantization.
75   bool pot_scale_int16;
76 };
77 
Init(TfLiteContext * context,const char * buffer,size_t length)78 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
79   auto* data = new OpData;
80   return data;
81 }
82 
Free(TfLiteContext * context,void * buffer)83 void Free(TfLiteContext* context, void* buffer) {
84   delete reinterpret_cast<OpData*>(buffer);
85 }
86 
Prepare(TfLiteContext * context,TfLiteNode * node)87 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
88   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
89   OpData* data = reinterpret_cast<OpData*>(node->user_data);
90 
91   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
92   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
93 
94   const TfLiteTensor* input1;
95   TF_LITE_ENSURE_OK(context,
96                     GetInputSafe(context, node, kInputTensor1, &input1));
97   const TfLiteTensor* input2;
98   TF_LITE_ENSURE_OK(context,
99                     GetInputSafe(context, node, kInputTensor2, &input2));
100   TfLiteTensor* output;
101   TF_LITE_ENSURE_OK(context,
102                     GetOutputSafe(context, node, kOutputTensor, &output));
103 
104   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
105   output->type = input2->type;
106 
107   const bool requires_broadcast = !HaveSameShapes(input1, input2);
108 
109   TfLiteIntArray* output_size = nullptr;
110   if (requires_broadcast) {
111     TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
112                                    context, input1, input2, &output_size));
113   } else {
114     output_size = TfLiteIntArrayCopy(input1->dims);
115   }
116 
117   // 8bit -> 8bit general quantized path, with general rescalings
118   // as well as, int16 -> int16 with general rescalings
119 
120   // There are two implementations of ADD operator in case of
121   // 16bit input/output depending on whether the scale parameter is
122   // the power of 2 or not. Currently only implementation for
123   // general case is used, but we need to use another implementation
124   // for older versions.
125   bool general_scale_int16 = false;
126 
127   bool input1_scale_is_pot = false;
128   bool input2_scale_is_pot = false;
129   bool output_scale_is_pot = false;
130 
131   int input1_scale_log2_rounded{0};
132   int input2_scale_log2_rounded{0};
133   int output_scale_log2_rounded{0};
134 
135   if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
136       output->type == kTfLiteInt16) {
137     // In case of int16, quantization is symmetic and
138     // zero point should be zero.
139     TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
140     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
141     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
142 
143     general_scale_int16 = !params || !params->pot_scale_int16;
144 
145     if (!general_scale_int16) {
146       // Do preparation in the case of the scale parameter is power of 2.
147 
148       input1_scale_is_pot =
149           CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
150 
151       input2_scale_is_pot =
152           CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
153 
154       output_scale_is_pot =
155           CheckedLog2(output->params.scale, &output_scale_log2_rounded);
156 
157       general_scale_int16 =
158           !input1_scale_is_pot || !input2_scale_is_pot || !output_scale_is_pot;
159     }
160   }
161 
162   data->pot_scale_int16 = !general_scale_int16;
163 
164   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
165       general_scale_int16) {
166     // 8bit -> 8bit general quantized path, with general rescalings
167     // as well as, 16bit -> 16bit with general rescalings
168     data->input1_offset = -input1->params.zero_point;
169     data->input2_offset = -input2->params.zero_point;
170     data->output_offset = output->params.zero_point;
171 
172     // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
173     // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
174     // therefore the addition will still fit in a 32 bit accumulator.
175     data->left_shift = general_scale_int16 ? 15 : 20;
176     const double twice_max_input_scale =
177         2 * std::max(input1->params.scale, input2->params.scale);
178     const double real_input1_multiplier =
179         input1->params.scale / twice_max_input_scale;
180     const double real_input2_multiplier =
181         input2->params.scale / twice_max_input_scale;
182     const double real_output_multiplier =
183         twice_max_input_scale /
184         ((1 << data->left_shift) * output->params.scale);
185 
186     QuantizeMultiplierSmallerThanOneExp(
187         real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
188 
189     QuantizeMultiplierSmallerThanOneExp(
190         real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
191 
192     QuantizeMultiplierSmallerThanOneExp(
193         real_output_multiplier, &data->output_multiplier, &data->output_shift);
194 
195     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
196         context, params->activation, output, &data->output_activation_min,
197         &data->output_activation_max));
198   } else if (output->type == kTfLiteInt16) {
199     // 16bit -> 16bit special quantized path, supporting only a rather
200     // narrow case of quantization parameters: zero_points must all be 0
201     // ("symmetric quantization") and scales must be power-of-two (which
202     // we abbreviate as "POT" below). The intended use case for this path
203     // is in LSTM cells, where, due to the constraints of implementing
204     // some of the math in these LSTM cells in fixed-point arithmetic,
205     // we need to have such symmetric, power-of-two quantization
206     // (Fixed-point formats are inherently symmetric, power-of-two).
207     TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
208     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
209     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
210 
211     TF_LITE_ENSURE(context, input1_scale_is_pot);
212     TF_LITE_ENSURE(context, input2_scale_is_pot);
213     TF_LITE_ENSURE(context, output_scale_is_pot);
214 
215     data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
216     data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
217 
218     // Shifting of one input is supported. The graph quantization should ensure
219     // that the other input matches the output.
220     TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
221     TF_LITE_ENSURE(context, data->input1_shift <= 0);
222     TF_LITE_ENSURE(context, data->input2_shift <= 0);
223 
224     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
225         context, params->activation, output, &data->output_activation_min,
226         &data->output_activation_max));
227   }
228 
229   return context->ResizeTensor(context, output, output_size);
230 }
231 
232 template <KernelType kernel_type>
EvalAdd(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)233 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
234              const OpData* data, const TfLiteTensor* input1,
235              const TfLiteTensor* input2, TfLiteTensor* output) {
236   tflite::ArithmeticParams op_params;
237   const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
238       GetTensorShape(input1), GetTensorShape(input2), &op_params);
239 #define TF_LITE_ADD(type, opname, data_type)                             \
240   data_type output_activation_min, output_activation_max;                \
241   CalculateActivationRange(params->activation, &output_activation_min,   \
242                            &output_activation_max);                      \
243   SetActivationParams(output_activation_min, output_activation_max,      \
244                       &op_params);                                       \
245   type::opname(op_params, GetTensorShape(input1),                        \
246                GetTensorData<data_type>(input1), GetTensorShape(input2), \
247                GetTensorData<data_type>(input2), GetTensorShape(output), \
248                GetTensorData<data_type>(output))
249   if (output->type == kTfLiteInt32) {
250     if (kernel_type == kReference) {
251       if (need_broadcast) {
252         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
253       } else {
254         TF_LITE_ADD(reference_ops, Add, int32_t);
255       }
256     } else {
257       if (need_broadcast) {
258         TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
259       } else {
260         TF_LITE_ADD(optimized_ops, Add, int32_t);
261       }
262     }
263   } else if (output->type == kTfLiteInt64) {
264     if (kernel_type == kReference) {
265       if (need_broadcast) {
266         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int64_t);
267       } else {
268         TF_LITE_ADD(reference_ops, Add, int64_t);
269       }
270     } else {
271       if (need_broadcast) {
272         TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int64_t);
273       } else {
274         TF_LITE_ADD(optimized_ops, Add, int64_t);
275       }
276     }
277   } else if (output->type == kTfLiteFloat32) {
278     if (kernel_type == kReference) {
279       if (need_broadcast) {
280         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
281       } else {
282         TF_LITE_ADD(reference_ops, Add, float);
283       }
284     } else {
285       if (need_broadcast) {
286         TF_LITE_ADD(optimized_ops, BroadcastAddDispatch, float);
287       } else {
288         TF_LITE_ADD(optimized_ops, Add, float);
289       }
290     }
291   }
292 #undef TF_LITE_ADD
293 }
294 
295 template <KernelType kernel_type>
EvalAddQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteAddParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)296 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
297                               TfLiteAddParams* params, const OpData* data,
298                               const TfLiteTensor* input1,
299                               const TfLiteTensor* input2,
300                               TfLiteTensor* output) {
301   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
302       !data->pot_scale_int16) {
303     tflite::ArithmeticParams op_params;
304     op_params.left_shift = data->left_shift;
305     op_params.input1_offset = data->input1_offset;
306     op_params.input1_multiplier = data->input1_multiplier;
307     op_params.input1_shift = data->input1_shift;
308     op_params.input2_offset = data->input2_offset;
309     op_params.input2_multiplier = data->input2_multiplier;
310     op_params.input2_shift = data->input2_shift;
311     op_params.output_offset = data->output_offset;
312     op_params.output_multiplier = data->output_multiplier;
313     op_params.output_shift = data->output_shift;
314     SetActivationParams(data->output_activation_min,
315                         data->output_activation_max, &op_params);
316     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
317         GetTensorShape(input1), GetTensorShape(input2), &op_params);
318 #define TF_LITE_ADD(type, opname, dtype)                             \
319   type::opname(op_params, GetTensorShape(input1),                    \
320                GetTensorData<dtype>(input1), GetTensorShape(input2), \
321                GetTensorData<dtype>(input2), GetTensorShape(output), \
322                GetTensorData<dtype>(output));
323     if (output->type == kTfLiteInt8) {
324       if (kernel_type == kReference) {
325         if (need_broadcast) {
326           TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
327         } else {
328           TF_LITE_ADD(reference_integer_ops, Add, int8_t);
329         }
330       } else {
331         if (need_broadcast) {
332           TF_LITE_ADD(optimized_integer_ops, BroadcastAddDispatch, int8_t);
333         } else {
334           TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
335         }
336       }
337     } else if (output->type == kTfLiteInt16) {
338       if (need_broadcast) {
339         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
340       } else {
341         if (kernel_type == kReference) {
342           reference_ops::Add(
343               op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
344               GetTensorShape(input2), GetTensorData<int16_t>(input2),
345               GetTensorShape(output), GetTensorData<int16_t>(output), false);
346         } else {
347           TF_LITE_ADD(optimized_integer_ops, Add, int16_t);
348         }
349       }
350     } else {
351       if (kernel_type == kReference) {
352         if (need_broadcast) {
353           TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
354         } else {
355           TF_LITE_ADD(reference_ops, Add, uint8_t);
356         }
357       } else {
358         if (need_broadcast) {
359           TF_LITE_ADD(optimized_ops, BroadcastAddDispatch, uint8_t);
360         } else {
361           TF_LITE_ADD(optimized_ops, Add, uint8_t);
362         }
363       }
364     }
365 #undef TF_LITE_ADD
366   } else if (output->type == kTfLiteInt16) {
367     tflite::ArithmeticParams op_params;
368     op_params.input1_shift = data->input1_shift;
369     op_params.input2_shift = data->input2_shift;
370     SetActivationParams(data->output_activation_min,
371                         data->output_activation_max, &op_params);
372 #define TF_LITE_ADD(type, opname)                                      \
373   type::opname(op_params, GetTensorShape(input1),                      \
374                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
375                GetTensorData<int16_t>(input2), GetTensorShape(output), \
376                GetTensorData<int16_t>(output))
377     // The quantized version of Add doesn't support activations, so we
378     // always use BroadcastAdd.
379     if (kernel_type == kReference) {
380       TF_LITE_ADD(reference_ops, Add);
381     } else {
382       TF_LITE_ADD(optimized_ops, Add);
383     }
384 #undef TF_LITE_ADD
385   }
386 
387   return kTfLiteOk;
388 }
389 
390 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)391 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
392   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
393   OpData* data = reinterpret_cast<OpData*>(node->user_data);
394 
395   const TfLiteTensor* input1;
396   TF_LITE_ENSURE_OK(context,
397                     GetInputSafe(context, node, kInputTensor1, &input1));
398   const TfLiteTensor* input2;
399   TF_LITE_ENSURE_OK(context,
400                     GetInputSafe(context, node, kInputTensor2, &input2));
401   TfLiteTensor* output;
402   TF_LITE_ENSURE_OK(context,
403                     GetOutputSafe(context, node, kOutputTensor, &output));
404 
405   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
406       output->type == kTfLiteInt64) {
407     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
408   } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
409              output->type == kTfLiteInt16) {
410     TF_LITE_ENSURE_OK(context,
411                       EvalAddQuantized<kernel_type>(context, node, params, data,
412                                                     input1, input2, output));
413   } else {
414     TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Add");
415   }
416 
417   return kTfLiteOk;
418 }
419 
420 }  // namespace add
421 
Register_ADD_REF()422 TfLiteRegistration* Register_ADD_REF() {
423   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
424                                  add::Eval<add::kReference>};
425   return &r;
426 }
427 
Register_ADD_GENERIC_OPT()428 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
429   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
430                                  add::Eval<add::kGenericOptimized>};
431   return &r;
432 }
433 
Register_ADD_NEON_OPT()434 TfLiteRegistration* Register_ADD_NEON_OPT() {
435   static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
436                                  add::Eval<add::kNeonOptimized>};
437   return &r;
438 }
439 
Register_ADD()440 TfLiteRegistration* Register_ADD() {
441 #ifdef USE_NEON
442   return Register_ADD_NEON_OPT();
443 #else
444   return Register_ADD_GENERIC_OPT();
445 #endif
446 }
447 
448 }  // namespace builtin
449 }  // namespace ops
450 }  // namespace tflite
451