• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/reference/sub.h"
16 
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <limits>
22 
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
27 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
28 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
29 #include "tensorflow/lite/kernels/internal/quantization_util.h"
30 #include "tensorflow/lite/kernels/internal/reference/add.h"
31 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
32 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
33 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
34 #include "tensorflow/lite/kernels/internal/tensor.h"
35 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
36 #include "tensorflow/lite/kernels/internal/types.h"
37 #include "tensorflow/lite/kernels/kernel_util.h"
38 
39 namespace tflite {
40 namespace ops {
41 namespace builtin {
42 namespace sub {
43 
44 // This file has three implementation of Sub.
45 enum KernelType {
46   kReference,
47   kGenericOptimized,  // Neon-free
48   kNeonOptimized,
49 };
50 
51 constexpr int kInputTensor1 = 0;
52 constexpr int kInputTensor2 = 1;
53 constexpr int kOutputTensor = 0;
54 
55 struct OpData {
56   bool requires_broadcast;
57 
58   // These fields are used in both the general 8-bit -> 8bit quantized path,
59   // and the special 16-bit -> 16bit quantized path
60   int input1_shift;
61   int input2_shift;
62   int32 output_activation_min;
63   int32 output_activation_max;
64 
65   // These fields are used only in the general 8-bit -> 8bit quantized path
66   int32 input1_multiplier;
67   int32 input2_multiplier;
68   int32 output_multiplier;
69   int output_shift;
70   int left_shift;
71   int32 input1_offset;
72   int32 input2_offset;
73   int32 output_offset;
74 
75   // This parameter is used to indicate whether
76   // parameter scale is power of two.
77   // It is used in 16-bit -> 16-bit quantization.
78   bool pot_scale_int16;
79 };
80 
Init(TfLiteContext * context,const char * buffer,size_t length)81 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
82   auto* data = new OpData;
83   data->requires_broadcast = false;
84   return data;
85 }
86 
Free(TfLiteContext * context,void * buffer)87 void Free(TfLiteContext* context, void* buffer) {
88   delete reinterpret_cast<OpData*>(buffer);
89 }
90 
PrepareGeneralSubOp(TfLiteContext * context,const TfLiteTensor * input_1,const TfLiteTensor * input_2,TfLiteTensor * output,TfLiteSubParams * params,OpData * op_params,int op_sign)91 TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
92                                  const TfLiteTensor* input_1,
93                                  const TfLiteTensor* input_2,
94                                  TfLiteTensor* output, TfLiteSubParams* params,
95                                  OpData* op_params, int op_sign) {
96   TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
97                               output->type == kTfLiteInt8 ||
98                               output->type == kTfLiteInt16);
99   const auto& input1_quantization_params = input_1->params;
100   const auto& input2_quantization_params = input_2->params;
101   const auto& output_quantization_params = output->params;
102   int32_t integer_type_min = 0;
103   int32_t integer_type_max = 0;
104   if (output->type == kTfLiteUInt8) {
105     integer_type_min = std::numeric_limits<uint8_t>::min();
106     integer_type_max = std::numeric_limits<uint8_t>::max();
107   } else if (output->type == kTfLiteInt16) {
108     integer_type_min = std::numeric_limits<int16_t>::min();
109     integer_type_max = std::numeric_limits<int16_t>::max();
110   } else {
111     // output->type == kTfLiteInt8
112     integer_type_min = std::numeric_limits<int8_t>::min();
113     integer_type_max = std::numeric_limits<int8_t>::max();
114   }
115 
116   TF_LITE_ENSURE(context,
117                  input1_quantization_params.zero_point >= integer_type_min);
118   TF_LITE_ENSURE(context,
119                  input1_quantization_params.zero_point <= integer_type_max);
120   TF_LITE_ENSURE(context,
121                  input2_quantization_params.zero_point >= integer_type_min);
122   TF_LITE_ENSURE(context,
123                  input2_quantization_params.zero_point <= integer_type_max);
124   TF_LITE_ENSURE(context,
125                  output_quantization_params.zero_point >= integer_type_min);
126   TF_LITE_ENSURE(context,
127                  output_quantization_params.zero_point <= integer_type_max);
128 
129   op_params->input1_offset = -input1_quantization_params.zero_point;
130   op_params->input2_offset = -input2_quantization_params.zero_point;
131   op_params->output_offset = output_quantization_params.zero_point;
132 
133   // The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
134   // accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
135   // 31, therefore the addition will still fit in a 32 bit accumulator.
136   op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
137   const double twice_max_input_scale =
138       2 * std::max(input1_quantization_params.scale,
139                    input2_quantization_params.scale);
140   const double real_input1_multiplier =
141       input1_quantization_params.scale / twice_max_input_scale;
142   const double real_input2_multiplier =
143       input2_quantization_params.scale / twice_max_input_scale;
144   const double real_output_multiplier =
145       twice_max_input_scale /
146       ((1 << op_params->left_shift) * output_quantization_params.scale);
147 
148   tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
149                                               &op_params->input1_multiplier,
150                                               &op_params->input1_shift);
151   tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
152                                               &op_params->input2_multiplier,
153                                               &op_params->input2_shift);
154   op_params->input2_multiplier *= op_sign;
155   tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
156                                               &op_params->output_multiplier,
157                                               &op_params->output_shift);
158 
159   TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
160       context, params->activation, output, &op_params->output_activation_min,
161       &op_params->output_activation_max));
162 
163   return kTfLiteOk;
164 }
165 
PrepareInt16SubOpPOT(TfLiteContext * context,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output,TfLiteSubParams * params,OpData * data)166 TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context,
167                                   const TfLiteTensor* input1,
168                                   const TfLiteTensor* input2,
169                                   TfLiteTensor* output, TfLiteSubParams* params,
170                                   OpData* data) {
171   // 16bit -> 16bit special quantized path, supporting only a rather
172   // narrow case of quantization parameters: zero_points must all be 0
173   // ("symmetric quantization") and scales must be power-of-two (which
174   // we abbreviate as "POT" below). The intended use case for this path
175   // is in LSTM cells, where, due to the constraints of implementing
176   // some of the math in these LSTM cells in fixed-point arithmetic,
177   // we need to have such symmetric, power-of-two quantization
178   // (Fixed-point formats are inherently symmetric, power-of-two).
179   TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
180   TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
181   TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
182 
183   int input1_scale_log2_rounded;
184   bool input1_scale_is_pot =
185       CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
186   TF_LITE_ENSURE(context, input1_scale_is_pot);
187 
188   int input2_scale_log2_rounded;
189   bool input2_scale_is_pot =
190       CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
191   TF_LITE_ENSURE(context, input2_scale_is_pot);
192 
193   int output_scale_log2_rounded;
194   bool output_scale_is_pot =
195       CheckedLog2(output->params.scale, &output_scale_log2_rounded);
196   TF_LITE_ENSURE(context, output_scale_is_pot);
197 
198   data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
199   data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
200 
201   // Shifting of one input is supported. The graph quantization should ensure
202   // that the other input matches the output.
203   TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
204   TF_LITE_ENSURE(context, data->input1_shift <= 0);
205   TF_LITE_ENSURE(context, data->input2_shift <= 0);
206 
207   TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
208       context, params->activation, output, &data->output_activation_min,
209       &data->output_activation_max));
210   return kTfLiteOk;
211 }
212 
Prepare(TfLiteContext * context,TfLiteNode * node)213 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
214   OpData* data = reinterpret_cast<OpData*>(node->user_data);
215   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
216 
217   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
218   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
219 
220   const TfLiteTensor* input1;
221   TF_LITE_ENSURE_OK(context,
222                     GetInputSafe(context, node, kInputTensor1, &input1));
223   const TfLiteTensor* input2;
224   TF_LITE_ENSURE_OK(context,
225                     GetInputSafe(context, node, kInputTensor2, &input2));
226   TfLiteTensor* output;
227   TF_LITE_ENSURE_OK(context,
228                     GetOutputSafe(context, node, kOutputTensor, &output));
229 
230   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
231   output->type = input2->type;
232 
233   data->requires_broadcast = !HaveSameShapes(input1, input2);
234 
235   TfLiteIntArray* output_size = nullptr;
236   if (data->requires_broadcast) {
237     TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
238                                    context, input1, input2, &output_size));
239   } else {
240     output_size = TfLiteIntArrayCopy(input1->dims);
241   }
242 
243   // 8bit -> 8bit general quantized path, with general rescalings
244   // as well as, 16bit -> 16bit with general rescalings
245 
246   // There are two implementations of SUB operator in case of
247   // 16bit input depending on whether the scale parameter is
248   // the power of 2 or not. Currently only implementation for
249   // general case is used, but we need to use another implementation
250   // for older versions.
251   bool general_scale_int16 = false;
252 
253   bool input1_scale_is_pot = false;
254   bool input2_scale_is_pot = false;
255   bool output_scale_is_pot = false;
256 
257   int input1_scale_log2_rounded{0};
258   int input2_scale_log2_rounded{0};
259   int output_scale_log2_rounded{0};
260 
261   if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
262       output->type == kTfLiteInt16) {
263     TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
264     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
265     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
266 
267     general_scale_int16 = !params || !params->pot_scale_int16;
268 
269     if (!general_scale_int16) {
270       // Do preparation in the case of the scale parameter is power of 2.
271       input1_scale_is_pot =
272           CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
273 
274       input2_scale_is_pot =
275           CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
276 
277       output_scale_is_pot =
278           CheckedLog2(output->params.scale, &output_scale_log2_rounded);
279 
280       general_scale_int16 =
281           !input1_scale_is_pot || !input2_scale_is_pot || !output_scale_is_pot;
282     }
283   }
284 
285   data->pot_scale_int16 = !general_scale_int16;
286 
287   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
288       general_scale_int16) {
289     TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
290                                                    output, params, data, -1));
291   } else if (output->type == kTfLiteInt16) {
292     // LSTM-special case with scale parameter of POT
293     TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2,
294                                                     output, params, data));
295   }
296 
297   return context->ResizeTensor(context, output, output_size);
298 }
299 
300 template <KernelType kernel_type, typename data_type>
EvalSubImpl(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,bool requires_broadcast,TfLiteTensor * output)301 void EvalSubImpl(TfLiteContext* context, TfLiteNode* node,
302                  TfLiteSubParams* params, const OpData* data,
303                  const TfLiteTensor* input1, const TfLiteTensor* input2,
304                  bool requires_broadcast, TfLiteTensor* output) {
305   data_type output_activation_min, output_activation_max;
306   CalculateActivationRange(params->activation, &output_activation_min,
307                            &output_activation_max);
308   tflite::ArithmeticParams op_params;
309   SetActivationParams(output_activation_min, output_activation_max, &op_params);
310 
311   switch (kernel_type) {
312     case kReference:
313       if (requires_broadcast) {
314         reference_ops::BroadcastSubSlow(
315             op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
316             GetTensorShape(input2), GetTensorData<data_type>(input2),
317             GetTensorShape(output), GetTensorData<data_type>(output));
318       } else {
319         reference_ops::SubWithActivation(
320             op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
321             GetTensorShape(input2), GetTensorData<data_type>(input2),
322             GetTensorShape(output), GetTensorData<data_type>(output));
323       }
324       break;
325     case kGenericOptimized:
326     case kNeonOptimized:
327       if (requires_broadcast) {
328         optimized_ops::BroadcastSubSlow(
329             op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
330             GetTensorShape(input2), GetTensorData<data_type>(input2),
331             GetTensorShape(output), GetTensorData<data_type>(output));
332       } else {
333         optimized_ops::SubWithActivation(
334             op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
335             GetTensorShape(input2), GetTensorData<data_type>(input2),
336             GetTensorShape(output), GetTensorData<data_type>(output));
337       }
338       break;
339   }
340 }
341 
342 template <KernelType kernel_type>
EvalSub(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)343 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
344              const OpData* data, const TfLiteTensor* input1,
345              const TfLiteTensor* input2, TfLiteTensor* output) {
346   const bool requires_broadcast = data->requires_broadcast;
347   switch (output->type) {
348     case kTfLiteInt32:
349       EvalSubImpl<kernel_type, int32_t>(context, node, params, data, input1,
350                                         input2, requires_broadcast, output);
351       break;
352     case kTfLiteFloat32:
353       EvalSubImpl<kernel_type, float>(context, node, params, data, input1,
354                                       input2, requires_broadcast, output);
355       break;
356     case kTfLiteInt64:
357       EvalSubImpl<kernel_type, int64_t>(context, node, params, data, input1,
358                                         input2, requires_broadcast, output);
359       break;
360 
361     default:
362       TF_LITE_KERNEL_LOG(context, "output type %s is not supported.",
363                          TfLiteTypeGetName(output->type));
364   }
365 }
366 
367 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)368 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
369                    TfLiteSubParams* params, const OpData* data,
370                    const TfLiteTensor* input1, const TfLiteTensor* input2,
371                    TfLiteTensor* output) {
372   tflite::ArithmeticParams op_params;
373   op_params.left_shift = data->left_shift;
374   op_params.input1_offset = data->input1_offset;
375   op_params.input1_multiplier = data->input1_multiplier;
376   op_params.input1_shift = data->input1_shift;
377   op_params.input2_offset = data->input2_offset;
378   op_params.input2_multiplier = data->input2_multiplier;
379   op_params.input2_shift = data->input2_shift;
380   op_params.output_offset = data->output_offset;
381   op_params.output_multiplier = data->output_multiplier;
382   op_params.output_shift = data->output_shift;
383   SetActivationParams(data->output_activation_min, data->output_activation_max,
384                       &op_params);
385 
386   const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
387       GetTensorShape(input1), GetTensorShape(input2), &op_params);
388 
389 #define TF_LITE_SUB(type, opname, data_type)                             \
390   type::opname(op_params, GetTensorShape(input1),                        \
391                GetTensorData<data_type>(input1), GetTensorShape(input2), \
392                GetTensorData<data_type>(input2), GetTensorShape(output), \
393                GetTensorData<data_type>(output))
394   // NOTE: We are using the add kernels. This is possible as the second values
395   // multiplier is negated before being passed down.
396   if (output->type == kTfLiteInt8) {
397     if (need_broadcast) {
398       TF_LITE_SUB(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
399     } else {
400       TF_LITE_SUB(reference_integer_ops, Add, int8_t);
401     }
402   } else if (!data->pot_scale_int16) {
403     if (need_broadcast) {
404       TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
405     } else {
406       reference_ops::Add(op_params, GetTensorShape(input1),
407                          GetTensorData<int16_t>(input1), GetTensorShape(input2),
408                          GetTensorData<int16_t>(input2), GetTensorShape(output),
409                          GetTensorData<int16_t>(output), false);
410     }
411   } else if (output->type == kTfLiteUInt8) {
412     if (kernel_type == kReference) {
413       if (need_broadcast) {
414         TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, uint8_t);
415       } else {
416         TF_LITE_SUB(reference_ops, Add, uint8_t);
417       }
418     } else {
419       if (need_broadcast) {
420         optimized_ops::BroadcastAddDispatch(
421             op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1),
422             GetTensorShape(input2), GetTensorData<uint8_t>(input2),
423             GetTensorShape(output), GetTensorData<uint8_t>(output));
424       } else {
425         TF_LITE_SUB(optimized_ops, Add, uint8_t);
426       }
427     }
428   } else {
429     // In the case of 16-bit sub with POT scaling, we use the sub kernels as
430     // there is no multiplier to negate to reuse the add kernels.
431     if (kernel_type == kReference) {
432       if (need_broadcast) {
433         TF_LITE_SUB(reference_ops, BroadcastSub16POTSlow, int16_t);
434       } else {
435         TF_LITE_SUB(reference_ops, Sub16, int16_t);
436       }
437     } else {
438       if (need_broadcast) {
439         TF_LITE_SUB(optimized_ops, BroadcastSub16POTSlow, int16_t);
440       } else {
441         TF_LITE_SUB(optimized_ops, Sub16, int16_t);
442       }
443     }
444   }
445 #undef TF_LITE_SUB
446 }
447 
448 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)449 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
450   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
451   OpData* data = reinterpret_cast<OpData*>(node->user_data);
452 
453   const TfLiteTensor* input1;
454   TF_LITE_ENSURE_OK(context,
455                     GetInputSafe(context, node, kInputTensor1, &input1));
456   const TfLiteTensor* input2;
457   TF_LITE_ENSURE_OK(context,
458                     GetInputSafe(context, node, kInputTensor2, &input2));
459   TfLiteTensor* output;
460   TF_LITE_ENSURE_OK(context,
461                     GetOutputSafe(context, node, kOutputTensor, &output));
462 
463   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
464       output->type == kTfLiteInt64) {
465     EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
466   } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
467              output->type == kTfLiteInt16) {
468     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
469                                output);
470   } else {
471     context->ReportError(
472         context,
473         "output type %d is not supported, requires float|uint8|int32 types.",
474         output->type);
475     return kTfLiteError;
476   }
477 
478   return kTfLiteOk;
479 }
480 
481 }  // namespace sub
482 
Register_SUB_REF()483 TfLiteRegistration* Register_SUB_REF() {
484   static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
485                                  sub::Eval<sub::kReference>};
486   return &r;
487 }
488 
Register_SUB_GENERIC_OPT()489 TfLiteRegistration* Register_SUB_GENERIC_OPT() {
490   static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
491                                  sub::Eval<sub::kGenericOptimized>};
492   return &r;
493 }
494 
Register_SUB_NEON_OPT()495 TfLiteRegistration* Register_SUB_NEON_OPT() {
496   static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
497                                  sub::Eval<sub::kNeonOptimized>};
498   return &r;
499 }
500 
Register_SUB()501 TfLiteRegistration* Register_SUB() {
502 #ifdef USE_NEON
503   return Register_SUB_NEON_OPT();
504 #else
505   return Register_SUB_GENERIC_OPT();
506 #endif
507 }
508 
509 }  // namespace builtin
510 }  // namespace ops
511 }  // namespace tflite
512