1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/reference/sub.h"
16
17 #include <stddef.h>
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <limits>
22
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/c/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
27 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
28 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
29 #include "tensorflow/lite/kernels/internal/quantization_util.h"
30 #include "tensorflow/lite/kernels/internal/reference/add.h"
31 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
32 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
33 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
34 #include "tensorflow/lite/kernels/internal/tensor.h"
35 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
36 #include "tensorflow/lite/kernels/internal/types.h"
37 #include "tensorflow/lite/kernels/kernel_util.h"
38
39 namespace tflite {
40 namespace ops {
41 namespace builtin {
42 namespace sub {
43
44 // This file has three implementation of Sub.
45 enum KernelType {
46 kReference,
47 kGenericOptimized, // Neon-free
48 kNeonOptimized,
49 };
50
51 constexpr int kInputTensor1 = 0;
52 constexpr int kInputTensor2 = 1;
53 constexpr int kOutputTensor = 0;
54
55 struct OpData {
56 bool requires_broadcast;
57
58 // These fields are used in both the general 8-bit -> 8bit quantized path,
59 // and the special 16-bit -> 16bit quantized path
60 int input1_shift;
61 int input2_shift;
62 int32 output_activation_min;
63 int32 output_activation_max;
64
65 // These fields are used only in the general 8-bit -> 8bit quantized path
66 int32 input1_multiplier;
67 int32 input2_multiplier;
68 int32 output_multiplier;
69 int output_shift;
70 int left_shift;
71 int32 input1_offset;
72 int32 input2_offset;
73 int32 output_offset;
74
75 // This parameter is used to indicate whether
76 // parameter scale is power of two.
77 // It is used in 16-bit -> 16-bit quantization.
78 bool pot_scale_int16;
79 };
80
Init(TfLiteContext * context,const char * buffer,size_t length)81 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
82 auto* data = new OpData;
83 data->requires_broadcast = false;
84 return data;
85 }
86
Free(TfLiteContext * context,void * buffer)87 void Free(TfLiteContext* context, void* buffer) {
88 delete reinterpret_cast<OpData*>(buffer);
89 }
90
PrepareGeneralSubOp(TfLiteContext * context,const TfLiteTensor * input_1,const TfLiteTensor * input_2,TfLiteTensor * output,TfLiteSubParams * params,OpData * op_params,int op_sign)91 TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
92 const TfLiteTensor* input_1,
93 const TfLiteTensor* input_2,
94 TfLiteTensor* output, TfLiteSubParams* params,
95 OpData* op_params, int op_sign) {
96 TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
97 output->type == kTfLiteInt8 ||
98 output->type == kTfLiteInt16);
99 const auto& input1_quantization_params = input_1->params;
100 const auto& input2_quantization_params = input_2->params;
101 const auto& output_quantization_params = output->params;
102 int32_t integer_type_min = 0;
103 int32_t integer_type_max = 0;
104 if (output->type == kTfLiteUInt8) {
105 integer_type_min = std::numeric_limits<uint8_t>::min();
106 integer_type_max = std::numeric_limits<uint8_t>::max();
107 } else if (output->type == kTfLiteInt16) {
108 integer_type_min = std::numeric_limits<int16_t>::min();
109 integer_type_max = std::numeric_limits<int16_t>::max();
110 } else {
111 // output->type == kTfLiteInt8
112 integer_type_min = std::numeric_limits<int8_t>::min();
113 integer_type_max = std::numeric_limits<int8_t>::max();
114 }
115
116 TF_LITE_ENSURE(context,
117 input1_quantization_params.zero_point >= integer_type_min);
118 TF_LITE_ENSURE(context,
119 input1_quantization_params.zero_point <= integer_type_max);
120 TF_LITE_ENSURE(context,
121 input2_quantization_params.zero_point >= integer_type_min);
122 TF_LITE_ENSURE(context,
123 input2_quantization_params.zero_point <= integer_type_max);
124 TF_LITE_ENSURE(context,
125 output_quantization_params.zero_point >= integer_type_min);
126 TF_LITE_ENSURE(context,
127 output_quantization_params.zero_point <= integer_type_max);
128
129 op_params->input1_offset = -input1_quantization_params.zero_point;
130 op_params->input2_offset = -input2_quantization_params.zero_point;
131 op_params->output_offset = output_quantization_params.zero_point;
132
133 // The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
134 // accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
135 // 31, therefore the addition will still fit in a 32 bit accumulator.
136 op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
137 const double twice_max_input_scale =
138 2 * std::max(input1_quantization_params.scale,
139 input2_quantization_params.scale);
140 const double real_input1_multiplier =
141 input1_quantization_params.scale / twice_max_input_scale;
142 const double real_input2_multiplier =
143 input2_quantization_params.scale / twice_max_input_scale;
144 const double real_output_multiplier =
145 twice_max_input_scale /
146 ((1 << op_params->left_shift) * output_quantization_params.scale);
147
148 tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
149 &op_params->input1_multiplier,
150 &op_params->input1_shift);
151 tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
152 &op_params->input2_multiplier,
153 &op_params->input2_shift);
154 op_params->input2_multiplier *= op_sign;
155 tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
156 &op_params->output_multiplier,
157 &op_params->output_shift);
158
159 TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
160 context, params->activation, output, &op_params->output_activation_min,
161 &op_params->output_activation_max));
162
163 return kTfLiteOk;
164 }
165
PrepareInt16SubOpPOT(TfLiteContext * context,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output,TfLiteSubParams * params,OpData * data)166 TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context,
167 const TfLiteTensor* input1,
168 const TfLiteTensor* input2,
169 TfLiteTensor* output, TfLiteSubParams* params,
170 OpData* data) {
171 // 16bit -> 16bit special quantized path, supporting only a rather
172 // narrow case of quantization parameters: zero_points must all be 0
173 // ("symmetric quantization") and scales must be power-of-two (which
174 // we abbreviate as "POT" below). The intended use case for this path
175 // is in LSTM cells, where, due to the constraints of implementing
176 // some of the math in these LSTM cells in fixed-point arithmetic,
177 // we need to have such symmetric, power-of-two quantization
178 // (Fixed-point formats are inherently symmetric, power-of-two).
179 TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
180 TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
181 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
182
183 int input1_scale_log2_rounded;
184 bool input1_scale_is_pot =
185 CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
186 TF_LITE_ENSURE(context, input1_scale_is_pot);
187
188 int input2_scale_log2_rounded;
189 bool input2_scale_is_pot =
190 CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
191 TF_LITE_ENSURE(context, input2_scale_is_pot);
192
193 int output_scale_log2_rounded;
194 bool output_scale_is_pot =
195 CheckedLog2(output->params.scale, &output_scale_log2_rounded);
196 TF_LITE_ENSURE(context, output_scale_is_pot);
197
198 data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
199 data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
200
201 // Shifting of one input is supported. The graph quantization should ensure
202 // that the other input matches the output.
203 TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
204 TF_LITE_ENSURE(context, data->input1_shift <= 0);
205 TF_LITE_ENSURE(context, data->input2_shift <= 0);
206
207 TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
208 context, params->activation, output, &data->output_activation_min,
209 &data->output_activation_max));
210 return kTfLiteOk;
211 }
212
Prepare(TfLiteContext * context,TfLiteNode * node)213 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
214 OpData* data = reinterpret_cast<OpData*>(node->user_data);
215 auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
216
217 TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
218 TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
219
220 const TfLiteTensor* input1;
221 TF_LITE_ENSURE_OK(context,
222 GetInputSafe(context, node, kInputTensor1, &input1));
223 const TfLiteTensor* input2;
224 TF_LITE_ENSURE_OK(context,
225 GetInputSafe(context, node, kInputTensor2, &input2));
226 TfLiteTensor* output;
227 TF_LITE_ENSURE_OK(context,
228 GetOutputSafe(context, node, kOutputTensor, &output));
229
230 TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
231 output->type = input2->type;
232
233 data->requires_broadcast = !HaveSameShapes(input1, input2);
234
235 TfLiteIntArray* output_size = nullptr;
236 if (data->requires_broadcast) {
237 TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
238 context, input1, input2, &output_size));
239 } else {
240 output_size = TfLiteIntArrayCopy(input1->dims);
241 }
242
243 // 8bit -> 8bit general quantized path, with general rescalings
244 // as well as, 16bit -> 16bit with general rescalings
245
246 // There are two implementations of SUB operator in case of
247 // 16bit input depending on whether the scale parameter is
248 // the power of 2 or not. Currently only implementation for
249 // general case is used, but we need to use another implementation
250 // for older versions.
251 bool general_scale_int16 = false;
252
253 bool input1_scale_is_pot = false;
254 bool input2_scale_is_pot = false;
255 bool output_scale_is_pot = false;
256
257 int input1_scale_log2_rounded{0};
258 int input2_scale_log2_rounded{0};
259 int output_scale_log2_rounded{0};
260
261 if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
262 output->type == kTfLiteInt16) {
263 TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
264 TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
265 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
266
267 general_scale_int16 = !params || !params->pot_scale_int16;
268
269 if (!general_scale_int16) {
270 // Do preparation in the case of the scale parameter is power of 2.
271 input1_scale_is_pot =
272 CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
273
274 input2_scale_is_pot =
275 CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
276
277 output_scale_is_pot =
278 CheckedLog2(output->params.scale, &output_scale_log2_rounded);
279
280 general_scale_int16 =
281 !input1_scale_is_pot || !input2_scale_is_pot || !output_scale_is_pot;
282 }
283 }
284
285 data->pot_scale_int16 = !general_scale_int16;
286
287 if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
288 general_scale_int16) {
289 TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
290 output, params, data, -1));
291 } else if (output->type == kTfLiteInt16) {
292 // LSTM-special case with scale parameter of POT
293 TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2,
294 output, params, data));
295 }
296
297 return context->ResizeTensor(context, output, output_size);
298 }
299
300 template <KernelType kernel_type, typename data_type>
EvalSubImpl(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,bool requires_broadcast,TfLiteTensor * output)301 void EvalSubImpl(TfLiteContext* context, TfLiteNode* node,
302 TfLiteSubParams* params, const OpData* data,
303 const TfLiteTensor* input1, const TfLiteTensor* input2,
304 bool requires_broadcast, TfLiteTensor* output) {
305 data_type output_activation_min, output_activation_max;
306 CalculateActivationRange(params->activation, &output_activation_min,
307 &output_activation_max);
308 tflite::ArithmeticParams op_params;
309 SetActivationParams(output_activation_min, output_activation_max, &op_params);
310
311 switch (kernel_type) {
312 case kReference:
313 if (requires_broadcast) {
314 reference_ops::BroadcastSubSlow(
315 op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
316 GetTensorShape(input2), GetTensorData<data_type>(input2),
317 GetTensorShape(output), GetTensorData<data_type>(output));
318 } else {
319 reference_ops::SubWithActivation(
320 op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
321 GetTensorShape(input2), GetTensorData<data_type>(input2),
322 GetTensorShape(output), GetTensorData<data_type>(output));
323 }
324 break;
325 case kGenericOptimized:
326 case kNeonOptimized:
327 if (requires_broadcast) {
328 optimized_ops::BroadcastSubSlow(
329 op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
330 GetTensorShape(input2), GetTensorData<data_type>(input2),
331 GetTensorShape(output), GetTensorData<data_type>(output));
332 } else {
333 optimized_ops::SubWithActivation(
334 op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
335 GetTensorShape(input2), GetTensorData<data_type>(input2),
336 GetTensorShape(output), GetTensorData<data_type>(output));
337 }
338 break;
339 }
340 }
341
342 template <KernelType kernel_type>
EvalSub(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)343 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
344 const OpData* data, const TfLiteTensor* input1,
345 const TfLiteTensor* input2, TfLiteTensor* output) {
346 const bool requires_broadcast = data->requires_broadcast;
347 switch (output->type) {
348 case kTfLiteInt32:
349 EvalSubImpl<kernel_type, int32_t>(context, node, params, data, input1,
350 input2, requires_broadcast, output);
351 break;
352 case kTfLiteFloat32:
353 EvalSubImpl<kernel_type, float>(context, node, params, data, input1,
354 input2, requires_broadcast, output);
355 break;
356 case kTfLiteInt64:
357 EvalSubImpl<kernel_type, int64_t>(context, node, params, data, input1,
358 input2, requires_broadcast, output);
359 break;
360
361 default:
362 TF_LITE_KERNEL_LOG(context, "output type %s is not supported.",
363 TfLiteTypeGetName(output->type));
364 }
365 }
366
367 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteSubParams * params,const OpData * data,const TfLiteTensor * input1,const TfLiteTensor * input2,TfLiteTensor * output)368 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
369 TfLiteSubParams* params, const OpData* data,
370 const TfLiteTensor* input1, const TfLiteTensor* input2,
371 TfLiteTensor* output) {
372 tflite::ArithmeticParams op_params;
373 op_params.left_shift = data->left_shift;
374 op_params.input1_offset = data->input1_offset;
375 op_params.input1_multiplier = data->input1_multiplier;
376 op_params.input1_shift = data->input1_shift;
377 op_params.input2_offset = data->input2_offset;
378 op_params.input2_multiplier = data->input2_multiplier;
379 op_params.input2_shift = data->input2_shift;
380 op_params.output_offset = data->output_offset;
381 op_params.output_multiplier = data->output_multiplier;
382 op_params.output_shift = data->output_shift;
383 SetActivationParams(data->output_activation_min, data->output_activation_max,
384 &op_params);
385
386 const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
387 GetTensorShape(input1), GetTensorShape(input2), &op_params);
388
389 #define TF_LITE_SUB(type, opname, data_type) \
390 type::opname(op_params, GetTensorShape(input1), \
391 GetTensorData<data_type>(input1), GetTensorShape(input2), \
392 GetTensorData<data_type>(input2), GetTensorShape(output), \
393 GetTensorData<data_type>(output))
394 // NOTE: We are using the add kernels. This is possible as the second values
395 // multiplier is negated before being passed down.
396 if (output->type == kTfLiteInt8) {
397 if (need_broadcast) {
398 TF_LITE_SUB(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
399 } else {
400 TF_LITE_SUB(reference_integer_ops, Add, int8_t);
401 }
402 } else if (!data->pot_scale_int16) {
403 if (need_broadcast) {
404 TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
405 } else {
406 reference_ops::Add(op_params, GetTensorShape(input1),
407 GetTensorData<int16_t>(input1), GetTensorShape(input2),
408 GetTensorData<int16_t>(input2), GetTensorShape(output),
409 GetTensorData<int16_t>(output), false);
410 }
411 } else if (output->type == kTfLiteUInt8) {
412 if (kernel_type == kReference) {
413 if (need_broadcast) {
414 TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, uint8_t);
415 } else {
416 TF_LITE_SUB(reference_ops, Add, uint8_t);
417 }
418 } else {
419 if (need_broadcast) {
420 optimized_ops::BroadcastAddDispatch(
421 op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1),
422 GetTensorShape(input2), GetTensorData<uint8_t>(input2),
423 GetTensorShape(output), GetTensorData<uint8_t>(output));
424 } else {
425 TF_LITE_SUB(optimized_ops, Add, uint8_t);
426 }
427 }
428 } else {
429 // In the case of 16-bit sub with POT scaling, we use the sub kernels as
430 // there is no multiplier to negate to reuse the add kernels.
431 if (kernel_type == kReference) {
432 if (need_broadcast) {
433 TF_LITE_SUB(reference_ops, BroadcastSub16POTSlow, int16_t);
434 } else {
435 TF_LITE_SUB(reference_ops, Sub16, int16_t);
436 }
437 } else {
438 if (need_broadcast) {
439 TF_LITE_SUB(optimized_ops, BroadcastSub16POTSlow, int16_t);
440 } else {
441 TF_LITE_SUB(optimized_ops, Sub16, int16_t);
442 }
443 }
444 }
445 #undef TF_LITE_SUB
446 }
447
448 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)449 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
450 auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
451 OpData* data = reinterpret_cast<OpData*>(node->user_data);
452
453 const TfLiteTensor* input1;
454 TF_LITE_ENSURE_OK(context,
455 GetInputSafe(context, node, kInputTensor1, &input1));
456 const TfLiteTensor* input2;
457 TF_LITE_ENSURE_OK(context,
458 GetInputSafe(context, node, kInputTensor2, &input2));
459 TfLiteTensor* output;
460 TF_LITE_ENSURE_OK(context,
461 GetOutputSafe(context, node, kOutputTensor, &output));
462
463 if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
464 output->type == kTfLiteInt64) {
465 EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
466 } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
467 output->type == kTfLiteInt16) {
468 EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
469 output);
470 } else {
471 context->ReportError(
472 context,
473 "output type %d is not supported, requires float|uint8|int32 types.",
474 output->type);
475 return kTfLiteError;
476 }
477
478 return kTfLiteOk;
479 }
480
481 } // namespace sub
482
Register_SUB_REF()483 TfLiteRegistration* Register_SUB_REF() {
484 static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
485 sub::Eval<sub::kReference>};
486 return &r;
487 }
488
Register_SUB_GENERIC_OPT()489 TfLiteRegistration* Register_SUB_GENERIC_OPT() {
490 static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
491 sub::Eval<sub::kGenericOptimized>};
492 return &r;
493 }
494
Register_SUB_NEON_OPT()495 TfLiteRegistration* Register_SUB_NEON_OPT() {
496 static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
497 sub::Eval<sub::kNeonOptimized>};
498 return &r;
499 }
500
Register_SUB()501 TfLiteRegistration* Register_SUB() {
502 #ifdef USE_NEON
503 return Register_SUB_NEON_OPT();
504 #else
505 return Register_SUB_GENERIC_OPT();
506 #endif
507 }
508
509 } // namespace builtin
510 } // namespace ops
511 } // namespace tflite
512