• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
17 
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <limits>
22 
23 #include "ruy/profiler/instrumentation.h"  // from @ruy
24 #include "tensorflow/lite/kernels/internal/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27 
28 namespace tflite {
29 
30 namespace reference_ops {
31 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)32 inline void SubNonBroadcast(const ArithmeticParams& params,
33                             const RuntimeShape& input1_shape,
34                             const float* input1_data,
35                             const RuntimeShape& input2_shape,
36                             const float* input2_data,
37                             const RuntimeShape& output_shape,
38                             float* output_data) {
39   const int flat_size =
40       MatchingElementsSize(input1_shape, input2_shape, output_shape);
41   for (int i = 0; i < flat_size; ++i) {
42     output_data[i] = ActivationFunctionWithMinMax(
43         input1_data[i] - input2_data[i], params.float_activation_min,
44         params.float_activation_max);
45   }
46 }
47 
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)48 inline void SubNonBroadcast(const ArithmeticParams& params,
49                             const RuntimeShape& input1_shape,
50                             const int32_t* input1_data,
51                             const RuntimeShape& input2_shape,
52                             const int32_t* input2_data,
53                             const RuntimeShape& output_shape,
54                             int32_t* output_data) {
55   const int flat_size =
56       MatchingElementsSize(input1_shape, input2_shape, output_shape);
57   for (int i = 0; i < flat_size; ++i) {
58     output_data[i] = ActivationFunctionWithMinMax(
59         input1_data[i] - input2_data[i], params.quantized_activation_min,
60         params.quantized_activation_max);
61   }
62 }
63 
64 // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
65 // dimensionality if the runtime code does a single loop over one dimension
66 // that handles broadcasting as the base case. The code generator would then
67 // generate max(D1, D2) nested for loops.
68 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)69 inline void BroadcastSubSlow(const ArithmeticParams& params,
70                              const RuntimeShape& input1_shape,
71                              const float* input1_data,
72                              const RuntimeShape& input2_shape,
73                              const float* input2_data,
74                              const RuntimeShape& output_shape,
75                              float* output_data) {
76   ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
77   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
78   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
79   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
80   NdArrayDesc<N> desc1;
81   NdArrayDesc<N> desc2;
82   NdArrayDesc<N> output_desc;
83   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84                                       &desc2);
85   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
86 
87   // In Tensorflow, the dimensions are canonically named (batch_number, row,
88   // col, channel), with extents (batches, height, width, depth), with the
89   // trailing dimension changing most rapidly (channels has the smallest stride,
90   // typically 1 element).
91   //
92   // In generated C code, we store arrays with the dimensions reversed. The
93   // first dimension has smallest stride.
94   //
95   // We name our variables by their Tensorflow convention, but generate C code
96   // nesting loops such that the innermost loop has the smallest stride for the
97   // best cache behavior.
98   auto sub_func = [&](int indexes[N]) {
99     output_data[SubscriptToIndex(output_desc, indexes)] =
100         ActivationFunctionWithMinMax(
101             input1_data[SubscriptToIndex(desc1, indexes)] -
102                 input2_data[SubscriptToIndex(desc2, indexes)],
103             params.float_activation_min, params.float_activation_max);
104   };
105   NDOpsHelper<N>(output_desc, sub_func);
106 }
107 
108 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)109 inline void BroadcastSubSlow(const ArithmeticParams& params,
110                              const RuntimeShape& input1_shape,
111                              const uint8_t* input1_data,
112                              const RuntimeShape& input2_shape,
113                              const uint8_t* input2_data,
114                              const RuntimeShape& output_shape,
115                              uint8_t* output_data) {
116   ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
117   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
118   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
119   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
120   NdArrayDesc<N> desc1;
121   NdArrayDesc<N> desc2;
122   NdArrayDesc<N> output_desc;
123   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
124                                       &desc2);
125   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
126 
127   // In Tensorflow, the dimensions are canonically named (batch_number, row,
128   // col, channel), with extents (batches, height, width, depth), with the
129   // trailing dimension changing most rapidly (channels has the smallest stride,
130   // typically 1 element).
131   //
132   // In generated C code, we store arrays with the dimensions reversed. The
133   // first dimension has smallest stride.
134   //
135   // We name our variables by their Tensorflow convention, but generate C code
136   // nesting loops such that the innermost loop has the smallest stride for the
137   // best cache behavior.
138   auto sub_func = [&](int indexes[N]) {
139     const int32_t input1_val =
140         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
141     const int32_t input2_val =
142         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
143     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
144     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
145     const int32_t scaled_input1_val =
146         MultiplyByQuantizedMultiplierSmallerThanOneExp(
147             shifted_input1_val, params.input1_multiplier, params.input1_shift);
148     const int32_t scaled_input2_val =
149         MultiplyByQuantizedMultiplierSmallerThanOneExp(
150             shifted_input2_val, params.input2_multiplier, params.input2_shift);
151     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
152     const int32_t raw_output =
153         MultiplyByQuantizedMultiplierSmallerThanOneExp(
154             raw_sub, params.output_multiplier, params.output_shift) +
155         params.output_offset;
156     const int32_t clamped_output =
157         std::min(params.quantized_activation_max,
158                  std::max(params.quantized_activation_min, raw_output));
159     output_data[SubscriptToIndex(output_desc, indexes)] =
160         static_cast<uint8_t>(clamped_output);
161   };
162   NDOpsHelper<N>(output_desc, sub_func);
163 }
164 
165 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)166 inline void BroadcastSubSlow(const ArithmeticParams& params,
167                              const RuntimeShape& input1_shape,
168                              const int32_t* input1_data,
169                              const RuntimeShape& input2_shape,
170                              const int32_t* input2_data,
171                              const RuntimeShape& output_shape,
172                              int32_t* output_data) {
173   ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
174   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
175   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
176   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
177   NdArrayDesc<N> desc1;
178   NdArrayDesc<N> desc2;
179   NdArrayDesc<N> output_desc;
180   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
181                                       &desc2);
182   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
183 
184   // In Tensorflow, the dimensions are canonically named (batch_number, row,
185   // col, channel), with extents (batches, height, width, depth), with the
186   // trailing dimension changing most rapidly (channels has the smallest stride,
187   // typically 1 element).
188   //
189   // In generated C code, we store arrays with the dimensions reversed. The
190   // first dimension has smallest stride.
191   //
192   // We name our variables by their Tensorflow convention, but generate C code
193   // nesting loops such that the innermost loop has the smallest stride for the
194   // best cache behavior.
195   auto sub_func = [&](int indexes[N]) {
196     output_data[SubscriptToIndex(output_desc, indexes)] =
197         ActivationFunctionWithMinMax(
198             input1_data[SubscriptToIndex(desc1, indexes)] -
199                 input2_data[SubscriptToIndex(desc2, indexes)],
200             params.quantized_activation_min, params.quantized_activation_max);
201   };
202   NDOpsHelper<N>(output_desc, sub_func);
203 }
204 
205 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)206 inline void BroadcastSubSlow(const ArithmeticParams& params,
207                              const RuntimeShape& input1_shape,
208                              const int8_t* input1_data,
209                              const RuntimeShape& input2_shape,
210                              const int8_t* input2_data,
211                              const RuntimeShape& output_shape,
212                              int8_t* output_data) {
213   ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
214   NdArrayDesc<N> desc1;
215   NdArrayDesc<N> desc2;
216   NdArrayDesc<N> output_desc;
217   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
218                                       &desc2);
219   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
220 
221   // In Tensorflow, the dimensions are canonically named (batch_number, row,
222   // col, channel), with extents (batches, height, width, depth), with the
223   // trailing dimension changing most rapidly (channels has the smallest stride,
224   // typically 1 element).
225   //
226   // In generated C code, we store arrays with the dimensions reversed. The
227   // first dimension has smallest stride.
228   //
229   // We name our variables by their Tensorflow convention, but generate C code
230   // nesting loops such that the innermost loop has the smallest stride for the
231   // best cache behavior.
232   auto sub_func = [&](int indexes[N]) {
233     const int32_t input1_val =
234         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
235     const int32_t input2_val =
236         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
237     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
238     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
239     const int32_t scaled_input1_val =
240         MultiplyByQuantizedMultiplierSmallerThanOneExp(
241             shifted_input1_val, params.input1_multiplier, params.input1_shift);
242     const int32_t scaled_input2_val =
243         MultiplyByQuantizedMultiplierSmallerThanOneExp(
244             shifted_input2_val, params.input2_multiplier, params.input2_shift);
245     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
246     const int32_t raw_output =
247         MultiplyByQuantizedMultiplierSmallerThanOneExp(
248             raw_sub, params.output_multiplier, params.output_shift) +
249         params.output_offset;
250     const int32_t clamped_output =
251         std::min(params.quantized_activation_max,
252                  std::max(params.quantized_activation_min, raw_output));
253     output_data[SubscriptToIndex(output_desc, indexes)] =
254         static_cast<int8_t>(clamped_output);
255   };
256   NDOpsHelper<N>(output_desc, sub_func);
257 }
258 
259 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int64_t * input1_data,const RuntimeShape & input2_shape,const int64_t * input2_data,const RuntimeShape & output_shape,int64_t * output_data)260 void BroadcastSubSlow(const ArithmeticParams& params,
261                       const RuntimeShape& input1_shape,
262                       const int64_t* input1_data,
263                       const RuntimeShape& input2_shape,
264                       const int64_t* input2_data,
265                       const RuntimeShape& output_shape, int64_t* output_data) {
266   ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
267   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
268   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
269   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
270   NdArrayDesc<N> desc1;
271   NdArrayDesc<N> desc2;
272   NdArrayDesc<N> output_desc;
273   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
274                                       &desc2);
275   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
276 
277   // In Tensorflow, the dimensions are canonically named (batch_number, row,
278   // col, channel), with extents (batches, height, width, depth), with the
279   // trailing dimension changing most rapidly (channels has the smallest stride,
280   // typically 1 element).
281   //
282   // In generated C code, we store arrays with the dimensions reversed. The
283   // first dimension has smallest stride.
284   //
285   // We name our variables by their Tensorflow convention, but generate C code
286   // nesting loops such that the innermost loop has the smallest stride for the
287   // best cache behavior.
288   auto sub_func = [&](int indexes[N]) {
289     output_data[SubscriptToIndex(output_desc, indexes)] =
290         ActivationFunctionWithMinMax(
291             input1_data[SubscriptToIndex(desc1, indexes)] -
292                 input2_data[SubscriptToIndex(desc2, indexes)],
293             params.int64_activation_min, params.int64_activation_max);
294   };
295   NDOpsHelper<N>(output_desc, sub_func);
296 }
297 
298 template <typename T, int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)299 void BroadcastSubSlow(const ArithmeticParams& params,
300                       const RuntimeShape& input1_shape, const T* input1_data,
301                       const RuntimeShape& input2_shape, const T* input2_data,
302                       const RuntimeShape& output_shape, T* output_data) {
303   ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
304   TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
305   TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
306   TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
307   NdArrayDesc<N> desc1;
308   NdArrayDesc<N> desc2;
309   NdArrayDesc<N> output_desc;
310   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
311                                       &desc2);
312   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
313 
314   // In Tensorflow, the dimensions are canonically named (batch_number, row,
315   // col, channel), with extents (batches, height, width, depth), with the
316   // trailing dimension changing most rapidly (channels has the smallest stride,
317   // typically 1 element).
318   //
319   // In generated C code, we store arrays with the dimensions reversed. The
320   // first dimension has smallest stride.
321   //
322   // We name our variables by their Tensorflow convention, but generate C code
323   // nesting loops such that the innermost loop has the smallest stride for the
324   // best cache behavior.
325   auto sub_func = [&](int indexes[N]) {
326     output_data[SubscriptToIndex(output_desc, indexes)] =
327         ActivationFunctionWithMinMax(
328             input1_data[SubscriptToIndex(desc1, indexes)] -
329                 input2_data[SubscriptToIndex(desc2, indexes)],
330             params.quantized_activation_min, params.quantized_activation_max);
331   };
332   NDOpsHelper<N>(output_desc, sub_func);
333 }
334 
335 template <int N = 5>
BroadcastSub16POTSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)336 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
337                                   const RuntimeShape& input1_shape,
338                                   const int16_t* input1_data,
339                                   const RuntimeShape& input2_shape,
340                                   const int16_t* input2_data,
341                                   const RuntimeShape& output_shape,
342                                   int16_t* output_data) {
343   ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
344   NdArrayDesc<N> desc1;
345   NdArrayDesc<N> desc2;
346   NdArrayDesc<N> output_desc;
347   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
348                                       &desc2);
349   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
350 
351   // In Tensorflow, the dimensions are canonically named (batch_number, row,
352   // col, channel), with extents (batches, height, width, depth), with the
353   // trailing dimension changing most rapidly (channels has the smallest stride,
354   // typically 1 element).
355   //
356   // In generated C code, we store arrays with the dimensions reversed. The
357   // first dimension has smallest stride.
358   //
359   // We name our variables by their Tensorflow convention, but generate C code
360   // nesting loops such that the innermost loop has the smallest stride for the
361   // best cache behavior.
362   auto sub_func = [&](int indexes[N]) {
363     const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
364     const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
365     const int32_t scaled_input1_val =
366         gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
367     const int32_t scaled_input2_val =
368         gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
369     const int32_t raw_output = scaled_input1_val - scaled_input2_val;
370     const int32_t clamped_output =
371         std::min(params.quantized_activation_max,
372                  std::max(params.quantized_activation_min, raw_output));
373     output_data[SubscriptToIndex(output_desc, indexes)] =
374         static_cast<int16_t>(clamped_output);
375   };
376   NDOpsHelper<N>(output_desc, sub_func);
377 }
378 
379 // Element-wise Sub that can often be used for inner loop of broadcast sub as
380 // well as the non-broadcast sub.
SubElementwise(int size,const ArithmeticParams & params,const uint8_t * input1_data,const uint8_t * input2_data,uint8_t * output_data)381 inline void SubElementwise(int size, const ArithmeticParams& params,
382                            const uint8_t* input1_data,
383                            const uint8_t* input2_data, uint8_t* output_data) {
384   TFLITE_DCHECK_GT(params.input1_offset, -256);
385   TFLITE_DCHECK_GT(params.input2_offset, -256);
386   TFLITE_DCHECK_LT(params.input1_offset, 256);
387   TFLITE_DCHECK_LT(params.input2_offset, 256);
388 
389   for (int i = 0; i < size; ++i) {
390     const int32_t input1_val = params.input1_offset + input1_data[i];
391     const int32_t input2_val = params.input2_offset + input2_data[i];
392     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
393     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
394     const int32_t scaled_input1_val =
395         MultiplyByQuantizedMultiplierSmallerThanOneExp(
396             shifted_input1_val, params.input1_multiplier, params.input1_shift);
397     const int32_t scaled_input2_val =
398         MultiplyByQuantizedMultiplierSmallerThanOneExp(
399             shifted_input2_val, params.input2_multiplier, params.input2_shift);
400     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
401     const int32_t raw_output =
402         MultiplyByQuantizedMultiplierSmallerThanOneExp(
403             raw_sub, params.output_multiplier, params.output_shift) +
404         params.output_offset;
405     const int32_t clamped_output =
406         std::min(params.quantized_activation_max,
407                  std::max(params.quantized_activation_min, raw_output));
408     output_data[i] = static_cast<uint8_t>(clamped_output);
409   }
410 }
411 
412 // Element-wise add that can often be used for inner loop of broadcast add as
413 // well as the non-broadcast add.
SubElementwise(int size,const ArithmeticParams & params,const int8_t * input1_data,const int8_t * input2_data,int8_t * output_data)414 inline void SubElementwise(int size, const ArithmeticParams& params,
415                            const int8_t* input1_data, const int8_t* input2_data,
416                            int8_t* output_data) {
417   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
418   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
419   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
420   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
421   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
422 
423   for (int i = 0; i < size; ++i) {
424     const int32_t input1_val = params.input1_offset + input1_data[i];
425     const int32_t input2_val = params.input2_offset + input2_data[i];
426     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
427     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
428     const int32_t scaled_input1_val =
429         MultiplyByQuantizedMultiplierSmallerThanOneExp(
430             shifted_input1_val, params.input1_multiplier, params.input1_shift);
431     const int32_t scaled_input2_val =
432         MultiplyByQuantizedMultiplierSmallerThanOneExp(
433             shifted_input2_val, params.input2_multiplier, params.input2_shift);
434     const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
435     const int32_t raw_output =
436         MultiplyByQuantizedMultiplierSmallerThanOneExp(
437             raw_sub, params.output_multiplier, params.output_shift) +
438         params.output_offset;
439     const int32_t clamped_output =
440         std::min(params.quantized_activation_max,
441                  std::max(params.quantized_activation_min, raw_output));
442     output_data[i] = static_cast<int8_t>(clamped_output);
443   }
444 }
445 
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)446 inline void Sub(const ArithmeticParams& params,
447                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
448                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
449                 const RuntimeShape& output_shape, uint8_t* output_data) {
450   TFLITE_DCHECK_LE(params.quantized_activation_min,
451                    params.quantized_activation_max);
452   const int flat_size =
453       MatchingElementsSize(input1_shape, input2_shape, output_shape);
454 
455   TFLITE_DCHECK_GT(params.input1_offset, -256);
456   TFLITE_DCHECK_GT(params.input2_offset, -256);
457   TFLITE_DCHECK_LT(params.input1_offset, 256);
458   TFLITE_DCHECK_LT(params.input2_offset, 256);
459   SubElementwise(flat_size, params, input1_data, input2_data, output_data);
460 }
461 
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)462 inline void Sub(const ArithmeticParams& params,
463                 const RuntimeShape& input1_shape, const int8_t* input1_data,
464                 const RuntimeShape& input2_shape, const int8_t* input2_data,
465                 const RuntimeShape& output_shape, int8_t* output_data) {
466   TFLITE_DCHECK_LE(params.quantized_activation_min,
467                    params.quantized_activation_max);
468 
469   const int flat_size =
470       MatchingElementsSize(input1_shape, input2_shape, output_shape);
471 
472   const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
473   TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
474   TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
475   TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
476   TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
477   SubElementwise(flat_size, params, input1_data, input2_data, output_data);
478 }
479 
480 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)481 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
482          const T* input1_data, const RuntimeShape& input2_shape,
483          const T* input2_data, const RuntimeShape& output_shape,
484          T* output_data) {
485   NdArrayDesc<4> desc1;
486   NdArrayDesc<4> desc2;
487   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
488                                       &desc2);
489   const RuntimeShape extended_output_shape =
490       RuntimeShape::ExtendedShape(4, output_shape);
491 
492   // In Tensorflow, the dimensions are canonically named (batch_number, row,
493   // col, channel), with extents (batches, height, width, depth), with the
494   // trailing dimension changing most rapidly (channels has the smallest stride,
495   // typically 1 element).
496   //
497   // In generated C code, we store arrays with the dimensions reversed. The
498   // first dimension has smallest stride.
499   //
500   // We name our variables by their Tensorflow convention, but generate C code
501   // nesting loops such that the innermost loop has the smallest stride for the
502   // best cache behavior.
503   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
504     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
505       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
506         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
507           output_data[Offset(extended_output_shape, b, y, x, c)] =
508               input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
509               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
510         }
511       }
512     }
513   }
514 }
515 
SetActivationMinMax(const ArithmeticParams & params,int32_t * activation_min,int32_t * activation_max)516 inline void SetActivationMinMax(const ArithmeticParams& params,
517                                 int32_t* activation_min,
518                                 int32_t* activation_max) {
519   *activation_min = params.quantized_activation_min;
520   *activation_max = params.quantized_activation_max;
521 }
522 
SetActivationMinMax(const ArithmeticParams & params,float * activation_min,float * activation_max)523 inline void SetActivationMinMax(const ArithmeticParams& params,
524                                 float* activation_min, float* activation_max) {
525   *activation_min = params.float_activation_min;
526   *activation_max = params.float_activation_max;
527 }
528 
SetActivationMinMax(const ArithmeticParams & params,int64_t * activation_min,int64_t * activation_max)529 inline void SetActivationMinMax(const ArithmeticParams& params,
530                                 int64_t* activation_min,
531                                 int64_t* activation_max) {
532   *activation_min = params.int64_activation_min;
533   *activation_max = params.int64_activation_max;
534 }
535 
536 template <typename T>
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)537 inline void SubWithActivation(
538     const ArithmeticParams& params, const RuntimeShape& input1_shape,
539     const T* input1_data, const RuntimeShape& input2_shape,
540     const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
541   ruy::profiler::ScopeLabel label("SubWithActivation");
542   const int flat_size =
543       MatchingElementsSize(input1_shape, input2_shape, output_shape);
544   T activation_min, activation_max;
545   SetActivationMinMax(params, &activation_min, &activation_max);
546 
547   for (int i = 0; i < flat_size; ++i) {
548     output_data[i] = ActivationFunctionWithMinMax(
549         input1_data[i] - input2_data[i], activation_min, activation_max);
550   }
551 }
552 
553 }  // namespace reference_ops
554 }  // namespace tflite
555 
556 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
557