• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
17 
18 #include <algorithm>
19 #include <type_traits>
20 
21 #include "fixedpoint/fixedpoint.h"
22 #include "tensorflow/lite/kernels/internal/common.h"
23 
24 namespace tflite {
25 
26 namespace reference_ops {
27 
28 template <typename T>
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)29 inline void Add(const ArithmeticParams& params,
30                 const RuntimeShape& input1_shape, const T* input1_data,
31                 const RuntimeShape& input2_shape, const T* input2_data,
32                 const RuntimeShape& output_shape, T* output_data) {
33   T activation_min, activation_max;
34   GetActivationParams(params, &activation_min, &activation_max);
35 
36   const int flat_size =
37       MatchingElementsSize(input1_shape, input2_shape, output_shape);
38   for (int i = 0; i < flat_size; ++i) {
39     output_data[i] = ActivationFunctionWithMinMax(
40         input1_data[i] + input2_data[i], activation_min, activation_max);
41   }
42 }
43 
44 // Element-wise add that can often be used for inner loop of broadcast add as
45 // well as the non-broadcast add.
46 
47 // This function is used for 8-bit as well as for 16-bit, but the accumulator
48 // is 32-bit for both cases. The overflow does not happen due to the
49 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
50 template <typename T>
AddElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)51 inline void AddElementwise(int size, const ArithmeticParams& params,
52                            const T* input1_data, const T* input2_data,
53                            T* output_data) {
54   TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
55   TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
56   TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
57   TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
58 
59   for (int i = 0; i < size; ++i) {
60     const int32_t input1_val = params.input1_offset + input1_data[i];
61     const int32_t input2_val = params.input2_offset + input2_data[i];
62     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
63     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
64     const int32_t scaled_input1_val =
65         MultiplyByQuantizedMultiplierSmallerThanOneExp(
66             shifted_input1_val, params.input1_multiplier, params.input1_shift);
67     const int32_t scaled_input2_val =
68         MultiplyByQuantizedMultiplierSmallerThanOneExp(
69             shifted_input2_val, params.input2_multiplier, params.input2_shift);
70     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
71     const int32_t raw_output =
72         MultiplyByQuantizedMultiplierSmallerThanOneExp(
73             raw_sum, params.output_multiplier, params.output_shift) +
74         params.output_offset;
75     const int32_t clamped_output =
76         std::min(params.quantized_activation_max,
77                  std::max(params.quantized_activation_min, raw_output));
78     output_data[i] = static_cast<T>(clamped_output);
79   }
80 }
81 
82 // Scalar-broadcast add that can be used for inner loop of more general
83 // broadcast add, so that, for example, scalar-broadcast with batch will still
84 // be fast.
AddScalarBroadcast(int size,const ArithmeticParams & params,uint8_t input1_data,const uint8_t * input2_data,uint8_t * output_data)85 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
86                                uint8_t input1_data, const uint8_t* input2_data,
87                                uint8_t* output_data) {
88   TFLITE_DCHECK_GT(params.input1_offset, -256);
89   TFLITE_DCHECK_GT(params.input2_offset, -256);
90   TFLITE_DCHECK_LT(params.input1_offset, 256);
91   TFLITE_DCHECK_LT(params.input2_offset, 256);
92 
93   const int32_t input1_val = params.input1_offset + input1_data;
94   const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
95   const int32_t scaled_input1_val =
96       MultiplyByQuantizedMultiplierSmallerThanOneExp(
97           shifted_input1_val, params.input1_multiplier, params.input1_shift);
98   for (int i = 0; i < size; ++i) {
99     const int32_t input2_val = params.input2_offset + input2_data[i];
100     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
101     const int32_t scaled_input2_val =
102         MultiplyByQuantizedMultiplierSmallerThanOneExp(
103             shifted_input2_val, params.input2_multiplier, params.input2_shift);
104     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
105     const int32_t raw_output =
106         MultiplyByQuantizedMultiplierSmallerThanOneExp(
107             raw_sum, params.output_multiplier, params.output_shift) +
108         params.output_offset;
109     const int32_t clamped_output =
110         std::min(params.quantized_activation_max,
111                  std::max(params.quantized_activation_min, raw_output));
112     output_data[i] = static_cast<uint8_t>(clamped_output);
113   }
114 }
115 
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)116 inline void Add(const ArithmeticParams& params,
117                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
118                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
119                 const RuntimeShape& output_shape, uint8_t* output_data) {
120   TFLITE_DCHECK_LE(params.quantized_activation_min,
121                    params.quantized_activation_max);
122   const int flat_size =
123       MatchingElementsSize(input1_shape, input2_shape, output_shape);
124 
125   TFLITE_DCHECK_GT(params.input1_offset, -256);
126   TFLITE_DCHECK_GT(params.input2_offset, -256);
127   TFLITE_DCHECK_LT(params.input1_offset, 256);
128   TFLITE_DCHECK_LT(params.input2_offset, 256);
129   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
130 }
131 
AddGeneralParamScale(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)132 inline void AddGeneralParamScale(const ArithmeticParams& params,
133                                  const RuntimeShape& input1_shape,
134                                  const int16_t* input1_data,
135                                  const RuntimeShape& input2_shape,
136                                  const int16_t* input2_data,
137                                  const RuntimeShape& output_shape,
138                                  int16_t* output_data) {
139   TFLITE_DCHECK_LE(params.quantized_activation_min,
140                    params.quantized_activation_max);
141   const int flat_size =
142       MatchingElementsSize(input1_shape, input2_shape, output_shape);
143 
144   int max_value = std::numeric_limits<int16_t>::max();
145 
146   TFLITE_DCHECK_GT(params.input1_offset, -max_value);
147   TFLITE_DCHECK_GT(params.input2_offset, -max_value);
148   TFLITE_DCHECK_LT(params.input1_offset, max_value);
149   TFLITE_DCHECK_LT(params.input2_offset, max_value);
150   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
151 }
152 
153 inline void Add(const ArithmeticParams& params,
154                 const RuntimeShape& input1_shape, const int16_t* input1_data,
155                 const RuntimeShape& input2_shape, const int16_t* input2_data,
156                 const RuntimeShape& output_shape, int16_t* output_data,
157                 bool pot_scale = true) {
158   if (!pot_scale) {
159     AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
160                          input2_data, output_shape, output_data);
161     return;
162   }
163 
164   TFLITE_DCHECK_LE(params.quantized_activation_min,
165                    params.quantized_activation_max);
166 
167   const int input1_shift = params.input1_shift;
168   const int flat_size =
169       MatchingElementsSize(input1_shape, input2_shape, output_shape);
170   const int16_t output_activation_min = params.quantized_activation_min;
171   const int16_t output_activation_max = params.quantized_activation_max;
172 
173   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
174   TFLITE_DCHECK_LE(input1_shift, 0);
175   TFLITE_DCHECK_LE(params.input2_shift, 0);
176   const int16_t* not_shift_input =
177       input1_shift == 0 ? input1_data : input2_data;
178   const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
179   const int input_right_shift =
180       input1_shift == 0 ? -params.input2_shift : -input1_shift;
181 
182   for (int i = 0; i < flat_size; i++) {
183     // F0 uses 0 integer bits, range [-1, 1].
184     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
185 
186     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
187     F0 scaled_input = F0::FromRaw(
188         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
189     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
190     const int16_t raw_output = result.raw();
191     const int16_t clamped_output = std::min(
192         output_activation_max, std::max(output_activation_min, raw_output));
193     output_data[i] = clamped_output;
194   }
195 }
196 
197 template <typename T>
198 inline typename std::enable_if<!is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)199 BroadcastAdd4DSlow(const ArithmeticParams& params,
200                    const RuntimeShape& input1_shape, const T* input1_data,
201                    const RuntimeShape& input2_shape, const T* input2_data,
202                    const RuntimeShape& output_shape, T* output_data) {
203   NdArrayDesc<4> desc1;
204   NdArrayDesc<4> desc2;
205   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
206                                       &desc2);
207   const RuntimeShape extended_output_shape =
208       RuntimeShape::ExtendedShape(4, output_shape);
209 
210   T activation_min, activation_max;
211   GetActivationParams(params, &activation_min, &activation_max);
212 
213   // In Tensorflow, the dimensions are canonically named (batch_number, row,
214   // col, channel), with extents (batches, height, width, depth), with the
215   // trailing dimension changing most rapidly (channels has the smallest stride,
216   // typically 1 element).
217   //
218   // In generated C code, we store arrays with the dimensions reversed. The
219   // first dimension has smallest stride.
220   //
221   // We name our variables by their Tensorflow convention, but generate C code
222   // nesting loops such that the innermost loop has the smallest stride for the
223   // best cache behavior.
224   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
225     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
226       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
227         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
228           output_data[Offset(extended_output_shape, b, y, x, c)] =
229               ActivationFunctionWithMinMax<T>(
230                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
231                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
232                   activation_min, activation_max);
233         }
234       }
235     }
236   }
237 }
238 
239 // This function is used for 8-bit as well as for 16-bit, but the accumulator
240 // is 32-bit for both cases. The overflow does not happen due to the
241 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
242 template <typename T>
243 inline typename std::enable_if<is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)244 BroadcastAdd4DSlow(const ArithmeticParams& params,
245                    const RuntimeShape& input1_shape, const T* input1_data,
246                    const RuntimeShape& input2_shape, const T* input2_data,
247                    const RuntimeShape& output_shape, T* output_data) {
248   NdArrayDesc<4> desc1;
249   NdArrayDesc<4> desc2;
250   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
251                                       &desc2);
252   const RuntimeShape extended_output_shape =
253       RuntimeShape::ExtendedShape(4, output_shape);
254 
255   // In Tensorflow, the dimensions are canonically named (batch_number, row,
256   // col, channel), with extents (batches, height, width, depth), with the
257   // trailing dimension changing most rapidly (channels has the smallest stride,
258   // typically 1 element).
259   //
260   // In generated C code, we store arrays with the dimensions reversed. The
261   // first dimension has smallest stride.
262   //
263   // We name our variables by their Tensorflow convention, but generate C code
264   // nesting loops such that the innermost loop has the smallest stride for the
265   // best cache behavior.
266   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
267     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
268       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
269         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
270           const int32_t input1_val =
271               params.input1_offset +
272               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
273           const int32_t input2_val =
274               params.input2_offset +
275               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
276           const int32_t shifted_input1_val =
277               input1_val * (1 << params.left_shift);
278           const int32_t shifted_input2_val =
279               input2_val * (1 << params.left_shift);
280           const int32_t scaled_input1_val =
281               MultiplyByQuantizedMultiplierSmallerThanOneExp(
282                   shifted_input1_val, params.input1_multiplier,
283                   params.input1_shift);
284           const int32_t scaled_input2_val =
285               MultiplyByQuantizedMultiplierSmallerThanOneExp(
286                   shifted_input2_val, params.input2_multiplier,
287                   params.input2_shift);
288           const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
289           const int32_t raw_output =
290               MultiplyByQuantizedMultiplierSmallerThanOneExp(
291                   raw_sum, params.output_multiplier, params.output_shift) +
292               params.output_offset;
293           const int32_t clamped_output =
294               std::min(params.quantized_activation_max,
295                        std::max(params.quantized_activation_min, raw_output));
296           output_data[Offset(extended_output_shape, b, y, x, c)] =
297               static_cast<T>(clamped_output);
298         }
299       }
300     }
301   }
302 }
303 
BroadcastAddFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8_t * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8_t * unswitched_input2_data,const RuntimeShape & output_shape,uint8_t * output_data)304 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
305                                  const RuntimeShape& unswitched_input1_shape,
306                                  const uint8_t* unswitched_input1_data,
307                                  const RuntimeShape& unswitched_input2_shape,
308                                  const uint8_t* unswitched_input2_data,
309                                  const RuntimeShape& output_shape,
310                                  uint8_t* output_data) {
311   ArithmeticParams switched_params = unswitched_params;
312   switched_params.input1_offset = unswitched_params.input2_offset;
313   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
314   switched_params.input1_shift = unswitched_params.input2_shift;
315   switched_params.input2_offset = unswitched_params.input1_offset;
316   switched_params.input2_multiplier = unswitched_params.input1_multiplier;
317   switched_params.input2_shift = unswitched_params.input1_shift;
318 
319   const bool use_unswitched =
320       unswitched_params.broadcast_category ==
321       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
322 
323   const ArithmeticParams& params =
324       use_unswitched ? unswitched_params : switched_params;
325   const uint8_t* input1_data =
326       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
327   const uint8_t* input2_data =
328       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
329 
330   // Fivefold nested loops. The second input resets its position for each
331   // iteration of the second loop. The first input resets its position at the
332   // beginning of the fourth loop. The innermost loop is an elementwise add of
333   // sections of the arrays.
334   uint8_t* output_data_ptr = output_data;
335   const uint8_t* input1_data_ptr = input1_data;
336   const uint8_t* input2_data_reset = input2_data;
337   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
338   // between input shapes. y3 for input 1 is always broadcast, and so the
339   // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
340   // Put another way,
341   // input1.shape.FlatSize = y0 * y1 * y2 * y4,
342   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
343   int y0 = params.broadcast_shape[0];
344   int y1 = params.broadcast_shape[1];
345   int y2 = params.broadcast_shape[2];
346   int y3 = params.broadcast_shape[3];
347   int y4 = params.broadcast_shape[4];
348   if (y4 > 1) {
349     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
350     // dimension.
351     for (int i0 = 0; i0 < y0; ++i0) {
352       const uint8_t* input2_data_ptr;
353       for (int i1 = 0; i1 < y1; ++i1) {
354         input2_data_ptr = input2_data_reset;
355         for (int i2 = 0; i2 < y2; ++i2) {
356           for (int i3 = 0; i3 < y3; ++i3) {
357             AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
358                            output_data_ptr);
359             input2_data_ptr += y4;
360             output_data_ptr += y4;
361           }
362           // We have broadcast y4 of input1 data y3 times, and now move on.
363           input1_data_ptr += y4;
364         }
365       }
366       // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
367       input2_data_reset = input2_data_ptr;
368     }
369   } else {
370     // Special case of y4 == 1, in which the innermost loop is a single element
371     // and can be combined with the next (y3) as an inner broadcast.
372     //
373     // Note that this handles the case of pure scalar broadcast when
374     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
375     // broadcast with batch (as y2 > 1).
376     //
377     // NOTE The process is the same as the above general case except simplified
378     // for y4 == 1 and the loop over y3 is contained within the
379     // AddScalarBroadcast function.
380     for (int i0 = 0; i0 < y0; ++i0) {
381       const uint8_t* input2_data_ptr;
382       for (int i1 = 0; i1 < y1; ++i1) {
383         input2_data_ptr = input2_data_reset;
384         for (int i2 = 0; i2 < y2; ++i2) {
385           AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
386                              output_data_ptr);
387           input2_data_ptr += y3;
388           output_data_ptr += y3;
389           input1_data_ptr += 1;
390         }
391       }
392       input2_data_reset = input2_data_ptr;
393     }
394   }
395 }
396 
397 }  // namespace reference_ops
398 }  // namespace tflite
399 
400 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
401