1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
17
18 #include <algorithm>
19 #include <type_traits>
20
21 #include "fixedpoint/fixedpoint.h"
22 #include "tensorflow/lite/kernels/internal/common.h"
23
24 namespace tflite {
25
26 namespace reference_ops {
27
28 template <typename T>
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)29 inline void Add(const ArithmeticParams& params,
30 const RuntimeShape& input1_shape, const T* input1_data,
31 const RuntimeShape& input2_shape, const T* input2_data,
32 const RuntimeShape& output_shape, T* output_data) {
33 T activation_min, activation_max;
34 GetActivationParams(params, &activation_min, &activation_max);
35
36 const int flat_size =
37 MatchingElementsSize(input1_shape, input2_shape, output_shape);
38 for (int i = 0; i < flat_size; ++i) {
39 output_data[i] = ActivationFunctionWithMinMax(
40 input1_data[i] + input2_data[i], activation_min, activation_max);
41 }
42 }
43
44 // Element-wise add that can often be used for inner loop of broadcast add as
45 // well as the non-broadcast add.
46
47 // This function is used for 8-bit as well as for 16-bit, but the accumulator
48 // is 32-bit for both cases. The overflow does not happen due to the
49 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
50 template <typename T>
AddElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)51 inline void AddElementwise(int size, const ArithmeticParams& params,
52 const T* input1_data, const T* input2_data,
53 T* output_data) {
54 TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
55 TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
56 TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
57 TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
58
59 for (int i = 0; i < size; ++i) {
60 const int32_t input1_val = params.input1_offset + input1_data[i];
61 const int32_t input2_val = params.input2_offset + input2_data[i];
62 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
63 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
64 const int32_t scaled_input1_val =
65 MultiplyByQuantizedMultiplierSmallerThanOneExp(
66 shifted_input1_val, params.input1_multiplier, params.input1_shift);
67 const int32_t scaled_input2_val =
68 MultiplyByQuantizedMultiplierSmallerThanOneExp(
69 shifted_input2_val, params.input2_multiplier, params.input2_shift);
70 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
71 const int32_t raw_output =
72 MultiplyByQuantizedMultiplierSmallerThanOneExp(
73 raw_sum, params.output_multiplier, params.output_shift) +
74 params.output_offset;
75 const int32_t clamped_output =
76 std::min(params.quantized_activation_max,
77 std::max(params.quantized_activation_min, raw_output));
78 output_data[i] = static_cast<T>(clamped_output);
79 }
80 }
81
82 // Scalar-broadcast add that can be used for inner loop of more general
83 // broadcast add, so that, for example, scalar-broadcast with batch will still
84 // be fast.
AddScalarBroadcast(int size,const ArithmeticParams & params,uint8_t input1_data,const uint8_t * input2_data,uint8_t * output_data)85 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
86 uint8_t input1_data, const uint8_t* input2_data,
87 uint8_t* output_data) {
88 TFLITE_DCHECK_GT(params.input1_offset, -256);
89 TFLITE_DCHECK_GT(params.input2_offset, -256);
90 TFLITE_DCHECK_LT(params.input1_offset, 256);
91 TFLITE_DCHECK_LT(params.input2_offset, 256);
92
93 const int32_t input1_val = params.input1_offset + input1_data;
94 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
95 const int32_t scaled_input1_val =
96 MultiplyByQuantizedMultiplierSmallerThanOneExp(
97 shifted_input1_val, params.input1_multiplier, params.input1_shift);
98 for (int i = 0; i < size; ++i) {
99 const int32_t input2_val = params.input2_offset + input2_data[i];
100 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
101 const int32_t scaled_input2_val =
102 MultiplyByQuantizedMultiplierSmallerThanOneExp(
103 shifted_input2_val, params.input2_multiplier, params.input2_shift);
104 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
105 const int32_t raw_output =
106 MultiplyByQuantizedMultiplierSmallerThanOneExp(
107 raw_sum, params.output_multiplier, params.output_shift) +
108 params.output_offset;
109 const int32_t clamped_output =
110 std::min(params.quantized_activation_max,
111 std::max(params.quantized_activation_min, raw_output));
112 output_data[i] = static_cast<uint8_t>(clamped_output);
113 }
114 }
115
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)116 inline void Add(const ArithmeticParams& params,
117 const RuntimeShape& input1_shape, const uint8_t* input1_data,
118 const RuntimeShape& input2_shape, const uint8_t* input2_data,
119 const RuntimeShape& output_shape, uint8_t* output_data) {
120 TFLITE_DCHECK_LE(params.quantized_activation_min,
121 params.quantized_activation_max);
122 const int flat_size =
123 MatchingElementsSize(input1_shape, input2_shape, output_shape);
124
125 TFLITE_DCHECK_GT(params.input1_offset, -256);
126 TFLITE_DCHECK_GT(params.input2_offset, -256);
127 TFLITE_DCHECK_LT(params.input1_offset, 256);
128 TFLITE_DCHECK_LT(params.input2_offset, 256);
129 AddElementwise(flat_size, params, input1_data, input2_data, output_data);
130 }
131
AddGeneralParamScale(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)132 inline void AddGeneralParamScale(const ArithmeticParams& params,
133 const RuntimeShape& input1_shape,
134 const int16_t* input1_data,
135 const RuntimeShape& input2_shape,
136 const int16_t* input2_data,
137 const RuntimeShape& output_shape,
138 int16_t* output_data) {
139 TFLITE_DCHECK_LE(params.quantized_activation_min,
140 params.quantized_activation_max);
141 const int flat_size =
142 MatchingElementsSize(input1_shape, input2_shape, output_shape);
143
144 int max_value = std::numeric_limits<int16_t>::max();
145
146 TFLITE_DCHECK_GT(params.input1_offset, -max_value);
147 TFLITE_DCHECK_GT(params.input2_offset, -max_value);
148 TFLITE_DCHECK_LT(params.input1_offset, max_value);
149 TFLITE_DCHECK_LT(params.input2_offset, max_value);
150 AddElementwise(flat_size, params, input1_data, input2_data, output_data);
151 }
152
153 inline void Add(const ArithmeticParams& params,
154 const RuntimeShape& input1_shape, const int16_t* input1_data,
155 const RuntimeShape& input2_shape, const int16_t* input2_data,
156 const RuntimeShape& output_shape, int16_t* output_data,
157 bool pot_scale = true) {
158 if (!pot_scale) {
159 AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
160 input2_data, output_shape, output_data);
161 return;
162 }
163
164 TFLITE_DCHECK_LE(params.quantized_activation_min,
165 params.quantized_activation_max);
166
167 const int input1_shift = params.input1_shift;
168 const int flat_size =
169 MatchingElementsSize(input1_shape, input2_shape, output_shape);
170 const int16_t output_activation_min = params.quantized_activation_min;
171 const int16_t output_activation_max = params.quantized_activation_max;
172
173 TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
174 TFLITE_DCHECK_LE(input1_shift, 0);
175 TFLITE_DCHECK_LE(params.input2_shift, 0);
176 const int16_t* not_shift_input =
177 input1_shift == 0 ? input1_data : input2_data;
178 const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
179 const int input_right_shift =
180 input1_shift == 0 ? -params.input2_shift : -input1_shift;
181
182 for (int i = 0; i < flat_size; i++) {
183 // F0 uses 0 integer bits, range [-1, 1].
184 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
185
186 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
187 F0 scaled_input = F0::FromRaw(
188 gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
189 F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
190 const int16_t raw_output = result.raw();
191 const int16_t clamped_output = std::min(
192 output_activation_max, std::max(output_activation_min, raw_output));
193 output_data[i] = clamped_output;
194 }
195 }
196
197 template <typename T>
198 inline typename std::enable_if<!is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)199 BroadcastAdd4DSlow(const ArithmeticParams& params,
200 const RuntimeShape& input1_shape, const T* input1_data,
201 const RuntimeShape& input2_shape, const T* input2_data,
202 const RuntimeShape& output_shape, T* output_data) {
203 NdArrayDesc<4> desc1;
204 NdArrayDesc<4> desc2;
205 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
206 &desc2);
207 const RuntimeShape extended_output_shape =
208 RuntimeShape::ExtendedShape(4, output_shape);
209
210 T activation_min, activation_max;
211 GetActivationParams(params, &activation_min, &activation_max);
212
213 // In Tensorflow, the dimensions are canonically named (batch_number, row,
214 // col, channel), with extents (batches, height, width, depth), with the
215 // trailing dimension changing most rapidly (channels has the smallest stride,
216 // typically 1 element).
217 //
218 // In generated C code, we store arrays with the dimensions reversed. The
219 // first dimension has smallest stride.
220 //
221 // We name our variables by their Tensorflow convention, but generate C code
222 // nesting loops such that the innermost loop has the smallest stride for the
223 // best cache behavior.
224 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
225 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
226 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
227 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
228 output_data[Offset(extended_output_shape, b, y, x, c)] =
229 ActivationFunctionWithMinMax<T>(
230 input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
231 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
232 activation_min, activation_max);
233 }
234 }
235 }
236 }
237 }
238
239 // This function is used for 8-bit as well as for 16-bit, but the accumulator
240 // is 32-bit for both cases. The overflow does not happen due to the
241 // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
242 template <typename T>
243 inline typename std::enable_if<is_small_integer<T>::value, void>::type
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)244 BroadcastAdd4DSlow(const ArithmeticParams& params,
245 const RuntimeShape& input1_shape, const T* input1_data,
246 const RuntimeShape& input2_shape, const T* input2_data,
247 const RuntimeShape& output_shape, T* output_data) {
248 NdArrayDesc<4> desc1;
249 NdArrayDesc<4> desc2;
250 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
251 &desc2);
252 const RuntimeShape extended_output_shape =
253 RuntimeShape::ExtendedShape(4, output_shape);
254
255 // In Tensorflow, the dimensions are canonically named (batch_number, row,
256 // col, channel), with extents (batches, height, width, depth), with the
257 // trailing dimension changing most rapidly (channels has the smallest stride,
258 // typically 1 element).
259 //
260 // In generated C code, we store arrays with the dimensions reversed. The
261 // first dimension has smallest stride.
262 //
263 // We name our variables by their Tensorflow convention, but generate C code
264 // nesting loops such that the innermost loop has the smallest stride for the
265 // best cache behavior.
266 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
267 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
268 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
269 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
270 const int32_t input1_val =
271 params.input1_offset +
272 input1_data[SubscriptToIndex(desc1, b, y, x, c)];
273 const int32_t input2_val =
274 params.input2_offset +
275 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
276 const int32_t shifted_input1_val =
277 input1_val * (1 << params.left_shift);
278 const int32_t shifted_input2_val =
279 input2_val * (1 << params.left_shift);
280 const int32_t scaled_input1_val =
281 MultiplyByQuantizedMultiplierSmallerThanOneExp(
282 shifted_input1_val, params.input1_multiplier,
283 params.input1_shift);
284 const int32_t scaled_input2_val =
285 MultiplyByQuantizedMultiplierSmallerThanOneExp(
286 shifted_input2_val, params.input2_multiplier,
287 params.input2_shift);
288 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
289 const int32_t raw_output =
290 MultiplyByQuantizedMultiplierSmallerThanOneExp(
291 raw_sum, params.output_multiplier, params.output_shift) +
292 params.output_offset;
293 const int32_t clamped_output =
294 std::min(params.quantized_activation_max,
295 std::max(params.quantized_activation_min, raw_output));
296 output_data[Offset(extended_output_shape, b, y, x, c)] =
297 static_cast<T>(clamped_output);
298 }
299 }
300 }
301 }
302 }
303
BroadcastAddFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8_t * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8_t * unswitched_input2_data,const RuntimeShape & output_shape,uint8_t * output_data)304 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
305 const RuntimeShape& unswitched_input1_shape,
306 const uint8_t* unswitched_input1_data,
307 const RuntimeShape& unswitched_input2_shape,
308 const uint8_t* unswitched_input2_data,
309 const RuntimeShape& output_shape,
310 uint8_t* output_data) {
311 ArithmeticParams switched_params = unswitched_params;
312 switched_params.input1_offset = unswitched_params.input2_offset;
313 switched_params.input1_multiplier = unswitched_params.input2_multiplier;
314 switched_params.input1_shift = unswitched_params.input2_shift;
315 switched_params.input2_offset = unswitched_params.input1_offset;
316 switched_params.input2_multiplier = unswitched_params.input1_multiplier;
317 switched_params.input2_shift = unswitched_params.input1_shift;
318
319 const bool use_unswitched =
320 unswitched_params.broadcast_category ==
321 tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
322
323 const ArithmeticParams& params =
324 use_unswitched ? unswitched_params : switched_params;
325 const uint8_t* input1_data =
326 use_unswitched ? unswitched_input1_data : unswitched_input2_data;
327 const uint8_t* input2_data =
328 use_unswitched ? unswitched_input2_data : unswitched_input1_data;
329
330 // Fivefold nested loops. The second input resets its position for each
331 // iteration of the second loop. The first input resets its position at the
332 // beginning of the fourth loop. The innermost loop is an elementwise add of
333 // sections of the arrays.
334 uint8_t* output_data_ptr = output_data;
335 const uint8_t* input1_data_ptr = input1_data;
336 const uint8_t* input2_data_reset = input2_data;
337 // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
338 // between input shapes. y3 for input 1 is always broadcast, and so the
339 // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
340 // Put another way,
341 // input1.shape.FlatSize = y0 * y1 * y2 * y4,
342 // input2.shape.FlatSize = y0 * y2 * y3 * y4.
343 int y0 = params.broadcast_shape[0];
344 int y1 = params.broadcast_shape[1];
345 int y2 = params.broadcast_shape[2];
346 int y3 = params.broadcast_shape[3];
347 int y4 = params.broadcast_shape[4];
348 if (y4 > 1) {
349 // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
350 // dimension.
351 for (int i0 = 0; i0 < y0; ++i0) {
352 const uint8_t* input2_data_ptr;
353 for (int i1 = 0; i1 < y1; ++i1) {
354 input2_data_ptr = input2_data_reset;
355 for (int i2 = 0; i2 < y2; ++i2) {
356 for (int i3 = 0; i3 < y3; ++i3) {
357 AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
358 output_data_ptr);
359 input2_data_ptr += y4;
360 output_data_ptr += y4;
361 }
362 // We have broadcast y4 of input1 data y3 times, and now move on.
363 input1_data_ptr += y4;
364 }
365 }
366 // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
367 input2_data_reset = input2_data_ptr;
368 }
369 } else {
370 // Special case of y4 == 1, in which the innermost loop is a single element
371 // and can be combined with the next (y3) as an inner broadcast.
372 //
373 // Note that this handles the case of pure scalar broadcast when
374 // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
375 // broadcast with batch (as y2 > 1).
376 //
377 // NOTE The process is the same as the above general case except simplified
378 // for y4 == 1 and the loop over y3 is contained within the
379 // AddScalarBroadcast function.
380 for (int i0 = 0; i0 < y0; ++i0) {
381 const uint8_t* input2_data_ptr;
382 for (int i1 = 0; i1 < y1; ++i1) {
383 input2_data_ptr = input2_data_reset;
384 for (int i2 = 0; i2 < y2; ++i2) {
385 AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
386 output_data_ptr);
387 input2_data_ptr += y3;
388 output_data_ptr += y3;
389 input1_data_ptr += 1;
390 }
391 }
392 input2_data_reset = input2_data_ptr;
393 }
394 }
395 }
396
397 } // namespace reference_ops
398 } // namespace tflite
399
400 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
401