1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
17
18 #include <algorithm>
19
20 #include "ruy/profiler/instrumentation.h" // from @ruy
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/cppmath.h"
23 #include "tensorflow/lite/kernels/internal/max.h"
24 #include "tensorflow/lite/kernels/internal/min.h"
25 #include "tensorflow/lite/kernels/internal/quantization_util.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27
28 // Check if the reduction at index is the first one along the dimensions given
29 // in axis.
IsFirstReduction(const int * index,const int num_axis,const int * axis)30 inline bool IsFirstReduction(const int* index, const int num_axis,
31 const int* axis) {
32 if (num_axis == 0) {
33 return true;
34 }
35
36 TFLITE_DCHECK(index != nullptr);
37 TFLITE_DCHECK(axis != nullptr);
38 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
39 if (index[axis[axis_idx]] != 0) {
40 return false;
41 }
42 }
43
44 return true;
45 }
46
47 namespace tflite {
48
49 namespace reference_ops {
50
51 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
52 // This method iterates through input data and reduce elements along the
53 // dimensions given in axis.
54 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (Out current,const In in),Out * output_data)55 inline bool Reduce(const In* input_data, const int* input_dims,
56 const int* output_dims, const int input_num_dims,
57 const int output_num_dims, const int* axis,
58 const int num_axis, int* input_iter,
59 Out reducer(Out current, const In in), Out* output_data) {
60 // Reset input iterator.
61 for (int idx = 0; idx < input_num_dims; ++idx) {
62 input_iter[idx] = 0;
63 }
64 // Iterate through input_data.
65 do {
66 size_t input_offset =
67 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
68 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
69 input_iter, num_axis, axis);
70 output_data[output_offset] =
71 reducer(output_data[output_offset], input_data[input_offset]);
72 } while (NextIndex(input_num_dims, input_dims, input_iter));
73 return true;
74 }
75
76 // Similar to above Reduce function but takes two reducer functions.
77 // The 'reducer_first' is called with the first value of the reduction,
78 // 'reducer_next' is then called for all the others.
79 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,const std::function<Out (In in)> & reducer_first,const std::function<Out (Out current,In in)> & reducer_next,Out * output_data)80 inline bool Reduce(const In* input_data, const int* input_dims,
81 const int* output_dims, const int input_num_dims,
82 const int output_num_dims, const int* axis,
83 const int num_axis, int* input_iter,
84 const std::function<Out(In in)>& reducer_first,
85 const std::function<Out(Out current, In in)>& reducer_next,
86 Out* output_data) {
87 // Reset input iterator.
88 for (int idx = 0; idx < input_num_dims; ++idx) {
89 input_iter[idx] = 0;
90 }
91 // Iterate through input_data.
92 do {
93 size_t input_offset =
94 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
95 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
96 input_iter, num_axis, axis);
97 if (IsFirstReduction(input_iter, num_axis, axis)) {
98 output_data[output_offset] = reducer_first(input_data[input_offset]);
99 } else {
100 output_data[output_offset] =
101 reducer_next(output_data[output_offset], input_data[input_offset]);
102 }
103 } while (NextIndex(input_num_dims, input_dims, input_iter));
104 return true;
105 }
106
107 // This method parses the input 'axis' to remove duplicates and handle negative
108 // values, and returns a valid 'out_axis'
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)109 inline bool ResolveAxis(const int num_dims, const int* axis,
110 const int64_t num_axis, int* out_axis,
111 int* out_num_axis) {
112 *out_num_axis = 0; // Just in case.
113 // Short-circuit axis resolution for scalars; the axis will go unused.
114 if (num_dims == 0) {
115 return true;
116 }
117 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
118 for (int64_t idx = 0; idx < num_axis; ++idx) {
119 // Handle negative index. A positive index 'p_idx' can be represented as a
120 // negative index 'n_idx' as: n_idx = p_idx-num_dims
121 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
122 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
123 TFLITE_DCHECK(current >= 0 && current < num_dims);
124 if (current < 0 || current >= num_dims) {
125 return false;
126 }
127 bool is_dup = false;
128 for (int j = 0; j < *out_num_axis; ++j) {
129 if (out_axis[j] == current) {
130 is_dup = true;
131 break;
132 }
133 }
134 if (!is_dup) {
135 out_axis[*out_num_axis] = current;
136 *out_num_axis += 1;
137 }
138 }
139 return true;
140 }
141
142 // This method expects that output_data has been initialized.
143 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)144 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
145 const int* output_dims, const int input_num_dims,
146 const int output_num_dims, const int* axis,
147 const int num_axis, int* input_iter,
148 Out* output_data) {
149 auto reducer = [](const Out current, const In in) -> Out {
150 const Out actual_in = static_cast<Out>(in);
151 return current + actual_in;
152 };
153 return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
154 output_num_dims, axis, num_axis, input_iter, reducer,
155 output_data);
156 }
157
158 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)159 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
160 const T init_value, T* data) {
161 size_t num_elements = 1;
162 for (int idx = 0; idx < num_dims; ++idx) {
163 size_t current = static_cast<size_t>(dims[idx]);
164 // Overflow prevention.
165 if (current > 0 &&
166 num_elements > std::numeric_limits<size_t>::max() / current) {
167 return false;
168 }
169 num_elements *= current;
170 }
171 for (size_t idx = 0; idx < num_elements; ++idx) {
172 data[idx] = init_value;
173 }
174 return true;
175 }
176
177 // Computes the generic value (i.e., sum/max/min/prod) of elements across
178 // dimensions given in axis. It needs to pass in init_value and reducer.
179 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))180 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
181 const int input_num_dims, T* output_data,
182 const int* output_dims, const int output_num_dims,
183 const int* axis, const int64_t num_axis_dimensions,
184 bool keep_dims, int* temp_index, int* resolved_axis,
185 T init_value,
186 T reducer(const T current, const T in)) {
187 // Reset output data.
188 if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
189 output_data)) {
190 return false;
191 }
192
193 // Return early when input shape has zero dim. This is done after initializing
194 // data for output tensor because there are cases that the input tensor is
195 // empty but output tensor is not. In that case, output tensor should be
196 // filled with init_value.
197 for (int i = 0; i < input_num_dims; ++i) {
198 if (input_dims[i] == 0) return true;
199 }
200
201 // Resolve axis.
202 int num_resolved_axis = 0;
203 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
204 &num_resolved_axis)) {
205 return false;
206 }
207
208 return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
209 output_num_dims, resolved_axis, num_resolved_axis,
210 temp_index, reducer, output_data);
211 }
212
213 // Computes the mean of elements across dimensions given in axis.
214 // It does so in two stages, first calculates the sum of elements along the axis
215 // then divides it by the number of element in axis.
216 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)217 inline bool Mean(const T* input_data, const int* input_dims,
218 const int input_num_dims, T* output_data,
219 const int* output_dims, const int output_num_dims,
220 const int* axis, const int num_axis_dimensions, bool keep_dims,
221 int* temp_index, int* resolved_axis, U* temp_sum) {
222 ruy::profiler::ScopeLabel label("Mean");
223 // Reset output data.
224 size_t num_outputs = 1;
225 for (int idx = 0; idx < output_num_dims; ++idx) {
226 size_t current = static_cast<size_t>(output_dims[idx]);
227 // Overflow prevention.
228 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
229 return false;
230 }
231 num_outputs *= current;
232 }
233 for (size_t idx = 0; idx < num_outputs; ++idx) {
234 output_data[idx] = T();
235 temp_sum[idx] = U();
236 }
237
238 // Resolve axis.
239 int num_resolved_axis = 0;
240 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
241 &num_resolved_axis)) {
242 return false;
243 }
244
245 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
246 output_num_dims, resolved_axis, num_resolved_axis,
247 temp_index, temp_sum)) {
248 return false;
249 }
250
251 // Calculate mean by dividing output_data by num of aggregated element.
252 size_t num_elements_in_axis = 1;
253 for (int idx = 0; idx < num_resolved_axis; ++idx) {
254 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
255 // Overflow prevention.
256 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
257 return false;
258 }
259 num_elements_in_axis *= current;
260 }
261
262 if (num_elements_in_axis > 0) {
263 for (size_t idx = 0; idx < num_outputs; ++idx) {
264 output_data[idx] =
265 static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
266 }
267 }
268 return true;
269 }
270
271 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)272 inline void Mean(const tflite::MeanParams& op_params,
273 const RuntimeShape& unextended_input_shape,
274 const T* input_data,
275 const RuntimeShape& unextended_output_shape, T* output_data) {
276 ruy::profiler::ScopeLabel label("Mean4D");
277
278 // Current implementation only supports dimension equals 4 and simultaneous
279 // reduction over width and height.
280 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
281 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
282 const RuntimeShape input_shape =
283 RuntimeShape::ExtendedShape(4, unextended_input_shape);
284 const RuntimeShape output_shape =
285 RuntimeShape::ExtendedShape(4, unextended_output_shape);
286
287 const int output_batch = output_shape.Dims(0);
288 const int output_height = output_shape.Dims(1);
289 const int output_width = output_shape.Dims(2);
290 const int output_depth = output_shape.Dims(3);
291
292 const int input_height = input_shape.Dims(1);
293 const int input_width = input_shape.Dims(2);
294
295 TFLITE_CHECK_EQ(op_params.axis_count, 2);
296 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
297 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
298 TFLITE_CHECK_EQ(output_height, 1);
299 TFLITE_CHECK_EQ(output_width, 1);
300
301 for (int out_b = 0; out_b < output_batch; ++out_b) {
302 for (int out_d = 0; out_d < output_depth; ++out_d) {
303 float value = 0;
304 for (int in_h = 0; in_h < input_height; ++in_h) {
305 for (int in_w = 0; in_w < input_width; ++in_w) {
306 value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
307 }
308 }
309 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
310 value / (input_width * input_height);
311 }
312 }
313 }
314
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32_t input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32_t output_zero_point,float output_scale)315 inline void Mean(const tflite::MeanParams& op_params,
316 const RuntimeShape& unextended_input_shape,
317 const uint8_t* input_data, int32_t input_zero_point,
318 float input_scale, const RuntimeShape& unextended_output_shape,
319 uint8_t* output_data, int32_t output_zero_point,
320 float output_scale) {
321 ruy::profiler::ScopeLabel label("Mean4D/Uint8");
322
323 // Current implementation only supports dimension equals 4 and simultaneous
324 // reduction over width and height.
325 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
326 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
327 const RuntimeShape input_shape =
328 RuntimeShape::ExtendedShape(4, unextended_input_shape);
329 const RuntimeShape output_shape =
330 RuntimeShape::ExtendedShape(4, unextended_output_shape);
331 const int output_batch = output_shape.Dims(0);
332 const int output_height = output_shape.Dims(1);
333 const int output_width = output_shape.Dims(2);
334 const int output_depth = output_shape.Dims(3);
335 const int input_height = input_shape.Dims(1);
336 const int input_width = input_shape.Dims(2);
337 const float num_elements_in_axis = input_width * input_height;
338
339 TFLITE_CHECK_EQ(op_params.axis_count, 2);
340 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
341 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
342 TFLITE_CHECK_EQ(output_height, 1);
343 TFLITE_CHECK_EQ(output_width, 1);
344
345 constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
346 constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
347
348 float temp = input_zero_point * input_scale / output_scale;
349 temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
350 int32_t bias = output_zero_point - static_cast<int32_t>(temp);
351 double real_scale =
352 static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
353
354 int32_t multiplier;
355 int shift;
356 QuantizeMultiplier(real_scale, &multiplier, &shift);
357 for (int out_b = 0; out_b < output_batch; ++out_b) {
358 for (int out_d = 0; out_d < output_depth; ++out_d) {
359 int32_t acc = 0;
360 for (int in_h = 0; in_h < input_height; ++in_h) {
361 for (int in_w = 0; in_w < input_width; ++in_w) {
362 acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
363 }
364 }
365 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
366 acc += bias;
367 acc = std::min(std::max(acc, kMinValue), kMaxValue);
368 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
369 static_cast<uint8_t>(acc);
370 }
371 }
372 }
373
374 // Computes the mean of elements across dimensions given in axis.
375 // It does so in two stages, first calculates the sum of elements along the axis
376 // then divides it by the number of element in axis for quantized values.
377 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32_t input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32_t output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)378 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
379 float input_scale, const int* input_dims,
380 const int input_num_dims, T* output_data,
381 int32_t output_zero_point, float output_scale,
382 const int* output_dims,
383 const int output_num_dims, const int* axis,
384 const int num_axis_dimensions, bool keep_dims,
385 int* temp_index, int* resolved_axis, U* temp_sum,
386 bool compute_sum) {
387 const bool uint8_case = std::is_same<T, uint8_t>::value;
388 const bool int16_case = std::is_same<T, int16_t>::value;
389 if (uint8_case) {
390 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
391 } else if (int16_case) {
392 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
393 } else {
394 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
395 }
396 // Reset output data.
397 size_t num_outputs = 1;
398 for (int idx = 0; idx < output_num_dims; ++idx) {
399 size_t current = static_cast<size_t>(output_dims[idx]);
400 // Overflow prevention.
401 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
402 return false;
403 }
404 num_outputs *= current;
405 }
406 for (size_t idx = 0; idx < num_outputs; ++idx) {
407 output_data[idx] = T();
408 temp_sum[idx] = U();
409 }
410
411 // Return early when input shape has zero dim. This is done after initializing
412 // data for output tensor because there are cases that the input tensor is
413 // empty but output tensor is not. In that case, output tensor should be
414 // filled with init_value.
415 for (int i = 0; i < input_num_dims; ++i) {
416 if (input_dims[i] == 0) return true;
417 }
418
419 // Resolve axis.
420 int num_resolved_axis = 0;
421 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
422 &num_resolved_axis)) {
423 return false;
424 }
425
426 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
427 output_num_dims, resolved_axis, num_resolved_axis,
428 temp_index, temp_sum)) {
429 return false;
430 }
431
432 // Calculate mean by dividing output_data by num of aggregated element.
433 size_t num_elements_in_axis = 1;
434 for (int idx = 0; idx < num_resolved_axis; ++idx) {
435 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
436 // Overflow prevention.
437 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
438 return false;
439 }
440 num_elements_in_axis *= current;
441 }
442
443 if (num_elements_in_axis > 0) {
444 const float scale = input_scale / output_scale;
445 if (compute_sum) {
446 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
447 const float bias = -input_zero_point * scale * num_elements_in_axis;
448 for (size_t idx = 0; idx < num_outputs; ++idx) {
449 const U value =
450 static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
451 output_zero_point;
452 output_data[idx] = static_cast<T>(value);
453 }
454 } else {
455 const float bias = -input_zero_point * scale;
456 for (size_t idx = 0; idx < num_outputs; ++idx) {
457 float float_mean = static_cast<float>(temp_sum[idx]) /
458 static_cast<float>(num_elements_in_axis);
459 float result = TfLiteMin(
460 TfLiteRound(float_mean * scale + bias) + output_zero_point,
461 static_cast<float>(std::numeric_limits<T>::max()));
462 result = TfLiteMax(result,
463 static_cast<float>(std::numeric_limits<T>::min()));
464 output_data[idx] = static_cast<T>(result);
465 }
466 }
467 }
468 return true;
469 }
470
471 template <typename T>
QuantizedReduceProd(const T * input_data,int32_t input_zero_point,const RuntimeShape & input_shape,T * output_data,int32_t output_zero_point,const RuntimeShape & output_shape,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,int32_t * temp_prod,int32_t scaling_multiplier,int scaling_shift)472 inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
473 const RuntimeShape& input_shape, T* output_data,
474 int32_t output_zero_point,
475 const RuntimeShape& output_shape,
476 const int* axis,
477 const int64_t num_axis_dimensions,
478 bool keep_dims, int* temp_index,
479 int* resolved_axis, int32_t* temp_prod,
480 int32_t scaling_multiplier, int scaling_shift) {
481 const int32_t kMinValue = std::numeric_limits<T>::min();
482 const int32_t kMaxValue = std::numeric_limits<T>::max();
483
484 // Resolve axis.
485 int num_resolved_axis = 0;
486 if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
487 resolved_axis, &num_resolved_axis)) {
488 return false;
489 }
490
491 // Calculate the reduced product by rescaling each multiplication step to
492 // avoid an overflow.
493 auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
494
495 auto reducer_next = [&](int32_t current, T in) -> int32_t {
496 const int64_t result =
497 static_cast<int64_t>(current) * (in - input_zero_point);
498 return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
499 scaling_shift);
500 };
501
502 if (!Reduce<T, int32_t>(
503 input_data, input_shape.DimsData(), output_shape.DimsData(),
504 input_shape.DimensionsCount(), output_shape.DimensionsCount(),
505 resolved_axis, num_resolved_axis, temp_index, reducer_first,
506 reducer_next, temp_prod)) {
507 return false;
508 }
509
510 for (int i = 0; i < output_shape.FlatSize(); i++) {
511 int32_t result =
512 MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
513 scaling_multiplier, scaling_shift) +
514 output_zero_point;
515 result = std::min(std::max(result, kMinValue), kMaxValue);
516 output_data[i] = static_cast<T>(result);
517 }
518
519 return true;
520 }
521
522 } // namespace reference_ops
523
524 } // namespace tflite
525
526 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
527