1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
17
18 #include "ruy/profiler/instrumentation.h" // from @ruy
19 #include "tensorflow/lite/kernels/internal/common.h"
20 #include "tensorflow/lite/kernels/internal/cppmath.h"
21 #include "tensorflow/lite/kernels/internal/max.h"
22 #include "tensorflow/lite/kernels/internal/min.h"
23 #include "tensorflow/lite/kernels/internal/quantization_util.h"
24 #include "tensorflow/lite/kernels/internal/types.h"
25
26 // Check if the reduction at index is the first one along the dimensions given
27 // in axis.
IsFirstReduction(const int * index,const int num_axis,const int * axis)28 inline bool IsFirstReduction(const int* index, const int num_axis,
29 const int* axis) {
30 if (num_axis == 0) {
31 return true;
32 }
33
34 TFLITE_DCHECK(index != nullptr);
35 TFLITE_DCHECK(axis != nullptr);
36 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
37 if (index[axis[axis_idx]] != 0) {
38 return false;
39 }
40 }
41
42 return true;
43 }
44
45 namespace tflite {
46
47 namespace reference_ops {
48
49 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
50 // This method iterates through input data and reduce elements along the
51 // dimensions given in axis.
52 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (Out current,const In in),Out * output_data)53 inline bool Reduce(const In* input_data, const int* input_dims,
54 const int* output_dims, const int input_num_dims,
55 const int output_num_dims, const int* axis,
56 const int num_axis, int* input_iter,
57 Out reducer(Out current, const In in), Out* output_data) {
58 // Reset input iterator.
59 for (int idx = 0; idx < input_num_dims; ++idx) {
60 input_iter[idx] = 0;
61 }
62 // Iterate through input_data.
63 do {
64 size_t input_offset =
65 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
66 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
67 input_iter, num_axis, axis);
68 output_data[output_offset] =
69 reducer(output_data[output_offset], input_data[input_offset]);
70 } while (NextIndex(input_num_dims, input_dims, input_iter));
71 return true;
72 }
73
74 // Similar to above Reduce function but takes two reducer functions.
75 // The 'reducer_first' is called with the first value of the reduction,
76 // 'reducer_next' is then called for all the others.
77 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,const std::function<Out (In in)> & reducer_first,const std::function<Out (Out current,In in)> & reducer_next,Out * output_data)78 inline bool Reduce(const In* input_data, const int* input_dims,
79 const int* output_dims, const int input_num_dims,
80 const int output_num_dims, const int* axis,
81 const int num_axis, int* input_iter,
82 const std::function<Out(In in)>& reducer_first,
83 const std::function<Out(Out current, In in)>& reducer_next,
84 Out* output_data) {
85 // Reset input iterator.
86 for (int idx = 0; idx < input_num_dims; ++idx) {
87 input_iter[idx] = 0;
88 }
89 // Iterate through input_data.
90 do {
91 size_t input_offset =
92 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
93 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
94 input_iter, num_axis, axis);
95 if (IsFirstReduction(input_iter, num_axis, axis)) {
96 output_data[output_offset] = reducer_first(input_data[input_offset]);
97 } else {
98 output_data[output_offset] =
99 reducer_next(output_data[output_offset], input_data[input_offset]);
100 }
101 } while (NextIndex(input_num_dims, input_dims, input_iter));
102 return true;
103 }
104
105 // This method parses the input 'axis' to remove duplicates and handle negative
106 // values, and returns a valid 'out_axis'
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)107 inline bool ResolveAxis(const int num_dims, const int* axis,
108 const int64_t num_axis, int* out_axis,
109 int* out_num_axis) {
110 *out_num_axis = 0; // Just in case.
111 // Short-circuit axis resolution for scalars; the axis will go unused.
112 if (num_dims == 0) {
113 return true;
114 }
115 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
116 for (int64_t idx = 0; idx < num_axis; ++idx) {
117 // Handle negative index. A positive index 'p_idx' can be represented as a
118 // negative index 'n_idx' as: n_idx = p_idx-num_dims
119 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
120 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
121 TFLITE_DCHECK(current >= 0 && current < num_dims);
122 if (current < 0 || current >= num_dims) {
123 return false;
124 }
125 bool is_dup = false;
126 for (int j = 0; j < *out_num_axis; ++j) {
127 if (out_axis[j] == current) {
128 is_dup = true;
129 break;
130 }
131 }
132 if (!is_dup) {
133 out_axis[*out_num_axis] = current;
134 *out_num_axis += 1;
135 }
136 }
137 return true;
138 }
139
140 // This method expects that output_data has been initialized.
141 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)142 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
143 const int* output_dims, const int input_num_dims,
144 const int output_num_dims, const int* axis,
145 const int num_axis, int* input_iter,
146 Out* output_data) {
147 auto reducer = [](const Out current, const In in) -> Out {
148 const Out actual_in = static_cast<Out>(in);
149 return current + actual_in;
150 };
151 return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
152 output_num_dims, axis, num_axis, input_iter, reducer,
153 output_data);
154 }
155
156 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)157 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
158 const T init_value, T* data) {
159 size_t num_elements = 1;
160 for (int idx = 0; idx < num_dims; ++idx) {
161 size_t current = static_cast<size_t>(dims[idx]);
162 // Overflow prevention.
163 if (current > 0 &&
164 num_elements > std::numeric_limits<size_t>::max() / current) {
165 return false;
166 }
167 num_elements *= current;
168 }
169 for (size_t idx = 0; idx < num_elements; ++idx) {
170 data[idx] = init_value;
171 }
172 return true;
173 }
174
175 // Computes the generic value (i.e., sum/max/min/prod) of elements across
176 // dimensions given in axis. It needs to pass in init_value and reducer.
177 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))178 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
179 const int input_num_dims, T* output_data,
180 const int* output_dims, const int output_num_dims,
181 const int* axis, const int64_t num_axis_dimensions,
182 bool keep_dims, int* temp_index, int* resolved_axis,
183 T init_value,
184 T reducer(const T current, const T in)) {
185 // Reset output data.
186 if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
187 output_data)) {
188 return false;
189 }
190
191 // Return early when input shape has zero dim. This is done after initializing
192 // data for output tensor because there are cases that the input tensor is
193 // empty but output tensor is not. In that case, output tensor should be
194 // filled with init_value.
195 for (int i = 0; i < input_num_dims; ++i) {
196 if (input_dims[i] == 0) return true;
197 }
198
199 // Resolve axis.
200 int num_resolved_axis = 0;
201 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
202 &num_resolved_axis)) {
203 return false;
204 }
205
206 return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
207 output_num_dims, resolved_axis, num_resolved_axis,
208 temp_index, reducer, output_data);
209 }
210
211 // Computes the mean of elements across dimensions given in axis.
212 // It does so in two stages, first calculates the sum of elements along the axis
213 // then divides it by the number of element in axis.
214 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)215 inline bool Mean(const T* input_data, const int* input_dims,
216 const int input_num_dims, T* output_data,
217 const int* output_dims, const int output_num_dims,
218 const int* axis, const int num_axis_dimensions, bool keep_dims,
219 int* temp_index, int* resolved_axis, U* temp_sum) {
220 ruy::profiler::ScopeLabel label("Mean");
221 // Reset output data.
222 size_t num_outputs = 1;
223 for (int idx = 0; idx < output_num_dims; ++idx) {
224 size_t current = static_cast<size_t>(output_dims[idx]);
225 // Overflow prevention.
226 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
227 return false;
228 }
229 num_outputs *= current;
230 }
231 for (size_t idx = 0; idx < num_outputs; ++idx) {
232 output_data[idx] = T();
233 temp_sum[idx] = U();
234 }
235
236 // Resolve axis.
237 int num_resolved_axis = 0;
238 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
239 &num_resolved_axis)) {
240 return false;
241 }
242
243 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
244 output_num_dims, resolved_axis, num_resolved_axis,
245 temp_index, temp_sum)) {
246 return false;
247 }
248
249 // Calculate mean by dividing output_data by num of aggregated element.
250 size_t num_elements_in_axis = 1;
251 for (int idx = 0; idx < num_resolved_axis; ++idx) {
252 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
253 // Overflow prevention.
254 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
255 return false;
256 }
257 num_elements_in_axis *= current;
258 }
259
260 if (num_elements_in_axis > 0) {
261 for (size_t idx = 0; idx < num_outputs; ++idx) {
262 output_data[idx] =
263 static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
264 }
265 }
266 return true;
267 }
268
269 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)270 inline void Mean(const tflite::MeanParams& op_params,
271 const RuntimeShape& unextended_input_shape,
272 const T* input_data,
273 const RuntimeShape& unextended_output_shape, T* output_data) {
274 ruy::profiler::ScopeLabel label("Mean4D");
275
276 // Current implementation only supports dimension equals 4 and simultaneous
277 // reduction over width and height.
278 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
279 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
280 const RuntimeShape input_shape =
281 RuntimeShape::ExtendedShape(4, unextended_input_shape);
282 const RuntimeShape output_shape =
283 RuntimeShape::ExtendedShape(4, unextended_output_shape);
284
285 const int output_batch = output_shape.Dims(0);
286 const int output_height = output_shape.Dims(1);
287 const int output_width = output_shape.Dims(2);
288 const int output_depth = output_shape.Dims(3);
289
290 const int input_height = input_shape.Dims(1);
291 const int input_width = input_shape.Dims(2);
292
293 TFLITE_CHECK_EQ(op_params.axis_count, 2);
294 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
295 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
296 TFLITE_CHECK_EQ(output_height, 1);
297 TFLITE_CHECK_EQ(output_width, 1);
298
299 for (int out_b = 0; out_b < output_batch; ++out_b) {
300 for (int out_d = 0; out_d < output_depth; ++out_d) {
301 float value = 0;
302 for (int in_h = 0; in_h < input_height; ++in_h) {
303 for (int in_w = 0; in_w < input_width; ++in_w) {
304 value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
305 }
306 }
307 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
308 value / (input_width * input_height);
309 }
310 }
311 }
312
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32_t input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32_t output_zero_point,float output_scale)313 inline void Mean(const tflite::MeanParams& op_params,
314 const RuntimeShape& unextended_input_shape,
315 const uint8_t* input_data, int32_t input_zero_point,
316 float input_scale, const RuntimeShape& unextended_output_shape,
317 uint8_t* output_data, int32_t output_zero_point,
318 float output_scale) {
319 ruy::profiler::ScopeLabel label("Mean4D/Uint8");
320
321 // Current implementation only supports dimension equals 4 and simultaneous
322 // reduction over width and height.
323 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
324 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
325 const RuntimeShape input_shape =
326 RuntimeShape::ExtendedShape(4, unextended_input_shape);
327 const RuntimeShape output_shape =
328 RuntimeShape::ExtendedShape(4, unextended_output_shape);
329 const int output_batch = output_shape.Dims(0);
330 const int output_height = output_shape.Dims(1);
331 const int output_width = output_shape.Dims(2);
332 const int output_depth = output_shape.Dims(3);
333 const int input_height = input_shape.Dims(1);
334 const int input_width = input_shape.Dims(2);
335 const float num_elements_in_axis = input_width * input_height;
336
337 TFLITE_CHECK_EQ(op_params.axis_count, 2);
338 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
339 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
340 TFLITE_CHECK_EQ(output_height, 1);
341 TFLITE_CHECK_EQ(output_width, 1);
342
343 constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
344 constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
345
346 float temp = input_zero_point * input_scale / output_scale;
347 temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
348 int32_t bias = output_zero_point - static_cast<int32_t>(temp);
349 double real_scale =
350 static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
351
352 int32_t multiplier;
353 int shift;
354 QuantizeMultiplier(real_scale, &multiplier, &shift);
355 for (int out_b = 0; out_b < output_batch; ++out_b) {
356 for (int out_d = 0; out_d < output_depth; ++out_d) {
357 int32_t acc = 0;
358 for (int in_h = 0; in_h < input_height; ++in_h) {
359 for (int in_w = 0; in_w < input_width; ++in_w) {
360 acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
361 }
362 }
363 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
364 acc += bias;
365 acc = std::min(std::max(acc, kMinValue), kMaxValue);
366 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
367 static_cast<uint8_t>(acc);
368 }
369 }
370 }
371
372 // Computes the mean of elements across dimensions given in axis.
373 // It does so in two stages, first calculates the sum of elements along the axis
374 // then divides it by the number of element in axis for quantized values.
375 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32_t input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32_t output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)376 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
377 float input_scale, const int* input_dims,
378 const int input_num_dims, T* output_data,
379 int32_t output_zero_point, float output_scale,
380 const int* output_dims,
381 const int output_num_dims, const int* axis,
382 const int num_axis_dimensions, bool keep_dims,
383 int* temp_index, int* resolved_axis, U* temp_sum,
384 bool compute_sum) {
385 const bool uint8_case = std::is_same<T, uint8_t>::value;
386 const bool int16_case = std::is_same<T, int16_t>::value;
387 if (uint8_case) {
388 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
389 } else if (int16_case) {
390 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
391 } else {
392 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
393 }
394 // Reset output data.
395 size_t num_outputs = 1;
396 for (int idx = 0; idx < output_num_dims; ++idx) {
397 size_t current = static_cast<size_t>(output_dims[idx]);
398 // Overflow prevention.
399 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
400 return false;
401 }
402 num_outputs *= current;
403 }
404 for (size_t idx = 0; idx < num_outputs; ++idx) {
405 output_data[idx] = T();
406 temp_sum[idx] = U();
407 }
408
409 // Return early when input shape has zero dim. This is done after initializing
410 // data for output tensor because there are cases that the input tensor is
411 // empty but output tensor is not. In that case, output tensor should be
412 // filled with init_value.
413 for (int i = 0; i < input_num_dims; ++i) {
414 if (input_dims[i] == 0) return true;
415 }
416
417 // Resolve axis.
418 int num_resolved_axis = 0;
419 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
420 &num_resolved_axis)) {
421 return false;
422 }
423
424 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
425 output_num_dims, resolved_axis, num_resolved_axis,
426 temp_index, temp_sum)) {
427 return false;
428 }
429
430 // Calculate mean by dividing output_data by num of aggregated element.
431 size_t num_elements_in_axis = 1;
432 for (int idx = 0; idx < num_resolved_axis; ++idx) {
433 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
434 // Overflow prevention.
435 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
436 return false;
437 }
438 num_elements_in_axis *= current;
439 }
440
441 if (num_elements_in_axis > 0) {
442 const float scale = input_scale / output_scale;
443 if (compute_sum) {
444 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
445 const float bias = -input_zero_point * scale * num_elements_in_axis;
446 for (size_t idx = 0; idx < num_outputs; ++idx) {
447 const U value =
448 static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
449 output_zero_point;
450 output_data[idx] = static_cast<T>(value);
451 }
452 } else {
453 const float bias = -input_zero_point * scale;
454 for (size_t idx = 0; idx < num_outputs; ++idx) {
455 float float_mean = static_cast<float>(temp_sum[idx]) /
456 static_cast<float>(num_elements_in_axis);
457 float result = TfLiteMin(
458 TfLiteRound(float_mean * scale + bias) + output_zero_point,
459 static_cast<float>(std::numeric_limits<T>::max()));
460 result = TfLiteMax(result,
461 static_cast<float>(std::numeric_limits<T>::min()));
462 output_data[idx] = static_cast<T>(result);
463 }
464 }
465 }
466 return true;
467 }
468
469 template <typename T>
QuantizedReduceProd(const T * input_data,int32_t input_zero_point,const RuntimeShape & input_shape,T * output_data,int32_t output_zero_point,const RuntimeShape & output_shape,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,int32_t * temp_prod,int32_t scaling_multiplier,int scaling_shift)470 inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
471 const RuntimeShape& input_shape, T* output_data,
472 int32_t output_zero_point,
473 const RuntimeShape& output_shape,
474 const int* axis,
475 const int64_t num_axis_dimensions,
476 bool keep_dims, int* temp_index,
477 int* resolved_axis, int32_t* temp_prod,
478 int32_t scaling_multiplier, int scaling_shift) {
479 const int32_t kMinValue = std::numeric_limits<T>::min();
480 const int32_t kMaxValue = std::numeric_limits<T>::max();
481
482 // Resolve axis.
483 int num_resolved_axis = 0;
484 if (!ResolveAxis(input_shape.DimensionsCount(), axis, num_axis_dimensions,
485 resolved_axis, &num_resolved_axis)) {
486 return false;
487 }
488
489 // Calculate the reduced product by rescaling each multiplication step to
490 // avoid an overflow.
491 auto reducer_first = [&](T in) -> int32_t { return in - input_zero_point; };
492
493 auto reducer_next = [&](int32_t current, T in) -> int32_t {
494 const int64_t result =
495 static_cast<int64_t>(current) * (in - input_zero_point);
496 return MultiplyByQuantizedMultiplier(result, scaling_multiplier,
497 scaling_shift);
498 };
499
500 if (!Reduce<T, int32_t>(
501 input_data, input_shape.DimsData(), output_shape.DimsData(),
502 input_shape.DimensionsCount(), output_shape.DimensionsCount(),
503 resolved_axis, num_resolved_axis, temp_index, reducer_first,
504 reducer_next, temp_prod)) {
505 return false;
506 }
507
508 for (int i = 0; i < output_shape.FlatSize(); i++) {
509 int32_t result =
510 MultiplyByQuantizedMultiplier(static_cast<int64_t>(temp_prod[i]),
511 scaling_multiplier, scaling_shift) +
512 output_zero_point;
513 result = std::min(std::max(result, kMinValue), kMaxValue);
514 output_data[i] = static_cast<T>(result);
515 }
516
517 return true;
518 }
519
520 } // namespace reference_ops
521
522 } // namespace tflite
523
524 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
525