1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
17
18 #include "ruy/profiler/instrumentation.h" // from @ruy
19 #include "tensorflow/lite/kernels/internal/common.h"
20 #include "tensorflow/lite/kernels/internal/cppmath.h"
21 #include "tensorflow/lite/kernels/internal/max.h"
22 #include "tensorflow/lite/kernels/internal/min.h"
23 #include "tensorflow/lite/kernels/internal/quantization_util.h"
24 #include "tensorflow/lite/kernels/internal/types.h"
25
26 namespace tflite {
27
28 namespace reference_ops {
29
30 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
31 // This method iterates through input data and reduce elements along the
32 // dimensions given in axis.
33 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (const Out current,const In in),Out * output_data)34 inline bool Reduce(const In* input_data, const int* input_dims,
35 const int* output_dims, const int input_num_dims,
36 const int output_num_dims, const int* axis,
37 const int num_axis, int* input_iter,
38 Out reducer(const Out current, const In in),
39 Out* output_data) {
40 // Reset input iterator.
41 for (int idx = 0; idx < input_num_dims; ++idx) {
42 input_iter[idx] = 0;
43 }
44 // Iterate through input_data.
45 do {
46 size_t input_offset =
47 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
48 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
49 input_iter, num_axis, axis);
50 output_data[output_offset] =
51 reducer(output_data[output_offset], input_data[input_offset]);
52 } while (NextIndex(input_num_dims, input_dims, input_iter));
53 return true;
54 }
55
56 // This method parses the input 'axis' to remove duplicates and handle negative
57 // values, and returns a valid 'out_axis'
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)58 inline bool ResolveAxis(const int num_dims, const int* axis,
59 const int64_t num_axis, int* out_axis,
60 int* out_num_axis) {
61 *out_num_axis = 0; // Just in case.
62 // Short-circuit axis resolution for scalars; the axis will go unused.
63 if (num_dims == 0) {
64 return true;
65 }
66 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
67 for (int64_t idx = 0; idx < num_axis; ++idx) {
68 // Handle negative index. A positive index 'p_idx' can be represented as a
69 // negative index 'n_idx' as: n_idx = p_idx-num_dims
70 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
71 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
72 TFLITE_DCHECK(current >= 0 && current < num_dims);
73 if (current < 0 || current >= num_dims) {
74 return false;
75 }
76 bool is_dup = false;
77 for (int j = 0; j < *out_num_axis; ++j) {
78 if (out_axis[j] == current) {
79 is_dup = true;
80 break;
81 }
82 }
83 if (!is_dup) {
84 out_axis[*out_num_axis] = current;
85 *out_num_axis += 1;
86 }
87 }
88 return true;
89 }
90
91 // This method expects that output_data has been initialized.
92 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)93 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
94 const int* output_dims, const int input_num_dims,
95 const int output_num_dims, const int* axis,
96 const int num_axis, int* input_iter,
97 Out* output_data) {
98 auto reducer = [](const Out current, const In in) -> Out {
99 const Out actual_in = static_cast<Out>(in);
100 return current + actual_in;
101 };
102 return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
103 output_num_dims, axis, num_axis, input_iter, reducer,
104 output_data);
105 }
106
107 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)108 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
109 const T init_value, T* data) {
110 size_t num_elements = 1;
111 for (int idx = 0; idx < num_dims; ++idx) {
112 size_t current = static_cast<size_t>(dims[idx]);
113 // Overflow prevention.
114 if (num_elements > std::numeric_limits<size_t>::max() / current) {
115 return false;
116 }
117 num_elements *= current;
118 }
119 for (size_t idx = 0; idx < num_elements; ++idx) {
120 data[idx] = init_value;
121 }
122 return true;
123 }
124
125 // Computes the generic value (i.e., sum/max/min/prod) of elements across
126 // dimensions given in axis. It needs to pass in init_value and reducer.
127 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))128 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
129 const int input_num_dims, T* output_data,
130 const int* output_dims, const int output_num_dims,
131 const int* axis, const int64_t num_axis_dimensions,
132 bool keep_dims, int* temp_index, int* resolved_axis,
133 T init_value,
134 T reducer(const T current, const T in)) {
135 // Return early when input shape has zero dim.
136 for (int i = 0; i < input_num_dims; ++i) {
137 if (input_dims[i] == 0) return true;
138 }
139
140 // Reset output data.
141 if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
142 output_data)) {
143 return false;
144 }
145
146 // Resolve axis.
147 int num_resolved_axis = 0;
148 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
149 &num_resolved_axis)) {
150 return false;
151 }
152
153 return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
154 output_num_dims, resolved_axis, num_resolved_axis,
155 temp_index, reducer, output_data);
156 }
157
158 // Computes the mean of elements across dimensions given in axis.
159 // It does so in two stages, first calculates the sum of elements along the axis
160 // then divides it by the number of element in axis.
161 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)162 inline bool Mean(const T* input_data, const int* input_dims,
163 const int input_num_dims, T* output_data,
164 const int* output_dims, const int output_num_dims,
165 const int* axis, const int num_axis_dimensions, bool keep_dims,
166 int* temp_index, int* resolved_axis, U* temp_sum) {
167 ruy::profiler::ScopeLabel label("Mean");
168 // Reset output data.
169 size_t num_outputs = 1;
170 for (int idx = 0; idx < output_num_dims; ++idx) {
171 size_t current = static_cast<size_t>(output_dims[idx]);
172 // Overflow prevention.
173 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
174 return false;
175 }
176 num_outputs *= current;
177 }
178 for (size_t idx = 0; idx < num_outputs; ++idx) {
179 output_data[idx] = T();
180 temp_sum[idx] = U();
181 }
182
183 // Resolve axis.
184 int num_resolved_axis = 0;
185 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
186 &num_resolved_axis)) {
187 return false;
188 }
189
190 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
191 output_num_dims, resolved_axis, num_resolved_axis,
192 temp_index, temp_sum)) {
193 return false;
194 }
195
196 // Calculate mean by dividing output_data by num of aggregated element.
197 size_t num_elements_in_axis = 1;
198 for (int idx = 0; idx < num_resolved_axis; ++idx) {
199 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
200 // Overflow prevention.
201 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
202 return false;
203 }
204 num_elements_in_axis *= current;
205 }
206
207 if (num_elements_in_axis > 0) {
208 for (size_t idx = 0; idx < num_outputs; ++idx) {
209 output_data[idx] =
210 static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
211 }
212 }
213 return true;
214 }
215
216 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)217 inline void Mean(const tflite::MeanParams& op_params,
218 const RuntimeShape& unextended_input_shape,
219 const T* input_data,
220 const RuntimeShape& unextended_output_shape, T* output_data) {
221 ruy::profiler::ScopeLabel label("Mean4D");
222
223 // Current implementation only supports dimension equals 4 and simultaneous
224 // reduction over width and height.
225 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
226 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
227 const RuntimeShape input_shape =
228 RuntimeShape::ExtendedShape(4, unextended_input_shape);
229 const RuntimeShape output_shape =
230 RuntimeShape::ExtendedShape(4, unextended_output_shape);
231
232 const int output_batch = output_shape.Dims(0);
233 const int output_height = output_shape.Dims(1);
234 const int output_width = output_shape.Dims(2);
235 const int output_depth = output_shape.Dims(3);
236
237 const int input_height = input_shape.Dims(1);
238 const int input_width = input_shape.Dims(2);
239
240 TFLITE_CHECK_EQ(op_params.axis_count, 2);
241 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
242 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
243 TFLITE_CHECK_EQ(output_height, 1);
244 TFLITE_CHECK_EQ(output_width, 1);
245
246 for (int out_b = 0; out_b < output_batch; ++out_b) {
247 for (int out_d = 0; out_d < output_depth; ++out_d) {
248 float value = 0;
249 for (int in_h = 0; in_h < input_height; ++in_h) {
250 for (int in_w = 0; in_w < input_width; ++in_w) {
251 value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
252 }
253 }
254 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
255 value / (input_width * input_height);
256 }
257 }
258 }
259
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32_t input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32_t output_zero_point,float output_scale)260 inline void Mean(const tflite::MeanParams& op_params,
261 const RuntimeShape& unextended_input_shape,
262 const uint8_t* input_data, int32_t input_zero_point,
263 float input_scale, const RuntimeShape& unextended_output_shape,
264 uint8_t* output_data, int32_t output_zero_point,
265 float output_scale) {
266 ruy::profiler::ScopeLabel label("Mean4D/Uint8");
267
268 // Current implementation only supports dimension equals 4 and simultaneous
269 // reduction over width and height.
270 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
271 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
272 const RuntimeShape input_shape =
273 RuntimeShape::ExtendedShape(4, unextended_input_shape);
274 const RuntimeShape output_shape =
275 RuntimeShape::ExtendedShape(4, unextended_output_shape);
276 const int output_batch = output_shape.Dims(0);
277 const int output_height = output_shape.Dims(1);
278 const int output_width = output_shape.Dims(2);
279 const int output_depth = output_shape.Dims(3);
280 const int input_height = input_shape.Dims(1);
281 const int input_width = input_shape.Dims(2);
282 const float num_elements_in_axis = input_width * input_height;
283
284 TFLITE_CHECK_EQ(op_params.axis_count, 2);
285 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
286 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
287 TFLITE_CHECK_EQ(output_height, 1);
288 TFLITE_CHECK_EQ(output_width, 1);
289
290 constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
291 constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
292
293 int32_t bias =
294 output_zero_point -
295 static_cast<int32_t>(input_zero_point * input_scale / output_scale);
296 double real_scale =
297 static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
298
299 int32_t multiplier;
300 int shift;
301 QuantizeMultiplier(real_scale, &multiplier, &shift);
302 for (int out_b = 0; out_b < output_batch; ++out_b) {
303 for (int out_d = 0; out_d < output_depth; ++out_d) {
304 int32_t acc = 0;
305 for (int in_h = 0; in_h < input_height; ++in_h) {
306 for (int in_w = 0; in_w < input_width; ++in_w) {
307 acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
308 }
309 }
310 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
311 acc += bias;
312 acc = std::min(std::max(acc, kMinValue), kMaxValue);
313 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
314 static_cast<uint8_t>(acc);
315 }
316 }
317 }
318
319 // Computes the mean of elements across dimensions given in axis.
320 // It does so in two stages, first calculates the sum of elements along the axis
321 // then divides it by the number of element in axis for quantized values.
322 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32_t input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32_t output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)323 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
324 float input_scale, const int* input_dims,
325 const int input_num_dims, T* output_data,
326 int32_t output_zero_point, float output_scale,
327 const int* output_dims,
328 const int output_num_dims, const int* axis,
329 const int num_axis_dimensions, bool keep_dims,
330 int* temp_index, int* resolved_axis, U* temp_sum,
331 bool compute_sum) {
332 const bool uint8_case = std::is_same<T, uint8_t>::value;
333 const bool int16_case = std::is_same<T, int16_t>::value;
334 if (uint8_case) {
335 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
336 } else if (int16_case) {
337 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
338 } else {
339 ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
340 }
341 // Reset output data.
342 size_t num_outputs = 1;
343 for (int idx = 0; idx < output_num_dims; ++idx) {
344 size_t current = static_cast<size_t>(output_dims[idx]);
345 // Overflow prevention.
346 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
347 return false;
348 }
349 num_outputs *= current;
350 }
351 for (size_t idx = 0; idx < num_outputs; ++idx) {
352 output_data[idx] = T();
353 temp_sum[idx] = U();
354 }
355
356 // Resolve axis.
357 int num_resolved_axis = 0;
358 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
359 &num_resolved_axis)) {
360 return false;
361 }
362
363 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
364 output_num_dims, resolved_axis, num_resolved_axis,
365 temp_index, temp_sum)) {
366 return false;
367 }
368
369 // Calculate mean by dividing output_data by num of aggregated element.
370 size_t num_elements_in_axis = 1;
371 for (int idx = 0; idx < num_resolved_axis; ++idx) {
372 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
373 // Overflow prevention.
374 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
375 return false;
376 }
377 num_elements_in_axis *= current;
378 }
379
380 if (num_elements_in_axis > 0) {
381 const float scale = input_scale / output_scale;
382 if (compute_sum) {
383 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
384 const float bias = -input_zero_point * scale * num_elements_in_axis;
385 for (size_t idx = 0; idx < num_outputs; ++idx) {
386 const U value =
387 static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
388 output_zero_point;
389 output_data[idx] = static_cast<T>(value);
390 }
391 } else {
392 const float bias = -input_zero_point * scale;
393 for (size_t idx = 0; idx < num_outputs; ++idx) {
394 float float_mean = static_cast<float>(temp_sum[idx]) /
395 static_cast<float>(num_elements_in_axis);
396 float result = TfLiteMin(
397 TfLiteRound(float_mean * scale + bias) + output_zero_point,
398 static_cast<float>(std::numeric_limits<T>::max()));
399 result = TfLiteMax(result,
400 static_cast<float>(std::numeric_limits<T>::min()));
401 output_data[idx] = static_cast<T>(result);
402 }
403 }
404 }
405 return true;
406 }
407
408 } // namespace reference_ops
409
410 } // namespace tflite
411
412 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
413