• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
17 
18 #include "ruy/profiler/instrumentation.h"  // from @ruy
19 #include "tensorflow/lite/kernels/internal/common.h"
20 #include "tensorflow/lite/kernels/internal/cppmath.h"
21 #include "tensorflow/lite/kernels/internal/max.h"
22 #include "tensorflow/lite/kernels/internal/min.h"
23 #include "tensorflow/lite/kernels/internal/quantization_util.h"
24 #include "tensorflow/lite/kernels/internal/types.h"
25 
26 namespace tflite {
27 
28 namespace reference_ops {
29 
30 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
31 // This method iterates through input data and reduce elements along the
32 // dimensions given in axis.
33 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (const Out current,const In in),Out * output_data)34 inline bool Reduce(const In* input_data, const int* input_dims,
35                    const int* output_dims, const int input_num_dims,
36                    const int output_num_dims, const int* axis,
37                    const int num_axis, int* input_iter,
38                    Out reducer(const Out current, const In in),
39                    Out* output_data) {
40   // Reset input iterator.
41   for (int idx = 0; idx < input_num_dims; ++idx) {
42     input_iter[idx] = 0;
43   }
44   // Iterate through input_data.
45   do {
46     size_t input_offset =
47         ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
48     size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
49                                                input_iter, num_axis, axis);
50     output_data[output_offset] =
51         reducer(output_data[output_offset], input_data[input_offset]);
52   } while (NextIndex(input_num_dims, input_dims, input_iter));
53   return true;
54 }
55 
56 // This method parses the input 'axis' to remove duplicates and handle negative
57 // values, and returns a valid 'out_axis'
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)58 inline bool ResolveAxis(const int num_dims, const int* axis,
59                         const int64_t num_axis, int* out_axis,
60                         int* out_num_axis) {
61   *out_num_axis = 0;  // Just in case.
62   // Short-circuit axis resolution for scalars; the axis will go unused.
63   if (num_dims == 0) {
64     return true;
65   }
66   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
67   for (int64_t idx = 0; idx < num_axis; ++idx) {
68     // Handle negative index. A positive index 'p_idx' can be represented as a
69     // negative index 'n_idx' as: n_idx = p_idx-num_dims
70     // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
71     int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
72     TFLITE_DCHECK(current >= 0 && current < num_dims);
73     if (current < 0 || current >= num_dims) {
74       return false;
75     }
76     bool is_dup = false;
77     for (int j = 0; j < *out_num_axis; ++j) {
78       if (out_axis[j] == current) {
79         is_dup = true;
80         break;
81       }
82     }
83     if (!is_dup) {
84       out_axis[*out_num_axis] = current;
85       *out_num_axis += 1;
86     }
87   }
88   return true;
89 }
90 
91 // This method expects that output_data has been initialized.
92 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)93 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
94                           const int* output_dims, const int input_num_dims,
95                           const int output_num_dims, const int* axis,
96                           const int num_axis, int* input_iter,
97                           Out* output_data) {
98   auto reducer = [](const Out current, const In in) -> Out {
99     const Out actual_in = static_cast<Out>(in);
100     return current + actual_in;
101   };
102   return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
103                          output_num_dims, axis, num_axis, input_iter, reducer,
104                          output_data);
105 }
106 
107 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)108 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
109                                     const T init_value, T* data) {
110   size_t num_elements = 1;
111   for (int idx = 0; idx < num_dims; ++idx) {
112     size_t current = static_cast<size_t>(dims[idx]);
113     // Overflow prevention.
114     if (num_elements > std::numeric_limits<size_t>::max() / current) {
115       return false;
116     }
117     num_elements *= current;
118   }
119   for (size_t idx = 0; idx < num_elements; ++idx) {
120     data[idx] = init_value;
121   }
122   return true;
123 }
124 
125 // Computes the generic value (i.e., sum/max/min/prod) of elements across
126 // dimensions given in axis. It needs to pass in init_value and reducer.
127 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))128 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
129                           const int input_num_dims, T* output_data,
130                           const int* output_dims, const int output_num_dims,
131                           const int* axis, const int64_t num_axis_dimensions,
132                           bool keep_dims, int* temp_index, int* resolved_axis,
133                           T init_value,
134                           T reducer(const T current, const T in)) {
135   // Return early when input shape has zero dim.
136   for (int i = 0; i < input_num_dims; ++i) {
137     if (input_dims[i] == 0) return true;
138   }
139 
140   // Reset output data.
141   if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
142                                output_data)) {
143     return false;
144   }
145 
146   // Resolve axis.
147   int num_resolved_axis = 0;
148   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
149                    &num_resolved_axis)) {
150     return false;
151   }
152 
153   return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
154                       output_num_dims, resolved_axis, num_resolved_axis,
155                       temp_index, reducer, output_data);
156 }
157 
158 // Computes the mean of elements across dimensions given in axis.
159 // It does so in two stages, first calculates the sum of elements along the axis
160 // then divides it by the number of element in axis.
161 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)162 inline bool Mean(const T* input_data, const int* input_dims,
163                  const int input_num_dims, T* output_data,
164                  const int* output_dims, const int output_num_dims,
165                  const int* axis, const int num_axis_dimensions, bool keep_dims,
166                  int* temp_index, int* resolved_axis, U* temp_sum) {
167   ruy::profiler::ScopeLabel label("Mean");
168   // Reset output data.
169   size_t num_outputs = 1;
170   for (int idx = 0; idx < output_num_dims; ++idx) {
171     size_t current = static_cast<size_t>(output_dims[idx]);
172     // Overflow prevention.
173     if (num_outputs > std::numeric_limits<size_t>::max() / current) {
174       return false;
175     }
176     num_outputs *= current;
177   }
178   for (size_t idx = 0; idx < num_outputs; ++idx) {
179     output_data[idx] = T();
180     temp_sum[idx] = U();
181   }
182 
183   // Resolve axis.
184   int num_resolved_axis = 0;
185   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
186                    &num_resolved_axis)) {
187     return false;
188   }
189 
190   if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
191                            output_num_dims, resolved_axis, num_resolved_axis,
192                            temp_index, temp_sum)) {
193     return false;
194   }
195 
196   // Calculate mean by dividing output_data by num of aggregated element.
197   size_t num_elements_in_axis = 1;
198   for (int idx = 0; idx < num_resolved_axis; ++idx) {
199     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
200     // Overflow prevention.
201     if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
202       return false;
203     }
204     num_elements_in_axis *= current;
205   }
206 
207   if (num_elements_in_axis > 0) {
208     for (size_t idx = 0; idx < num_outputs; ++idx) {
209       output_data[idx] =
210           static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
211     }
212   }
213   return true;
214 }
215 
216 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)217 inline void Mean(const tflite::MeanParams& op_params,
218                  const RuntimeShape& unextended_input_shape,
219                  const T* input_data,
220                  const RuntimeShape& unextended_output_shape, T* output_data) {
221   ruy::profiler::ScopeLabel label("Mean4D");
222 
223   // Current implementation only supports dimension equals 4 and simultaneous
224   // reduction over width and height.
225   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
226   TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
227   const RuntimeShape input_shape =
228       RuntimeShape::ExtendedShape(4, unextended_input_shape);
229   const RuntimeShape output_shape =
230       RuntimeShape::ExtendedShape(4, unextended_output_shape);
231 
232   const int output_batch = output_shape.Dims(0);
233   const int output_height = output_shape.Dims(1);
234   const int output_width = output_shape.Dims(2);
235   const int output_depth = output_shape.Dims(3);
236 
237   const int input_height = input_shape.Dims(1);
238   const int input_width = input_shape.Dims(2);
239 
240   TFLITE_CHECK_EQ(op_params.axis_count, 2);
241   TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
242                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
243   TFLITE_CHECK_EQ(output_height, 1);
244   TFLITE_CHECK_EQ(output_width, 1);
245 
246   for (int out_b = 0; out_b < output_batch; ++out_b) {
247     for (int out_d = 0; out_d < output_depth; ++out_d) {
248       float value = 0;
249       for (int in_h = 0; in_h < input_height; ++in_h) {
250         for (int in_w = 0; in_w < input_width; ++in_w) {
251           value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
252         }
253       }
254       output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
255           value / (input_width * input_height);
256     }
257   }
258 }
259 
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32_t input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32_t output_zero_point,float output_scale)260 inline void Mean(const tflite::MeanParams& op_params,
261                  const RuntimeShape& unextended_input_shape,
262                  const uint8_t* input_data, int32_t input_zero_point,
263                  float input_scale, const RuntimeShape& unextended_output_shape,
264                  uint8_t* output_data, int32_t output_zero_point,
265                  float output_scale) {
266   ruy::profiler::ScopeLabel label("Mean4D/Uint8");
267 
268   // Current implementation only supports dimension equals 4 and simultaneous
269   // reduction over width and height.
270   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
271   TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
272   const RuntimeShape input_shape =
273       RuntimeShape::ExtendedShape(4, unextended_input_shape);
274   const RuntimeShape output_shape =
275       RuntimeShape::ExtendedShape(4, unextended_output_shape);
276   const int output_batch = output_shape.Dims(0);
277   const int output_height = output_shape.Dims(1);
278   const int output_width = output_shape.Dims(2);
279   const int output_depth = output_shape.Dims(3);
280   const int input_height = input_shape.Dims(1);
281   const int input_width = input_shape.Dims(2);
282   const float num_elements_in_axis = input_width * input_height;
283 
284   TFLITE_CHECK_EQ(op_params.axis_count, 2);
285   TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
286                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
287   TFLITE_CHECK_EQ(output_height, 1);
288   TFLITE_CHECK_EQ(output_width, 1);
289 
290   constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
291   constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
292 
293   int32_t bias =
294       output_zero_point -
295       static_cast<int32_t>(input_zero_point * input_scale / output_scale);
296   double real_scale =
297       static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
298 
299   int32_t multiplier;
300   int shift;
301   QuantizeMultiplier(real_scale, &multiplier, &shift);
302   for (int out_b = 0; out_b < output_batch; ++out_b) {
303     for (int out_d = 0; out_d < output_depth; ++out_d) {
304       int32_t acc = 0;
305       for (int in_h = 0; in_h < input_height; ++in_h) {
306         for (int in_w = 0; in_w < input_width; ++in_w) {
307           acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
308         }
309       }
310       acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
311       acc += bias;
312       acc = std::min(std::max(acc, kMinValue), kMaxValue);
313       output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
314           static_cast<uint8_t>(acc);
315     }
316   }
317 }
318 
319 // Computes the mean of elements across dimensions given in axis.
320 // It does so in two stages, first calculates the sum of elements along the axis
321 // then divides it by the number of element in axis for quantized values.
322 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32_t input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32_t output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)323 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
324                                float input_scale, const int* input_dims,
325                                const int input_num_dims, T* output_data,
326                                int32_t output_zero_point, float output_scale,
327                                const int* output_dims,
328                                const int output_num_dims, const int* axis,
329                                const int num_axis_dimensions, bool keep_dims,
330                                int* temp_index, int* resolved_axis, U* temp_sum,
331                                bool compute_sum) {
332   const bool uint8_case = std::is_same<T, uint8_t>::value;
333   const bool int16_case = std::is_same<T, int16_t>::value;
334   if (uint8_case) {
335     ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
336   } else if (int16_case) {
337     ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
338   } else {
339     ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
340   }
341   // Reset output data.
342   size_t num_outputs = 1;
343   for (int idx = 0; idx < output_num_dims; ++idx) {
344     size_t current = static_cast<size_t>(output_dims[idx]);
345     // Overflow prevention.
346     if (num_outputs > std::numeric_limits<size_t>::max() / current) {
347       return false;
348     }
349     num_outputs *= current;
350   }
351   for (size_t idx = 0; idx < num_outputs; ++idx) {
352     output_data[idx] = T();
353     temp_sum[idx] = U();
354   }
355 
356   // Resolve axis.
357   int num_resolved_axis = 0;
358   if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
359                    &num_resolved_axis)) {
360     return false;
361   }
362 
363   if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
364                            output_num_dims, resolved_axis, num_resolved_axis,
365                            temp_index, temp_sum)) {
366     return false;
367   }
368 
369   // Calculate mean by dividing output_data by num of aggregated element.
370   size_t num_elements_in_axis = 1;
371   for (int idx = 0; idx < num_resolved_axis; ++idx) {
372     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
373     // Overflow prevention.
374     if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
375       return false;
376     }
377     num_elements_in_axis *= current;
378   }
379 
380   if (num_elements_in_axis > 0) {
381     const float scale = input_scale / output_scale;
382     if (compute_sum) {
383       // TODO(b/116341117): Eliminate float and do this completely in 8bit.
384       const float bias = -input_zero_point * scale * num_elements_in_axis;
385       for (size_t idx = 0; idx < num_outputs; ++idx) {
386         const U value =
387             static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
388             output_zero_point;
389         output_data[idx] = static_cast<T>(value);
390       }
391     } else {
392       const float bias = -input_zero_point * scale;
393       for (size_t idx = 0; idx < num_outputs; ++idx) {
394         float float_mean = static_cast<float>(temp_sum[idx]) /
395                            static_cast<float>(num_elements_in_axis);
396         float result = TfLiteMin(
397             TfLiteRound(float_mean * scale + bias) + output_zero_point,
398             static_cast<float>(std::numeric_limits<T>::max()));
399         result = TfLiteMax(result,
400                            static_cast<float>(std::numeric_limits<T>::min()));
401         output_data[idx] = static_cast<T>(result);
402       }
403     }
404   }
405   return true;
406 }
407 
408 }  // namespace reference_ops
409 
410 }  // namespace tflite
411 
412 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
413