• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
17 
18 #include "tensorflow/lite/kernels/cpu_backend_context.h"
19 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
22 
23 namespace tflite {
24 namespace optimized_integer_ops {
25 
MeanImpl(const tflite::MeanParams & op_params,const RuntimeShape & input_shape,const int8_t * input_data,int32 multiplier,int32 shift,int32 bias,const RuntimeShape & output_shape,int8_t * output_data,int start_depth,int end_depth)26 inline void MeanImpl(const tflite::MeanParams& op_params,
27                      const RuntimeShape& input_shape, const int8_t* input_data,
28                      int32 multiplier, int32 shift, int32 bias,
29                      const RuntimeShape& output_shape, int8_t* output_data,
30                      int start_depth, int end_depth) {
31   ruy::profiler::ScopeLabel label("Mean4D/Int8/MeanImpl");
32 
33   // Current implementation only supports dimension equals 4 and simultaneous
34   // reduction over width and height.
35   const int output_batch = output_shape.Dims(0);
36   const int output_height = output_shape.Dims(2);
37   const int output_width = output_shape.Dims(2);
38   const int input_height = input_shape.Dims(1);
39   const int input_width = input_shape.Dims(2);
40 
41   TFLITE_CHECK_EQ(op_params.axis_count, 2);
42   TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
43                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
44   TFLITE_CHECK_EQ(output_height, 1);
45   TFLITE_CHECK_EQ(output_width, 1);
46 
47   constexpr static int32_t kMinValue = std::numeric_limits<int8_t>::min();
48   constexpr static int32_t kMaxValue = std::numeric_limits<int8_t>::max();
49 
50 #ifdef USE_NEON
51   const int32x4_t bias_dup = vdupq_n_s32(bias);
52   const int32x4_t min_dup = vdupq_n_s32(kMinValue);
53   const int32x4_t max_dup = vdupq_n_s32(kMaxValue);
54 #endif  // USE_NEON
55   for (int out_b = 0; out_b < output_batch; ++out_b) {
56     int out_d = start_depth;
57 #ifdef USE_NEON
58 
59     for (; out_d <= end_depth - 16; out_d += 16) {
60       int32x4x4_t temp_sum;
61       temp_sum.val[0] = vdupq_n_s32(0);
62       temp_sum.val[1] = vdupq_n_s32(0);
63       temp_sum.val[2] = vdupq_n_s32(0);
64       temp_sum.val[3] = vdupq_n_s32(0);
65       for (int in_h = 0; in_h < input_height; ++in_h) {
66         for (int in_w = 0; in_w < input_width; ++in_w) {
67           const int8_t* input_data_ptr =
68               input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
69           int8x16_t input_data_val = vld1q_s8(input_data_ptr);
70 
71           int16x8_t input_data_low_shift =
72               vmovl_s8(vget_low_s8(input_data_val));
73           int16x8_t input_data_high_shift =
74               vmovl_s8(vget_high_s8(input_data_val));
75 
76           int32x4_t input_low_low =
77               vmovl_s16(vget_low_s16(input_data_low_shift));
78           int32x4_t input_high_low =
79               vmovl_s16(vget_high_s16(input_data_low_shift));
80           int32x4_t input_low_high =
81               vmovl_s16(vget_low_s16(input_data_high_shift));
82           int32x4_t input_high_high =
83               vmovl_s16(vget_high_s16(input_data_high_shift));
84 
85           temp_sum.val[0] = vaddq_s32(temp_sum.val[0], input_low_low);
86           temp_sum.val[1] = vaddq_s32(temp_sum.val[1], input_high_low);
87           temp_sum.val[2] = vaddq_s32(temp_sum.val[2], input_low_high);
88           temp_sum.val[3] = vaddq_s32(temp_sum.val[3], input_high_high);
89         }
90       }
91 
92       temp_sum =
93           MultiplyByQuantizedMultiplier4Rows(temp_sum, multiplier, shift);
94 
95       temp_sum.val[0] = vaddq_s32(temp_sum.val[0], bias_dup);
96       temp_sum.val[1] = vaddq_s32(temp_sum.val[1], bias_dup);
97       temp_sum.val[2] = vaddq_s32(temp_sum.val[2], bias_dup);
98       temp_sum.val[3] = vaddq_s32(temp_sum.val[3], bias_dup);
99 
100       temp_sum.val[0] = vminq_s32(vmaxq_s32(temp_sum.val[0], min_dup), max_dup);
101       temp_sum.val[1] = vminq_s32(vmaxq_s32(temp_sum.val[1], min_dup), max_dup);
102       temp_sum.val[2] = vminq_s32(vmaxq_s32(temp_sum.val[2], min_dup), max_dup);
103       temp_sum.val[3] = vminq_s32(vmaxq_s32(temp_sum.val[3], min_dup), max_dup);
104 
105       int16x4_t narrowed_low_low = vmovn_s32(temp_sum.val[0]);
106       int16x4_t narrowed_high_low = vmovn_s32(temp_sum.val[1]);
107       int16x4_t narrowed_low_high = vmovn_s32(temp_sum.val[2]);
108       int16x4_t narrowed_high_high = vmovn_s32(temp_sum.val[3]);
109 
110       int16x8_t combined_low =
111           vcombine_s16(narrowed_low_low, narrowed_high_low);
112       int16x8_t combined_high =
113           vcombine_s16(narrowed_low_high, narrowed_high_high);
114 
115       int8x8_t narrowed_low = vmovn_s16(combined_low);
116       int8x8_t narrowed_high = vmovn_s16(combined_high);
117 
118       int8x16_t combined_output = vcombine_s8(narrowed_low, narrowed_high);
119 
120       int8_t* output_data_ptr =
121           output_data + Offset(output_shape, out_b, 0, 0, out_d);
122       vst1q_s8(output_data_ptr, combined_output);
123     }
124 #endif  // USE_NEON
125 
126     for (; out_d < end_depth; ++out_d) {
127       int acc = 0;
128       for (int in_h = 0; in_h < input_height; ++in_h) {
129         for (int in_w = 0; in_w < input_width; ++in_w) {
130           acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
131         }
132       }
133 
134       acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
135       acc += bias;
136       acc = std::min(std::max(acc, kMinValue), kMaxValue);
137       output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
138           static_cast<int8_t>(acc);
139     }
140   }
141 }
142 
143 struct MeanWorkerTask : cpu_backend_threadpool::Task {
MeanWorkerTaskMeanWorkerTask144   MeanWorkerTask(const tflite::MeanParams& op_params,
145                  const RuntimeShape& input_shape, const int8_t* input_data,
146                  int32 multiplier, int32 shift, int32 bias,
147                  const RuntimeShape& output_shape, int8_t* output_data,
148                  int start_height, int end_height)
149       : op_params(op_params),
150         input_shape(input_shape),
151         input_data(input_data),
152         multiplier(multiplier),
153         shift(shift),
154         bias(bias),
155         output_shape(output_shape),
156         output_data(output_data),
157         start_height(start_height),
158         end_height(end_height) {}
159 
RunMeanWorkerTask160   void Run() override {
161     MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
162              output_shape, output_data, start_height, end_height);
163   }
164 
165  private:
166   const tflite::MeanParams& op_params;
167   const RuntimeShape& input_shape;
168   const int8_t* input_data;
169   int32 multiplier;
170   int32 shift;
171   int32 bias;
172   const RuntimeShape& output_shape;
173   int8_t* output_data;
174   int start_height;
175   int end_height;
176 };
177 
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const int8_t * input_data,int32 input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,int8_t * output_data,int32 output_zero_point,float output_scale,CpuBackendContext * cpu_backend_context)178 inline void Mean(const tflite::MeanParams& op_params,
179                  const RuntimeShape& unextended_input_shape,
180                  const int8_t* input_data, int32 input_zero_point,
181                  float input_scale, const RuntimeShape& unextended_output_shape,
182                  int8_t* output_data, int32 output_zero_point,
183                  float output_scale, CpuBackendContext* cpu_backend_context) {
184   ruy::profiler::ScopeLabel label("Mean4D/Int8");
185   // Current implementation only supports dimension equals 4 and simultaneous
186   // reduction over width and height.
187   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
188   TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
189   const RuntimeShape input_shape =
190       RuntimeShape::ExtendedShape(4, unextended_input_shape);
191   const RuntimeShape output_shape =
192       RuntimeShape::ExtendedShape(4, unextended_output_shape);
193   const int output_height = output_shape.Dims(1);
194   const int output_width = output_shape.Dims(2);
195   const int output_depth = output_shape.Dims(3);
196 
197   TFLITE_CHECK_EQ(op_params.axis_count, 2);
198   TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
199                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
200   TFLITE_CHECK_EQ(output_height, 1);
201   TFLITE_CHECK_EQ(output_width, 1);
202 
203   const int input_height = input_shape.Dims(1);
204   const int input_width = input_shape.Dims(2);
205   const float num_elements_in_axis = input_width * input_height;
206 
207   int32 bias =
208       output_zero_point -
209       static_cast<int32>(input_zero_point * input_scale / output_scale);
210   float real_scale = input_scale / (num_elements_in_axis * output_scale);
211 
212   int32 multiplier, shift;
213   QuantizeMultiplier(real_scale, &multiplier, &shift);
214 
215   constexpr int kMinDepthPerThread = 8;
216   int thread_count = output_depth / kMinDepthPerThread;
217   thread_count = thread_count > 0 ? thread_count : 1;
218   const int capped_thread_count =
219       std::min(thread_count, cpu_backend_context->max_num_threads());
220 
221   if (capped_thread_count == 1) {
222     MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
223              output_shape, output_data, 0, output_depth);
224   } else {
225     // Instead parallel for batch, we loop for the output_depth since batch
226     // is typical 1.
227     std::vector<MeanWorkerTask> tasks;
228     // TODO(b/131746020) don't create new heap allocations every time.
229     // At least we make it a single heap allocation by using reserve().
230     tasks.reserve(capped_thread_count);
231     int depth_start = 0;
232     for (int i = 0; i < capped_thread_count; ++i) {
233       // Try to distribute the tasks as even as possible.
234       int depth_end = depth_start +
235                       (output_depth - depth_start) / (capped_thread_count - i);
236       tasks.emplace_back(op_params, input_shape, input_data, multiplier, shift,
237                          bias, output_shape, output_data, depth_start,
238                          depth_end);
239       depth_start = depth_end;
240     }
241     cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
242                                     cpu_backend_context);
243   }
244 }
245 
246 }  // namespace optimized_integer_ops
247 }  // namespace tflite
248 
249 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
250