1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
17
18 #include "tensorflow/lite/kernels/cpu_backend_context.h"
19 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
22
23 namespace tflite {
24 namespace optimized_integer_ops {
25
MeanImpl(const tflite::MeanParams & op_params,const RuntimeShape & input_shape,const int8_t * input_data,int32 multiplier,int32 shift,int32 bias,const RuntimeShape & output_shape,int8_t * output_data,int start_depth,int end_depth)26 inline void MeanImpl(const tflite::MeanParams& op_params,
27 const RuntimeShape& input_shape, const int8_t* input_data,
28 int32 multiplier, int32 shift, int32 bias,
29 const RuntimeShape& output_shape, int8_t* output_data,
30 int start_depth, int end_depth) {
31 ruy::profiler::ScopeLabel label("Mean4D/Int8/MeanImpl");
32
33 // Current implementation only supports dimension equals 4 and simultaneous
34 // reduction over width and height.
35 const int output_batch = output_shape.Dims(0);
36 const int output_height = output_shape.Dims(2);
37 const int output_width = output_shape.Dims(2);
38 const int input_height = input_shape.Dims(1);
39 const int input_width = input_shape.Dims(2);
40
41 TFLITE_CHECK_EQ(op_params.axis_count, 2);
42 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
43 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
44 TFLITE_CHECK_EQ(output_height, 1);
45 TFLITE_CHECK_EQ(output_width, 1);
46
47 constexpr static int32_t kMinValue = std::numeric_limits<int8_t>::min();
48 constexpr static int32_t kMaxValue = std::numeric_limits<int8_t>::max();
49
50 #ifdef USE_NEON
51 const int32x4_t bias_dup = vdupq_n_s32(bias);
52 const int32x4_t min_dup = vdupq_n_s32(kMinValue);
53 const int32x4_t max_dup = vdupq_n_s32(kMaxValue);
54 #endif // USE_NEON
55 for (int out_b = 0; out_b < output_batch; ++out_b) {
56 int out_d = start_depth;
57 #ifdef USE_NEON
58
59 for (; out_d <= end_depth - 16; out_d += 16) {
60 int32x4x4_t temp_sum;
61 temp_sum.val[0] = vdupq_n_s32(0);
62 temp_sum.val[1] = vdupq_n_s32(0);
63 temp_sum.val[2] = vdupq_n_s32(0);
64 temp_sum.val[3] = vdupq_n_s32(0);
65 for (int in_h = 0; in_h < input_height; ++in_h) {
66 for (int in_w = 0; in_w < input_width; ++in_w) {
67 const int8_t* input_data_ptr =
68 input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
69 int8x16_t input_data_val = vld1q_s8(input_data_ptr);
70
71 int16x8_t input_data_low_shift =
72 vmovl_s8(vget_low_s8(input_data_val));
73 int16x8_t input_data_high_shift =
74 vmovl_s8(vget_high_s8(input_data_val));
75
76 int32x4_t input_low_low =
77 vmovl_s16(vget_low_s16(input_data_low_shift));
78 int32x4_t input_high_low =
79 vmovl_s16(vget_high_s16(input_data_low_shift));
80 int32x4_t input_low_high =
81 vmovl_s16(vget_low_s16(input_data_high_shift));
82 int32x4_t input_high_high =
83 vmovl_s16(vget_high_s16(input_data_high_shift));
84
85 temp_sum.val[0] = vaddq_s32(temp_sum.val[0], input_low_low);
86 temp_sum.val[1] = vaddq_s32(temp_sum.val[1], input_high_low);
87 temp_sum.val[2] = vaddq_s32(temp_sum.val[2], input_low_high);
88 temp_sum.val[3] = vaddq_s32(temp_sum.val[3], input_high_high);
89 }
90 }
91
92 temp_sum =
93 MultiplyByQuantizedMultiplier4Rows(temp_sum, multiplier, shift);
94
95 temp_sum.val[0] = vaddq_s32(temp_sum.val[0], bias_dup);
96 temp_sum.val[1] = vaddq_s32(temp_sum.val[1], bias_dup);
97 temp_sum.val[2] = vaddq_s32(temp_sum.val[2], bias_dup);
98 temp_sum.val[3] = vaddq_s32(temp_sum.val[3], bias_dup);
99
100 temp_sum.val[0] = vminq_s32(vmaxq_s32(temp_sum.val[0], min_dup), max_dup);
101 temp_sum.val[1] = vminq_s32(vmaxq_s32(temp_sum.val[1], min_dup), max_dup);
102 temp_sum.val[2] = vminq_s32(vmaxq_s32(temp_sum.val[2], min_dup), max_dup);
103 temp_sum.val[3] = vminq_s32(vmaxq_s32(temp_sum.val[3], min_dup), max_dup);
104
105 int16x4_t narrowed_low_low = vmovn_s32(temp_sum.val[0]);
106 int16x4_t narrowed_high_low = vmovn_s32(temp_sum.val[1]);
107 int16x4_t narrowed_low_high = vmovn_s32(temp_sum.val[2]);
108 int16x4_t narrowed_high_high = vmovn_s32(temp_sum.val[3]);
109
110 int16x8_t combined_low =
111 vcombine_s16(narrowed_low_low, narrowed_high_low);
112 int16x8_t combined_high =
113 vcombine_s16(narrowed_low_high, narrowed_high_high);
114
115 int8x8_t narrowed_low = vmovn_s16(combined_low);
116 int8x8_t narrowed_high = vmovn_s16(combined_high);
117
118 int8x16_t combined_output = vcombine_s8(narrowed_low, narrowed_high);
119
120 int8_t* output_data_ptr =
121 output_data + Offset(output_shape, out_b, 0, 0, out_d);
122 vst1q_s8(output_data_ptr, combined_output);
123 }
124 #endif // USE_NEON
125
126 for (; out_d < end_depth; ++out_d) {
127 int acc = 0;
128 for (int in_h = 0; in_h < input_height; ++in_h) {
129 for (int in_w = 0; in_w < input_width; ++in_w) {
130 acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
131 }
132 }
133
134 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
135 acc += bias;
136 acc = std::min(std::max(acc, kMinValue), kMaxValue);
137 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
138 static_cast<int8_t>(acc);
139 }
140 }
141 }
142
143 struct MeanWorkerTask : cpu_backend_threadpool::Task {
MeanWorkerTaskMeanWorkerTask144 MeanWorkerTask(const tflite::MeanParams& op_params,
145 const RuntimeShape& input_shape, const int8_t* input_data,
146 int32 multiplier, int32 shift, int32 bias,
147 const RuntimeShape& output_shape, int8_t* output_data,
148 int start_height, int end_height)
149 : op_params(op_params),
150 input_shape(input_shape),
151 input_data(input_data),
152 multiplier(multiplier),
153 shift(shift),
154 bias(bias),
155 output_shape(output_shape),
156 output_data(output_data),
157 start_height(start_height),
158 end_height(end_height) {}
159
RunMeanWorkerTask160 void Run() override {
161 MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
162 output_shape, output_data, start_height, end_height);
163 }
164
165 private:
166 const tflite::MeanParams& op_params;
167 const RuntimeShape& input_shape;
168 const int8_t* input_data;
169 int32 multiplier;
170 int32 shift;
171 int32 bias;
172 const RuntimeShape& output_shape;
173 int8_t* output_data;
174 int start_height;
175 int end_height;
176 };
177
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const int8_t * input_data,int32 input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,int8_t * output_data,int32 output_zero_point,float output_scale,CpuBackendContext * cpu_backend_context)178 inline void Mean(const tflite::MeanParams& op_params,
179 const RuntimeShape& unextended_input_shape,
180 const int8_t* input_data, int32 input_zero_point,
181 float input_scale, const RuntimeShape& unextended_output_shape,
182 int8_t* output_data, int32 output_zero_point,
183 float output_scale, CpuBackendContext* cpu_backend_context) {
184 ruy::profiler::ScopeLabel label("Mean4D/Int8");
185 // Current implementation only supports dimension equals 4 and simultaneous
186 // reduction over width and height.
187 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
188 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
189 const RuntimeShape input_shape =
190 RuntimeShape::ExtendedShape(4, unextended_input_shape);
191 const RuntimeShape output_shape =
192 RuntimeShape::ExtendedShape(4, unextended_output_shape);
193 const int output_height = output_shape.Dims(1);
194 const int output_width = output_shape.Dims(2);
195 const int output_depth = output_shape.Dims(3);
196
197 TFLITE_CHECK_EQ(op_params.axis_count, 2);
198 TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
199 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
200 TFLITE_CHECK_EQ(output_height, 1);
201 TFLITE_CHECK_EQ(output_width, 1);
202
203 const int input_height = input_shape.Dims(1);
204 const int input_width = input_shape.Dims(2);
205 const float num_elements_in_axis = input_width * input_height;
206
207 int32 bias =
208 output_zero_point -
209 static_cast<int32>(input_zero_point * input_scale / output_scale);
210 float real_scale = input_scale / (num_elements_in_axis * output_scale);
211
212 int32 multiplier, shift;
213 QuantizeMultiplier(real_scale, &multiplier, &shift);
214
215 constexpr int kMinDepthPerThread = 8;
216 int thread_count = output_depth / kMinDepthPerThread;
217 thread_count = thread_count > 0 ? thread_count : 1;
218 const int capped_thread_count =
219 std::min(thread_count, cpu_backend_context->max_num_threads());
220
221 if (capped_thread_count == 1) {
222 MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
223 output_shape, output_data, 0, output_depth);
224 } else {
225 // Instead parallel for batch, we loop for the output_depth since batch
226 // is typical 1.
227 std::vector<MeanWorkerTask> tasks;
228 // TODO(b/131746020) don't create new heap allocations every time.
229 // At least we make it a single heap allocation by using reserve().
230 tasks.reserve(capped_thread_count);
231 int depth_start = 0;
232 for (int i = 0; i < capped_thread_count; ++i) {
233 // Try to distribute the tasks as even as possible.
234 int depth_end = depth_start +
235 (output_depth - depth_start) / (capped_thread_count - i);
236 tasks.emplace_back(op_params, input_shape, input_data, multiplier, shift,
237 bias, output_shape, output_data, depth_start,
238 depth_end);
239 depth_start = depth_end;
240 }
241 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
242 cpu_backend_context);
243 }
244 }
245
246 } // namespace optimized_integer_ops
247 } // namespace tflite
248
249 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MEAN_H_
250