1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
17
18 #include <stdint.h>
19 #include <sys/types.h>
20 #include <algorithm>
21 #include <cmath>
22 #include <functional>
23 #include <limits>
24 #include <memory>
25 #include <type_traits>
26
27 #include "fixedpoint/fixedpoint.h"
28 #include "public/gemmlowp.h"
29 #include "tensorflow/lite/kernels/internal/common.h"
30 #include "tensorflow/lite/kernels/internal/quantization_util.h"
31 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
32 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
33 #include "tensorflow/lite/kernels/internal/round.h"
34 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
35 #include "tensorflow/lite/kernels/internal/types.h"
36
37 namespace tflite {
38
39 namespace reference_ops {
40
41 // Return true for broadcast case, false otherwise.
ProcessBroadcastShapes(const RuntimeShape & shape0,const RuntimeShape & shape1,tflite::ArithmeticParams * params)42 inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
43 const RuntimeShape& shape1,
44 tflite::ArithmeticParams* params) {
45 const int dims_count =
46 std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
47
48 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
49 RuntimeShape scalar_shape(dims_count, 1);
50
51 auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
52 auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
53
54 // Check for "exact" match, implicitly accepting any scalar shapes.
55 if (extended_shape0 == extended_shape1) {
56 params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
57 return false;
58 }
59
60 for (int i = dims_count - 1; i >= 0; --i) {
61 if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
62 continue;
63 } else if (extended_shape0.Dims(i) == 1) {
64 params->broadcast_category =
65 BroadcastableOpCategory::kFirstInputBroadcastsFast;
66 break;
67 } else if (extended_shape1.Dims(i) == 1) {
68 params->broadcast_category =
69 BroadcastableOpCategory::kSecondInputBroadcastsFast;
70 break;
71 } else {
72 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
73 break;
74 }
75 }
76
77 if (params->broadcast_category !=
78 BroadcastableOpCategory::kFirstInputBroadcastsFast &&
79 params->broadcast_category !=
80 BroadcastableOpCategory::kSecondInputBroadcastsFast) {
81 return false;
82 }
83
84 // From this point it is assumed contractually that corresponding dimensions
85 // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
86 const bool swap_inputs = params->broadcast_category ==
87 BroadcastableOpCategory::kSecondInputBroadcastsFast;
88 const RuntimeShape* shape_a =
89 swap_inputs ? &extended_shape1 : &extended_shape0;
90 const RuntimeShape* shape_b =
91 swap_inputs ? &extended_shape0 : &extended_shape1;
92
93 int i = dims_count - 1;
94 params->broadcast_shape[0] = 1;
95 params->broadcast_shape[1] = 1;
96 params->broadcast_shape[2] = 1;
97 params->broadcast_shape[3] = 1;
98 params->broadcast_shape[4] = 1;
99 // y_0 is greedy: include dims if both or neither equal 1: in other words,
100 // test for equality rather than (shape_a->Dims(i) != 1).
101 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
102 params->broadcast_shape[4] *= shape_b->Dims(i);
103 --i;
104 }
105 // Here either input_a or input_b has dim of 1 (if i >= 0). If it is input_b
106 // that has the unit dimension, the next two loops are not entered.
107 while (i >= 0 && shape_a->Dims(i) == 1) {
108 params->broadcast_shape[3] *= shape_b->Dims(i);
109 --i;
110 }
111 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
112 params->broadcast_shape[2] *= shape_a->Dims(i);
113 --i;
114 }
115 // Here either input_a or input_b has dim of 1 (if i >= 0).
116 while (i >= 0 && shape_b->Dims(i) == 1) {
117 params->broadcast_shape[1] *= shape_a->Dims(i);
118 --i;
119 }
120 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
121 params->broadcast_shape[0] *= shape_b->Dims(i);
122 --i;
123 }
124
125 // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
126 // loop.
127 if (i >= 0) {
128 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
129 }
130 return true;
131 }
132
Conv(const ConvParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & filter_shape,const float * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const RuntimeShape & im2col_shape,float * im2col_data)133 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
134 const float* input_data, const RuntimeShape& filter_shape,
135 const float* filter_data, const RuntimeShape& bias_shape,
136 const float* bias_data, const RuntimeShape& output_shape,
137 float* output_data, const RuntimeShape& im2col_shape,
138 float* im2col_data) {
139 const int stride_width = params.stride_width;
140 const int stride_height = params.stride_height;
141 const int dilation_width_factor = params.dilation_width_factor;
142 const int dilation_height_factor = params.dilation_height_factor;
143 const int pad_width = params.padding_values.width;
144 const int pad_height = params.padding_values.height;
145 const float output_activation_min = params.float_activation_min;
146 const float output_activation_max = params.float_activation_max;
147 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
148 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
149 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
150
151 (void)im2col_data; // only used in optimized code.
152 (void)im2col_shape; // only used in optimized code.
153 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
154 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
155 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
156 if (bias_data) {
157 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
158 }
159 const int input_height = input_shape.Dims(1);
160 const int input_width = input_shape.Dims(2);
161 const int filter_height = filter_shape.Dims(1);
162 const int filter_width = filter_shape.Dims(2);
163 const int output_height = output_shape.Dims(1);
164 const int output_width = output_shape.Dims(2);
165 for (int batch = 0; batch < batches; ++batch) {
166 for (int out_y = 0; out_y < output_height; ++out_y) {
167 for (int out_x = 0; out_x < output_width; ++out_x) {
168 for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
169 const int in_x_origin = (out_x * stride_width) - pad_width;
170 const int in_y_origin = (out_y * stride_height) - pad_height;
171 float total = 0.f;
172 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
173 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
174 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
175 const int in_x = in_x_origin + dilation_width_factor * filter_x;
176 const int in_y =
177 in_y_origin + dilation_height_factor * filter_y;
178 // If the location is outside the bounds of the input image,
179 // use zero as a default value.
180 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
181 (in_y < input_height)) {
182 float input_value = input_data[Offset(
183 input_shape, batch, in_y, in_x, in_channel)];
184 float filter_value =
185 filter_data[Offset(filter_shape, out_channel, filter_y,
186 filter_x, in_channel)];
187 total += (input_value * filter_value);
188 }
189 }
190 }
191 }
192 float bias_value = 0.0f;
193 if (bias_data) {
194 bias_value = bias_data[out_channel];
195 }
196 output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
197 ActivationFunctionWithMinMax(total + bias_value,
198 output_activation_min,
199 output_activation_max);
200 }
201 }
202 }
203 }
204 }
205
Conv(const ConvParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & filter_shape,const uint8 * filter_data,const RuntimeShape & bias_shape,const int32 * bias_data,const RuntimeShape & output_shape,uint8 * output_data,const RuntimeShape & im2col_shape,uint8 * im2col_data,gemmlowp::GemmContext * gemm_context)206 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
207 const uint8* input_data, const RuntimeShape& filter_shape,
208 const uint8* filter_data, const RuntimeShape& bias_shape,
209 const int32* bias_data, const RuntimeShape& output_shape,
210 uint8* output_data, const RuntimeShape& im2col_shape,
211 uint8* im2col_data, gemmlowp::GemmContext* gemm_context) {
212 (void)im2col_data; // only used in optimized code.
213 (void)im2col_shape; // only used in optimized code.
214 (void)gemm_context; // only used in optimized code.
215 const int stride_width = params.stride_width;
216 const int stride_height = params.stride_height;
217 const int dilation_width_factor = params.dilation_width_factor;
218 const int dilation_height_factor = params.dilation_height_factor;
219 const int pad_width = params.padding_values.width;
220 const int pad_height = params.padding_values.height;
221 const int32 input_offset = params.input_offset;
222 const int32 filter_offset = params.weights_offset;
223 const int32 output_offset = params.output_offset;
224 const int32 output_multiplier = params.output_multiplier;
225 const int output_shift = params.output_shift;
226 const int32 output_activation_min = params.quantized_activation_min;
227 const int32 output_activation_max = params.quantized_activation_max;
228 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
229
230 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
231 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
232 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
233 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
234 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
235 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
236 if (bias_data) {
237 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
238 }
239 const int input_height = input_shape.Dims(1);
240 const int input_width = input_shape.Dims(2);
241 const int filter_height = filter_shape.Dims(1);
242 const int filter_width = filter_shape.Dims(2);
243 const int output_height = output_shape.Dims(1);
244 const int output_width = output_shape.Dims(2);
245 for (int batch = 0; batch < batches; ++batch) {
246 for (int out_y = 0; out_y < output_height; ++out_y) {
247 for (int out_x = 0; out_x < output_width; ++out_x) {
248 for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
249 const int in_x_origin = (out_x * stride_width) - pad_width;
250 const int in_y_origin = (out_y * stride_height) - pad_height;
251 int32 acc = 0;
252 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
253 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
254 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
255 const int in_x = in_x_origin + dilation_width_factor * filter_x;
256 const int in_y =
257 in_y_origin + dilation_height_factor * filter_y;
258 // If the location is outside the bounds of the input image,
259 // use zero as a default value.
260 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
261 (in_y < input_height)) {
262 int32 input_val = input_data[Offset(input_shape, batch, in_y,
263 in_x, in_channel)];
264 int32 filter_val =
265 filter_data[Offset(filter_shape, out_channel, filter_y,
266 filter_x, in_channel)];
267 acc +=
268 (filter_val + filter_offset) * (input_val + input_offset);
269 }
270 }
271 }
272 }
273 if (bias_data) {
274 acc += bias_data[out_channel];
275 }
276 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
277 output_shift);
278 acc += output_offset;
279 acc = std::max(acc, output_activation_min);
280 acc = std::min(acc, output_activation_max);
281 output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
282 static_cast<uint8>(acc);
283 }
284 }
285 }
286 }
287 }
288
289 template <typename T>
DepthToSpace(const tflite::DepthToSpaceParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)290 inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
291 const RuntimeShape& unextended_input_shape,
292 const T* input_data,
293 const RuntimeShape& unextended_output_shape,
294 T* output_data) {
295 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
296 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
297 const RuntimeShape input_shape =
298 RuntimeShape::ExtendedShape(4, unextended_input_shape);
299 const RuntimeShape output_shape =
300 RuntimeShape::ExtendedShape(4, unextended_output_shape);
301
302 const int input_depth = input_shape.Dims(3);
303 const int input_width = input_shape.Dims(2);
304 const int input_height = input_shape.Dims(1);
305 const int input_batch = input_shape.Dims(0);
306
307 const int output_depth = output_shape.Dims(3);
308 const int output_width = output_shape.Dims(2);
309 const int output_height = output_shape.Dims(1);
310 const int output_batch = output_shape.Dims(0);
311
312 const int32 block_size = op_params.block_size;
313
314 TFLITE_DCHECK_EQ(input_width * block_size, output_width);
315 TFLITE_DCHECK_EQ(input_height * block_size, output_height);
316 TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
317 TFLITE_DCHECK_EQ(input_batch, output_batch);
318
319 for (int out_b = 0; out_b < output_batch; ++out_b) {
320 for (int out_h = 0; out_h < output_height; ++out_h) {
321 for (int out_w = 0; out_w < output_width; ++out_w) {
322 for (int out_d = 0; out_d < output_depth; ++out_d) {
323 const int in_d =
324 out_d + ((out_h % block_size) * block_size + out_w % block_size) *
325 output_depth;
326
327 const int in_w = out_w / block_size;
328 const int in_h = out_h / block_size;
329 const int in_b = out_b;
330
331 const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
332 const int output_index =
333 Offset(output_shape, out_b, out_h, out_w, out_d);
334
335 output_data[output_index] = input_data[input_index];
336 }
337 }
338 }
339 }
340 }
341
342 template <typename T>
SpaceToDepth(const tflite::SpaceToDepthParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)343 inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
344 const RuntimeShape& unextended_input_shape,
345 const T* input_data,
346 const RuntimeShape& unextended_output_shape,
347 T* output_data) {
348 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
349 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
350 const RuntimeShape input_shape =
351 RuntimeShape::ExtendedShape(4, unextended_input_shape);
352 const RuntimeShape output_shape =
353 RuntimeShape::ExtendedShape(4, unextended_output_shape);
354
355 const int input_depth = input_shape.Dims(3);
356 const int input_width = input_shape.Dims(2);
357 const int input_height = input_shape.Dims(1);
358 const int input_batch = input_shape.Dims(0);
359
360 const int output_depth = output_shape.Dims(3);
361 const int output_width = output_shape.Dims(2);
362 const int output_height = output_shape.Dims(1);
363 const int output_batch = output_shape.Dims(0);
364
365 const int32 block_size = op_params.block_size;
366
367 TFLITE_DCHECK_EQ(input_width, output_width * block_size);
368 TFLITE_DCHECK_EQ(input_height, output_height * block_size);
369 TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
370 TFLITE_DCHECK_EQ(input_batch, output_batch);
371
372 for (int in_b = 0; in_b < input_batch; ++in_b) {
373 for (int in_h = 0; in_h < input_height; ++in_h) {
374 for (int in_w = 0; in_w < input_width; ++in_w) {
375 for (int in_d = 0; in_d < input_depth; ++in_d) {
376 const int out_d =
377 in_d + ((in_h % block_size) * block_size + in_w % block_size) *
378 input_depth;
379 const int out_w = in_w / block_size;
380 const int out_h = in_h / block_size;
381 const int out_b = in_b;
382
383 const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
384 const int output_index =
385 Offset(output_shape, out_b, out_h, out_w, out_d);
386
387 output_data[output_index] = input_data[input_index];
388 }
389 }
390 }
391 }
392 }
393
Elu(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)394 inline void Elu(const RuntimeShape& input_shape, const float* input_data,
395 const RuntimeShape& output_shape, float* output_data) {
396 const int flat_size = MatchingFlatSize(input_shape, output_shape);
397 for (int i = 0; i < flat_size; ++i) {
398 const float val = input_data[i];
399 output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
400 }
401 }
402
Relu(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)403 inline void Relu(const RuntimeShape& input_shape, const float* input_data,
404 const RuntimeShape& output_shape, float* output_data) {
405 const int flat_size = MatchingFlatSize(input_shape, output_shape);
406 for (int i = 0; i < flat_size; ++i) {
407 const float val = input_data[i];
408 const float lower = 0;
409 const float clamped = val < lower ? lower : val;
410 output_data[i] = clamped;
411 }
412 }
413
Relu1(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)414 inline void Relu1(const RuntimeShape& input_shape, const float* input_data,
415 const RuntimeShape& output_shape, float* output_data) {
416 gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
417 const int flat_size = MatchingFlatSize(input_shape, output_shape);
418 for (int i = 0; i < flat_size; ++i) {
419 const float val = input_data[i];
420 const float upper = 1;
421 const float lower = -1;
422 const float clamped = val > upper ? upper : val < lower ? lower : val;
423 output_data[i] = clamped;
424 }
425 }
426
Relu6(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)427 inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
428 const RuntimeShape& output_shape, float* output_data) {
429 gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
430 const int flat_size = MatchingFlatSize(input_shape, output_shape);
431 for (int i = 0; i < flat_size; ++i) {
432 const float val = input_data[i];
433 const float upper = 6;
434 const float lower = 0;
435 const float clamped = val > upper ? upper : val < lower ? lower : val;
436 output_data[i] = clamped;
437 }
438 }
439
440 template <typename T>
ReluX(const tflite::ActivationParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)441 inline void ReluX(const tflite::ActivationParams& params,
442 const RuntimeShape& input_shape, const T* input_data,
443 const RuntimeShape& output_shape, T* output_data) {
444 gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
445 const int flat_size = MatchingFlatSize(input_shape, output_shape);
446 const T max_value = params.quantized_activation_max;
447 const T min_value = params.quantized_activation_min;
448 for (int i = 0; i < flat_size; ++i) {
449 const T val = input_data[i];
450 const T clamped =
451 val > max_value ? max_value : val < min_value ? min_value : val;
452 output_data[i] = clamped;
453 }
454 }
455
LeakyRelu(const tflite::LeakyReluParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)456 inline void LeakyRelu(const tflite::LeakyReluParams& params,
457 const RuntimeShape& input_shape, const float* input_data,
458 const RuntimeShape& output_shape, float* output_data) {
459 gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
460 const int flat_size = MatchingFlatSize(input_shape, output_shape);
461 for (int i = 0; i < flat_size; ++i) {
462 const float val = input_data[i];
463 // Note that this implementation matches that of TensorFlow, and corresponds
464 // to the traditional LeakyRelu equation only for alpha <= 1.
465 output_data[i] = std::max(val, val * params.alpha);
466 }
467 }
468
L2Normalization(const tflite::L2NormalizationParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)469 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
470 const RuntimeShape& input_shape,
471 const float* input_data,
472 const RuntimeShape& output_shape,
473 float* output_data) {
474 const int trailing_dim = input_shape.DimensionsCount() - 1;
475 const int outer_size =
476 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
477 const int depth =
478 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
479 for (int i = 0; i < outer_size; ++i) {
480 float squared_l2_norm = 0;
481 for (int c = 0; c < depth; ++c) {
482 const float val = input_data[depth * i + c];
483 squared_l2_norm += val * val;
484 }
485 const float l2_norm = std::sqrt(squared_l2_norm);
486 for (int c = 0; c < depth; ++c) {
487 output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
488 }
489 }
490 }
491
L2Normalization(const tflite::L2NormalizationParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)492 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
493 const RuntimeShape& input_shape,
494 const uint8* input_data,
495 const RuntimeShape& output_shape,
496 uint8* output_data) {
497 const int trailing_dim = input_shape.DimensionsCount() - 1;
498 const int depth =
499 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
500 const int outer_size =
501 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
502 const int32 input_zero_point = op_params.input_zero_point;
503 for (int i = 0; i < outer_size; ++i) {
504 int32 square_l2_norm = 0;
505 for (int c = 0; c < depth; c++) {
506 int32 diff = input_data[depth * i + c] - input_zero_point;
507 square_l2_norm += diff * diff;
508 }
509 int32 inv_l2norm_multiplier;
510 int inv_l2norm_shift;
511 GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
512 &inv_l2norm_multiplier, &inv_l2norm_shift);
513 for (int c = 0; c < depth; c++) {
514 int32 diff = input_data[depth * i + c] - input_zero_point;
515 int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
516 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
517 int32 unclamped_output_val = 128 + rescaled_diff;
518 int32 output_val = std::min(255, std::max(0, unclamped_output_val));
519 output_data[depth * i + c] = static_cast<uint8>(output_val);
520 }
521 }
522 }
523
524 template <typename T>
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)525 inline void Add(const ArithmeticParams& params,
526 const RuntimeShape& input1_shape, const T* input1_data,
527 const RuntimeShape& input2_shape, const T* input2_data,
528 const RuntimeShape& output_shape, T* output_data) {
529 const int flat_size =
530 MatchingFlatSize(input1_shape, input2_shape, output_shape);
531 for (int i = 0; i < flat_size; ++i) {
532 output_data[i] = ActivationFunctionWithMinMax(
533 input1_data[i] + input2_data[i], params.quantized_activation_min,
534 params.quantized_activation_max);
535 }
536 }
537
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)538 inline void Add(const ArithmeticParams& params,
539 const RuntimeShape& input1_shape, const float* input1_data,
540 const RuntimeShape& input2_shape, const float* input2_data,
541 const RuntimeShape& output_shape, float* output_data) {
542 const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
543 for (int i = 0; i < size; i++) {
544 auto x = input1_data[i] + input2_data[i];
545 output_data[i] = ActivationFunctionWithMinMax(
546 x, params.float_activation_min, params.float_activation_max);
547 }
548 }
549
550 // T is expected to be either float or int.
551 template <typename T>
AddN(const RuntimeShape & input_shape,const size_t num_inputs,T * const * input_data,T * output_data)552 inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
553 T* const* input_data, T* output_data) {
554 // All inputs and output should have the same shape, this is checked during
555 // Prepare stage.
556 const size_t size = input_shape.FlatSize();
557 for (int i = 0; i < size; ++i) {
558 T x = 0;
559 for (int j = 0; j < num_inputs; ++j) {
560 x += input_data[j][i];
561 }
562 output_data[i] = x;
563 }
564 }
565
566 // Element-wise add that can often be used for inner loop of broadcast add as
567 // well as the non-broadcast add.
AddElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)568 inline void AddElementwise(int size, const ArithmeticParams& params,
569 const uint8* input1_data, const uint8* input2_data,
570 uint8* output_data) {
571 TFLITE_DCHECK_GT(params.input1_offset, -256);
572 TFLITE_DCHECK_GT(params.input2_offset, -256);
573 TFLITE_DCHECK_LT(params.input1_offset, 256);
574 TFLITE_DCHECK_LT(params.input2_offset, 256);
575
576 for (int i = 0; i < size; ++i) {
577 const int32 input1_val = params.input1_offset + input1_data[i];
578 const int32 input2_val = params.input2_offset + input2_data[i];
579 const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
580 const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
581 const int32 scaled_input1_val =
582 MultiplyByQuantizedMultiplierSmallerThanOneExp(
583 shifted_input1_val, params.input1_multiplier, params.input1_shift);
584 const int32 scaled_input2_val =
585 MultiplyByQuantizedMultiplierSmallerThanOneExp(
586 shifted_input2_val, params.input2_multiplier, params.input2_shift);
587 const int32 raw_sum = scaled_input1_val + scaled_input2_val;
588 const int32 raw_output =
589 MultiplyByQuantizedMultiplierSmallerThanOneExp(
590 raw_sum, params.output_multiplier, params.output_shift) +
591 params.output_offset;
592 const int32 clamped_output =
593 std::min(params.quantized_activation_max,
594 std::max(params.quantized_activation_min, raw_output));
595 output_data[i] = static_cast<uint8>(clamped_output);
596 }
597 }
598
599 // Scalar-broadcast add that can be used for inner loop of more general
600 // broadcast add, so that, for example, scalar-broadcast with batch will still
601 // be fast.
AddScalarBroadcast(int size,const ArithmeticParams & params,uint8 input1_data,const uint8 * input2_data,uint8 * output_data)602 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
603 uint8 input1_data, const uint8* input2_data,
604 uint8* output_data) {
605 TFLITE_DCHECK_GT(params.input1_offset, -256);
606 TFLITE_DCHECK_GT(params.input2_offset, -256);
607 TFLITE_DCHECK_LT(params.input1_offset, 256);
608 TFLITE_DCHECK_LT(params.input2_offset, 256);
609
610 const int32 input1_val = params.input1_offset + input1_data;
611 const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
612 const int32 scaled_input1_val =
613 MultiplyByQuantizedMultiplierSmallerThanOneExp(
614 shifted_input1_val, params.input1_multiplier, params.input1_shift);
615 for (int i = 0; i < size; ++i) {
616 const int32 input2_val = params.input2_offset + input2_data[i];
617 const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
618 const int32 scaled_input2_val =
619 MultiplyByQuantizedMultiplierSmallerThanOneExp(
620 shifted_input2_val, params.input2_multiplier, params.input2_shift);
621 const int32 raw_sum = scaled_input1_val + scaled_input2_val;
622 const int32 raw_output =
623 MultiplyByQuantizedMultiplierSmallerThanOneExp(
624 raw_sum, params.output_multiplier, params.output_shift) +
625 params.output_offset;
626 const int32 clamped_output =
627 std::min(params.quantized_activation_max,
628 std::max(params.quantized_activation_min, raw_output));
629 output_data[i] = static_cast<uint8>(clamped_output);
630 }
631 }
632
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)633 inline void Add(const ArithmeticParams& params,
634 const RuntimeShape& input1_shape, const uint8* input1_data,
635 const RuntimeShape& input2_shape, const uint8* input2_data,
636 const RuntimeShape& output_shape, uint8* output_data) {
637 TFLITE_DCHECK_LE(params.quantized_activation_min,
638 params.quantized_activation_max);
639 const int flat_size =
640 MatchingFlatSize(input1_shape, input2_shape, output_shape);
641
642 TFLITE_DCHECK_GT(params.input1_offset, -256);
643 TFLITE_DCHECK_GT(params.input2_offset, -256);
644 TFLITE_DCHECK_LT(params.input1_offset, 256);
645 TFLITE_DCHECK_LT(params.input2_offset, 256);
646 AddElementwise(flat_size, params, input1_data, input2_data, output_data);
647 }
648
Add(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,int16 * output_data)649 inline void Add(const ArithmeticParams& params,
650 const RuntimeShape& input1_shape, const int16* input1_data,
651 const RuntimeShape& input2_shape, const int16* input2_data,
652 const RuntimeShape& output_shape, int16* output_data) {
653 TFLITE_DCHECK_LE(params.quantized_activation_min,
654 params.quantized_activation_max);
655
656 const int input1_shift = params.input1_shift;
657 const int flat_size =
658 MatchingFlatSize(output_shape, input1_shape, input2_shape);
659 const int16 output_activation_min = params.quantized_activation_min;
660 const int16 output_activation_max = params.quantized_activation_max;
661
662 TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
663 TFLITE_DCHECK_LE(input1_shift, 0);
664 TFLITE_DCHECK_LE(params.input2_shift, 0);
665 const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
666 const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
667 const int input_right_shift =
668 input1_shift == 0 ? -params.input2_shift : -input1_shift;
669
670 for (int i = 0; i < flat_size; i++) {
671 // F0 uses 0 integer bits, range [-1, 1].
672 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
673
674 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
675 F0 scaled_input = F0::FromRaw(
676 gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
677 F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
678 const int16 raw_output = result.raw();
679 const int16 clamped_output = std::min(
680 output_activation_max, std::max(output_activation_min, raw_output));
681 output_data[i] = clamped_output;
682 }
683 }
684
685 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
686 // dimensionality if the runtime code does a single loop over one dimension
687 // that handles broadcasting as the base case. The code generator would then
688 // generate max(D1, D2) nested for loops.
689 // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
690 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
691 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
692 // reference_ops.h.
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)693 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
694 const RuntimeShape& input1_shape,
695 const float* input1_data,
696 const RuntimeShape& input2_shape,
697 const float* input2_data,
698 const RuntimeShape& output_shape,
699 float* output_data) {
700 gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
701 NdArrayDesc<4> desc1;
702 NdArrayDesc<4> desc2;
703 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
704 &desc2);
705 const RuntimeShape extended_output_shape =
706 RuntimeShape::ExtendedShape(4, output_shape);
707
708 // In Tensorflow, the dimensions are canonically named (batch_number, row,
709 // col, channel), with extents (batches, height, width, depth), with the
710 // trailing dimension changing most rapidly (channels has the smallest stride,
711 // typically 1 element).
712 //
713 // In generated C code, we store arrays with the dimensions reversed. The
714 // first dimension has smallest stride.
715 //
716 // We name our variables by their Tensorflow convention, but generate C code
717 // nesting loops such that the innermost loop has the smallest stride for the
718 // best cache behavior.
719 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
720 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
721 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
722 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
723 output_data[Offset(extended_output_shape, b, y, x, c)] =
724 ActivationFunctionWithMinMax(
725 input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
726 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
727 params.float_activation_min, params.float_activation_max);
728 }
729 }
730 }
731 }
732 }
733
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)734 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
735 const RuntimeShape& input1_shape,
736 const int32* input1_data,
737 const RuntimeShape& input2_shape,
738 const int32* input2_data,
739 const RuntimeShape& output_shape,
740 int32* output_data) {
741 gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
742 NdArrayDesc<4> desc1;
743 NdArrayDesc<4> desc2;
744 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
745 &desc2);
746 const RuntimeShape extended_output_shape =
747 RuntimeShape::ExtendedShape(4, output_shape);
748
749 // In Tensorflow, the dimensions are canonically named (batch_number, row,
750 // col, channel), with extents (batches, height, width, depth), with the
751 // trailing dimension changing most rapidly (channels has the smallest stride,
752 // typically 1 element).
753 //
754 // In generated C code, we store arrays with the dimensions reversed. The
755 // first dimension has smallest stride.
756 //
757 // We name our variables by their Tensorflow convention, but generate C code
758 // nesting loops such that the innermost loop has the smallest stride for the
759 // best cache behavior.
760 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
761 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
762 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
763 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
764 output_data[Offset(extended_output_shape, b, y, x, c)] =
765 ActivationFunctionWithMinMax(
766 input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
767 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
768 params.quantized_activation_min,
769 params.quantized_activation_max);
770 }
771 }
772 }
773 }
774 }
775
BroadcastAdd4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)776 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
777 const RuntimeShape& input1_shape,
778 const uint8* input1_data,
779 const RuntimeShape& input2_shape,
780 const uint8* input2_data,
781 const RuntimeShape& output_shape,
782 uint8* output_data) {
783 gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
784 NdArrayDesc<4> desc1;
785 NdArrayDesc<4> desc2;
786 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
787 &desc2);
788 const RuntimeShape extended_output_shape =
789 RuntimeShape::ExtendedShape(4, output_shape);
790
791 // In Tensorflow, the dimensions are canonically named (batch_number, row,
792 // col, channel), with extents (batches, height, width, depth), with the
793 // trailing dimension changing most rapidly (channels has the smallest stride,
794 // typically 1 element).
795 //
796 // In generated C code, we store arrays with the dimensions reversed. The
797 // first dimension has smallest stride.
798 //
799 // We name our variables by their Tensorflow convention, but generate C code
800 // nesting loops such that the innermost loop has the smallest stride for the
801 // best cache behavior.
802 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
803 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
804 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
805 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
806 const int32 input1_val =
807 params.input1_offset +
808 input1_data[SubscriptToIndex(desc1, b, y, x, c)];
809 const int32 input2_val =
810 params.input2_offset +
811 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
812 const int32 shifted_input1_val =
813 input1_val * (1 << params.left_shift);
814 const int32 shifted_input2_val =
815 input2_val * (1 << params.left_shift);
816 const int32 scaled_input1_val =
817 MultiplyByQuantizedMultiplierSmallerThanOneExp(
818 shifted_input1_val, params.input1_multiplier,
819 params.input1_shift);
820 const int32 scaled_input2_val =
821 MultiplyByQuantizedMultiplierSmallerThanOneExp(
822 shifted_input2_val, params.input2_multiplier,
823 params.input2_shift);
824 const int32 raw_sum = scaled_input1_val + scaled_input2_val;
825 const int32 raw_output =
826 MultiplyByQuantizedMultiplierSmallerThanOneExp(
827 raw_sum, params.output_multiplier, params.output_shift) +
828 params.output_offset;
829 const int32 clamped_output =
830 std::min(params.quantized_activation_max,
831 std::max(params.quantized_activation_min, raw_output));
832 output_data[Offset(extended_output_shape, b, y, x, c)] =
833 static_cast<uint8>(clamped_output);
834 }
835 }
836 }
837 }
838 }
839
BroadcastAddFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8 * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8 * unswitched_input2_data,const RuntimeShape & output_shape,uint8 * output_data)840 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
841 const RuntimeShape& unswitched_input1_shape,
842 const uint8* unswitched_input1_data,
843 const RuntimeShape& unswitched_input2_shape,
844 const uint8* unswitched_input2_data,
845 const RuntimeShape& output_shape,
846 uint8* output_data) {
847 ArithmeticParams switched_params = unswitched_params;
848 switched_params.input1_offset = unswitched_params.input2_offset;
849 switched_params.input1_multiplier = unswitched_params.input2_multiplier;
850 switched_params.input1_shift = unswitched_params.input2_shift;
851 switched_params.input2_offset = unswitched_params.input1_offset;
852 switched_params.input2_multiplier = unswitched_params.input1_multiplier;
853 switched_params.input2_shift = unswitched_params.input1_shift;
854
855 const bool use_unswitched =
856 unswitched_params.broadcast_category ==
857 tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
858
859 const ArithmeticParams& params =
860 use_unswitched ? unswitched_params : switched_params;
861 const uint8* input1_data =
862 use_unswitched ? unswitched_input1_data : unswitched_input2_data;
863 const uint8* input2_data =
864 use_unswitched ? unswitched_input2_data : unswitched_input1_data;
865
866 // Fivefold nested loops. The second input resets its position for each
867 // iteration of the second loop. The first input resets its position at the
868 // beginning of the fourth loop. The innermost loop is an elementwise add of
869 // sections of the arrays.
870 uint8* output_data_ptr = output_data;
871 const uint8* input1_data_ptr = input1_data;
872 const uint8* input2_data_reset = input2_data;
873 // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
874 // between input shapes. y3 for input 1 is always broadcast, and so the
875 // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
876 // Put another way,
877 // input1.shape.FlatSize = y0 * y1 * y2 * y4,
878 // input2.shape.FlatSize = y0 * y2 * y3 * y4.
879 int y0 = params.broadcast_shape[0];
880 int y1 = params.broadcast_shape[1];
881 int y2 = params.broadcast_shape[2];
882 int y3 = params.broadcast_shape[3];
883 int y4 = params.broadcast_shape[4];
884 if (y4 > 1) {
885 // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
886 // dimension.
887 for (int i0 = 0; i0 < y0; ++i0) {
888 const uint8* input2_data_ptr;
889 for (int i1 = 0; i1 < y1; ++i1) {
890 input2_data_ptr = input2_data_reset;
891 for (int i2 = 0; i2 < y2; ++i2) {
892 for (int i3 = 0; i3 < y3; ++i3) {
893 AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
894 output_data_ptr);
895 input2_data_ptr += y4;
896 output_data_ptr += y4;
897 }
898 // We have broadcast y4 of input1 data y3 times, and now move on.
899 input1_data_ptr += y4;
900 }
901 }
902 // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
903 input2_data_reset = input2_data_ptr;
904 }
905 } else {
906 // Special case of y4 == 1, in which the innermost loop is a single element
907 // and can be combined with the next (y3) as an inner broadcast.
908 //
909 // Note that this handles the case of pure scalar broadcast when
910 // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
911 // broadcast with batch (as y2 > 1).
912 //
913 // NOTE The process is the same as the above general case except simplified
914 // for y4 == 1 and the loop over y3 is contained within the
915 // AddScalarBroadcast function.
916 for (int i0 = 0; i0 < y0; ++i0) {
917 const uint8* input2_data_ptr;
918 for (int i1 = 0; i1 < y1; ++i1) {
919 input2_data_ptr = input2_data_reset;
920 for (int i2 = 0; i2 < y2; ++i2) {
921 AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
922 output_data_ptr);
923 input2_data_ptr += y3;
924 output_data_ptr += y3;
925 input1_data_ptr += 1;
926 }
927 }
928 input2_data_reset = input2_data_ptr;
929 }
930 }
931 }
932
933 template <typename T>
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)934 inline void Mul(const ArithmeticParams& params,
935 const RuntimeShape& input1_shape, const T* input1_data,
936 const RuntimeShape& input2_shape, const T* input2_data,
937 const RuntimeShape& output_shape, T* output_data) {
938 T output_activation_min;
939 T output_activation_max;
940 GetActivationParams(params, &output_activation_min, &output_activation_max);
941
942 const int flat_size =
943 MatchingFlatSize(input1_shape, input2_shape, output_shape);
944 for (int i = 0; i < flat_size; ++i) {
945 output_data[i] = ActivationFunctionWithMinMax(
946 input1_data[i] * input2_data[i], output_activation_min,
947 output_activation_max);
948 }
949 }
950
951 // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
952 // dimensionality if the runtime code does a single loop over one dimension
953 // that handles broadcasting as the base case. The code generator would then
954 // generate max(D1, D2) nested for loops.
955 // TODO(benoitjacob): BroadcastMul is intentionally duplicated from
956 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
957 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
958 // reference_ops.h.
959 template <typename T>
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)960 void BroadcastMul4DSlow(const ArithmeticParams& params,
961 const RuntimeShape& unextended_input1_shape,
962 const T* input1_data,
963 const RuntimeShape& unextended_input2_shape,
964 const T* input2_data,
965 const RuntimeShape& unextended_output_shape,
966 T* output_data) {
967 gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow");
968 T output_activation_min;
969 T output_activation_max;
970 GetActivationParams(params, &output_activation_min, &output_activation_max);
971
972 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
973 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
974 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
975 const RuntimeShape output_shape =
976 RuntimeShape::ExtendedShape(4, unextended_output_shape);
977
978 NdArrayDesc<4> desc1;
979 NdArrayDesc<4> desc2;
980 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
981 unextended_input2_shape, &desc1, &desc2);
982
983 // In Tensorflow, the dimensions are canonically named (batch_number, row,
984 // col, channel), with extents (batches, height, width, depth), with the
985 // trailing dimension changing most rapidly (channels has the smallest stride,
986 // typically 1 element).
987 //
988 // In generated C code, we store arrays with the dimensions reversed. The
989 // first dimension has smallest stride.
990 //
991 // We name our variables by their Tensorflow convention, but generate C code
992 // nesting loops such that the innermost loop has the smallest stride for the
993 // best cache behavior.
994 for (int b = 0; b < output_shape.Dims(0); ++b) {
995 for (int y = 0; y < output_shape.Dims(1); ++y) {
996 for (int x = 0; x < output_shape.Dims(2); ++x) {
997 for (int c = 0; c < output_shape.Dims(3); ++c) {
998 output_data[Offset(output_shape, b, y, x, c)] =
999 ActivationFunctionWithMinMax(
1000 input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
1001 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1002 output_activation_min, output_activation_max);
1003 }
1004 }
1005 }
1006 }
1007 }
1008
1009 // Element-wise mul that can often be used for inner loop of broadcast Mul as
1010 // well as the non-broadcast Mul.
MulElementwise(int size,const ArithmeticParams & params,const uint8 * input1_data,const uint8 * input2_data,uint8 * output_data)1011 inline void MulElementwise(int size, const ArithmeticParams& params,
1012 const uint8* input1_data, const uint8* input2_data,
1013 uint8* output_data) {
1014 for (int i = 0; i < size; ++i) {
1015 const int32 input1_val = params.input1_offset + input1_data[i];
1016 const int32 input2_val = params.input2_offset + input2_data[i];
1017 const int32 unclamped_result =
1018 params.output_offset +
1019 MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
1020 params.output_multiplier,
1021 params.output_shift);
1022 const int32 clamped_output =
1023 std::min(params.quantized_activation_max,
1024 std::max(params.quantized_activation_min, unclamped_result));
1025 output_data[i] = static_cast<uint8>(clamped_output);
1026 }
1027 }
1028
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1029 inline void Mul(const ArithmeticParams& params,
1030 const RuntimeShape& input1_shape, const uint8* input1_data,
1031 const RuntimeShape& input2_shape, const uint8* input2_data,
1032 const RuntimeShape& output_shape, uint8* output_data) {
1033 TFLITE_DCHECK_LE(params.quantized_activation_min,
1034 params.quantized_activation_max);
1035 gemmlowp::ScopedProfilingLabel label("Mul/8bit");
1036 const int flat_size =
1037 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1038
1039 MulElementwise(flat_size, params, input1_data, input2_data, output_data);
1040 }
1041
BroadcastMulFivefold(const ArithmeticParams & unswitched_params,const RuntimeShape & unswitched_input1_shape,const uint8 * unswitched_input1_data,const RuntimeShape & unswitched_input2_shape,const uint8 * unswitched_input2_data,const RuntimeShape & output_shape,uint8 * output_data)1042 inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
1043 const RuntimeShape& unswitched_input1_shape,
1044 const uint8* unswitched_input1_data,
1045 const RuntimeShape& unswitched_input2_shape,
1046 const uint8* unswitched_input2_data,
1047 const RuntimeShape& output_shape,
1048 uint8* output_data) {
1049 ArithmeticParams switched_params = unswitched_params;
1050 switched_params.input1_offset = unswitched_params.input2_offset;
1051 switched_params.input2_offset = unswitched_params.input1_offset;
1052
1053 const bool use_unswitched =
1054 unswitched_params.broadcast_category ==
1055 tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
1056
1057 const ArithmeticParams& params =
1058 use_unswitched ? unswitched_params : switched_params;
1059 const uint8* input1_data =
1060 use_unswitched ? unswitched_input1_data : unswitched_input2_data;
1061 const uint8* input2_data =
1062 use_unswitched ? unswitched_input2_data : unswitched_input1_data;
1063
1064 // Fivefold nested loops. The second input resets its position for each
1065 // iteration of the second loop. The first input resets its position at the
1066 // beginning of the fourth loop. The innermost loop is an elementwise Mul of
1067 // sections of the arrays.
1068 uint8* output_data_ptr = output_data;
1069 const uint8* input1_data_ptr = input1_data;
1070 const uint8* input2_data_reset = input2_data;
1071 int y0 = params.broadcast_shape[0];
1072 int y1 = params.broadcast_shape[1];
1073 int y2 = params.broadcast_shape[2];
1074 int y3 = params.broadcast_shape[3];
1075 int y4 = params.broadcast_shape[4];
1076 for (int i0 = 0; i0 < y0; ++i0) {
1077 const uint8* input2_data_ptr;
1078 for (int i1 = 0; i1 < y1; ++i1) {
1079 input2_data_ptr = input2_data_reset;
1080 for (int i2 = 0; i2 < y2; ++i2) {
1081 for (int i3 = 0; i3 < y3; ++i3) {
1082 MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
1083 output_data_ptr);
1084 input2_data_ptr += y4;
1085 output_data_ptr += y4;
1086 }
1087 input1_data_ptr += y4;
1088 }
1089 }
1090 input2_data_reset = input2_data_ptr;
1091 }
1092 }
1093
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1094 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
1095 const RuntimeShape& input1_shape,
1096 const uint8* input1_data,
1097 const RuntimeShape& input2_shape,
1098 const uint8* input2_data,
1099 const RuntimeShape& output_shape,
1100 uint8* output_data) {
1101 gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
1102
1103 NdArrayDesc<4> desc1;
1104 NdArrayDesc<4> desc2;
1105 // The input shapes are extended as part of NdArrayDesc initialization.
1106 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1107 &desc2);
1108 const RuntimeShape extended_output_shape =
1109 RuntimeShape::ExtendedShape(4, output_shape);
1110
1111 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1112 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1113 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1114 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1115 const int32 input1_val =
1116 params.input1_offset +
1117 input1_data[SubscriptToIndex(desc1, b, y, x, c)];
1118 const int32 input2_val =
1119 params.input2_offset +
1120 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1121 const int32 unclamped_result =
1122 params.output_offset +
1123 MultiplyByQuantizedMultiplierSmallerThanOneExp(
1124 input1_val * input2_val, params.output_multiplier,
1125 params.output_shift);
1126 const int32 clamped_output = std::min(
1127 params.quantized_activation_max,
1128 std::max(params.quantized_activation_min, unclamped_result));
1129 output_data[Offset(extended_output_shape, b, y, x, c)] =
1130 static_cast<uint8>(clamped_output);
1131 }
1132 }
1133 }
1134 }
1135 }
1136
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,int16 * output_data)1137 inline void Mul(const ArithmeticParams& params,
1138 const RuntimeShape& input1_shape, const int16* input1_data,
1139 const RuntimeShape& input2_shape, const int16* input2_data,
1140 const RuntimeShape& output_shape, int16* output_data) {
1141 gemmlowp::ScopedProfilingLabel label("Mul/Int16");
1142
1143 const int flat_size =
1144 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1145
1146 for (int i = 0; i < flat_size; i++) {
1147 // F0 uses 0 integer bits, range [-1, 1].
1148 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1149
1150 F0 unclamped_result =
1151 F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
1152 output_data[i] = unclamped_result.raw();
1153 }
1154 }
1155
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16 * input1_data,const RuntimeShape & input2_shape,const int16 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1156 inline void Mul(const ArithmeticParams& params,
1157 const RuntimeShape& input1_shape, const int16* input1_data,
1158 const RuntimeShape& input2_shape, const int16* input2_data,
1159 const RuntimeShape& output_shape, uint8* output_data) {
1160 gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
1161 int32 output_offset = params.output_offset;
1162 int32 output_activation_min = params.quantized_activation_min;
1163 int32 output_activation_max = params.quantized_activation_max;
1164 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1165
1166 const int flat_size =
1167 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1168
1169 for (int i = 0; i < flat_size; i++) {
1170 // F0 uses 0 integer bits, range [-1, 1].
1171 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1172
1173 F0 unclamped_result =
1174 F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
1175 int16 rescaled_result =
1176 gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
1177 int16 clamped_result =
1178 std::min<int16>(output_activation_max - output_offset, rescaled_result);
1179 clamped_result =
1180 std::max<int16>(output_activation_min - output_offset, clamped_result);
1181 output_data[i] = output_offset + clamped_result;
1182 }
1183 }
1184
1185 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
1186 // dimensionality if the runtime code does a single loop over one dimension
1187 // that handles broadcasting as the base case. The code generator would then
1188 // generate max(D1, D2) nested for loops.
1189 template <typename T>
BroadcastDiv4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)1190 void BroadcastDiv4DSlow(const ArithmeticParams& params,
1191 const RuntimeShape& unextended_input1_shape,
1192 const T* input1_data,
1193 const RuntimeShape& unextended_input2_shape,
1194 const T* input2_data,
1195 const RuntimeShape& unextended_output_shape,
1196 T* output_data) {
1197 T output_activation_min;
1198 T output_activation_max;
1199 GetActivationParams(params, &output_activation_min, &output_activation_max);
1200
1201 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
1202 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
1203 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
1204 const RuntimeShape output_shape =
1205 RuntimeShape::ExtendedShape(4, unextended_output_shape);
1206
1207 NdArrayDesc<4> desc1;
1208 NdArrayDesc<4> desc2;
1209 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
1210 unextended_input2_shape, &desc1, &desc2);
1211
1212 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1213 // col, channel), with extents (batches, height, width, depth), with the
1214 // trailing dimension changing most rapidly (channels has the smallest
1215 // stride, typically 1 element).
1216 //
1217 // In generated C code, we store arrays with the dimensions reversed. The
1218 // first dimension has smallest stride.
1219 //
1220 // We name our variables by their Tensorflow convention, but generate C code
1221 // nesting loops such that the innermost loop has the smallest stride for
1222 // the best cache behavior.
1223 for (int b = 0; b < output_shape.Dims(0); ++b) {
1224 for (int y = 0; y < output_shape.Dims(1); ++y) {
1225 for (int x = 0; x < output_shape.Dims(2); ++x) {
1226 for (int c = 0; c < output_shape.Dims(3); ++c) {
1227 output_data[Offset(output_shape, b, y, x, c)] =
1228 ActivationFunctionWithMinMax(
1229 input1_data[SubscriptToIndex(desc1, b, y, x, c)] /
1230 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1231 output_activation_min, output_activation_max);
1232 }
1233 }
1234 }
1235 }
1236 }
1237
1238 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1239 inline void Div(const ArithmeticParams& params,
1240 const RuntimeShape& input1_shape, const T* input1_data,
1241 const RuntimeShape& input2_shape, const T* input2_data,
1242 const RuntimeShape& output_shape, T* output_data) {
1243 T output_activation_min;
1244 T output_activation_max;
1245 GetActivationParams(params, &output_activation_min, &output_activation_max);
1246
1247 const int flat_size =
1248 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1249 for (int i = 0; i < flat_size; ++i) {
1250 output_data[i] = ActivationFunctionWithMinMax(
1251 input1_data[i] / input2_data[i], output_activation_min,
1252 output_activation_max);
1253 }
1254 }
1255
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1256 inline void SubNonBroadcast(const ArithmeticParams& params,
1257 const RuntimeShape& input1_shape,
1258 const float* input1_data,
1259 const RuntimeShape& input2_shape,
1260 const float* input2_data,
1261 const RuntimeShape& output_shape,
1262 float* output_data) {
1263 const int flat_size =
1264 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1265 for (int i = 0; i < flat_size; ++i) {
1266 output_data[i] = ActivationFunctionWithMinMax(
1267 input1_data[i] - input2_data[i], params.float_activation_min,
1268 params.float_activation_max);
1269 }
1270 }
1271
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1272 inline void SubNonBroadcast(const ArithmeticParams& params,
1273 const RuntimeShape& input1_shape,
1274 const int32* input1_data,
1275 const RuntimeShape& input2_shape,
1276 const int32* input2_data,
1277 const RuntimeShape& output_shape,
1278 int32* output_data) {
1279 const int flat_size =
1280 MatchingFlatSize(input1_shape, input2_shape, output_shape);
1281 for (int i = 0; i < flat_size; ++i) {
1282 output_data[i] = ActivationFunctionWithMinMax(
1283 input1_data[i] - input2_data[i], params.quantized_activation_min,
1284 params.quantized_activation_max);
1285 }
1286 }
1287
1288 // TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
1289 // dimensionality if the runtime code does a single loop over one dimension
1290 // that handles broadcasting as the base case. The code generator would then
1291 // generate max(D1, D2) nested for loops.
1292 // TODO(benoitjacob): BroadcastSub is intentionally duplicated from
1293 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
1294 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
1295 // reference_ops.h.
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1296 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1297 const RuntimeShape& input1_shape,
1298 const float* input1_data,
1299 const RuntimeShape& input2_shape,
1300 const float* input2_data,
1301 const RuntimeShape& output_shape,
1302 float* output_data) {
1303 gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/float");
1304 NdArrayDesc<4> desc1;
1305 NdArrayDesc<4> desc2;
1306 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1307 &desc2);
1308 const RuntimeShape extended_output_shape =
1309 RuntimeShape::ExtendedShape(4, output_shape);
1310
1311 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1312 // col, channel), with extents (batches, height, width, depth), with the
1313 // trailing dimension changing most rapidly (channels has the smallest stride,
1314 // typically 1 element).
1315 //
1316 // In generated C code, we store arrays with the dimensions reversed. The
1317 // first dimension has smallest stride.
1318 //
1319 // We name our variables by their Tensorflow convention, but generate C code
1320 // nesting loops such that the innermost loop has the smallest stride for the
1321 // best cache behavior.
1322 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1323 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1324 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1325 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1326 output_data[Offset(extended_output_shape, b, y, x, c)] =
1327 ActivationFunctionWithMinMax(
1328 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1329 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1330 params.float_activation_min, params.float_activation_max);
1331 }
1332 }
1333 }
1334 }
1335 }
1336
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8 * input1_data,const RuntimeShape & input2_shape,const uint8 * input2_data,const RuntimeShape & output_shape,uint8 * output_data)1337 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1338 const RuntimeShape& input1_shape,
1339 const uint8* input1_data,
1340 const RuntimeShape& input2_shape,
1341 const uint8* input2_data,
1342 const RuntimeShape& output_shape,
1343 uint8* output_data) {
1344 gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/uint8");
1345 NdArrayDesc<4> desc1;
1346 NdArrayDesc<4> desc2;
1347 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1348 &desc2);
1349 const RuntimeShape extended_output_shape =
1350 RuntimeShape::ExtendedShape(4, output_shape);
1351
1352 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1353 // col, channel), with extents (batches, height, width, depth), with the
1354 // trailing dimension changing most rapidly (channels has the smallest stride,
1355 // typically 1 element).
1356 //
1357 // In generated C code, we store arrays with the dimensions reversed. The
1358 // first dimension has smallest stride.
1359 //
1360 // We name our variables by their Tensorflow convention, but generate C code
1361 // nesting loops such that the innermost loop has the smallest stride for the
1362 // best cache behavior.
1363 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1364 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1365 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1366 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1367 const int32 input1_val =
1368 params.input1_offset +
1369 input1_data[SubscriptToIndex(desc1, b, y, x, c)];
1370 const int32 input2_val =
1371 params.input2_offset +
1372 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1373 const int32 shifted_input1_val =
1374 input1_val * (1 << params.left_shift);
1375 const int32 shifted_input2_val =
1376 input2_val * (1 << params.left_shift);
1377 const int32 scaled_input1_val =
1378 MultiplyByQuantizedMultiplierSmallerThanOneExp(
1379 shifted_input1_val, params.input1_multiplier,
1380 params.input1_shift);
1381 const int32 scaled_input2_val =
1382 MultiplyByQuantizedMultiplierSmallerThanOneExp(
1383 shifted_input2_val, params.input2_multiplier,
1384 params.input2_shift);
1385 const int32 raw_sub = scaled_input1_val - scaled_input2_val;
1386 const int32 raw_output =
1387 MultiplyByQuantizedMultiplierSmallerThanOneExp(
1388 raw_sub, params.output_multiplier, params.output_shift) +
1389 params.output_offset;
1390 const int32 clamped_output =
1391 std::min(params.quantized_activation_max,
1392 std::max(params.quantized_activation_min, raw_output));
1393 output_data[Offset(extended_output_shape, b, y, x, c)] =
1394 static_cast<uint8>(clamped_output);
1395 }
1396 }
1397 }
1398 }
1399 }
1400
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1401 inline void BroadcastSub4DSlow(const ArithmeticParams& params,
1402 const RuntimeShape& input1_shape,
1403 const int32* input1_data,
1404 const RuntimeShape& input2_shape,
1405 const int32* input2_data,
1406 const RuntimeShape& output_shape,
1407 int32* output_data) {
1408 gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/int32");
1409 NdArrayDesc<4> desc1;
1410 NdArrayDesc<4> desc2;
1411 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1412 &desc2);
1413 const RuntimeShape extended_output_shape =
1414 RuntimeShape::ExtendedShape(4, output_shape);
1415
1416 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1417 // col, channel), with extents (batches, height, width, depth), with the
1418 // trailing dimension changing most rapidly (channels has the smallest stride,
1419 // typically 1 element).
1420 //
1421 // In generated C code, we store arrays with the dimensions reversed. The
1422 // first dimension has smallest stride.
1423 //
1424 // We name our variables by their Tensorflow convention, but generate C code
1425 // nesting loops such that the innermost loop has the smallest stride for the
1426 // best cache behavior.
1427 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1428 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1429 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1430 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1431 output_data[Offset(extended_output_shape, b, y, x, c)] =
1432 ActivationFunctionWithMinMax(
1433 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1434 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1435 params.quantized_activation_min,
1436 params.quantized_activation_max);
1437 }
1438 }
1439 }
1440 }
1441 }
1442
1443 template <typename T>
BroadcastSub4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1444 void BroadcastSub4DSlow(const ArithmeticParams& params,
1445 const RuntimeShape& input1_shape, const T* input1_data,
1446 const RuntimeShape& input2_shape, const T* input2_data,
1447 const RuntimeShape& output_shape, T* output_data) {
1448 gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/templated");
1449 NdArrayDesc<4> desc1;
1450 NdArrayDesc<4> desc2;
1451 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1452 &desc2);
1453 const RuntimeShape extended_output_shape =
1454 RuntimeShape::ExtendedShape(4, output_shape);
1455
1456 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1457 // col, channel), with extents (batches, height, width, depth), with the
1458 // trailing dimension changing most rapidly (channels has the smallest stride,
1459 // typically 1 element).
1460 //
1461 // In generated C code, we store arrays with the dimensions reversed. The
1462 // first dimension has smallest stride.
1463 //
1464 // We name our variables by their Tensorflow convention, but generate C code
1465 // nesting loops such that the innermost loop has the smallest stride for the
1466 // best cache behavior.
1467 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1468 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1469 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1470 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1471 output_data[Offset(extended_output_shape, b, y, x, c)] =
1472 ActivationFunctionWithMinMax(
1473 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1474 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
1475 params.quantized_activation_min,
1476 params.quantized_activation_max);
1477 }
1478 }
1479 }
1480 }
1481 }
1482
1483 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)1484 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
1485 const T* input1_data, const RuntimeShape& input2_shape,
1486 const T* input2_data, const RuntimeShape& output_shape,
1487 T* output_data) {
1488 NdArrayDesc<4> desc1;
1489 NdArrayDesc<4> desc2;
1490 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
1491 &desc2);
1492 const RuntimeShape extended_output_shape =
1493 RuntimeShape::ExtendedShape(4, output_shape);
1494
1495 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1496 // col, channel), with extents (batches, height, width, depth), with the
1497 // trailing dimension changing most rapidly (channels has the smallest stride,
1498 // typically 1 element).
1499 //
1500 // In generated C code, we store arrays with the dimensions reversed. The
1501 // first dimension has smallest stride.
1502 //
1503 // We name our variables by their Tensorflow convention, but generate C code
1504 // nesting loops such that the innermost loop has the smallest stride for the
1505 // best cache behavior.
1506 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
1507 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
1508 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
1509 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
1510 output_data[Offset(extended_output_shape, b, y, x, c)] =
1511 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
1512 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
1513 }
1514 }
1515 }
1516 }
1517 }
1518
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32 * input1_data,const RuntimeShape & input2_shape,const int32 * input2_data,const RuntimeShape & output_shape,int32 * output_data)1519 inline void SubWithActivation(const ArithmeticParams& params,
1520 const RuntimeShape& input1_shape,
1521 const int32* input1_data,
1522 const RuntimeShape& input2_shape,
1523 const int32* input2_data,
1524 const RuntimeShape& output_shape,
1525 int32* output_data) {
1526 gemmlowp::ScopedProfilingLabel label("SubWithActivation");
1527 const int flat_size =
1528 MatchingFlatSize(input1_shape, input2_shape, input2_shape);
1529 for (int i = 0; i < flat_size; ++i) {
1530 output_data[i] = ActivationFunctionWithMinMax(
1531 input1_data[i] - input2_data[i], params.quantized_activation_min,
1532 params.quantized_activation_max);
1533 }
1534 }
1535
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)1536 inline void SubWithActivation(const ArithmeticParams& params,
1537 const RuntimeShape& input1_shape,
1538 const float* input1_data,
1539 const RuntimeShape& input2_shape,
1540 const float* input2_data,
1541 const RuntimeShape& output_shape,
1542 float* output_data) {
1543 const int flat_size =
1544 MatchingFlatSize(input1_shape, input2_shape, input2_shape);
1545 for (int i = 0; i < flat_size; ++i) {
1546 output_data[i] = ActivationFunctionWithMinMax(
1547 input1_data[i] - input2_data[i], params.float_activation_min,
1548 params.float_activation_max);
1549 }
1550 }
1551
Sub16(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)1552 inline void Sub16(const ArithmeticParams& params,
1553 const RuntimeShape& input1_shape, const int16_t* input1_data,
1554 const RuntimeShape& input2_shape, const int16_t* input2_data,
1555 const RuntimeShape& output_shape, int16_t* output_data) {
1556 gemmlowp::ScopedProfilingLabel label("Sub/Int16");
1557 const int input1_shift = params.input1_shift;
1558 const int flat_size =
1559 MatchingFlatSize(output_shape, input1_shape, input2_shape);
1560 const int16 output_activation_min = params.quantized_activation_min;
1561 const int16 output_activation_max = params.quantized_activation_max;
1562
1563 TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
1564 TFLITE_DCHECK_LE(input1_shift, 0);
1565 TFLITE_DCHECK_LE(params.input2_shift, 0);
1566 const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
1567 const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
1568 const int input_right_shift =
1569 input1_shift == 0 ? -params.input2_shift : -input1_shift;
1570
1571 if (input1_shift == 0) {
1572 // F0 uses 0 integer bits, range [-1, 1].
1573 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1574 for (int i = 0; i < flat_size; ++i) {
1575 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
1576 F0 scaled_input = F0::FromRaw(
1577 gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
1578 F0 result = SaturatingSub(input_ready_scaled, scaled_input);
1579 const int16 raw_output = result.raw();
1580 const int16 clamped_output = std::min(
1581 output_activation_max, std::max(output_activation_min, raw_output));
1582 output_data[i] = clamped_output;
1583 }
1584 } else {
1585 // F0 uses 0 integer bits, range [-1, 1].
1586 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
1587 for (int i = 0; i < flat_size; ++i) {
1588 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
1589 F0 scaled_input = F0::FromRaw(
1590 gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
1591 F0 result = SaturatingSub(scaled_input, input_ready_scaled);
1592 const int16 raw_output = result.raw();
1593 const int16 clamped_output = std::min(
1594 output_activation_max, std::max(output_activation_min, raw_output));
1595 output_data[i] = clamped_output;
1596 }
1597 }
1598 }
1599
1600 template <typename Scalar>
Concatenation(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1601 inline void Concatenation(const ConcatenationParams& params,
1602 const RuntimeShape* const* input_shapes,
1603 const Scalar* const* input_data,
1604 const RuntimeShape& output_shape,
1605 Scalar* output_data) {
1606 gemmlowp::ScopedProfilingLabel label("Concatenation");
1607 int axis = params.axis;
1608 int inputs_count = params.inputs_count;
1609 const int concat_dimensions = output_shape.DimensionsCount();
1610 TFLITE_DCHECK_LT(axis, concat_dimensions);
1611
1612 int64_t concat_size = 0;
1613 for (int i = 0; i < inputs_count; i++) {
1614 TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
1615 for (int j = 0; j < concat_dimensions; j++) {
1616 if (j != axis) {
1617 MatchingDim(*input_shapes[i], j, output_shape, j);
1618 }
1619 }
1620 concat_size += input_shapes[i]->Dims(axis);
1621 }
1622 TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
1623 int64_t outer_size = 1;
1624 for (int i = 0; i < axis; ++i) {
1625 outer_size *= output_shape.Dims(i);
1626 }
1627 // For all input arrays,
1628 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
1629 int64_t base_inner_size = 1;
1630 for (int i = axis + 1; i < concat_dimensions; ++i) {
1631 base_inner_size *= output_shape.Dims(i);
1632 }
1633
1634 Scalar* output_ptr = output_data;
1635 for (int k = 0; k < outer_size; k++) {
1636 for (int i = 0; i < inputs_count; ++i) {
1637 const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
1638 memcpy(output_ptr, input_data[i] + k * copy_size,
1639 copy_size * sizeof(Scalar));
1640 output_ptr += copy_size;
1641 }
1642 }
1643 }
1644
1645 // TODO(prabhumk): This is the same as the optimized implementation.
1646 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
1647 // quantized as it takes scale as a floating point value. This should be fixed
1648 // when optimizng this routine further.
ConcatenationWithScaling(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const uint8 * const * input_data,const RuntimeShape & output_shape,uint8 * output_data)1649 inline void ConcatenationWithScaling(const ConcatenationParams& params,
1650 const RuntimeShape* const* input_shapes,
1651 const uint8* const* input_data,
1652 const RuntimeShape& output_shape,
1653 uint8* output_data) {
1654 gemmlowp::ScopedProfilingLabel label("ConcatenationWithScaling/Uint8");
1655 int axis = params.axis;
1656 const int32* input_zeropoint = params.input_zeropoint;
1657 const float* input_scale = params.input_scale;
1658 int inputs_count = params.inputs_count;
1659 const int32 output_zeropoint = params.output_zeropoint;
1660 const float output_scale = params.output_scale;
1661
1662 const int concat_dimensions = output_shape.DimensionsCount();
1663 TFLITE_DCHECK_LT(axis, concat_dimensions);
1664
1665 int64_t concat_size = 0;
1666 for (int i = 0; i < inputs_count; i++) {
1667 TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions);
1668 for (int j = 0; j < concat_dimensions; j++) {
1669 if (j != axis) {
1670 MatchingDim(*input_shapes[i], j, output_shape, j);
1671 }
1672 }
1673 concat_size += input_shapes[i]->Dims(axis);
1674 }
1675 TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis));
1676 int64_t outer_size = 1;
1677 for (int i = 0; i < axis; ++i) {
1678 outer_size *= output_shape.Dims(i);
1679 }
1680 // For all input arrays,
1681 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
1682 int64_t base_inner_size = 1;
1683 for (int i = axis + 1; i < concat_dimensions; ++i) {
1684 base_inner_size *= output_shape.Dims(i);
1685 }
1686
1687 const float inverse_output_scale = 1.f / output_scale;
1688 uint8* output_ptr = output_data;
1689 for (int k = 0; k < outer_size; k++) {
1690 for (int i = 0; i < inputs_count; ++i) {
1691 const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
1692 const uint8* input_ptr = input_data[i] + k * copy_size;
1693 if (input_zeropoint[i] == output_zeropoint &&
1694 input_scale[i] == output_scale) {
1695 memcpy(output_ptr, input_ptr, copy_size);
1696 } else {
1697 const float scale = input_scale[i] * inverse_output_scale;
1698 const float bias = -input_zeropoint[i] * scale;
1699 for (int j = 0; j < copy_size; ++j) {
1700 const int32_t value =
1701 static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
1702 output_zeropoint;
1703 output_ptr[j] =
1704 static_cast<uint8_t>(std::max(std::min(255, value), 0));
1705 }
1706 }
1707 output_ptr += copy_size;
1708 }
1709 }
1710 }
1711
1712 template <typename Scalar>
Pack(const PackParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1713 void Pack(const PackParams& params, const RuntimeShape* const* input_shapes,
1714 const Scalar* const* input_data, const RuntimeShape& output_shape,
1715 Scalar* output_data) {
1716 gemmlowp::ScopedProfilingLabel label("Pack");
1717 const int dimensions = output_shape.DimensionsCount();
1718 int axis = params.axis;
1719 int inputs_count = params.inputs_count;
1720
1721 int outer_size = 1;
1722 for (int i = 0; i < axis; i++) {
1723 outer_size *= output_shape.Dims(i);
1724 }
1725 int copy_size = 1;
1726 for (int i = params.axis + 1; i < dimensions; i++) {
1727 copy_size *= output_shape.Dims(i);
1728 }
1729 TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
1730
1731 for (int i = 0; i < inputs_count; ++i) {
1732 for (int k = 0; k < outer_size; k++) {
1733 const Scalar* input_ptr = input_data[i] + copy_size * k;
1734 int loc = k * inputs_count * copy_size + i * copy_size;
1735 memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
1736 }
1737 }
1738 }
1739
1740 template <typename Scalar>
Unpack(const UnpackParams & params,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * const * output_datas)1741 void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
1742 const Scalar* input_data, const RuntimeShape& output_shape,
1743 Scalar* const* output_datas) {
1744 gemmlowp::ScopedProfilingLabel label("Unpack");
1745 const int dimensions = input_shape.DimensionsCount();
1746 const int outputs_count = params.num_split;
1747
1748 int outer_size = 1;
1749 int axis = params.axis;
1750 if (axis < 0) {
1751 axis += dimensions;
1752 }
1753 TFLITE_DCHECK_GE(axis, 0);
1754 TFLITE_DCHECK_LT(axis, dimensions);
1755 for (int i = 0; i < axis; ++i) {
1756 outer_size *= input_shape.Dims(i);
1757 }
1758 int copy_size = 1;
1759 for (int i = axis + 1; i < dimensions; ++i) {
1760 copy_size *= input_shape.Dims(i);
1761 }
1762 TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
1763
1764 for (int i = 0; i < outputs_count; ++i) {
1765 for (int k = 0; k < outer_size; k++) {
1766 Scalar* output_ptr = output_datas[i] + copy_size * k;
1767 int loc = k * outputs_count * copy_size + i * copy_size;
1768 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
1769 }
1770 }
1771 }
1772
1773 template <typename Scalar>
PackWithScaling(const PackParams & params,const RuntimeShape * const * input_shapes,const uint8 * const * input_data,const RuntimeShape & output_shape,uint8 * output_data)1774 void PackWithScaling(const PackParams& params,
1775 const RuntimeShape* const* input_shapes,
1776 const uint8* const* input_data,
1777 const RuntimeShape& output_shape, uint8* output_data) {
1778 gemmlowp::ScopedProfilingLabel label("PackWithScaling");
1779 const int dimensions = output_shape.DimensionsCount();
1780 int axis = params.axis;
1781 const int32* input_zeropoint = params.input_zeropoint;
1782 const float* input_scale = params.input_scale;
1783 int inputs_count = params.inputs_count;
1784 const int32 output_zeropoint = params.output_zeropoint;
1785 const float output_scale = params.output_scale;
1786
1787 int outer_size = 1;
1788 for (int i = 0; i < axis; i++) {
1789 outer_size *= output_shape.Dims(i);
1790 }
1791 int copy_size = 1;
1792 for (int i = axis + 1; i < dimensions; i++) {
1793 copy_size *= output_shape.Dims(i);
1794 }
1795 TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
1796
1797 Scalar* output_ptr = output_data;
1798 const float inverse_output_scale = 1.f / output_scale;
1799 for (int k = 0; k < outer_size; k++) {
1800 for (int i = 0; i < inputs_count; ++i) {
1801 if (input_zeropoint[i] == output_zeropoint &&
1802 input_scale[i] == output_scale) {
1803 memcpy(output_ptr, input_data[i] + k * copy_size,
1804 copy_size * sizeof(Scalar));
1805 } else {
1806 assert(false);
1807 const float scale = input_scale[i] * inverse_output_scale;
1808 const float bias = -input_zeropoint[i] * scale;
1809 auto input_ptr = input_data[i];
1810 for (int j = 0; j < copy_size; ++j) {
1811 const int32_t value =
1812 static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
1813 output_zeropoint;
1814 output_ptr[j] =
1815 static_cast<uint8_t>(std::max(std::min(255, value), 0));
1816 }
1817 }
1818 output_ptr += copy_size;
1819 }
1820 }
1821 }
1822
1823 template <typename Scalar>
DepthConcatenation(const ConcatenationParams & params,const RuntimeShape * const * input_shapes,const Scalar * const * input_data,const RuntimeShape & output_shape,Scalar * output_data)1824 void DepthConcatenation(const ConcatenationParams& params,
1825 const RuntimeShape* const* input_shapes,
1826 const Scalar* const* input_data,
1827 const RuntimeShape& output_shape, Scalar* output_data) {
1828 gemmlowp::ScopedProfilingLabel label("DepthConcatenation");
1829 auto params_copy = params;
1830 params_copy.axis = 3;
1831 Concatenation(params_copy, input_shapes, input_data, output_shape,
1832 output_data);
1833 }
1834
LstmCell(const LstmCellParams & params,const RuntimeShape & unextended_input_shape,const float * input_data,const RuntimeShape & unextended_prev_activ_shape,const float * prev_activ_data,const RuntimeShape & weights_shape,const float * weights_data,const RuntimeShape & unextended_bias_shape,const float * bias_data,const RuntimeShape & unextended_prev_state_shape,const float * prev_state_data,const RuntimeShape & unextended_output_state_shape,float * output_state_data,const RuntimeShape & unextended_output_activ_shape,float * output_activ_data,const RuntimeShape & unextended_concat_temp_shape,float * concat_temp_data,const RuntimeShape & unextended_activ_temp_shape,float * activ_temp_data)1835 inline void LstmCell(
1836 const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
1837 const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
1838 const float* prev_activ_data, const RuntimeShape& weights_shape,
1839 const float* weights_data, const RuntimeShape& unextended_bias_shape,
1840 const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
1841 const float* prev_state_data,
1842 const RuntimeShape& unextended_output_state_shape, float* output_state_data,
1843 const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
1844 const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
1845 const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
1846 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
1847 TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
1848 TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
1849 TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
1850 TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
1851 TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
1852 TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
1853 TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
1854 const RuntimeShape input_shape =
1855 RuntimeShape::ExtendedShape(4, unextended_input_shape);
1856 const RuntimeShape prev_activ_shape =
1857 RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
1858 const RuntimeShape bias_shape =
1859 RuntimeShape::ExtendedShape(4, unextended_bias_shape);
1860 const RuntimeShape prev_state_shape =
1861 RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
1862 const RuntimeShape output_state_shape =
1863 RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
1864 const RuntimeShape output_activ_shape =
1865 RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
1866 const RuntimeShape concat_temp_shape =
1867 RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
1868 const RuntimeShape activ_temp_shape =
1869 RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
1870 TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
1871
1872 const int weights_dim_count = weights_shape.DimensionsCount();
1873 const int batches =
1874 MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
1875 output_state_shape, 0, output_activ_shape, 0);
1876 const int height =
1877 MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
1878 output_state_shape, 1, output_activ_shape, 1);
1879 const int width =
1880 MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
1881 output_state_shape, 2, output_activ_shape, 2);
1882 const int input_depth = input_shape.Dims(3);
1883 const int prev_activ_depth = prev_activ_shape.Dims(3);
1884 const int total_input_depth = prev_activ_depth + input_depth;
1885 TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
1886 total_input_depth);
1887 TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
1888 const int intern_activ_depth =
1889 MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
1890 TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
1891 intern_activ_depth * total_input_depth);
1892 TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
1893 const int output_depth =
1894 MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
1895 3, output_activ_shape, 3);
1896 TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
1897
1898 // Concatenate prev_activ and input data together
1899 std::vector<float const*> concat_input_arrays_data;
1900 std::vector<RuntimeShape const*> concat_input_arrays_shapes;
1901 concat_input_arrays_data.push_back(input_data);
1902 concat_input_arrays_data.push_back(prev_activ_data);
1903 concat_input_arrays_shapes.push_back(&input_shape);
1904 concat_input_arrays_shapes.push_back(&prev_activ_shape);
1905 tflite::ConcatenationParams concat_params;
1906 concat_params.axis = 3;
1907 concat_params.inputs_count = concat_input_arrays_data.size();
1908 Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
1909 &(concat_input_arrays_data[0]), concat_temp_shape,
1910 concat_temp_data);
1911
1912 // Fully connected
1913 tflite::FullyConnectedParams fc_params;
1914 fc_params.float_activation_min = std::numeric_limits<float>::lowest();
1915 fc_params.float_activation_max = std::numeric_limits<float>::max();
1916 FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
1917 weights_data, bias_shape, bias_data, activ_temp_shape,
1918 activ_temp_data);
1919
1920 // Memory state update (the LSTM "guts")
1921 for (int b = 0; b < batches; ++b) {
1922 for (int w = 0; w < width; ++w) {
1923 for (int h = 0; h < height; ++h) {
1924 for (int c = 0; c < output_depth; ++c) {
1925 const float input_gate =
1926 1.f /
1927 (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1928 0 * output_depth + c)]));
1929 const float new_input = std::tanh(activ_temp_data[Offset(
1930 activ_temp_shape, b, h, w, 1 * output_depth + c)]);
1931 const float forget_gate =
1932 1.f /
1933 (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1934 2 * output_depth + c)]));
1935 const float output_gate =
1936 1.f /
1937 (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
1938 3 * output_depth + c)]));
1939 const float new_state =
1940 input_gate * new_input +
1941 forget_gate *
1942 prev_state_data[Offset(prev_state_shape, b, h, w, c)];
1943 output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
1944 output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
1945 output_gate * std::tanh(new_state);
1946 }
1947 }
1948 }
1949 }
1950 }
1951
1952 // Quantized LSTM cell implementation.
1953 // The quantization of the input, output arrays is as follows:
1954 // - The input activations are quantized as uint8 on the interval
1955 // [-1, 127/128].
1956 // The rationale for that is that is the natural interval for output
1957 // activations (see next point) and these need to be concatenated together.
1958 // We could accommodate different ranges by re-scaling, but we empirically
1959 // found that setting the input activations range to be [-1, 127/128] in the
1960 // first place, removing the need for re-scaling, greatly improves accuracy.
1961 // - The output activations are quantized as uint8 on the interval
1962 // [-1, 127/128].
1963 // The rationale for that is that the definition of a LSTM cell makes them
1964 // intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
1965 // makes for simpler, more accurate fixed-point arithmetic.
1966 // - The output-at-previous-timestep state array is obviously quantized as
1967 // the output activations.
1968 // - The internal LSTM memory (not the output-at-previous-timestep, the other
1969 // internal state array) is int16-quantized and may use any power-of-two,
1970 // symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
1971 // StateIntegerBits below, see the below discussion of that template
1972 // parameter ("The StateIntegerBits template parameter").
1973 // - The output of the internal fully-connected node is int16-quantized
1974 // on the interval [-8, 8 * 32767/32768], the rationale for which is
1975 // explained just below ("Why [-8, 8] for fully-connected output?").
1976 //
1977 //
1978 // === The StateIntegerBits template parameter ===
1979 //
1980 // The StateIntegerBits template parameter controls the fixed-point format used
1981 // to represent the internal memory of the LSTM cell (not the
1982 // output-at-previous-timestep, the other internal state array). It's currently
1983 // a template parameter so that the model can control that. The most typical
1984 // value for StateIntegerBits is 4. Other plausible values are anywhere between
1985 // 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
1986 // and drop that template parameter. The reason why it can't be a runtime
1987 // parameter is that this controls the fixed-point format used, i.e. we need to
1988 // generate actually different code based on it. In particular, we generate code
1989 // for a fixed-point tanh() implementation for that format, which internally
1990 // uses a fixed-point exp() implementation, which internally uses a
1991 // barrel-shifter with a number of steps that depends on StateIntegerBits.
1992 // Another consequence of that is that a higher value of StateIntegerBits
1993 // results in a more expensive implementation (more barrel shifter steps
1994 // needed).
1995 //
1996 //
1997 // === Why [-8, 8] for fully-connected output? ===
1998 //
1999 // This array is only fed to Logistic and Tanh functions, for which
2000 // the quantized implementation will want to use fixed-point arithmetic,
2001 // requiring a power-of-two representation interval. Thus, we should right
2002 // away quantize this array to a power-of-two interval; otherwise,
2003 // implementation will need to rescale that, losing any benefit that a tighter
2004 // representation interval might otherwise yield, while introducing some
2005 // numerical error and computational overhead.
2006 //
2007 // Now, Logistic and Tanh
2008 // are nearly constant (nearly equal to their horizontal asymptotes)
2009 // outside of a small bounded interval around 0:
2010 //
2011 // Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4
2012 // Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7
2013 // Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14
2014 //
2015 // From this, we see that clamping to [-4, 4] would be too inaccurate
2016 // (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
2017 // while clamping to [-16, 16] would make no difference even in float32.
2018 // However, for a fixed-point implementation in 16-bit integers, using 5
2019 // integer bits to represent the [-16, 16] range would leave only 11
2020 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
2021 // representable values. Notice that is higher than the
2022 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
2023 // Using [-8, 8] thus seems like the better compromise overall, enjoying
2024 // an increment of 2.4e-4 between representable values and a worst-case
2025 // clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
2026 // [-16, 16].
2027 //
2028 // Moreover, all other things being equal, it is nice to choose the narrower
2029 // representation range, as that makes the implementation of fixed-point
2030 // math functions a little cheaper (each integer bit requires an additional
2031 // barrel-shifter atep in the implementation of exp(-x)). That is further
2032 // reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
2033 // sense for 32-bit float or 32-bit fixed-point quantization, but we are
2034 // aiming for 16-bit fixed-point quantization of these internal nodes here.
2035 //
2036 template <int StateIntegerBits>
LstmCell(const LstmCellParams & params,const RuntimeShape & unextended_input_shape,const uint8 * input_data_uint8,const RuntimeShape & unextended_prev_activ_shape,const uint8 * prev_activ_data_uint8,const RuntimeShape & weights_shape,const uint8 * weights_data_uint8,const RuntimeShape & unextended_bias_shape,const int32 * bias_data_int32,const RuntimeShape & unextended_prev_state_shape,const int16 * prev_state_data_int16,const RuntimeShape & unextended_output_state_shape,int16 * output_state_data_int16,const RuntimeShape & unextended_output_activ_shape,uint8 * output_activ_data_uint8,const RuntimeShape & unextended_concat_temp_shape,uint8 * concat_temp_data_uint8,const RuntimeShape & unextended_activ_temp_shape,int16 * activ_temp_data_int16,gemmlowp::GemmContext * gemm_context)2037 inline void LstmCell(
2038 const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
2039 const uint8* input_data_uint8,
2040 const RuntimeShape& unextended_prev_activ_shape,
2041 const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
2042 const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
2043 const int32* bias_data_int32,
2044 const RuntimeShape& unextended_prev_state_shape,
2045 const int16* prev_state_data_int16,
2046 const RuntimeShape& unextended_output_state_shape,
2047 int16* output_state_data_int16,
2048 const RuntimeShape& unextended_output_activ_shape,
2049 uint8* output_activ_data_uint8,
2050 const RuntimeShape& unextended_concat_temp_shape,
2051 uint8* concat_temp_data_uint8,
2052 const RuntimeShape& unextended_activ_temp_shape,
2053 int16* activ_temp_data_int16, gemmlowp::GemmContext* gemm_context) {
2054 (void)gemm_context; // only used in optimized code.
2055 int32 weights_zero_point = params.weights_zero_point;
2056 int32 accum_multiplier = params.accum_multiplier;
2057 int accum_shift = params.accum_shift;
2058 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
2059 TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
2060 TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
2061 TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
2062 TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
2063 TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
2064 TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
2065 TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
2066 const RuntimeShape input_shape =
2067 RuntimeShape::ExtendedShape(4, unextended_input_shape);
2068 const RuntimeShape prev_activ_shape =
2069 RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
2070 const RuntimeShape bias_shape =
2071 RuntimeShape::ExtendedShape(4, unextended_bias_shape);
2072 const RuntimeShape prev_state_shape =
2073 RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
2074 const RuntimeShape output_state_shape =
2075 RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
2076 const RuntimeShape output_activ_shape =
2077 RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
2078 const RuntimeShape concat_temp_shape =
2079 RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
2080 const RuntimeShape activ_temp_shape =
2081 RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
2082 TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
2083
2084 // Gather dimensions information, and perform consistency checks.
2085 const int weights_dim_count = weights_shape.DimensionsCount();
2086 const int outer_size = MatchingFlatSizeSkipDim(
2087 input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
2088 output_activ_shape);
2089 const int input_depth = input_shape.Dims(3);
2090 const int prev_activ_depth = prev_activ_shape.Dims(3);
2091 const int total_input_depth = prev_activ_depth + input_depth;
2092 TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
2093 total_input_depth);
2094 const int intern_activ_depth =
2095 MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
2096 TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
2097 intern_activ_depth * total_input_depth);
2098 TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
2099 TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
2100 const int output_depth =
2101 MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
2102 3, output_activ_shape, 3);
2103 TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
2104 const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
2105 const int fc_output_depth =
2106 MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
2107 const int fc_accum_depth = total_input_depth;
2108 TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
2109
2110 // Depth-concatenate prev_activ and input data together.
2111 uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
2112 prev_activ_data_uint8};
2113 const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
2114 &prev_activ_shape};
2115 tflite::ConcatenationParams concat_params;
2116 concat_params.axis = 3;
2117 concat_params.inputs_count = 2;
2118 Concatenation(concat_params, concat_input_arrays_shapes,
2119 concat_input_arrays_data, concat_temp_shape,
2120 concat_temp_data_uint8);
2121
2122 // Implementation of the fully connected node inside the LSTM cell.
2123 // The operands are 8-bit integers, the accumulators are internally 32bit
2124 // integers, and the output is 16-bit fixed-point with 3 integer bits so
2125 // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
2126 // is explained in the function comment above.
2127 for (int b = 0; b < fc_batches; ++b) {
2128 for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
2129 // Internal accumulation.
2130 // Initialize accumulator with the bias-value.
2131 int32 accum = bias_data_int32[out_c];
2132 // Accumulation loop.
2133 for (int d = 0; d < fc_accum_depth; ++d) {
2134 int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
2135 int16 weights_val =
2136 weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
2137 accum += input_val * weights_val;
2138 }
2139 // Down-scale the final int32 accumulator to the scale used by our
2140 // (16-bit, using 3 integer bits) fixed-point format. The quantized
2141 // multiplier and shift here have been pre-computed offline
2142 // (e.g. by toco).
2143 accum =
2144 MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
2145 // Saturate, cast to int16, and store to the temporary activations array.
2146 accum = std::max(-32768, std::min(32767, accum));
2147 activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
2148 }
2149 }
2150
2151 // Rest of the LSTM cell: tanh and logistic math functions, and some adds
2152 // and muls, all done in 16-bit fixed-point.
2153 for (int b = 0; b < outer_size; ++b) {
2154 for (int c = 0; c < output_depth; ++c) {
2155 // Define the fixed-point data types that we will use here. All use
2156 // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
2157 // They only differ by the number of integral vs. fractional bits,
2158 // determining the range of values that they can represent.
2159 //
2160 // F0 uses 0 integer bits, range [-1, 1].
2161 // This is the return type of math functions such as tanh, logistic,
2162 // whose range is in [-1, 1].
2163 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2164 // F3 uses 3 integer bits, range [-8, 8].
2165 // This is the range of the previous fully-connected node's output,
2166 // which is our input here.
2167 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2168 // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
2169 // 2^StateIntegerBits]. It's used to represent the internal state, whose
2170 // number of integer bits is currently dictated by the model. See comment
2171 // on the StateIntegerBits template parameter above.
2172 using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
2173 // Implementation of input gate, using fixed-point logistic function.
2174 F3 input_gate_input = F3::FromRaw(
2175 activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
2176 F0 input_gate_output = gemmlowp::logistic(input_gate_input);
2177 // Implementation of input modulation gate, using fixed-point tanh
2178 // function.
2179 F3 input_modulation_gate_input = F3::FromRaw(
2180 activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
2181 F0 input_modulation_gate_output =
2182 gemmlowp::tanh(input_modulation_gate_input);
2183 // Implementation of forget gate, using fixed-point logistic function.
2184 F3 forget_gate_input = F3::FromRaw(
2185 activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
2186 F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
2187 // Implementation of output gate, using fixed-point logistic function.
2188 F3 output_gate_input = F3::FromRaw(
2189 activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
2190 F0 output_gate_output = gemmlowp::logistic(output_gate_input);
2191 // Implementation of internal multiplication nodes, still in fixed-point.
2192 F0 input_times_input_modulation =
2193 input_gate_output * input_modulation_gate_output;
2194 FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
2195 FS prev_state_times_forget_state = forget_gate_output * prev_state;
2196 // Implementation of internal addition node, saturating.
2197 FS new_state = gemmlowp::SaturatingAdd(
2198 gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
2199 prev_state_times_forget_state);
2200 // Implementation of last internal Tanh node, still in fixed-point.
2201 // Since a Tanh fixed-point implementation is specialized for a given
2202 // number or integer bits, and each specialization can have a substantial
2203 // code size, and we already used above a Tanh on an input with 3 integer
2204 // bits, and per the table in the above function comment there is no
2205 // significant accuracy to be lost by clamping to [-8, +8] for a
2206 // 3-integer-bits representation, let us just do that. This helps people
2207 // porting this to targets where code footprint must be minimized.
2208 F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
2209 F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
2210 // Store the new internal state back to memory, as 16-bit integers.
2211 // Note: here we store the original value with StateIntegerBits, not
2212 // the rescaled 3-integer-bits value fed to tanh.
2213 output_state_data_int16[b * output_depth + c] = new_state.raw();
2214 // Down-scale the output activations to 8-bit integers, saturating,
2215 // and store back to memory.
2216 int16 rescaled_output_activ =
2217 gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
2218 int16 clamped_output_activ =
2219 std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
2220 output_activ_data_uint8[b * output_depth + c] =
2221 128 + clamped_output_activ;
2222 }
2223 }
2224 }
2225
2226 template <typename Scalar>
Split(const SplitParams & params,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape * const * output_shapes,Scalar * const * output_data)2227 void Split(const SplitParams& params, const RuntimeShape& input_shape,
2228 const Scalar* input_data, const RuntimeShape* const* output_shapes,
2229 Scalar* const* output_data) {
2230 gemmlowp::ScopedProfilingLabel label("Split");
2231 const int concat_dimensions = input_shape.DimensionsCount();
2232 int axis = params.axis < 0 ? params.axis + concat_dimensions : params.axis;
2233 int outputs_count = params.num_split;
2234 TFLITE_DCHECK_LT(axis, concat_dimensions);
2235
2236 int64_t concat_size = 0;
2237 for (int i = 0; i < outputs_count; i++) {
2238 TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), concat_dimensions);
2239 for (int j = 0; j < concat_dimensions; j++) {
2240 if (j != axis) {
2241 MatchingDim(*output_shapes[i], j, input_shape, j);
2242 }
2243 }
2244 concat_size += output_shapes[i]->Dims(axis);
2245 }
2246 TFLITE_DCHECK_EQ(concat_size, input_shape.Dims(axis));
2247 int64_t outer_size = 1;
2248 for (int i = 0; i < axis; ++i) {
2249 outer_size *= input_shape.Dims(i);
2250 }
2251 // For all output arrays,
2252 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
2253 int64_t base_inner_size = 1;
2254 for (int i = axis + 1; i < concat_dimensions; ++i) {
2255 base_inner_size *= input_shape.Dims(i);
2256 }
2257
2258 const Scalar* input_ptr = input_data;
2259 for (int k = 0; k < outer_size; k++) {
2260 for (int i = 0; i < outputs_count; ++i) {
2261 const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
2262 memcpy(output_data[i] + k * copy_size, input_ptr,
2263 copy_size * sizeof(Scalar));
2264 input_ptr += copy_size;
2265 }
2266 }
2267 }
2268
NodeOffset(int b,int h,int w,int height,int width)2269 inline int NodeOffset(int b, int h, int w, int height, int width) {
2270 return (b * height + h) * width + w;
2271 }
2272
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2273 inline void AveragePool(const PoolParams& params,
2274 const RuntimeShape& input_shape,
2275 const float* input_data,
2276 const RuntimeShape& output_shape, float* output_data) {
2277 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2278 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2279 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2280 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2281 const int input_height = input_shape.Dims(1);
2282 const int input_width = input_shape.Dims(2);
2283 const int output_height = output_shape.Dims(1);
2284 const int output_width = output_shape.Dims(2);
2285 const int stride_height = params.stride_height;
2286 const int stride_width = params.stride_width;
2287 for (int batch = 0; batch < batches; ++batch) {
2288 for (int out_y = 0; out_y < output_height; ++out_y) {
2289 for (int out_x = 0; out_x < output_width; ++out_x) {
2290 for (int channel = 0; channel < depth; ++channel) {
2291 const int in_x_origin =
2292 (out_x * stride_width) - params.padding_values.width;
2293 const int in_y_origin =
2294 (out_y * stride_height) - params.padding_values.height;
2295 // Compute the boundaries of the filter region clamped so as to
2296 // ensure that the filter window fits in the input array.
2297 const int filter_x_start = std::max(0, -in_x_origin);
2298 const int filter_x_end =
2299 std::min(params.filter_width, input_width - in_x_origin);
2300 const int filter_y_start = std::max(0, -in_y_origin);
2301 const int filter_y_end =
2302 std::min(params.filter_height, input_height - in_y_origin);
2303 float total = 0.f;
2304 float filter_count = 0;
2305 for (int filter_y = filter_y_start; filter_y < filter_y_end;
2306 ++filter_y) {
2307 for (int filter_x = filter_x_start; filter_x < filter_x_end;
2308 ++filter_x) {
2309 const int in_x = in_x_origin + filter_x;
2310 const int in_y = in_y_origin + filter_y;
2311 total +=
2312 input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2313 filter_count++;
2314 }
2315 }
2316 const float average = total / filter_count;
2317 output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2318 ActivationFunctionWithMinMax(average, params.float_activation_min,
2319 params.float_activation_max);
2320 }
2321 }
2322 }
2323 }
2324 }
2325
AveragePool(const PoolParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2326 inline void AveragePool(const PoolParams& params,
2327 const RuntimeShape& input_shape,
2328 const uint8* input_data,
2329 const RuntimeShape& output_shape, uint8* output_data) {
2330 TFLITE_DCHECK_LE(params.quantized_activation_min,
2331 params.quantized_activation_max);
2332 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2333 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2334 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2335 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2336 const int input_height = input_shape.Dims(1);
2337 const int input_width = input_shape.Dims(2);
2338 const int output_height = output_shape.Dims(1);
2339 const int output_width = output_shape.Dims(2);
2340 const int stride_height = params.stride_height;
2341 const int stride_width = params.stride_width;
2342 for (int batch = 0; batch < batches; ++batch) {
2343 for (int out_y = 0; out_y < output_height; ++out_y) {
2344 for (int out_x = 0; out_x < output_width; ++out_x) {
2345 for (int channel = 0; channel < depth; ++channel) {
2346 const int in_x_origin =
2347 (out_x * stride_width) - params.padding_values.width;
2348 const int in_y_origin =
2349 (out_y * stride_height) - params.padding_values.height;
2350 // Compute the boundaries of the filter region clamped so as to
2351 // ensure that the filter window fits in the input array.
2352 const int filter_x_start = std::max(0, -in_x_origin);
2353 const int filter_x_end =
2354 std::min(params.filter_width, input_width - in_x_origin);
2355 const int filter_y_start = std::max(0, -in_y_origin);
2356 const int filter_y_end =
2357 std::min(params.filter_height, input_height - in_y_origin);
2358 int32 acc = 0;
2359 int filter_count = 0;
2360 for (int filter_y = filter_y_start; filter_y < filter_y_end;
2361 ++filter_y) {
2362 for (int filter_x = filter_x_start; filter_x < filter_x_end;
2363 ++filter_x) {
2364 const int in_x = in_x_origin + filter_x;
2365 const int in_y = in_y_origin + filter_y;
2366 acc +=
2367 input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2368 filter_count++;
2369 }
2370 }
2371 acc = (acc + filter_count / 2) / filter_count;
2372 acc = std::max(acc, params.quantized_activation_min);
2373 acc = std::min(acc, params.quantized_activation_max);
2374 output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2375 static_cast<uint8>(acc);
2376 }
2377 }
2378 }
2379 }
2380 }
2381
L2Pool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2382 inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
2383 const float* input_data, const RuntimeShape& output_shape,
2384 float* output_data) {
2385 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2386 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2387 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2388 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2389 const int input_height = input_shape.Dims(1);
2390 const int input_width = input_shape.Dims(2);
2391 const int output_height = output_shape.Dims(1);
2392 const int output_width = output_shape.Dims(2);
2393 const int stride_height = params.stride_height;
2394 const int stride_width = params.stride_width;
2395 for (int batch = 0; batch < batches; ++batch) {
2396 for (int out_y = 0; out_y < output_height; ++out_y) {
2397 for (int out_x = 0; out_x < output_width; ++out_x) {
2398 for (int channel = 0; channel < depth; ++channel) {
2399 const int in_x_origin =
2400 (out_x * stride_width) - params.padding_values.width;
2401 const int in_y_origin =
2402 (out_y * stride_height) - params.padding_values.height;
2403 // Compute the boundaries of the filter region clamped so as to
2404 // ensure that the filter window fits in the input array.
2405 const int filter_x_start = std::max(0, -in_x_origin);
2406 const int filter_x_end =
2407 std::min(params.filter_width, input_width - in_x_origin);
2408 const int filter_y_start = std::max(0, -in_y_origin);
2409 const int filter_y_end =
2410 std::min(params.filter_height, input_height - in_y_origin);
2411 float sum_squares = 0.f;
2412 int filter_count = 0;
2413 for (int filter_y = filter_y_start; filter_y < filter_y_end;
2414 ++filter_y) {
2415 for (int filter_x = filter_x_start; filter_x < filter_x_end;
2416 ++filter_x) {
2417 const int in_x = in_x_origin + filter_x;
2418 const int in_y = in_y_origin + filter_y;
2419 const float val =
2420 input_data[Offset(input_shape, batch, in_y, in_x, channel)];
2421 sum_squares += val * val;
2422 filter_count++;
2423 }
2424 }
2425 const float l2pool_result = std::sqrt(sum_squares / filter_count);
2426 output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2427 ActivationFunctionWithMinMax(l2pool_result,
2428 params.float_activation_min,
2429 params.float_activation_max);
2430 }
2431 }
2432 }
2433 }
2434 }
2435
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2436 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
2437 const float* input_data, const RuntimeShape& output_shape,
2438 float* output_data) {
2439 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2440 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2441 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2442 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2443 const int input_height = input_shape.Dims(1);
2444 const int input_width = input_shape.Dims(2);
2445 const int output_height = output_shape.Dims(1);
2446 const int output_width = output_shape.Dims(2);
2447 const int stride_height = params.stride_height;
2448 const int stride_width = params.stride_width;
2449 for (int batch = 0; batch < batches; ++batch) {
2450 for (int out_y = 0; out_y < output_height; ++out_y) {
2451 for (int out_x = 0; out_x < output_width; ++out_x) {
2452 for (int channel = 0; channel < depth; ++channel) {
2453 const int in_x_origin =
2454 (out_x * stride_width) - params.padding_values.width;
2455 const int in_y_origin =
2456 (out_y * stride_height) - params.padding_values.height;
2457 // Compute the boundaries of the filter region clamped so as to
2458 // ensure that the filter window fits in the input array.
2459 const int filter_x_start = std::max(0, -in_x_origin);
2460 const int filter_x_end =
2461 std::min(params.filter_width, input_width - in_x_origin);
2462 const int filter_y_start = std::max(0, -in_y_origin);
2463 const int filter_y_end =
2464 std::min(params.filter_height, input_height - in_y_origin);
2465 float max = std::numeric_limits<float>::lowest();
2466 for (int filter_y = filter_y_start; filter_y < filter_y_end;
2467 ++filter_y) {
2468 for (int filter_x = filter_x_start; filter_x < filter_x_end;
2469 ++filter_x) {
2470 const int in_x = in_x_origin + filter_x;
2471 const int in_y = in_y_origin + filter_y;
2472 max = std::max(
2473 max,
2474 input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
2475 }
2476 }
2477 output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2478 ActivationFunctionWithMinMax(max, params.float_activation_min,
2479 params.float_activation_max);
2480 }
2481 }
2482 }
2483 }
2484 }
2485
MaxPool(const PoolParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2486 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
2487 const uint8* input_data, const RuntimeShape& output_shape,
2488 uint8* output_data) {
2489 TFLITE_DCHECK_LE(params.quantized_activation_min,
2490 params.quantized_activation_max);
2491 TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
2492 TFLITE_DCHECK_LE(params.quantized_activation_max, 255);
2493 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
2494 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
2495 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
2496 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
2497 const int input_height = input_shape.Dims(1);
2498 const int input_width = input_shape.Dims(2);
2499 const int output_height = output_shape.Dims(1);
2500 const int output_width = output_shape.Dims(2);
2501 const int stride_height = params.stride_height;
2502 const int stride_width = params.stride_width;
2503 for (int batch = 0; batch < batches; ++batch) {
2504 for (int out_y = 0; out_y < output_height; ++out_y) {
2505 for (int out_x = 0; out_x < output_width; ++out_x) {
2506 for (int channel = 0; channel < depth; ++channel) {
2507 const int in_x_origin =
2508 (out_x * stride_width) - params.padding_values.width;
2509 const int in_y_origin =
2510 (out_y * stride_height) - params.padding_values.height;
2511 // Compute the boundaries of the filter region clamped so as to
2512 // ensure that the filter window fits in the input array.
2513 const int filter_x_start = std::max(0, -in_x_origin);
2514 const int filter_x_end =
2515 std::min(params.filter_width, input_width - in_x_origin);
2516 const int filter_y_start = std::max(0, -in_y_origin);
2517 const int filter_y_end =
2518 std::min(params.filter_height, input_height - in_y_origin);
2519 uint8 max = 0;
2520 for (int filter_y = filter_y_start; filter_y < filter_y_end;
2521 ++filter_y) {
2522 for (int filter_x = filter_x_start; filter_x < filter_x_end;
2523 ++filter_x) {
2524 const int in_x = in_x_origin + filter_x;
2525 const int in_y = in_y_origin + filter_y;
2526 max = std::max(
2527 max,
2528 input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
2529 }
2530 }
2531 max = std::max<uint8>(max, params.quantized_activation_min);
2532 max = std::min<uint8>(max, params.quantized_activation_max);
2533 output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
2534 static_cast<uint8>(max);
2535 }
2536 }
2537 }
2538 }
2539 }
2540
LocalResponseNormalization(const tflite::LocalResponseNormalizationParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2541 inline void LocalResponseNormalization(
2542 const tflite::LocalResponseNormalizationParams& op_params,
2543 const RuntimeShape& input_shape, const float* input_data,
2544 const RuntimeShape& output_shape, float* output_data) {
2545 const int trailing_dim = input_shape.DimensionsCount() - 1;
2546 const int outer_size =
2547 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2548 const int depth =
2549 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2550
2551 for (int i = 0; i < outer_size; ++i) {
2552 for (int c = 0; c < depth; ++c) {
2553 const int begin_input_c = std::max(0, c - op_params.range);
2554 const int end_input_c = std::min(depth, c + op_params.range);
2555 float accum = 0.f;
2556 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
2557 const float input_val = input_data[i * depth + input_c];
2558 accum += input_val * input_val;
2559 }
2560 const float multiplier =
2561 std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
2562 output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
2563 }
2564 }
2565 }
2566
LogSoftmax(const SoftmaxParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2567 inline void LogSoftmax(const SoftmaxParams& params,
2568 const RuntimeShape& input_shape, const float* input_data,
2569 const RuntimeShape& output_shape, float* output_data) {
2570 const int trailing_dim = input_shape.DimensionsCount() - 1;
2571 const int outer_size =
2572 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2573 const int depth =
2574 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2575
2576 for (int i = 0; i < outer_size; ++i) {
2577 // Find max element value which we'll use to ensure numerical stability
2578 // taking advantage of the following equality:
2579 // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
2580 float max = std::numeric_limits<float>::lowest();
2581 for (int c = 0; c < depth; ++c) {
2582 max = std::max(max, input_data[i * depth + c]);
2583 }
2584
2585 // Compute sum.
2586 float sum = 0.f;
2587 for (int c = 0; c < depth; ++c) {
2588 sum += std::exp(input_data[i * depth + c] - max);
2589 }
2590
2591 // Compute result.
2592 const float log_sum = std::log(sum);
2593 for (int c = 0; c < depth; ++c) {
2594 output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
2595 }
2596 }
2597 }
2598
LogSoftmax(const SoftmaxParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2599 inline void LogSoftmax(const SoftmaxParams& params,
2600 const RuntimeShape& input_shape, const uint8* input_data,
2601 const RuntimeShape& output_shape, uint8* output_data) {
2602 gemmlowp::ScopedProfilingLabel label("LogSoftmax/8bit");
2603 const int32 input_multiplier = params.input_multiplier;
2604 const int32 input_left_shift = params.input_left_shift;
2605 const int32 reverse_scaling_divisor = params.reverse_scaling_divisor;
2606 const int32 reverse_scaling_right_shift = params.reverse_scaling_right_shift;
2607 const int diff_min = params.diff_min;
2608 // The representation chosen for the input to the exp() function is Q5.26.
2609 // We need to leave extra space since values that we skip might be as large
2610 // as -32 before multiplying by input_beta_multiplier, and therefore as
2611 // large as -16 afterwards. Note that exp(-8) is definitely not
2612 // insignificant to accumulation, but exp(-16) definitely is.
2613 static constexpr int kScaledDiffIntegerBits = 5;
2614 static constexpr int kAccumulationIntegerBits = 12;
2615 static constexpr int kOutputIntegerBits = 4;
2616 using FixedPointScaledDiff =
2617 gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
2618 using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
2619
2620 const int trailing_dim = input_shape.DimensionsCount() - 1;
2621 const int outer_size =
2622 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
2623 const int depth =
2624 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
2625
2626 for (int i = 0; i < outer_size; ++i) {
2627 uint8 max_in_row = 0;
2628 for (int c = 0; c < depth; ++c) {
2629 max_in_row = std::max(max_in_row, input_data[i * depth + c]);
2630 }
2631
2632 FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
2633 for (int c = 0; c < depth; ++c) {
2634 int32 input_diff =
2635 static_cast<int32>(input_data[i * depth + c]) - max_in_row;
2636 if (input_diff >= diff_min) {
2637 const int32 input_diff_rescaled =
2638 MultiplyByQuantizedMultiplierGreaterThanOne(
2639 input_diff, input_multiplier, input_left_shift);
2640 const FixedPointScaledDiff scaled_diff_f8 =
2641 FixedPointScaledDiff::FromRaw(input_diff_rescaled);
2642 sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
2643 exp_on_negative_values(scaled_diff_f8));
2644 }
2645 }
2646
2647 const int32 fixed_log_sum_of_exps =
2648 log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
2649 sum_of_exps)
2650 .raw();
2651
2652 // rescaled_diff_min is smallest representable in
2653 // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
2654 // log-sub-exps that will be subtracted in the loop.
2655 //
2656 // The thresholds diff_min, etc are negative.
2657 const int rescaled_diff_min =
2658 fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
2659 const int adjusted_diff_min =
2660 std::max(diff_min - 1, // Note use of > below instead of >= above.
2661 MultiplyByQuantizedMultiplierSmallerThanOneExp(
2662 rescaled_diff_min, reverse_scaling_divisor,
2663 -reverse_scaling_right_shift));
2664
2665 for (int c = 0; c < depth; ++c) {
2666 int32 input_diff =
2667 static_cast<int32>(input_data[i * depth + c]) - max_in_row;
2668 if (input_diff > adjusted_diff_min) {
2669 const int32 input_diff_rescaled =
2670 MultiplyByQuantizedMultiplierGreaterThanOne(
2671 input_diff, input_multiplier, input_left_shift);
2672 int32 unsat_output =
2673 gemmlowp::RoundingDivideByPOT(
2674 (input_diff_rescaled - fixed_log_sum_of_exps),
2675 31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
2676 255;
2677
2678 output_data[i * depth + c] = static_cast<uint8>(
2679 std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
2680 } else {
2681 // Set output to smallest value.
2682 output_data[i * depth + c] = 0;
2683 }
2684 }
2685 }
2686 }
2687
Logistic(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2688 inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
2689 const RuntimeShape& output_shape, float* output_data) {
2690 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2691
2692 for (int i = 0; i < flat_size; i++) {
2693 float val = input_data[i];
2694 float result = 1.f / (1.f + std::exp(-val));
2695 output_data[i] = result;
2696 }
2697 }
2698
2699 // Convenience version that allows, for example, generated-code calls to be
2700 // uniform between data types.
Logistic(const LogisticParams &,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2701 inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
2702 const float* input_data, const RuntimeShape& output_shape,
2703 float* output_data) {
2704 // Drop params: not needed.
2705 Logistic(input_shape, input_data, output_shape, output_data);
2706 }
2707
Logistic(const LogisticParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2708 inline void Logistic(const LogisticParams& params,
2709 const RuntimeShape& input_shape, const uint8* input_data,
2710 const RuntimeShape& output_shape, uint8* output_data) {
2711 const int32 input_zero_point = params.input_zero_point;
2712 const int32 input_range_radius = params.input_range_radius;
2713 const int32 input_multiplier = params.input_multiplier;
2714 const int input_left_shift = params.input_left_shift;
2715 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2716
2717 for (int i = 0; i < flat_size; i++) {
2718 const uint8 input_val_u8 = input_data[i];
2719 const int32 input_val_centered =
2720 static_cast<int32>(input_val_u8) - input_zero_point;
2721 uint8 output_val;
2722 if (input_val_centered <= -input_range_radius) {
2723 output_val = 0;
2724 } else if (input_val_centered >= input_range_radius) {
2725 output_val = 255;
2726 } else {
2727 const int32 input_val_rescaled =
2728 MultiplyByQuantizedMultiplierGreaterThanOne(
2729 input_val_centered, input_multiplier, input_left_shift);
2730 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
2731 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
2732 const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
2733 const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
2734 // Convert from Q0.31 to Q23.8.
2735 using gemmlowp::RoundingDivideByPOT;
2736 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
2737 if (output_val_s32 == 256) {
2738 output_val_s32 = 255;
2739 }
2740 // Reinterpret as U0.8.
2741 TFLITE_DCHECK_GE(output_val_s32, 0);
2742 TFLITE_DCHECK_LE(output_val_s32, 255);
2743 output_val = static_cast<uint8>(output_val_s32);
2744 }
2745 output_data[i] = output_val;
2746 }
2747 }
2748
Logistic(const LogisticParams & params,const RuntimeShape & input_shape,const int16 * input_data,const RuntimeShape & output_shape,int16 * output_data)2749 inline void Logistic(const LogisticParams& params,
2750 const RuntimeShape& input_shape, const int16* input_data,
2751 const RuntimeShape& output_shape, int16* output_data) {
2752 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2753
2754 for (int i = 0; i < flat_size; i++) {
2755 // F0 uses 0 integer bits, range [-1, 1].
2756 // This is the return type of math functions such as tanh, logistic,
2757 // whose range is in [-1, 1].
2758 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2759 // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
2760 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2761
2762 const F3 input = F3::FromRaw(input_data[i]);
2763 F0 output = gemmlowp::logistic(input);
2764 output_data[i] = output.raw();
2765 }
2766 }
2767
Tanh(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2768 inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
2769 const RuntimeShape& output_shape, float* output_data) {
2770 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2771
2772 for (int i = 0; i < flat_size; i++) {
2773 float val = input_data[i];
2774 float result = std::tanh(val);
2775 output_data[i] = result;
2776 }
2777 }
2778
2779 // Convenience version that allows, for example, generated-code calls to be
2780 // uniform between data types.
Tanh(const TanhParams &,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2781 inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
2782 const float* input_data, const RuntimeShape& output_shape,
2783 float* output_data) {
2784 // Drop params: not needed.
2785 Tanh(input_shape, input_data, output_shape, output_data);
2786 }
2787
Tanh(const TanhParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,uint8 * output_data)2788 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
2789 const uint8* input_data, const RuntimeShape& output_shape,
2790 uint8* output_data) {
2791 const int32 input_zero_point = params.input_zero_point;
2792 const int32 input_range_radius = params.input_range_radius;
2793 const int32 input_multiplier = params.input_multiplier;
2794 const int input_left_shift = params.input_left_shift;
2795 const int32 output_zero_point = 128;
2796 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2797
2798 for (int i = 0; i < flat_size; i++) {
2799 const uint8 input_val_u8 = input_data[i];
2800 const int32 input_val_centered =
2801 static_cast<int32>(input_val_u8) - input_zero_point;
2802 uint8 output_val;
2803 if (input_val_centered <= -input_range_radius) {
2804 output_val = 0;
2805 } else if (input_val_centered >= input_range_radius) {
2806 output_val = 255;
2807 } else {
2808 const int32 input_val_rescaled =
2809 MultiplyByQuantizedMultiplierGreaterThanOne(
2810 input_val_centered, input_multiplier, input_left_shift);
2811 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
2812 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
2813 const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
2814 const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
2815 // Convert from Q0.31 to Q24.7.
2816 using gemmlowp::RoundingDivideByPOT;
2817 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
2818 output_val_s32 += output_zero_point;
2819 if (output_val_s32 == 256) {
2820 output_val_s32 = 255;
2821 }
2822 // Reinterpret as Q0.7, encoded in uint8.
2823 TFLITE_DCHECK_GE(output_val_s32, 0);
2824 TFLITE_DCHECK_LE(output_val_s32, 255);
2825 output_val = static_cast<uint8>(output_val_s32);
2826 }
2827 output_data[i] = output_val;
2828 }
2829 }
2830
Tanh(const TanhParams & params,const RuntimeShape & input_shape,const int16 * input_data,const RuntimeShape & output_shape,int16 * output_data)2831 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
2832 const int16* input_data, const RuntimeShape& output_shape,
2833 int16* output_data) {
2834 const int input_left_shift = params.input_left_shift;
2835 // Support for shifts is limited until we have a parameterized version of
2836 // SaturatingRoundingMultiplyByPOT().
2837 TFLITE_DCHECK_GE(input_left_shift, 0);
2838 TFLITE_DCHECK_LE(input_left_shift, 1);
2839
2840 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2841
2842 // F0 uses 0 integer bits, range [-1, 1].
2843 // This is the return type of math functions such as tanh, logistic,
2844 // whose range is in [-1, 1].
2845 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
2846 // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
2847 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
2848
2849 if (input_left_shift == 0) {
2850 for (int i = 0; i < flat_size; i++) {
2851 F3 input = F3::FromRaw(input_data[i]);
2852 F0 output = gemmlowp::tanh(input);
2853 output_data[i] = output.raw();
2854 }
2855 } else {
2856 for (int i = 0; i < flat_size; i++) {
2857 F3 input = F3::FromRaw(
2858 gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
2859 F0 output = gemmlowp::tanh(input);
2860 output_data[i] = output.raw();
2861 }
2862 }
2863 }
2864
Dequantize(const tflite::DequantizationParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & output_shape,float * output_data)2865 inline void Dequantize(const tflite::DequantizationParams& op_params,
2866 const RuntimeShape& input_shape, const uint8* input_data,
2867 const RuntimeShape& output_shape, float* output_data) {
2868 gemmlowp::ScopedProfilingLabel label("Dequantize");
2869 int32 zero_point = op_params.zero_point;
2870 double scale = op_params.scale;
2871 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2872
2873 for (int i = 0; i < flat_size; i++) {
2874 int32 val = input_data[i];
2875 float result = static_cast<float>(scale * (val - zero_point));
2876 output_data[i] = result;
2877 }
2878 }
2879
FakeQuant(const tflite::FakeQuantParams & op_params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2880 inline void FakeQuant(const tflite::FakeQuantParams& op_params,
2881 const RuntimeShape& input_shape, const float* input_data,
2882 const RuntimeShape& output_shape, float* output_data) {
2883 gemmlowp::ScopedProfilingLabel label("FakeQuant");
2884 float rmin = op_params.minmax.min;
2885 float rmax = op_params.minmax.max;
2886 int num_bits = op_params.num_bits;
2887 // 0 should always be a representable value. Let's assume that the initial
2888 // min,max range contains 0.
2889 TFLITE_DCHECK_LE(rmin, 0.0f);
2890 TFLITE_DCHECK_GE(rmax, 0.0f);
2891 TFLITE_DCHECK_LT(rmin, rmax);
2892
2893 // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
2894 int quant_min = 0;
2895 int quant_max = (1 << num_bits) - 1;
2896 float nudged_min, nudged_max, nudged_scale;
2897 NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
2898 &nudged_max, &nudged_scale);
2899 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2900 FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data,
2901 output_data, flat_size);
2902 }
2903
2904 template <typename SrcT, typename DstT>
Cast(const RuntimeShape & input_shape,const SrcT * input_data,const RuntimeShape & output_shape,DstT * output_data)2905 inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
2906 const RuntimeShape& output_shape, DstT* output_data) {
2907 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2908
2909 for (int i = 0; i < flat_size; i++) {
2910 int offset = i;
2911 output_data[offset] = static_cast<DstT>(input_data[offset]);
2912 }
2913 }
2914
Floor(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2915 inline void Floor(const RuntimeShape& input_shape, const float* input_data,
2916 const RuntimeShape& output_shape, float* output_data) {
2917 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2918
2919 for (int i = 0; i < flat_size; i++) {
2920 int offset = i;
2921 output_data[offset] = std::floor(input_data[offset]);
2922 }
2923 }
2924
Ceil(const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & output_shape,float * output_data)2925 inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
2926 const RuntimeShape& output_shape, float* output_data) {
2927 const int flat_size = MatchingFlatSize(input_shape, output_shape);
2928
2929 for (int i = 0; i < flat_size; i++) {
2930 int offset = i;
2931 output_data[offset] = std::ceil(input_data[offset]);
2932 }
2933 }
2934
2935 template <typename T, typename CoordsT = int32>
Gather(const tflite::GatherParams & op_params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & coords_shape,const CoordsT * coords_data,const RuntimeShape & output_shape,T * output_data)2936 inline void Gather(const tflite::GatherParams& op_params,
2937 const RuntimeShape& input_shape, const T* input_data,
2938 const RuntimeShape& coords_shape, const CoordsT* coords_data,
2939 const RuntimeShape& output_shape, T* output_data) {
2940 gemmlowp::ScopedProfilingLabel label("Gather");
2941 int axis = op_params.axis;
2942 if (axis < 0) {
2943 axis += input_shape.DimensionsCount();
2944 }
2945 TFLITE_DCHECK_GE(axis, 0);
2946 TFLITE_DCHECK_LT(axis, input_shape.DimensionsCount());
2947 const int axis_size = input_shape.Dims(axis);
2948 const int coords_count = coords_shape.FlatSize();
2949
2950 int outer_size = 1;
2951 for (int i = 0; i < axis; ++i) {
2952 outer_size *= input_shape.Dims(i);
2953 }
2954
2955 int inner_size = 1;
2956 for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
2957 inner_size *= input_shape.Dims(i);
2958 }
2959
2960 for (int outer = 0; outer < outer_size; ++outer) {
2961 for (int i = 0; i < coords_count; ++i) {
2962 TFLITE_DCHECK_GE(coords_data[i], 0);
2963 TFLITE_DCHECK_LT(coords_data[i], axis_size);
2964 std::memcpy(
2965 output_data + (outer * coords_count + i) * inner_size,
2966 input_data + (outer * axis_size + coords_data[i]) * inner_size,
2967 sizeof(T) * inner_size);
2968 }
2969 }
2970 }
2971
2972 template <typename ParamsT, typename IndicesT = int32>
GatherNd(const RuntimeShape & params_shape,const ParamsT * params_data,const RuntimeShape & indices_shape,const IndicesT * indices_data,const RuntimeShape & output_shape,ParamsT * output_data)2973 inline void GatherNd(const RuntimeShape& params_shape,
2974 const ParamsT* params_data,
2975 const RuntimeShape& indices_shape,
2976 const IndicesT* indices_data,
2977 const RuntimeShape& output_shape, ParamsT* output_data) {
2978 gemmlowp::ScopedProfilingLabel label("GatherNd");
2979
2980 int n_slices = 1;
2981 int slice_size = 1;
2982 const int indices_dims = indices_shape.DimensionsCount();
2983 const int indices_nd = indices_shape.Dims(indices_dims - 1);
2984 const int params_dims = params_shape.DimensionsCount();
2985 for (int i = 0; i < indices_dims - 1; ++i) {
2986 n_slices *= indices_shape.Dims(i);
2987 }
2988 for (int i = indices_nd; i < params_dims; ++i) {
2989 slice_size *= params_shape.Dims(i);
2990 }
2991
2992 int remain_flat_size = params_shape.FlatSize();
2993 std::vector<int> dims_to_count(indices_nd, 0);
2994 for (int i = 0; i < indices_nd; ++i) {
2995 dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
2996 remain_flat_size = dims_to_count[i];
2997 }
2998
2999 for (int i = 0; i < n_slices; ++i) {
3000 int from_pos = 0;
3001 for (int j = 0; j < indices_nd; ++j) {
3002 from_pos += indices_data[i * indices_nd + j] * dims_to_count[j];
3003 }
3004 std::memcpy(output_data + i * slice_size, params_data + from_pos,
3005 sizeof(ParamsT) * slice_size);
3006 }
3007 }
3008
3009 template <typename T>
ResizeBilinear(const tflite::ResizeBilinearParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_size_shape,const int32 * output_size_data,const RuntimeShape & unextended_output_shape,T * output_data)3010 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
3011 const RuntimeShape& unextended_input_shape,
3012 const T* input_data,
3013 const RuntimeShape& unextended_output_size_shape,
3014 const int32* output_size_data,
3015 const RuntimeShape& unextended_output_shape,
3016 T* output_data) {
3017 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3018 TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
3019 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3020 const RuntimeShape input_shape =
3021 RuntimeShape::ExtendedShape(4, unextended_input_shape);
3022 const RuntimeShape output_size_shape =
3023 RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
3024 const RuntimeShape output_shape =
3025 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3026
3027 int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
3028 int32 input_height = input_shape.Dims(1);
3029 int32 input_width = input_shape.Dims(2);
3030 int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
3031
3032 TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
3033 TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
3034 TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
3035 TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
3036 int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
3037 int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
3038
3039 float height_scale = static_cast<float>(input_height) / output_height;
3040 float width_scale = static_cast<float>(input_width) / output_width;
3041 if (op_params.align_corners && output_height > 1) {
3042 height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
3043 }
3044 if (op_params.align_corners && output_width > 1) {
3045 width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
3046 }
3047
3048 for (int b = 0; b < batches; ++b) {
3049 for (int y = 0; y < output_height; ++y) {
3050 float input_y = y * height_scale;
3051 int32 y0 = static_cast<int32>(std::floor(input_y));
3052 int32 y1 = std::min(y0 + 1, input_height - 1);
3053 for (int x = 0; x < output_width; ++x) {
3054 float input_x = x * width_scale;
3055 int32 x0 = static_cast<int32>(std::floor(input_x));
3056 int32 x1 = std::min(x0 + 1, input_width - 1);
3057 for (int c = 0; c < depth; ++c) {
3058 T interpolation =
3059 static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
3060 (1 - (input_y - y0)) * (1 - (input_x - x0)) +
3061 input_data[Offset(input_shape, b, y1, x0, c)] *
3062 (input_y - y0) * (1 - (input_x - x0)) +
3063 input_data[Offset(input_shape, b, y0, x1, c)] *
3064 (1 - (input_y - y0)) * (input_x - x0) +
3065 input_data[Offset(input_shape, b, y1, x1, c)] *
3066 (input_y - y0) * (input_x - x0));
3067 output_data[Offset(output_shape, b, y, x, c)] = interpolation;
3068 }
3069 }
3070 }
3071 }
3072 }
3073
3074 template <typename T>
SpaceToBatchND(const SpaceToBatchParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const int32 * block_shape_data,const RuntimeShape & unextended_input3_shape,const int32 * paddings_data,const RuntimeShape & unextended_output_shape,T * output_data)3075 inline void SpaceToBatchND(
3076 const SpaceToBatchParams& params,
3077 const RuntimeShape& unextended_input1_shape, const T* input1_data,
3078 const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
3079 const RuntimeShape& unextended_input3_shape, const int32* paddings_data,
3080 const RuntimeShape& unextended_output_shape, T* output_data) {
3081 gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
3082 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3083 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3084 const RuntimeShape input1_shape =
3085 RuntimeShape::ExtendedShape(4, unextended_input1_shape);
3086 const RuntimeShape output_shape =
3087 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3088
3089 const int depth = input1_shape.Dims(3);
3090 const int input_width = input1_shape.Dims(2);
3091 const int input_height = input1_shape.Dims(1);
3092 const int input_batch_size = input1_shape.Dims(0);
3093
3094 const int output_width = output_shape.Dims(2);
3095 const int output_height = output_shape.Dims(1);
3096 const int output_batch_size = output_shape.Dims(0);
3097
3098 const int block_shape_height = block_shape_data[0];
3099 const int block_shape_width = block_shape_data[1];
3100 const int padding_top = paddings_data[0];
3101 const int padding_left = paddings_data[2];
3102
3103 // For uint8 quantized, the correct padding "zero value" is the output offset.
3104 const int32_t pad_value = params.output_offset;
3105
3106 for (int out_b = 0; out_b < output_batch_size; ++out_b) {
3107 int input_batch = out_b % input_batch_size;
3108 int shift_w = (out_b / input_batch_size) % block_shape_width;
3109 int shift_h = (out_b / input_batch_size) / block_shape_width;
3110 for (int out_h = 0; out_h < output_height; ++out_h) {
3111 for (int out_w = 0; out_w < output_width; ++out_w) {
3112 T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
3113 if (out_h * block_shape_height + shift_h < padding_top ||
3114 out_h * block_shape_height + shift_h >=
3115 padding_top + input_height ||
3116 out_w * block_shape_width + shift_w < padding_left ||
3117 out_w * block_shape_width + shift_w >= padding_left + input_width) {
3118 // This may not execute correctly when pad_value != 0 and T != uint8.
3119 memset(out, pad_value, depth * sizeof(T));
3120 } else {
3121 const T* in =
3122 input1_data +
3123 Offset(input1_shape, input_batch,
3124 (out_h * block_shape_height + shift_h) - padding_top,
3125 (out_w * block_shape_width + shift_w) - padding_left, 0);
3126 memcpy(out, in, depth * sizeof(T));
3127 }
3128 }
3129 }
3130 }
3131 }
3132
3133 template <typename T>
BatchToSpaceND(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const int32 * block_shape_data,const RuntimeShape & unextended_input3_shape,const int32 * crops_data,const RuntimeShape & unextended_output_shape,T * output_data)3134 inline void BatchToSpaceND(
3135 const RuntimeShape& unextended_input1_shape, const T* input1_data,
3136 const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
3137 const RuntimeShape& unextended_input3_shape, const int32* crops_data,
3138 const RuntimeShape& unextended_output_shape, T* output_data) {
3139 gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
3140 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3141 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3142 const RuntimeShape input1_shape =
3143 RuntimeShape::ExtendedShape(4, unextended_input1_shape);
3144 const RuntimeShape output_shape =
3145 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3146
3147 const int output_width = output_shape.Dims(2);
3148 const int output_height = output_shape.Dims(1);
3149 const int output_batch_size = output_shape.Dims(0);
3150
3151 const int depth = input1_shape.Dims(3);
3152 const int input_width = input1_shape.Dims(2);
3153 const int input_height = input1_shape.Dims(1);
3154 const int input_batch_size = input1_shape.Dims(0);
3155
3156 const int block_shape_width = block_shape_data[1];
3157 const int block_shape_height = block_shape_data[0];
3158 const int crops_top = crops_data[0];
3159 const int crops_left = crops_data[2];
3160
3161 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
3162 const int out_batch = in_batch % output_batch_size;
3163 const int spatial_offset = in_batch / output_batch_size;
3164 for (int in_h = 0; in_h < input_height; ++in_h) {
3165 const int out_h = in_h * block_shape_height +
3166 spatial_offset / block_shape_width - crops_top;
3167 if (out_h < 0 || out_h >= output_height) {
3168 continue;
3169 }
3170 for (int in_w = 0; in_w < input_width; ++in_w) {
3171 const int out_w = in_w * block_shape_width +
3172 spatial_offset % block_shape_width - crops_left;
3173
3174 if (out_w < 0 || out_w >= output_width) {
3175 continue;
3176 }
3177 T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
3178 const T* in =
3179 input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
3180 memcpy(out, in, depth * sizeof(T));
3181 }
3182 }
3183 }
3184 }
3185
3186 // There are two versions of pad: Pad and PadV2. In PadV2 there is a second
3187 // scalar input that provides the padding value. Therefore pad_value_ptr can be
3188 // equivalent to a simple input1_data. For Pad, it should point to a zero
3189 // value.
3190 //
3191 // Note that two typenames are required, so that T=P=int32 is considered a
3192 // specialization distinct from P=int32.
3193 template <typename T, typename P>
PadImpl(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3194 inline void PadImpl(const tflite::PadParams& op_params,
3195 const RuntimeShape& input_shape, const T* input_data,
3196 const P* pad_value_ptr, const RuntimeShape& output_shape,
3197 T* output_data) {
3198 const RuntimeShape ext_input_shape =
3199 RuntimeShape::ExtendedShape(4, input_shape);
3200 const RuntimeShape ext_output_shape =
3201 RuntimeShape::ExtendedShape(4, output_shape);
3202 TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
3203 TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
3204
3205 // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
3206 // we can pad them to 4 dims (yes, we are "padding the padding").
3207 std::vector<int> left_padding_copy(4, 0);
3208 for (int i = 0; i < op_params.left_padding_count; ++i) {
3209 left_padding_copy[i] = op_params.left_padding[i];
3210 }
3211 std::vector<int> right_padding_copy(4, 0);
3212 for (int i = 0; i < op_params.right_padding_count; ++i) {
3213 right_padding_copy[i] = op_params.right_padding[i];
3214 }
3215
3216 const int output_batch = ext_output_shape.Dims(0);
3217 const int output_height = ext_output_shape.Dims(1);
3218 const int output_width = ext_output_shape.Dims(2);
3219 const int output_depth = ext_output_shape.Dims(3);
3220
3221 const int left_b_padding = left_padding_copy[0];
3222 const int left_h_padding = left_padding_copy[1];
3223 const int left_w_padding = left_padding_copy[2];
3224 const int left_d_padding = left_padding_copy[3];
3225
3226 const int right_b_padding = right_padding_copy[0];
3227 const int right_h_padding = right_padding_copy[1];
3228 const int right_w_padding = right_padding_copy[2];
3229 const int right_d_padding = right_padding_copy[3];
3230
3231 const T pad_value = *pad_value_ptr;
3232
3233 const T* in_ptr = input_data;
3234 T* out_ptr = output_data;
3235 for (int out_b = 0; out_b < output_batch; ++out_b) {
3236 for (int out_h = 0; out_h < output_height; ++out_h) {
3237 for (int out_w = 0; out_w < output_width; ++out_w) {
3238 for (int out_d = 0; out_d < output_depth; ++out_d) {
3239 if (out_b < left_b_padding ||
3240 out_b >= output_batch - right_b_padding ||
3241 out_h < left_h_padding ||
3242 out_h >= output_height - right_h_padding ||
3243 out_w < left_w_padding ||
3244 out_w >= output_width - right_w_padding ||
3245 out_d < left_d_padding ||
3246 out_d >= output_depth - right_d_padding) {
3247 *out_ptr++ = pad_value;
3248 } else {
3249 *out_ptr++ = *in_ptr++;
3250 }
3251 }
3252 }
3253 }
3254 }
3255 }
3256
3257 template <typename T, typename P>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3258 inline void Pad(const tflite::PadParams& op_params,
3259 const RuntimeShape& input_shape, const T* input_data,
3260 const P* pad_value_ptr, const RuntimeShape& output_shape,
3261 T* output_data) {
3262 PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3263 output_data);
3264 }
3265
3266 // The second (pad-value) input can be int32 when, say, the first is uint8.
3267 template <typename T>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const int32 * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3268 inline void Pad(const tflite::PadParams& op_params,
3269 const RuntimeShape& input_shape, const T* input_data,
3270 const int32* pad_value_ptr, const RuntimeShape& output_shape,
3271 T* output_data) {
3272 const T converted_pad_value = static_cast<T>(*pad_value_ptr);
3273 PadImpl(op_params, input_shape, input_data, &converted_pad_value,
3274 output_shape, output_data);
3275 }
3276
3277 // This version avoids conflicting template matching.
3278 template <>
Pad(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const int32 * input_data,const int32 * pad_value_ptr,const RuntimeShape & output_shape,int32 * output_data)3279 inline void Pad(const tflite::PadParams& op_params,
3280 const RuntimeShape& input_shape, const int32* input_data,
3281 const int32* pad_value_ptr, const RuntimeShape& output_shape,
3282 int32* output_data) {
3283 PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3284 output_data);
3285 }
3286
3287 // One could make all PadImageStyle calls simply delegate the work to the
3288 // ordinary Pad. However, it is better that the reference code asserts false in
3289 // similar cases.
3290 template <typename T, typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const T * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,T * output_data)3291 inline void PadImageStyle(const tflite::PadParams& op_params,
3292 const RuntimeShape& input_shape, const T* input_data,
3293 const P* pad_value_ptr,
3294 const RuntimeShape& output_shape, T* output_data) {
3295 TFLITE_ASSERT_FALSE;
3296 }
3297
3298 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const uint8 * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,uint8 * output_data)3299 inline void PadImageStyle(const tflite::PadParams& op_params,
3300 const RuntimeShape& input_shape,
3301 const uint8* input_data, const P* pad_value_ptr,
3302 const RuntimeShape& output_shape,
3303 uint8* output_data) {
3304 Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3305 output_data);
3306 }
3307
3308 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const int8_t * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,int8_t * output_data)3309 inline void PadImageStyle(const tflite::PadParams& op_params,
3310 const RuntimeShape& input_shape,
3311 const int8_t* input_data, const P* pad_value_ptr,
3312 const RuntimeShape& output_shape,
3313 int8_t* output_data) {
3314 Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3315 output_data);
3316 }
3317
3318 template <typename P>
PadImageStyle(const tflite::PadParams & op_params,const RuntimeShape & input_shape,const float * input_data,const P * pad_value_ptr,const RuntimeShape & output_shape,float * output_data)3319 inline void PadImageStyle(const tflite::PadParams& op_params,
3320 const RuntimeShape& input_shape,
3321 const float* input_data, const P* pad_value_ptr,
3322 const RuntimeShape& output_shape,
3323 float* output_data) {
3324 Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
3325 output_data);
3326 }
3327 template <typename T>
StridedSlice(const tflite::StridedSliceParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3328 inline void StridedSlice(const tflite::StridedSliceParams& op_params,
3329 const RuntimeShape& unextended_input_shape,
3330 const T* input_data,
3331 const RuntimeShape& unextended_output_shape,
3332 T* output_data) {
3333 // Note that the output_shape is not used herein.
3334 tflite::StridedSliceParams params_copy = op_params;
3335
3336 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3337 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3338 const RuntimeShape input_shape =
3339 RuntimeShape::ExtendedShape(4, unextended_input_shape);
3340 const RuntimeShape output_shape =
3341 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3342
3343 // Reverse and pad to 4 dimensions because that is what the runtime code
3344 // requires (ie. all shapes must be 4D and are given backwards).
3345 strided_slice::StridedSlicePadIndices(¶ms_copy, 4);
3346
3347 const int start_b = strided_slice::StartForAxis(params_copy, input_shape, 0);
3348 const int stop_b =
3349 strided_slice::StopForAxis(params_copy, input_shape, 0, start_b);
3350 const int start_h = strided_slice::StartForAxis(params_copy, input_shape, 1);
3351 const int stop_h =
3352 strided_slice::StopForAxis(params_copy, input_shape, 1, start_h);
3353 const int start_w = strided_slice::StartForAxis(params_copy, input_shape, 2);
3354 const int stop_w =
3355 strided_slice::StopForAxis(params_copy, input_shape, 2, start_w);
3356 const int start_d = strided_slice::StartForAxis(params_copy, input_shape, 3);
3357 const int stop_d =
3358 strided_slice::StopForAxis(params_copy, input_shape, 3, start_d);
3359
3360 T* out_ptr = output_data;
3361 for (int in_b = start_b;
3362 !strided_slice::LoopCondition(in_b, stop_b, params_copy.strides[0]);
3363 in_b += params_copy.strides[0]) {
3364 for (int in_h = start_h;
3365 !strided_slice::LoopCondition(in_h, stop_h, params_copy.strides[1]);
3366 in_h += params_copy.strides[1]) {
3367 for (int in_w = start_w;
3368 !strided_slice::LoopCondition(in_w, stop_w, params_copy.strides[2]);
3369 in_w += params_copy.strides[2]) {
3370 for (int in_d = start_d; !strided_slice::LoopCondition(
3371 in_d, stop_d, params_copy.strides[3]);
3372 in_d += params_copy.strides[3]) {
3373 *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)];
3374 }
3375 }
3376 }
3377 }
3378 }
3379
3380 template <typename T>
Slice(const tflite::SliceParams & op_params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)3381 inline void Slice(const tflite::SliceParams& op_params,
3382 const RuntimeShape& input_shape, const T* input_data,
3383 const RuntimeShape& output_shape, T* output_data) {
3384 const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
3385 // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
3386 TFLITE_DCHECK_LE(op_params.begin_count, 4);
3387 TFLITE_DCHECK_LE(op_params.size_count, 4);
3388 const int begin_count = op_params.begin_count;
3389 const int size_count = op_params.size_count;
3390 // We front-pad the begin and size vectors.
3391 const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
3392 const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
3393 ? ext_shape.Dims(0) - start_b
3394 : start_b + op_params.size[0];
3395 const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
3396 const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
3397 ? ext_shape.Dims(1) - start_h
3398 : start_h + op_params.size[size_count - 3];
3399 const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
3400 const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
3401 ? ext_shape.Dims(2) - start_w
3402 : start_w + op_params.size[size_count - 2];
3403 const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
3404 const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
3405 ? ext_shape.Dims(3) - start_d
3406 : start_d + op_params.size[size_count - 1];
3407
3408 T* out_ptr = output_data;
3409 for (int in_b = start_b; in_b < stop_b; ++in_b) {
3410 for (int in_h = start_h; in_h < stop_h; ++in_h) {
3411 for (int in_w = start_w; in_w < stop_w; ++in_w) {
3412 for (int in_d = start_d; in_d < stop_d; ++in_d) {
3413 *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)];
3414 }
3415 }
3416 }
3417 }
3418 }
3419
3420 template <typename T>
Exp(const T * input_data,const size_t num_elements,T * output_data)3421 inline void Exp(const T* input_data, const size_t num_elements,
3422 T* output_data) {
3423 gemmlowp::ScopedProfilingLabel label("Exp");
3424 for (size_t idx = 0; idx < num_elements; ++idx) {
3425 output_data[idx] = exp(input_data[idx]);
3426 }
3427 }
3428
3429 // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
3430 // This method iterates through input data and reduce elements along the
3431 // dimensions given in axis.
3432 template <typename In, typename Out>
Reduce(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out reducer (const Out current,const In in),Out * output_data)3433 inline bool Reduce(const In* input_data, const int* input_dims,
3434 const int* output_dims, const int input_num_dims,
3435 const int output_num_dims, const int* axis,
3436 const int num_axis, int* input_iter,
3437 Out reducer(const Out current, const In in),
3438 Out* output_data) {
3439 // Reset input iterator.
3440 for (int idx = 0; idx < input_num_dims; ++idx) {
3441 input_iter[idx] = 0;
3442 }
3443 // Iterate through input_data.
3444 do {
3445 size_t input_offset =
3446 ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
3447 size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
3448 input_iter, num_axis, axis);
3449 output_data[output_offset] =
3450 reducer(output_data[output_offset], input_data[input_offset]);
3451 } while (NextIndex(input_num_dims, input_dims, input_iter));
3452 return true;
3453 }
3454
ResolveAxis(const int num_dims,const int * axis,const int64_t num_axis,int * out_axis,int * out_num_axis)3455 inline bool ResolveAxis(const int num_dims, const int* axis,
3456 const int64_t num_axis, int* out_axis,
3457 int* out_num_axis) {
3458 *out_num_axis = 0; // Just in case.
3459 // Short-circuit axis resolution for scalars; the axis will go unused.
3460 if (num_dims == 0) {
3461 return true;
3462 }
3463 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
3464 for (int64_t idx = 0; idx < num_axis; ++idx) {
3465 // Handle negative index.
3466 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
3467 TFLITE_DCHECK(current >= 0 && current < num_dims);
3468 bool is_dup = false;
3469 for (int j = 0; j < *out_num_axis; ++j) {
3470 if (out_axis[j] == current) {
3471 is_dup = true;
3472 break;
3473 }
3474 }
3475 if (!is_dup) {
3476 out_axis[*out_num_axis] = current;
3477 *out_num_axis += 1;
3478 }
3479 }
3480 return true;
3481 }
3482
3483 // This method expects that output_data has been initialized.
3484 template <typename In, typename Out>
ReduceSumImpl(const In * input_data,const int * input_dims,const int * output_dims,const int input_num_dims,const int output_num_dims,const int * axis,const int num_axis,int * input_iter,Out * output_data)3485 inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
3486 const int* output_dims, const int input_num_dims,
3487 const int output_num_dims, const int* axis,
3488 const int num_axis, int* input_iter,
3489 Out* output_data) {
3490 auto reducer = [](const Out current, const In in) -> Out {
3491 const Out actual_in = static_cast<Out>(in);
3492 return current + actual_in;
3493 };
3494 return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
3495 output_num_dims, axis, num_axis, input_iter, reducer,
3496 output_data);
3497 }
3498
3499 template <typename T>
InitTensorDataForReduce(const int * dims,const int num_dims,const T init_value,T * data)3500 inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
3501 const T init_value, T* data) {
3502 size_t num_elements = 1;
3503 for (int idx = 0; idx < num_dims; ++idx) {
3504 size_t current = static_cast<size_t>(dims[idx]);
3505 // Overflow prevention.
3506 if (num_elements > std::numeric_limits<size_t>::max() / current) {
3507 return false;
3508 }
3509 num_elements *= current;
3510 }
3511 for (size_t idx = 0; idx < num_elements; ++idx) {
3512 data[idx] = init_value;
3513 }
3514 return true;
3515 }
3516
3517 // Computes the generic value (i.e., sum/max/min/prod) of elements across
3518 // dimensions given in axis. It needs to pass in init_value and reducer.
3519 template <typename T>
ReduceGeneric(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int64_t num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,T init_value,T reducer (const T current,const T in))3520 inline bool ReduceGeneric(const T* input_data, const int* input_dims,
3521 const int input_num_dims, T* output_data,
3522 const int* output_dims, const int output_num_dims,
3523 const int* axis, const int64_t num_axis_dimensions,
3524 bool keep_dims, int* temp_index, int* resolved_axis,
3525 T init_value,
3526 T reducer(const T current, const T in)) {
3527 // Reset output data.
3528 if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
3529 output_data)) {
3530 return false;
3531 }
3532
3533 // Resolve axis.
3534 int num_resolved_axis = 0;
3535 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3536 &num_resolved_axis)) {
3537 return false;
3538 }
3539
3540 return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
3541 output_num_dims, resolved_axis, num_resolved_axis,
3542 temp_index, reducer, output_data);
3543 }
3544
3545 // Computes the mean of elements across dimensions given in axis.
3546 // It does so in two stages, first calculates the sum of elements along the axis
3547 // then divides it by the number of element in axis.
3548 template <typename T, typename U>
Mean(const T * input_data,const int * input_dims,const int input_num_dims,T * output_data,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum)3549 inline bool Mean(const T* input_data, const int* input_dims,
3550 const int input_num_dims, T* output_data,
3551 const int* output_dims, const int output_num_dims,
3552 const int* axis, const int num_axis_dimensions, bool keep_dims,
3553 int* temp_index, int* resolved_axis, U* temp_sum) {
3554 gemmlowp::ScopedProfilingLabel label("Mean");
3555 // Reset output data.
3556 size_t num_outputs = 1;
3557 for (int idx = 0; idx < output_num_dims; ++idx) {
3558 size_t current = static_cast<size_t>(output_dims[idx]);
3559 // Overflow prevention.
3560 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
3561 return false;
3562 }
3563 num_outputs *= current;
3564 }
3565 for (size_t idx = 0; idx < num_outputs; ++idx) {
3566 output_data[idx] = T();
3567 temp_sum[idx] = U();
3568 }
3569
3570 // Resolve axis.
3571 int num_resolved_axis = 0;
3572 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3573 &num_resolved_axis)) {
3574 return false;
3575 }
3576
3577 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
3578 output_num_dims, resolved_axis, num_resolved_axis,
3579 temp_index, temp_sum)) {
3580 return false;
3581 }
3582
3583 // Calculate mean by dividing output_data by num of aggregated element.
3584 U num_elements_in_axis = 1;
3585 for (int idx = 0; idx < num_resolved_axis; ++idx) {
3586 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
3587 // Overflow prevention.
3588 if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
3589 return false;
3590 }
3591 num_elements_in_axis *= current;
3592 }
3593
3594 if (num_elements_in_axis > 0) {
3595 for (size_t idx = 0; idx < num_outputs; ++idx) {
3596 output_data[idx] =
3597 static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
3598 }
3599 }
3600 return true;
3601 }
3602
3603 template <typename T>
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3604 inline void Mean(const tflite::MeanParams& op_params,
3605 const RuntimeShape& unextended_input_shape,
3606 const T* input_data,
3607 const RuntimeShape& unextended_output_shape, T* output_data) {
3608 gemmlowp::ScopedProfilingLabel label("Mean4D");
3609
3610 // Current implementation only supports dimension equals 4 and simultaneous
3611 // reduction over width and height.
3612 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
3613 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3614 const RuntimeShape input_shape =
3615 RuntimeShape::ExtendedShape(4, unextended_input_shape);
3616 const RuntimeShape output_shape =
3617 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3618
3619 const int output_batch = output_shape.Dims(0);
3620 const int output_height = output_shape.Dims(1);
3621 const int output_width = output_shape.Dims(2);
3622 const int output_depth = output_shape.Dims(3);
3623
3624 const int input_height = input_shape.Dims(1);
3625 const int input_width = input_shape.Dims(2);
3626
3627 TFLITE_DCHECK_EQ(op_params.axis_count, 2);
3628 TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
3629 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
3630 TFLITE_DCHECK_EQ(output_height, 1);
3631 TFLITE_DCHECK_EQ(output_width, 1);
3632
3633 for (int out_b = 0; out_b < output_batch; ++out_b) {
3634 for (int out_d = 0; out_d < output_depth; ++out_d) {
3635 float value = 0;
3636 for (int in_h = 0; in_h < input_height; ++in_h) {
3637 for (int in_w = 0; in_w < input_width; ++in_w) {
3638 value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
3639 }
3640 }
3641 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3642 value / (input_width * input_height);
3643 }
3644 }
3645 }
3646
Mean(const tflite::MeanParams & op_params,const RuntimeShape & unextended_input_shape,const uint8_t * input_data,int32 input_zero_point,float input_scale,const RuntimeShape & unextended_output_shape,uint8_t * output_data,int32 output_zero_point,float output_scale)3647 inline void Mean(const tflite::MeanParams& op_params,
3648 const RuntimeShape& unextended_input_shape,
3649 const uint8_t* input_data, int32 input_zero_point,
3650 float input_scale, const RuntimeShape& unextended_output_shape,
3651 uint8_t* output_data, int32 output_zero_point,
3652 float output_scale) {
3653 gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
3654
3655 // Current implementation only supports dimension equals 4 and simultaneous
3656 // reduction over width and height.
3657 TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
3658 TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3659 const RuntimeShape input_shape =
3660 RuntimeShape::ExtendedShape(4, unextended_input_shape);
3661 const RuntimeShape output_shape =
3662 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3663 const int output_batch = output_shape.Dims(0);
3664 const int output_height = output_shape.Dims(1);
3665 const int output_width = output_shape.Dims(2);
3666 const int output_depth = output_shape.Dims(3);
3667 const int input_height = input_shape.Dims(1);
3668 const int input_width = input_shape.Dims(2);
3669 const float num_elements_in_axis = input_width * input_height;
3670
3671 TFLITE_DCHECK_EQ(op_params.axis_count, 2);
3672 TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
3673 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
3674 TFLITE_DCHECK_EQ(output_height, 1);
3675 TFLITE_DCHECK_EQ(output_width, 1);
3676
3677 const bool ordinary_mean =
3678 (input_zero_point == output_zero_point && input_scale == output_scale);
3679 float scale, bias;
3680 if (!ordinary_mean) {
3681 scale = input_scale / output_scale;
3682 bias = -input_zero_point * scale + 0.5;
3683 }
3684 for (int out_b = 0; out_b < output_batch; ++out_b) {
3685 for (int out_d = 0; out_d < output_depth; ++out_d) {
3686 float temp_value = 0;
3687 for (int in_h = 0; in_h < input_height; ++in_h) {
3688 for (int in_w = 0; in_w < input_width; ++in_w) {
3689 temp_value +=
3690 input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
3691 }
3692 }
3693 temp_value = temp_value / num_elements_in_axis;
3694 if (ordinary_mean) {
3695 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3696 static_cast<uint8_t>(round(temp_value));
3697 } else {
3698 output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
3699 static_cast<uint8_t>(round(temp_value * scale + bias)) +
3700 output_zero_point;
3701 }
3702 }
3703 }
3704 }
3705
3706 // Computes the mean of elements across dimensions given in axis.
3707 // It does so in two stages, first calculates the sum of elements along the axis
3708 // then divides it by the number of element in axis for quantized values.
3709 template <typename T, typename U>
QuantizedMeanOrSum(const T * input_data,int32 input_zero_point,float input_scale,const int * input_dims,const int input_num_dims,T * output_data,int32 output_zero_point,float output_scale,const int * output_dims,const int output_num_dims,const int * axis,const int num_axis_dimensions,bool keep_dims,int * temp_index,int * resolved_axis,U * temp_sum,bool compute_sum)3710 inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
3711 float input_scale, const int* input_dims,
3712 const int input_num_dims, T* output_data,
3713 int32 output_zero_point, float output_scale,
3714 const int* output_dims,
3715 const int output_num_dims, const int* axis,
3716 const int num_axis_dimensions, bool keep_dims,
3717 int* temp_index, int* resolved_axis, U* temp_sum,
3718 bool compute_sum) {
3719 gemmlowp::ScopedProfilingLabel label(compute_sum ? "Sum/Uint8"
3720 : "Mean/Uint8");
3721 // Reset output data.
3722 size_t num_outputs = 1;
3723 for (int idx = 0; idx < output_num_dims; ++idx) {
3724 size_t current = static_cast<size_t>(output_dims[idx]);
3725 // Overflow prevention.
3726 if (num_outputs > std::numeric_limits<size_t>::max() / current) {
3727 return false;
3728 }
3729 num_outputs *= current;
3730 }
3731 for (size_t idx = 0; idx < num_outputs; ++idx) {
3732 output_data[idx] = T();
3733 temp_sum[idx] = U();
3734 }
3735
3736 // Resolve axis.
3737 int num_resolved_axis = 0;
3738 if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
3739 &num_resolved_axis)) {
3740 return false;
3741 }
3742
3743 if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
3744 output_num_dims, resolved_axis, num_resolved_axis,
3745 temp_index, temp_sum)) {
3746 return false;
3747 }
3748
3749 // Calculate mean by dividing output_data by num of aggregated element.
3750 U num_elements_in_axis = 1;
3751 for (int idx = 0; idx < num_resolved_axis; ++idx) {
3752 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
3753 // Overflow prevention.
3754 if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
3755 return false;
3756 }
3757 num_elements_in_axis *= current;
3758 }
3759
3760 if (num_elements_in_axis > 0) {
3761 const float scale = input_scale / output_scale;
3762 if (compute_sum) {
3763 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
3764 const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5;
3765 for (size_t idx = 0; idx < num_outputs; ++idx) {
3766 const U value = static_cast<U>(round(temp_sum[idx] * scale + bias)) +
3767 output_zero_point;
3768 output_data[idx] = static_cast<T>(value);
3769 }
3770 } else {
3771 const float bias = -input_zero_point * scale + 0.5;
3772 for (size_t idx = 0; idx < num_outputs; ++idx) {
3773 float float_mean = static_cast<float>(temp_sum[idx]) /
3774 static_cast<float>(num_elements_in_axis);
3775
3776 // Convert to float value.
3777 output_data[idx] = static_cast<T>(round(float_mean * scale + bias)) +
3778 output_zero_point;
3779 }
3780 }
3781 }
3782 return true;
3783 }
3784
3785 template <typename T>
Minimum(const RuntimeShape & input1_shape,const T * input1_data,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3786 void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
3787 const T* input2_data, const RuntimeShape& output_shape,
3788 T* output_data) {
3789 const int flat_size = MatchingFlatSize(input1_shape, output_shape);
3790
3791 auto min_value = input2_data[0];
3792 for (int i = 0; i < flat_size; i++) {
3793 output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
3794 }
3795 }
3796
3797 // Convenience version that allows, for example, generated-code calls to be
3798 // the same as other binary ops.
3799 template <typename T>
Minimum(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape &,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3800 inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
3801 const RuntimeShape&, const T* input2_data,
3802 const RuntimeShape& output_shape, T* output_data) {
3803 // Drop shape of second input: not needed.
3804 Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
3805 }
3806
3807 template <typename T>
Maximum(const RuntimeShape & input1_shape,const T * input1_data,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3808 void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
3809 const T* input2_data, const RuntimeShape& output_shape,
3810 T* output_data) {
3811 const int flat_size = MatchingFlatSize(input1_shape, output_shape);
3812
3813 auto max_value = input2_data[0];
3814 for (int i = 0; i < flat_size; i++) {
3815 output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
3816 }
3817 }
3818
3819 // Convenience version that allows, for example, generated-code calls to be
3820 // the same as other binary ops.
3821 template <typename T>
Maximum(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape &,const T * input2_data,const RuntimeShape & output_shape,T * output_data)3822 inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
3823 const RuntimeShape&, const T* input2_data,
3824 const RuntimeShape& output_shape, T* output_data) {
3825 // Drop shape of second input: not needed.
3826 Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
3827 }
3828
3829 template <typename T, typename Op>
MaximumMinimumBroadcast4DSlow(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data,Op op)3830 void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
3831 const T* input1_data,
3832 const RuntimeShape& unextended_input2_shape,
3833 const T* input2_data,
3834 const RuntimeShape& unextended_output_shape,
3835 T* output_data, Op op) {
3836 gemmlowp::ScopedProfilingLabel label("MaximumMinimumBroadcast4DSlow");
3837 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
3838 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
3839 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
3840 const RuntimeShape output_shape =
3841 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3842
3843 NdArrayDesc<4> desc1;
3844 NdArrayDesc<4> desc2;
3845 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
3846 unextended_input2_shape, &desc1, &desc2);
3847
3848 for (int b = 0; b < output_shape.Dims(0); ++b) {
3849 for (int y = 0; y < output_shape.Dims(1); ++y) {
3850 for (int x = 0; x < output_shape.Dims(2); ++x) {
3851 for (int c = 0; c < output_shape.Dims(3); ++c) {
3852 auto out_idx = Offset(output_shape, b, y, x, c);
3853 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
3854 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
3855 auto in1_val = input1_data[in1_idx];
3856 auto in2_val = input2_data[in2_idx];
3857 output_data[out_idx] = op(in1_val, in2_val);
3858 }
3859 }
3860 }
3861 }
3862 }
3863
3864 template <typename T1, typename T2, typename T3, typename Cmp>
ArgMinMax(const RuntimeShape & input1_shape,const T1 * input1_data,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data,const Cmp & cmp)3865 void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
3866 const T3* input2_data, const RuntimeShape& output_shape,
3867 T2* output_data, const Cmp& cmp) {
3868 gemmlowp::ScopedProfilingLabel label("ArgMinMax");
3869 TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
3870 TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
3871 output_shape.DimensionsCount());
3872
3873 int axis = input2_data[0];
3874 if (axis < 0) {
3875 axis += input1_shape.DimensionsCount();
3876 }
3877
3878 const int axis_size = input1_shape.Dims(axis);
3879
3880 int outer_size = 1;
3881 for (int i = 0; i < axis; ++i) {
3882 TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
3883 outer_size *= input1_shape.Dims(i);
3884 }
3885
3886 int inner_size = 1;
3887 const int dims_count = input1_shape.DimensionsCount();
3888 for (int i = axis + 1; i < dims_count; ++i) {
3889 TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
3890 inner_size *= input1_shape.Dims(i);
3891 }
3892
3893 for (int outer = 0; outer < outer_size; ++outer) {
3894 for (int inner = 0; inner < inner_size; ++inner) {
3895 auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
3896 int min_max_index = 0;
3897 for (int i = 1; i < axis_size; ++i) {
3898 const auto& curr_value =
3899 input1_data[(outer * axis_size + i) * inner_size + inner];
3900 if (cmp(curr_value, min_max_value)) {
3901 min_max_value = curr_value;
3902 min_max_index = i;
3903 }
3904 }
3905 output_data[outer * inner_size + inner] = min_max_index;
3906 }
3907 }
3908 }
3909
3910 template <typename T1, typename T2, typename T3>
ArgMax(const RuntimeShape & input1_shape,const T1 * input1_data,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data)3911 void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
3912 const T3* input2_data, const RuntimeShape& output_shape,
3913 T2* output_data) {
3914 ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data,
3915 std::greater<T1>());
3916 }
3917
3918 // Convenience version that allows, for example, generated-code calls to be
3919 // the same as other binary ops.
3920 template <typename T1, typename T2, typename T3>
ArgMax(const RuntimeShape & input1_shape,const T1 * input1_data,const RuntimeShape & input2_shape,const T3 * input2_data,const RuntimeShape & output_shape,T2 * output_data)3921 inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data,
3922 const RuntimeShape& input2_shape, const T3* input2_data,
3923 const RuntimeShape& output_shape, T2* output_data) {
3924 // Drop shape of second input: not needed.
3925 ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
3926 }
3927
3928 template <typename T>
Transpose(const TransposeParams & params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & unextended_output_shape,T * output_data)3929 void Transpose(const TransposeParams& params,
3930 const RuntimeShape& unextended_input_shape, const T* input_data,
3931 const RuntimeShape& unextended_output_shape, T* output_data) {
3932 const int unextended_output_size = unextended_output_shape.DimensionsCount();
3933 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
3934 TFLITE_DCHECK_LE(unextended_output_size, 4);
3935 TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count);
3936 const RuntimeShape input_shape =
3937 RuntimeShape::ExtendedShape(4, unextended_input_shape);
3938 const RuntimeShape output_shape =
3939 RuntimeShape::ExtendedShape(4, unextended_output_shape);
3940 const int input_ext_size = 4 - unextended_input_shape.DimensionsCount();
3941 const int output_ext_size = 4 - unextended_output_size;
3942
3943 // The perm data is extended to match the output, each index incremented by
3944 // the amount of front padding of the input shape.
3945 int extended_perm[4];
3946 for (int i = 0; i < output_ext_size; ++i) {
3947 extended_perm[i] = i;
3948 }
3949 for (int i = 0; i < unextended_output_size; ++i) {
3950 extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
3951 }
3952
3953 int out_sizes[4];
3954 // Compute the inverse permutation array so we can do an output centered
3955 // transpose. Also, check to make sure output_dims is matching input_dims.
3956 for (int k = 0; k < 4; k++) {
3957 out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k);
3958 }
3959
3960 // Naive transpose loop (iterate on output index and compute input index).
3961 int o[4]; // loop index (on output).
3962 int i[4];
3963 for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
3964 i[extended_perm[3]] = o[3];
3965 for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
3966 i[extended_perm[2]] = o[2];
3967 for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
3968 i[extended_perm[1]] = o[1];
3969 for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
3970 i[extended_perm[0]] = o[0];
3971 output_data[Offset(output_shape, o)] =
3972 input_data[Offset(input_shape, i)];
3973 }
3974 }
3975 }
3976 }
3977 }
3978
TransposeConv(const ConvParams & params,const RuntimeShape & input_shape,const float * input_data,const RuntimeShape & filter_shape,const float * filter_data,const RuntimeShape & output_shape,float * output_data,const RuntimeShape & im2col_shape,float * im2col_data)3979 inline void TransposeConv(
3980 const ConvParams& params, const RuntimeShape& input_shape,
3981 const float* input_data, const RuntimeShape& filter_shape,
3982 const float* filter_data, const RuntimeShape& output_shape,
3983 float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
3984 const int stride_width = params.stride_width;
3985 const int stride_height = params.stride_height;
3986 const int pad_width = params.padding_values.width;
3987 const int pad_height = params.padding_values.height;
3988 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
3989 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
3990 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
3991 (void)im2col_data; // only used in optimized code.
3992 (void)im2col_shape; // only used in optimized code.
3993
3994 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
3995 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
3996 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
3997 const int input_height = input_shape.Dims(1);
3998 const int input_width = input_shape.Dims(2);
3999 const int filter_height = filter_shape.Dims(1);
4000 const int filter_width = filter_shape.Dims(2);
4001 const int output_height = output_shape.Dims(1);
4002 const int output_width = output_shape.Dims(2);
4003
4004 // Although transpose convolution simplifies to convolution with transposed
4005 // weights for strides of 1, non-unitary striding complicates matters. To
4006 // keep this reference implementation as clear as possible, we use a
4007 // "scatter" access pattern, where we loop through all the input elements,
4008 // computing their influence on the output, rather than looping through the
4009 // output elements in the typical "gather" access pattern of a conv. We
4010 // therefore must initialize the output array to zero.
4011 const int num_elements = output_shape.FlatSize();
4012 for (int i = 0; i < num_elements; i++) {
4013 output_data[i] = 0.0f;
4014 }
4015
4016 // Loop through input elements one at a time.
4017 for (int batch = 0; batch < batches; ++batch) {
4018 for (int in_y = 0; in_y < input_height; ++in_y) {
4019 for (int in_x = 0; in_x < input_width; ++in_x) {
4020 for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
4021 // Loop through the output elements it will influence
4022 const int out_x_origin = (in_x * stride_width) - pad_width;
4023 const int out_y_origin = (in_y * stride_height) - pad_height;
4024 for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
4025 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
4026 for (int out_channel = 0; out_channel < output_depth;
4027 ++out_channel) {
4028 // Compute output element location
4029 const int out_x = out_x_origin + filter_x;
4030 const int out_y = out_y_origin + filter_y;
4031 // We cannot accumulate out of bounds
4032 if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
4033 (out_y < output_height)) {
4034 float input_value = input_data[Offset(
4035 input_shape, batch, in_y, in_x, in_channel)];
4036 float filter_value =
4037 filter_data[Offset(filter_shape, out_channel, filter_y,
4038 filter_x, in_channel)];
4039 output_data[Offset(output_shape, batch, out_y, out_x,
4040 out_channel)] +=
4041 input_value * filter_value;
4042 }
4043 }
4044 }
4045 }
4046 }
4047 }
4048 }
4049 }
4050 }
4051
4052 template <typename T>
EqualFn(T lhs,T rhs)4053 inline bool EqualFn(T lhs, T rhs) {
4054 return lhs == rhs;
4055 }
4056
4057 template <typename T>
NotEqualFn(T lhs,T rhs)4058 inline bool NotEqualFn(T lhs, T rhs) {
4059 return lhs != rhs;
4060 }
4061
4062 template <typename T>
GreaterFn(T lhs,T rhs)4063 inline bool GreaterFn(T lhs, T rhs) {
4064 return lhs > rhs;
4065 }
4066 template <typename T>
GreaterEqualFn(T lhs,T rhs)4067 inline bool GreaterEqualFn(T lhs, T rhs) {
4068 return lhs >= rhs;
4069 }
4070 template <typename T>
LessFn(T lhs,T rhs)4071 inline bool LessFn(T lhs, T rhs) {
4072 return lhs < rhs;
4073 }
4074 template <typename T>
LessEqualFn(T lhs,T rhs)4075 inline bool LessEqualFn(T lhs, T rhs) {
4076 return lhs <= rhs;
4077 }
4078
4079 template <typename T>
4080 using ComparisonFn = bool (*)(T, T);
4081
4082 template <typename T, ComparisonFn<T> F>
ComparisonImpl(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,bool * output_data)4083 inline void ComparisonImpl(
4084 const ComparisonParams& op_params, const RuntimeShape& input1_shape,
4085 const T* input1_data, const RuntimeShape& input2_shape,
4086 const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
4087 const int64_t flatsize =
4088 MatchingFlatSize(input1_shape, input2_shape, output_shape);
4089 for (int64_t i = 0; i < flatsize; ++i) {
4090 output_data[i] = F(input1_data[i], input2_data[i]);
4091 }
4092 }
4093
4094 template <ComparisonFn<float> F>
Comparison(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,bool * output_data)4095 inline void Comparison(const ComparisonParams& op_params,
4096 const RuntimeShape& input1_shape,
4097 const float* input1_data,
4098 const RuntimeShape& input2_shape,
4099 const float* input2_data,
4100 const RuntimeShape& output_shape, bool* output_data) {
4101 ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape,
4102 input2_data, output_shape, output_data);
4103 }
4104
4105 template <typename T, ComparisonFn<int32> F>
ComparisonWithScaling(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,bool * output_data)4106 inline void ComparisonWithScaling(
4107 const ComparisonParams& op_params, const RuntimeShape& input1_shape,
4108 const T* input1_data, const RuntimeShape& input2_shape,
4109 const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
4110 int left_shift = op_params.left_shift;
4111 int32 input1_offset = op_params.input1_offset;
4112 int32 input1_multiplier = op_params.input1_multiplier;
4113 int input1_shift = op_params.input1_shift;
4114 int32 input2_offset = op_params.input2_offset;
4115 int32 input2_multiplier = op_params.input2_multiplier;
4116 int input2_shift = op_params.input2_shift;
4117
4118 const int64_t flatsize =
4119 MatchingFlatSize(input1_shape, input2_shape, output_shape);
4120 for (int64_t i = 0; i < flatsize; ++i) {
4121 const int32 input1_val = input1_offset + input1_data[i];
4122 const int32 input2_val = input2_offset + input2_data[i];
4123 const int32 shifted_input1_val = input1_val * (1 << left_shift);
4124 const int32 shifted_input2_val = input2_val * (1 << left_shift);
4125 const int32 scaled_input1_val =
4126 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4127 shifted_input1_val, input1_multiplier, input1_shift);
4128 const int32 scaled_input2_val =
4129 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4130 shifted_input2_val, input2_multiplier, input2_shift);
4131 output_data[i] = F(scaled_input1_val, scaled_input2_val);
4132 }
4133 }
4134
4135 template <typename T, ComparisonFn<T> F>
BroadcastComparison4DSlowImpl(const ComparisonParams & op_params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data)4136 inline void BroadcastComparison4DSlowImpl(
4137 const ComparisonParams& op_params,
4138 const RuntimeShape& unextended_input1_shape, const T* input1_data,
4139 const RuntimeShape& unextended_input2_shape, const T* input2_data,
4140 const RuntimeShape& unextended_output_shape, bool* output_data) {
4141 gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlow");
4142 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4143 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4144 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4145 const RuntimeShape output_shape =
4146 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4147
4148 NdArrayDesc<4> desc1;
4149 NdArrayDesc<4> desc2;
4150 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4151 unextended_input2_shape, &desc1, &desc2);
4152
4153 for (int b = 0; b < output_shape.Dims(0); ++b) {
4154 for (int y = 0; y < output_shape.Dims(1); ++y) {
4155 for (int x = 0; x < output_shape.Dims(2); ++x) {
4156 for (int c = 0; c < output_shape.Dims(3); ++c) {
4157 output_data[Offset(output_shape, b, y, x, c)] =
4158 F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
4159 input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
4160 }
4161 }
4162 }
4163 }
4164 }
4165 template <ComparisonFn<float> F>
BroadcastComparison4DSlow(const ComparisonParams & op_params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,bool * output_data)4166 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
4167 const RuntimeShape& input1_shape,
4168 const float* input1_data,
4169 const RuntimeShape& input2_shape,
4170 const float* input2_data,
4171 const RuntimeShape& output_shape,
4172 bool* output_data) {
4173 BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data,
4174 input2_shape, input2_data,
4175 output_shape, output_data);
4176 }
4177
4178 template <typename T, ComparisonFn<int32> F>
BroadcastComparison4DSlowWithScaling(const ComparisonParams & op_params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data)4179 inline void BroadcastComparison4DSlowWithScaling(
4180 const ComparisonParams& op_params,
4181 const RuntimeShape& unextended_input1_shape, const T* input1_data,
4182 const RuntimeShape& unextended_input2_shape, const T* input2_data,
4183 const RuntimeShape& unextended_output_shape, bool* output_data) {
4184 gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlowWithScaling");
4185 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4186 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4187 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4188 const RuntimeShape output_shape =
4189 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4190
4191 NdArrayDesc<4> desc1;
4192 NdArrayDesc<4> desc2;
4193 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4194 unextended_input2_shape, &desc1, &desc2);
4195
4196 int left_shift = op_params.left_shift;
4197 int32 input1_offset = op_params.input1_offset;
4198 int32 input1_multiplier = op_params.input1_multiplier;
4199 int input1_shift = op_params.input1_shift;
4200 int32 input2_offset = op_params.input2_offset;
4201 int32 input2_multiplier = op_params.input2_multiplier;
4202 int input2_shift = op_params.input2_shift;
4203
4204 for (int b = 0; b < output_shape.Dims(0); ++b) {
4205 for (int y = 0; y < output_shape.Dims(1); ++y) {
4206 for (int x = 0; x < output_shape.Dims(2); ++x) {
4207 for (int c = 0; c < output_shape.Dims(3); ++c) {
4208 const int32 input1_val =
4209 input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
4210 const int32 input2_val =
4211 input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
4212 const int32 shifted_input1_val = input1_val * (1 << left_shift);
4213 const int32 shifted_input2_val = input2_val * (1 << left_shift);
4214 const int32 scaled_input1_val =
4215 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4216 shifted_input1_val, input1_multiplier, input1_shift);
4217 const int32 scaled_input2_val =
4218 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4219 shifted_input2_val, input2_multiplier, input2_shift);
4220 output_data[Offset(output_shape, b, y, x, c)] =
4221 F(scaled_input1_val, scaled_input2_val);
4222 }
4223 }
4224 }
4225 }
4226 }
4227
4228 #define TFLITE_COMPARISON_OP(name) \
4229 inline void name(const ComparisonParams& op_params, \
4230 const RuntimeShape& input1_shape, const float* input1_data, \
4231 const RuntimeShape& input2_shape, const float* input2_data, \
4232 const RuntimeShape& output_shape, bool* output_data) { \
4233 gemmlowp::ScopedProfilingLabel label(#name); \
4234 Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape, \
4235 input2_data, output_shape, output_data); \
4236 } \
4237 template <typename T> \
4238 inline void name##NoScaling( \
4239 const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
4240 const T* input1_data, const RuntimeShape& input2_shape, \
4241 const T* input2_data, const RuntimeShape& output_shape, \
4242 bool* output_data) { \
4243 gemmlowp::ScopedProfilingLabel label(#name "NoScaling"); \
4244 ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data, \
4245 input2_shape, input2_data, output_shape, \
4246 output_data); \
4247 } \
4248 template <typename T> \
4249 inline void name##WithScaling( \
4250 const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
4251 const T* input1_data, const RuntimeShape& input2_shape, \
4252 const T* input2_data, const RuntimeShape& output_shape, \
4253 bool* output_data) { \
4254 gemmlowp::ScopedProfilingLabel label(#name "WithScaling/8bit"); \
4255 ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data, \
4256 input2_shape, input2_data, \
4257 output_shape, output_data); \
4258 } \
4259 template <typename T> \
4260 inline void Broadcast4DSlow##name##NoScaling( \
4261 const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
4262 const T* input1_data, const RuntimeShape& input2_shape, \
4263 const T* input2_data, const RuntimeShape& output_shape, \
4264 bool* output_data) { \
4265 gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "NoScaling"); \
4266 BroadcastComparison4DSlowImpl<T, name##Fn>( \
4267 op_params, input1_shape, input1_data, input2_shape, input2_data, \
4268 output_shape, output_data); \
4269 } \
4270 inline void Broadcast4DSlow##name( \
4271 const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
4272 const float* input1_data, const RuntimeShape& input2_shape, \
4273 const float* input2_data, const RuntimeShape& output_shape, \
4274 bool* output_data) { \
4275 gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name); \
4276 BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data, \
4277 input2_shape, input2_data, \
4278 output_shape, output_data); \
4279 } \
4280 template <typename T> \
4281 inline void Broadcast4DSlow##name##WithScaling( \
4282 const ComparisonParams& op_params, const RuntimeShape& input1_shape, \
4283 const T* input1_data, const RuntimeShape& input2_shape, \
4284 const T* input2_data, const RuntimeShape& output_shape, \
4285 bool* output_data) { \
4286 gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "/8bit"); \
4287 BroadcastComparison4DSlowWithScaling<T, name##Fn>( \
4288 op_params, input1_shape, input1_data, input2_shape, input2_data, \
4289 output_shape, output_data); \
4290 }
4291 TFLITE_COMPARISON_OP(Equal);
4292 TFLITE_COMPARISON_OP(NotEqual);
4293 TFLITE_COMPARISON_OP(Greater);
4294 TFLITE_COMPARISON_OP(GreaterEqual);
4295 TFLITE_COMPARISON_OP(Less);
4296 TFLITE_COMPARISON_OP(LessEqual);
4297 #undef TFLITE_COMPARISON_OP
4298
4299 template <typename D, typename T>
Select(const RuntimeShape & input_condition_shape,const D * input_condition_data,const RuntimeShape & input_x_shape,const T * input_x_data,const RuntimeShape & input_y_shape,const T * input_y_data,const RuntimeShape & output_shape,T * output_data)4300 void Select(const RuntimeShape& input_condition_shape,
4301 const D* input_condition_data, const RuntimeShape& input_x_shape,
4302 const T* input_x_data, const RuntimeShape& input_y_shape,
4303 const T* input_y_data, const RuntimeShape& output_shape,
4304 T* output_data) {
4305 const int64_t flatsize = MatchingFlatSize(
4306 input_condition_shape, input_x_shape, input_y_shape, output_shape);
4307 for (int64_t i = 0; i < flatsize; ++i) {
4308 output_data[i] =
4309 input_condition_data[i] ? input_x_data[i] : input_y_data[i];
4310 }
4311 }
4312
4313 template <typename D, typename T>
RankOneSelect(const RuntimeShape & input_condition_shape,const D * input_condition_data,const RuntimeShape & input_x_shape,const T * input_x_data,const RuntimeShape & input_y_shape,const T * input_y_data,const RuntimeShape & output_shape,T * output_data)4314 void RankOneSelect(const RuntimeShape& input_condition_shape,
4315 const D* input_condition_data,
4316 const RuntimeShape& input_x_shape, const T* input_x_data,
4317 const RuntimeShape& input_y_shape, const T* input_y_data,
4318 const RuntimeShape& output_shape, T* output_data) {
4319 const int64_t outer_size = input_condition_shape.FlatSize();
4320 TFLITE_DCHECK_EQ(
4321 MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0),
4322 outer_size);
4323 const int64_t inner_size =
4324 MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
4325
4326 int64_t offset = 0;
4327 for (int64_t i = 0; i < outer_size; i++) {
4328 const T* input_data = input_condition_data[i] ? input_x_data : input_y_data;
4329 memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
4330 offset += inner_size;
4331 }
4332 }
4333
4334 template <typename D, typename T>
SelectTrueCoords(const RuntimeShape & input_condition_shape,const D * input_condition_data,T * output_data)4335 void SelectTrueCoords(const RuntimeShape& input_condition_shape,
4336 const D* input_condition_data, T* output_data) {
4337 const size_t size = input_condition_shape.FlatSize();
4338 const size_t cond_rank = input_condition_shape.DimensionsCount();
4339
4340 std::vector<int> dims_to_count(cond_rank, 0);
4341 int cur_flat_size = size;
4342 for (int i = 0; i < cond_rank; ++i) {
4343 dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
4344 cur_flat_size = dims_to_count[i];
4345 }
4346
4347 int output_index = 0;
4348 for (int i = 0; i < size; ++i) {
4349 if (input_condition_data[i]) {
4350 // Insert the coordinate of the current item (row major) into output.
4351 int flat_index = i;
4352 for (int j = 0; j < cond_rank; ++j) {
4353 int coord_j = flat_index / dims_to_count[j];
4354 output_data[output_index * cond_rank + j] = coord_j;
4355 flat_index %= dims_to_count[j];
4356 }
4357 output_index++;
4358 }
4359 }
4360 }
4361
4362 // For easy implementation, the indices is always a vector of size-4 vectors.
4363 template <typename T, typename TI>
SparseToDense(const std::vector<std::vector<TI>> & indices,const T * values,T default_value,bool value_is_scalar,const RuntimeShape & unextended_output_shape,T * output_data)4364 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
4365 const T* values, T default_value,
4366 bool value_is_scalar,
4367 const RuntimeShape& unextended_output_shape,
4368 T* output_data) {
4369 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4370 const RuntimeShape output_shape =
4371 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4372 const int value_count = indices.size();
4373
4374 // First fill the output_data with default value.
4375 const int num_elements = output_shape.FlatSize();
4376 for (int i = 0; i < num_elements; ++i) {
4377 output_data[i] = default_value;
4378 }
4379
4380 // Special handle for value is scalar case to avoid checking the boolean
4381 // condition within the loop every time.
4382 if (value_is_scalar) {
4383 for (int i = 0; i < value_count; ++i) {
4384 const std::vector<TI>& index = indices[i];
4385 TFLITE_DCHECK_EQ(index.size(), 4);
4386 const T value = *values; // just use the first value.
4387 output_data[Offset(output_shape, index[0], index[1], index[2],
4388 index[3])] = value;
4389 }
4390 return;
4391 }
4392
4393 // Go through the values and indices to fill the sparse values.
4394 for (int i = 0; i < value_count; ++i) {
4395 const std::vector<TI>& index = indices[i];
4396 TFLITE_DCHECK_EQ(index.size(), 4);
4397 const T value = values[i];
4398 output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] =
4399 value;
4400 }
4401 }
4402
4403 template <typename T>
Pow(const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)4404 inline void Pow(const RuntimeShape& input1_shape, const T* input1_data,
4405 const RuntimeShape& input2_shape, const T* input2_data,
4406 const RuntimeShape& output_shape, T* output_data) {
4407 const int flat_size =
4408 MatchingFlatSize(input1_shape, input2_shape, output_shape);
4409 for (int i = 0; i < flat_size; ++i) {
4410 output_data[i] = std::pow(input1_data[i], input2_data[i]);
4411 }
4412 }
4413
4414 template <typename T>
BroadcastPow4DSlow(const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)4415 inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape,
4416 const T* input1_data,
4417 const RuntimeShape& unextended_input2_shape,
4418 const T* input2_data,
4419 const RuntimeShape& unextended_output_shape,
4420 T* output_data) {
4421 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4422 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4423 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4424 const RuntimeShape output_shape =
4425 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4426
4427 NdArrayDesc<4> desc1;
4428 NdArrayDesc<4> desc2;
4429 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4430 unextended_input2_shape, &desc1, &desc2);
4431
4432 for (int b = 0; b < output_shape.Dims(0); ++b) {
4433 for (int y = 0; y < output_shape.Dims(1); ++y) {
4434 for (int x = 0; x < output_shape.Dims(2); ++x) {
4435 for (int c = 0; c < output_shape.Dims(3); ++c) {
4436 auto out_idx = Offset(output_shape, b, y, x, c);
4437 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4438 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4439 auto in1_val = input1_data[in1_idx];
4440 auto in2_val = input2_data[in2_idx];
4441 output_data[out_idx] = std::pow(in1_val, in2_val);
4442 }
4443 }
4444 }
4445 }
4446 }
4447
Logical(const RuntimeShape & input1_shape,const bool * input1_data,const RuntimeShape & input2_shape,const bool * input2_data,const RuntimeShape & output_shape,bool * output_data,const std::function<bool (bool,bool)> & func)4448 inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data,
4449 const RuntimeShape& input2_shape, const bool* input2_data,
4450 const RuntimeShape& output_shape, bool* output_data,
4451 const std::function<bool(bool, bool)>& func) {
4452 const int flat_size =
4453 MatchingFlatSize(input1_shape, input2_shape, output_shape);
4454 for (int i = 0; i < flat_size; ++i) {
4455 output_data[i] = func(input1_data[i], input2_data[i]);
4456 }
4457 }
4458
BroadcastLogical4DSlow(const RuntimeShape & unextended_input1_shape,const bool * input1_data,const RuntimeShape & unextended_input2_shape,const bool * input2_data,const RuntimeShape & unextended_output_shape,bool * output_data,const std::function<bool (bool,bool)> & func)4459 inline void BroadcastLogical4DSlow(
4460 const RuntimeShape& unextended_input1_shape, const bool* input1_data,
4461 const RuntimeShape& unextended_input2_shape, const bool* input2_data,
4462 const RuntimeShape& unextended_output_shape, bool* output_data,
4463 const std::function<bool(bool, bool)>& func) {
4464 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4465 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4466 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4467 const RuntimeShape output_shape =
4468 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4469
4470 NdArrayDesc<4> desc1;
4471 NdArrayDesc<4> desc2;
4472 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4473 unextended_input2_shape, &desc1, &desc2);
4474
4475 for (int b = 0; b < output_shape.Dims(0); ++b) {
4476 for (int y = 0; y < output_shape.Dims(1); ++y) {
4477 for (int x = 0; x < output_shape.Dims(2); ++x) {
4478 for (int c = 0; c < output_shape.Dims(3); ++c) {
4479 auto out_idx = Offset(output_shape, b, y, x, c);
4480 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4481 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4482 auto in1_val = input1_data[in1_idx];
4483 auto in2_val = input2_data[in2_idx];
4484 output_data[out_idx] = func(in1_val, in2_val);
4485 }
4486 }
4487 }
4488 }
4489 }
4490
4491 // TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
4492 // generalized and efficient BroadcastBinaryFunction.
4493 //
4494 // Also appears to duplicte MinimumMaximum.
4495 //
4496 // R: Result type. T1: Input 1 type. T2: Input 2 type.
4497 template <typename R, typename T1, typename T2>
BroadcastBinaryFunction4DSlow(const RuntimeShape & unextended_input1_shape,const T1 * input1_data,const RuntimeShape & unextended_input2_shape,const T2 * input2_data,const RuntimeShape & unextended_output_shape,R * output_data,R (* func)(T1,T2))4498 inline void BroadcastBinaryFunction4DSlow(
4499 const RuntimeShape& unextended_input1_shape, const T1* input1_data,
4500 const RuntimeShape& unextended_input2_shape, const T2* input2_data,
4501 const RuntimeShape& unextended_output_shape, R* output_data,
4502 R (*func)(T1, T2)) {
4503 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
4504 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
4505 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4506 const RuntimeShape output_shape =
4507 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4508
4509 NdArrayDesc<4> desc1;
4510 NdArrayDesc<4> desc2;
4511 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
4512 unextended_input2_shape, &desc1, &desc2);
4513
4514 for (int b = 0; b < output_shape.Dims(0); ++b) {
4515 for (int y = 0; y < output_shape.Dims(1); ++y) {
4516 for (int x = 0; x < output_shape.Dims(2); ++x) {
4517 for (int c = 0; c < output_shape.Dims(3); ++c) {
4518 auto out_idx = Offset(output_shape, b, y, x, c);
4519 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
4520 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
4521 auto in1_val = input1_data[in1_idx];
4522 auto in2_val = input2_data[in2_idx];
4523 output_data[out_idx] = func(in1_val, in2_val);
4524 }
4525 }
4526 }
4527 }
4528 }
4529
4530 // R: Result type. T1: Input 1 type. T2: Input 2 type.
4531 // TODO(renjieliu): Refactor other binary functions to use this one.
4532 template <typename R, typename T1, typename T2>
BinaryFunction(const RuntimeShape & input1_shape,const T1 * input1_data,const RuntimeShape & input2_shape,const T2 * input2_data,const RuntimeShape & output_shape,R * output_data,R (* func)(T1,T2))4533 inline void BinaryFunction(const RuntimeShape& input1_shape,
4534 const T1* input1_data,
4535 const RuntimeShape& input2_shape,
4536 const T2* input2_data,
4537 const RuntimeShape& output_shape, R* output_data,
4538 R (*func)(T1, T2)) {
4539 const int flat_size =
4540 MatchingFlatSize(input1_shape, input2_shape, output_shape);
4541 for (int i = 0; i < flat_size; ++i) {
4542 output_data[i] = func(input1_data[i], input2_data[i]);
4543 }
4544 }
4545
4546 template <typename T>
ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams & op_params,const RuntimeShape & unextended_input_shape,const T * input_data,const RuntimeShape & output_size_shape,const int32 * output_size_data,const RuntimeShape & unextended_output_shape,T * output_data)4547 inline void ResizeNearestNeighbor(
4548 const tflite::ResizeNearestNeighborParams& op_params,
4549 const RuntimeShape& unextended_input_shape, const T* input_data,
4550 const RuntimeShape& output_size_shape, const int32* output_size_data,
4551 const RuntimeShape& unextended_output_shape, T* output_data) {
4552 // Align corners = true is not supported.
4553 TFLITE_DCHECK(!op_params.align_corners);
4554 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
4555 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
4556
4557 const RuntimeShape input_shape =
4558 RuntimeShape::ExtendedShape(4, unextended_input_shape);
4559 const RuntimeShape output_shape =
4560 RuntimeShape::ExtendedShape(4, unextended_output_shape);
4561
4562 int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
4563 int32 input_height = input_shape.Dims(1);
4564 int32 input_width = input_shape.Dims(2);
4565 int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
4566
4567 // The Tensorflow version of this op allows resize on the width and height
4568 // axis only.
4569 TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
4570 int32 output_height = output_size_data[0];
4571 int32 output_width = output_size_data[1];
4572
4573 // We use float to ensure agreement with the Tensorflow implementation.
4574 const float height_scale = static_cast<float>(input_height) / output_height;
4575 const float width_scale = static_cast<float>(input_width) / output_width;
4576
4577 const int col_offset = input_shape.Dims(3);
4578 const int row_offset = input_shape.Dims(2) * col_offset;
4579 const int batch_offset = input_shape.Dims(1) * row_offset;
4580
4581 const T* input_ptr = input_data;
4582 T* output_ptr = output_data;
4583 for (int b = 0; b < batches; ++b) {
4584 for (int y = 0; y < output_height; ++y) {
4585 int32 in_y = std::min(static_cast<int32>(std::floor(y * height_scale)),
4586 input_height - 1);
4587 const T* y_input_ptr = input_ptr + in_y * row_offset;
4588 for (int x = 0; x < output_width; ++x) {
4589 int32 in_x = std::min(static_cast<int32>(std::floor(x * width_scale)),
4590 input_width - 1);
4591 const T* x_input_ptr = y_input_ptr + in_x * col_offset;
4592 memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
4593 output_ptr += depth;
4594 }
4595 }
4596 input_ptr += batch_offset;
4597 }
4598 }
4599
BroadcastPrelu4DSlow(const PreluParams & params,const RuntimeShape & input_shape,const uint8 * input_data,const RuntimeShape & alpha_shape,const uint8 * alpha_data,const RuntimeShape & output_shape,uint8 * output_data)4600 inline void BroadcastPrelu4DSlow(const PreluParams& params,
4601 const RuntimeShape& input_shape,
4602 const uint8* input_data,
4603 const RuntimeShape& alpha_shape,
4604 const uint8* alpha_data,
4605 const RuntimeShape& output_shape,
4606 uint8* output_data) {
4607 TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
4608 TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
4609 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
4610 const RuntimeShape extended_output_shape =
4611 RuntimeShape::ExtendedShape(4, output_shape);
4612 NdArrayDesc<4> desc1;
4613 NdArrayDesc<4> desc2;
4614 NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
4615
4616 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
4617 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
4618 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
4619 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
4620 int output_index = Offset(extended_output_shape, b, y, x, c);
4621 int input_index = SubscriptToIndex(desc1, b, y, x, c);
4622 const int32 input_value =
4623 params.input_offset + input_data[input_index];
4624 if (input_value >= 0) {
4625 output_data[output_index] = input_data[input_index];
4626 } else {
4627 auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
4628 const int32 alpha_value =
4629 params.alpha_offset + alpha_data[alpha_index];
4630 const int32 unclamped_output =
4631 params.output_offset +
4632 MultiplyByQuantizedMultiplierSmallerThanOneExp(
4633 input_value * alpha_value, params.output_multiplier,
4634 params.output_shift);
4635 const int32 quantized_min = std::numeric_limits<uint8_t>::min();
4636 const int32 quantized_max = std::numeric_limits<uint8_t>::max();
4637 const int32 clamped_output = std::min(
4638 quantized_max, std::max(quantized_min, unclamped_output));
4639 output_data[output_index] = static_cast<uint8>(clamped_output);
4640 }
4641 }
4642 }
4643 }
4644 }
4645 }
4646
4647 template <typename T>
Fill(const RuntimeShape & value_shape,const T * value_data,const RuntimeShape & output_shape,T * output_data)4648 void Fill(const RuntimeShape& value_shape, const T* value_data,
4649 const RuntimeShape& output_shape, T* output_data) {
4650 TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
4651 const int flat_size = output_shape.FlatSize();
4652 for (int i = 0; i < flat_size; ++i) {
4653 output_data[i] = *value_data;
4654 }
4655 }
4656
4657 template <typename Scalar>
Reverse(int axis,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * output_data)4658 void Reverse(int axis, const RuntimeShape& input_shape,
4659 const Scalar* input_data, const RuntimeShape& output_shape,
4660 Scalar* output_data) {
4661 gemmlowp::ScopedProfilingLabel label("Reverse");
4662
4663 int outer_size = 1;
4664 for (int i = 0; i < axis; ++i) {
4665 outer_size *= input_shape.Dims(i);
4666 }
4667
4668 int copy_size = 1;
4669 for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
4670 copy_size *= input_shape.Dims(i);
4671 }
4672
4673 const int dims_at_axis = input_shape.Dims(axis);
4674 for (int i = 0; i < outer_size; ++i) {
4675 for (int j = 0; j < dims_at_axis; ++j) {
4676 const int start_pos = (i * dims_at_axis + j) * copy_size;
4677 Scalar* output_ptr = output_data + start_pos;
4678 int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
4679 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
4680 }
4681 }
4682 }
4683
4684 template <typename Scalar, typename TS>
ReverseSequence(const TS * seq_lengths,const int seq_dim,const int batch_dim,const RuntimeShape & input_shape,const Scalar * input_data,const RuntimeShape & output_shape,Scalar * output_data)4685 void ReverseSequence(const TS* seq_lengths, const int seq_dim,
4686 const int batch_dim, const RuntimeShape& input_shape,
4687 const Scalar* input_data, const RuntimeShape& output_shape,
4688 Scalar* output_data) {
4689 gemmlowp::ScopedProfilingLabel label("ReverseSequence");
4690
4691 int outer_size = 1;
4692 int outer_dim = std::min(batch_dim, seq_dim);
4693 int medium_dim = std::max(batch_dim, seq_dim);
4694 for (int i = 0; i < outer_dim; ++i) {
4695 outer_size *= input_shape.Dims(i);
4696 }
4697
4698 int medium_size = 1;
4699 for (int i = outer_dim + 1; i < medium_dim; ++i) {
4700 medium_size *= input_shape.Dims(i);
4701 }
4702
4703 int copy_size = 1;
4704 for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i) {
4705 copy_size *= input_shape.Dims(i);
4706 }
4707
4708 const int dims_at_outer_dim = input_shape.Dims(outer_dim);
4709 const int dims_at_medium_dim = input_shape.Dims(medium_dim);
4710
4711 Scalar* output_ptr;
4712 if (batch_dim > seq_dim) {
4713 for (int i = 0; i < outer_size; ++i) {
4714 for (int j = 0; j < dims_at_outer_dim; ++j) {
4715 const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4716 for (int p = 0; p < medium_size; ++p) {
4717 for (int q = 0; q < dims_at_medium_dim; ++q) {
4718 const int in_pos =
4719 ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4720 const Scalar* in_ptr = input_data + in_pos;
4721 int sl = seq_lengths[q] - 1;
4722 if (j > sl) {
4723 output_ptr = output_data + in_pos;
4724 } else {
4725 const int out_pos_base =
4726 (i * dims_at_outer_dim + sl - j) * medium_size;
4727 const int out_pos =
4728 ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4729 output_ptr = output_data + out_pos;
4730 }
4731 memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
4732 }
4733 }
4734 }
4735 }
4736 } else if (batch_dim < seq_dim) {
4737 for (int i = 0; i < outer_size; ++i) {
4738 for (int j = 0; j < dims_at_outer_dim; ++j) {
4739 const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4740 int sl = seq_lengths[j] - 1;
4741 const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
4742 for (int p = 0; p < medium_size; ++p) {
4743 for (int q = 0; q < dims_at_medium_dim; ++q) {
4744 const int in_pos =
4745 ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
4746 const Scalar* in_ptr = input_data + in_pos;
4747 if (q > sl) {
4748 output_ptr = output_data + in_pos;
4749 } else {
4750 const int out_pos =
4751 ((out_pos_base + p) * dims_at_medium_dim + sl - q) *
4752 copy_size;
4753 output_ptr = output_data + out_pos;
4754 }
4755 memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
4756 }
4757 }
4758 }
4759 }
4760 }
4761 }
4762
4763 } // namespace reference_ops
4764 } // namespace tflite
4765
4766 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
4767