1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
17
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <limits>
22
23 #include "ruy/profiler/instrumentation.h" // from @ruy
24 #include "tensorflow/lite/kernels/internal/common.h"
25 #include "tensorflow/lite/kernels/internal/compatibility.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27
28 namespace tflite {
29
30 namespace reference_ops {
31
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)32 inline void SubNonBroadcast(const ArithmeticParams& params,
33 const RuntimeShape& input1_shape,
34 const float* input1_data,
35 const RuntimeShape& input2_shape,
36 const float* input2_data,
37 const RuntimeShape& output_shape,
38 float* output_data) {
39 const int flat_size =
40 MatchingElementsSize(input1_shape, input2_shape, output_shape);
41 for (int i = 0; i < flat_size; ++i) {
42 output_data[i] = ActivationFunctionWithMinMax(
43 input1_data[i] - input2_data[i], params.float_activation_min,
44 params.float_activation_max);
45 }
46 }
47
SubNonBroadcast(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)48 inline void SubNonBroadcast(const ArithmeticParams& params,
49 const RuntimeShape& input1_shape,
50 const int32_t* input1_data,
51 const RuntimeShape& input2_shape,
52 const int32_t* input2_data,
53 const RuntimeShape& output_shape,
54 int32_t* output_data) {
55 const int flat_size =
56 MatchingElementsSize(input1_shape, input2_shape, output_shape);
57 for (int i = 0; i < flat_size; ++i) {
58 output_data[i] = ActivationFunctionWithMinMax(
59 input1_data[i] - input2_data[i], params.quantized_activation_min,
60 params.quantized_activation_max);
61 }
62 }
63
64 // TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
65 // dimensionality if the runtime code does a single loop over one dimension
66 // that handles broadcasting as the base case. The code generator would then
67 // generate max(D1, D2) nested for loops.
68 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const float * input1_data,const RuntimeShape & input2_shape,const float * input2_data,const RuntimeShape & output_shape,float * output_data)69 inline void BroadcastSubSlow(const ArithmeticParams& params,
70 const RuntimeShape& input1_shape,
71 const float* input1_data,
72 const RuntimeShape& input2_shape,
73 const float* input2_data,
74 const RuntimeShape& output_shape,
75 float* output_data) {
76 ruy::profiler::ScopeLabel label("BroadcastSubSlow/float");
77 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
78 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
79 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
80 NdArrayDesc<N> desc1;
81 NdArrayDesc<N> desc2;
82 NdArrayDesc<N> output_desc;
83 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84 &desc2);
85 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
86
87 // In Tensorflow, the dimensions are canonically named (batch_number, row,
88 // col, channel), with extents (batches, height, width, depth), with the
89 // trailing dimension changing most rapidly (channels has the smallest stride,
90 // typically 1 element).
91 //
92 // In generated C code, we store arrays with the dimensions reversed. The
93 // first dimension has smallest stride.
94 //
95 // We name our variables by their Tensorflow convention, but generate C code
96 // nesting loops such that the innermost loop has the smallest stride for the
97 // best cache behavior.
98 auto sub_func = [&](int indexes[N]) {
99 output_data[SubscriptToIndex(output_desc, indexes)] =
100 ActivationFunctionWithMinMax(
101 input1_data[SubscriptToIndex(desc1, indexes)] -
102 input2_data[SubscriptToIndex(desc2, indexes)],
103 params.float_activation_min, params.float_activation_max);
104 };
105 NDOpsHelper<N>(output_desc, sub_func);
106 }
107
108 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)109 inline void BroadcastSubSlow(const ArithmeticParams& params,
110 const RuntimeShape& input1_shape,
111 const uint8_t* input1_data,
112 const RuntimeShape& input2_shape,
113 const uint8_t* input2_data,
114 const RuntimeShape& output_shape,
115 uint8_t* output_data) {
116 ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
117 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
118 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
119 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
120 NdArrayDesc<N> desc1;
121 NdArrayDesc<N> desc2;
122 NdArrayDesc<N> output_desc;
123 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
124 &desc2);
125 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
126
127 // In Tensorflow, the dimensions are canonically named (batch_number, row,
128 // col, channel), with extents (batches, height, width, depth), with the
129 // trailing dimension changing most rapidly (channels has the smallest stride,
130 // typically 1 element).
131 //
132 // In generated C code, we store arrays with the dimensions reversed. The
133 // first dimension has smallest stride.
134 //
135 // We name our variables by their Tensorflow convention, but generate C code
136 // nesting loops such that the innermost loop has the smallest stride for the
137 // best cache behavior.
138 auto sub_func = [&](int indexes[N]) {
139 const int32_t input1_val =
140 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
141 const int32_t input2_val =
142 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
143 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
144 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
145 const int32_t scaled_input1_val =
146 MultiplyByQuantizedMultiplierSmallerThanOneExp(
147 shifted_input1_val, params.input1_multiplier, params.input1_shift);
148 const int32_t scaled_input2_val =
149 MultiplyByQuantizedMultiplierSmallerThanOneExp(
150 shifted_input2_val, params.input2_multiplier, params.input2_shift);
151 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
152 const int32_t raw_output =
153 MultiplyByQuantizedMultiplierSmallerThanOneExp(
154 raw_sub, params.output_multiplier, params.output_shift) +
155 params.output_offset;
156 const int32_t clamped_output =
157 std::min(params.quantized_activation_max,
158 std::max(params.quantized_activation_min, raw_output));
159 output_data[SubscriptToIndex(output_desc, indexes)] =
160 static_cast<uint8_t>(clamped_output);
161 };
162 NDOpsHelper<N>(output_desc, sub_func);
163 }
164
165 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int32_t * input1_data,const RuntimeShape & input2_shape,const int32_t * input2_data,const RuntimeShape & output_shape,int32_t * output_data)166 inline void BroadcastSubSlow(const ArithmeticParams& params,
167 const RuntimeShape& input1_shape,
168 const int32_t* input1_data,
169 const RuntimeShape& input2_shape,
170 const int32_t* input2_data,
171 const RuntimeShape& output_shape,
172 int32_t* output_data) {
173 ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
174 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
175 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
176 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
177 NdArrayDesc<N> desc1;
178 NdArrayDesc<N> desc2;
179 NdArrayDesc<N> output_desc;
180 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
181 &desc2);
182 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
183
184 // In Tensorflow, the dimensions are canonically named (batch_number, row,
185 // col, channel), with extents (batches, height, width, depth), with the
186 // trailing dimension changing most rapidly (channels has the smallest stride,
187 // typically 1 element).
188 //
189 // In generated C code, we store arrays with the dimensions reversed. The
190 // first dimension has smallest stride.
191 //
192 // We name our variables by their Tensorflow convention, but generate C code
193 // nesting loops such that the innermost loop has the smallest stride for the
194 // best cache behavior.
195 auto sub_func = [&](int indexes[N]) {
196 output_data[SubscriptToIndex(output_desc, indexes)] =
197 ActivationFunctionWithMinMax(
198 input1_data[SubscriptToIndex(desc1, indexes)] -
199 input2_data[SubscriptToIndex(desc2, indexes)],
200 params.quantized_activation_min, params.quantized_activation_max);
201 };
202 NDOpsHelper<N>(output_desc, sub_func);
203 }
204
205 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)206 inline void BroadcastSubSlow(const ArithmeticParams& params,
207 const RuntimeShape& input1_shape,
208 const int8_t* input1_data,
209 const RuntimeShape& input2_shape,
210 const int8_t* input2_data,
211 const RuntimeShape& output_shape,
212 int8_t* output_data) {
213 ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
214 NdArrayDesc<N> desc1;
215 NdArrayDesc<N> desc2;
216 NdArrayDesc<N> output_desc;
217 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
218 &desc2);
219 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
220
221 // In Tensorflow, the dimensions are canonically named (batch_number, row,
222 // col, channel), with extents (batches, height, width, depth), with the
223 // trailing dimension changing most rapidly (channels has the smallest stride,
224 // typically 1 element).
225 //
226 // In generated C code, we store arrays with the dimensions reversed. The
227 // first dimension has smallest stride.
228 //
229 // We name our variables by their Tensorflow convention, but generate C code
230 // nesting loops such that the innermost loop has the smallest stride for the
231 // best cache behavior.
232 auto sub_func = [&](int indexes[N]) {
233 const int32_t input1_val =
234 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
235 const int32_t input2_val =
236 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
237 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
238 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
239 const int32_t scaled_input1_val =
240 MultiplyByQuantizedMultiplierSmallerThanOneExp(
241 shifted_input1_val, params.input1_multiplier, params.input1_shift);
242 const int32_t scaled_input2_val =
243 MultiplyByQuantizedMultiplierSmallerThanOneExp(
244 shifted_input2_val, params.input2_multiplier, params.input2_shift);
245 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
246 const int32_t raw_output =
247 MultiplyByQuantizedMultiplierSmallerThanOneExp(
248 raw_sub, params.output_multiplier, params.output_shift) +
249 params.output_offset;
250 const int32_t clamped_output =
251 std::min(params.quantized_activation_max,
252 std::max(params.quantized_activation_min, raw_output));
253 output_data[SubscriptToIndex(output_desc, indexes)] =
254 static_cast<int8_t>(clamped_output);
255 };
256 NDOpsHelper<N>(output_desc, sub_func);
257 }
258
259 template <int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int64_t * input1_data,const RuntimeShape & input2_shape,const int64_t * input2_data,const RuntimeShape & output_shape,int64_t * output_data)260 void BroadcastSubSlow(const ArithmeticParams& params,
261 const RuntimeShape& input1_shape,
262 const int64_t* input1_data,
263 const RuntimeShape& input2_shape,
264 const int64_t* input2_data,
265 const RuntimeShape& output_shape, int64_t* output_data) {
266 ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
267 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
268 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
269 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
270 NdArrayDesc<N> desc1;
271 NdArrayDesc<N> desc2;
272 NdArrayDesc<N> output_desc;
273 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
274 &desc2);
275 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
276
277 // In Tensorflow, the dimensions are canonically named (batch_number, row,
278 // col, channel), with extents (batches, height, width, depth), with the
279 // trailing dimension changing most rapidly (channels has the smallest stride,
280 // typically 1 element).
281 //
282 // In generated C code, we store arrays with the dimensions reversed. The
283 // first dimension has smallest stride.
284 //
285 // We name our variables by their Tensorflow convention, but generate C code
286 // nesting loops such that the innermost loop has the smallest stride for the
287 // best cache behavior.
288 auto sub_func = [&](int indexes[N]) {
289 output_data[SubscriptToIndex(output_desc, indexes)] =
290 ActivationFunctionWithMinMax(
291 input1_data[SubscriptToIndex(desc1, indexes)] -
292 input2_data[SubscriptToIndex(desc2, indexes)],
293 params.int64_activation_min, params.int64_activation_max);
294 };
295 NDOpsHelper<N>(output_desc, sub_func);
296 }
297
298 template <typename T, int N = 5>
BroadcastSubSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)299 void BroadcastSubSlow(const ArithmeticParams& params,
300 const RuntimeShape& input1_shape, const T* input1_data,
301 const RuntimeShape& input2_shape, const T* input2_data,
302 const RuntimeShape& output_shape, T* output_data) {
303 ruy::profiler::ScopeLabel label("BroadcastSubSlow/templated");
304 TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
305 TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
306 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
307 NdArrayDesc<N> desc1;
308 NdArrayDesc<N> desc2;
309 NdArrayDesc<N> output_desc;
310 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
311 &desc2);
312 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
313
314 // In Tensorflow, the dimensions are canonically named (batch_number, row,
315 // col, channel), with extents (batches, height, width, depth), with the
316 // trailing dimension changing most rapidly (channels has the smallest stride,
317 // typically 1 element).
318 //
319 // In generated C code, we store arrays with the dimensions reversed. The
320 // first dimension has smallest stride.
321 //
322 // We name our variables by their Tensorflow convention, but generate C code
323 // nesting loops such that the innermost loop has the smallest stride for the
324 // best cache behavior.
325 auto sub_func = [&](int indexes[N]) {
326 output_data[SubscriptToIndex(output_desc, indexes)] =
327 ActivationFunctionWithMinMax(
328 input1_data[SubscriptToIndex(desc1, indexes)] -
329 input2_data[SubscriptToIndex(desc2, indexes)],
330 params.quantized_activation_min, params.quantized_activation_max);
331 };
332 NDOpsHelper<N>(output_desc, sub_func);
333 }
334
335 template <int N = 5>
BroadcastSub16POTSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int16_t * input1_data,const RuntimeShape & input2_shape,const int16_t * input2_data,const RuntimeShape & output_shape,int16_t * output_data)336 inline void BroadcastSub16POTSlow(const ArithmeticParams& params,
337 const RuntimeShape& input1_shape,
338 const int16_t* input1_data,
339 const RuntimeShape& input2_shape,
340 const int16_t* input2_data,
341 const RuntimeShape& output_shape,
342 int16_t* output_data) {
343 ruy::profiler::ScopeLabel label("BroadcastSub16POTSlow/int16_t");
344 NdArrayDesc<N> desc1;
345 NdArrayDesc<N> desc2;
346 NdArrayDesc<N> output_desc;
347 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
348 &desc2);
349 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
350
351 // In Tensorflow, the dimensions are canonically named (batch_number, row,
352 // col, channel), with extents (batches, height, width, depth), with the
353 // trailing dimension changing most rapidly (channels has the smallest stride,
354 // typically 1 element).
355 //
356 // In generated C code, we store arrays with the dimensions reversed. The
357 // first dimension has smallest stride.
358 //
359 // We name our variables by their Tensorflow convention, but generate C code
360 // nesting loops such that the innermost loop has the smallest stride for the
361 // best cache behavior.
362 auto sub_func = [&](int indexes[N]) {
363 const int32_t input1_val = input1_data[SubscriptToIndex(desc1, indexes)];
364 const int32_t input2_val = input2_data[SubscriptToIndex(desc2, indexes)];
365 const int32_t scaled_input1_val =
366 gemmlowp::RoundingDivideByPOT(input1_val, -params.input1_shift);
367 const int32_t scaled_input2_val =
368 gemmlowp::RoundingDivideByPOT(input2_val, -params.input2_shift);
369 const int32_t raw_output = scaled_input1_val - scaled_input2_val;
370 const int32_t clamped_output =
371 std::min(params.quantized_activation_max,
372 std::max(params.quantized_activation_min, raw_output));
373 output_data[SubscriptToIndex(output_desc, indexes)] =
374 static_cast<int16_t>(clamped_output);
375 };
376 NDOpsHelper<N>(output_desc, sub_func);
377 }
378
379 // Element-wise Sub that can often be used for inner loop of broadcast sub as
380 // well as the non-broadcast sub.
SubElementwise(int size,const ArithmeticParams & params,const uint8_t * input1_data,const uint8_t * input2_data,uint8_t * output_data)381 inline void SubElementwise(int size, const ArithmeticParams& params,
382 const uint8_t* input1_data,
383 const uint8_t* input2_data, uint8_t* output_data) {
384 TFLITE_DCHECK_GT(params.input1_offset, -256);
385 TFLITE_DCHECK_GT(params.input2_offset, -256);
386 TFLITE_DCHECK_LT(params.input1_offset, 256);
387 TFLITE_DCHECK_LT(params.input2_offset, 256);
388
389 for (int i = 0; i < size; ++i) {
390 const int32_t input1_val = params.input1_offset + input1_data[i];
391 const int32_t input2_val = params.input2_offset + input2_data[i];
392 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
393 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
394 const int32_t scaled_input1_val =
395 MultiplyByQuantizedMultiplierSmallerThanOneExp(
396 shifted_input1_val, params.input1_multiplier, params.input1_shift);
397 const int32_t scaled_input2_val =
398 MultiplyByQuantizedMultiplierSmallerThanOneExp(
399 shifted_input2_val, params.input2_multiplier, params.input2_shift);
400 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
401 const int32_t raw_output =
402 MultiplyByQuantizedMultiplierSmallerThanOneExp(
403 raw_sub, params.output_multiplier, params.output_shift) +
404 params.output_offset;
405 const int32_t clamped_output =
406 std::min(params.quantized_activation_max,
407 std::max(params.quantized_activation_min, raw_output));
408 output_data[i] = static_cast<uint8_t>(clamped_output);
409 }
410 }
411
412 // Element-wise add that can often be used for inner loop of broadcast add as
413 // well as the non-broadcast add.
SubElementwise(int size,const ArithmeticParams & params,const int8_t * input1_data,const int8_t * input2_data,int8_t * output_data)414 inline void SubElementwise(int size, const ArithmeticParams& params,
415 const int8_t* input1_data, const int8_t* input2_data,
416 int8_t* output_data) {
417 const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
418 TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
419 TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
420 TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
421 TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
422
423 for (int i = 0; i < size; ++i) {
424 const int32_t input1_val = params.input1_offset + input1_data[i];
425 const int32_t input2_val = params.input2_offset + input2_data[i];
426 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
427 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
428 const int32_t scaled_input1_val =
429 MultiplyByQuantizedMultiplierSmallerThanOneExp(
430 shifted_input1_val, params.input1_multiplier, params.input1_shift);
431 const int32_t scaled_input2_val =
432 MultiplyByQuantizedMultiplierSmallerThanOneExp(
433 shifted_input2_val, params.input2_multiplier, params.input2_shift);
434 const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
435 const int32_t raw_output =
436 MultiplyByQuantizedMultiplierSmallerThanOneExp(
437 raw_sub, params.output_multiplier, params.output_shift) +
438 params.output_offset;
439 const int32_t clamped_output =
440 std::min(params.quantized_activation_max,
441 std::max(params.quantized_activation_min, raw_output));
442 output_data[i] = static_cast<int8_t>(clamped_output);
443 }
444 }
445
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)446 inline void Sub(const ArithmeticParams& params,
447 const RuntimeShape& input1_shape, const uint8_t* input1_data,
448 const RuntimeShape& input2_shape, const uint8_t* input2_data,
449 const RuntimeShape& output_shape, uint8_t* output_data) {
450 TFLITE_DCHECK_LE(params.quantized_activation_min,
451 params.quantized_activation_max);
452 const int flat_size =
453 MatchingElementsSize(input1_shape, input2_shape, output_shape);
454
455 TFLITE_DCHECK_GT(params.input1_offset, -256);
456 TFLITE_DCHECK_GT(params.input2_offset, -256);
457 TFLITE_DCHECK_LT(params.input1_offset, 256);
458 TFLITE_DCHECK_LT(params.input2_offset, 256);
459 SubElementwise(flat_size, params, input1_data, input2_data, output_data);
460 }
461
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)462 inline void Sub(const ArithmeticParams& params,
463 const RuntimeShape& input1_shape, const int8_t* input1_data,
464 const RuntimeShape& input2_shape, const int8_t* input2_data,
465 const RuntimeShape& output_shape, int8_t* output_data) {
466 TFLITE_DCHECK_LE(params.quantized_activation_min,
467 params.quantized_activation_max);
468
469 const int flat_size =
470 MatchingElementsSize(input1_shape, input2_shape, output_shape);
471
472 const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
473 TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
474 TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
475 TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
476 TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
477 SubElementwise(flat_size, params, input1_data, input2_data, output_data);
478 }
479
480 template <typename T>
Sub(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)481 void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
482 const T* input1_data, const RuntimeShape& input2_shape,
483 const T* input2_data, const RuntimeShape& output_shape,
484 T* output_data) {
485 NdArrayDesc<4> desc1;
486 NdArrayDesc<4> desc2;
487 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
488 &desc2);
489 const RuntimeShape extended_output_shape =
490 RuntimeShape::ExtendedShape(4, output_shape);
491
492 // In Tensorflow, the dimensions are canonically named (batch_number, row,
493 // col, channel), with extents (batches, height, width, depth), with the
494 // trailing dimension changing most rapidly (channels has the smallest stride,
495 // typically 1 element).
496 //
497 // In generated C code, we store arrays with the dimensions reversed. The
498 // first dimension has smallest stride.
499 //
500 // We name our variables by their Tensorflow convention, but generate C code
501 // nesting loops such that the innermost loop has the smallest stride for the
502 // best cache behavior.
503 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
504 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
505 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
506 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
507 output_data[Offset(extended_output_shape, b, y, x, c)] =
508 input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
509 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
510 }
511 }
512 }
513 }
514 }
515
SetActivationMinMax(const ArithmeticParams & params,int32_t * activation_min,int32_t * activation_max)516 inline void SetActivationMinMax(const ArithmeticParams& params,
517 int32_t* activation_min,
518 int32_t* activation_max) {
519 *activation_min = params.quantized_activation_min;
520 *activation_max = params.quantized_activation_max;
521 }
522
SetActivationMinMax(const ArithmeticParams & params,float * activation_min,float * activation_max)523 inline void SetActivationMinMax(const ArithmeticParams& params,
524 float* activation_min, float* activation_max) {
525 *activation_min = params.float_activation_min;
526 *activation_max = params.float_activation_max;
527 }
528
SetActivationMinMax(const ArithmeticParams & params,int64_t * activation_min,int64_t * activation_max)529 inline void SetActivationMinMax(const ArithmeticParams& params,
530 int64_t* activation_min,
531 int64_t* activation_max) {
532 *activation_min = params.int64_activation_min;
533 *activation_max = params.int64_activation_max;
534 }
535
536 template <typename T>
SubWithActivation(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)537 inline void SubWithActivation(
538 const ArithmeticParams& params, const RuntimeShape& input1_shape,
539 const T* input1_data, const RuntimeShape& input2_shape,
540 const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
541 ruy::profiler::ScopeLabel label("SubWithActivation");
542 const int flat_size =
543 MatchingElementsSize(input1_shape, input2_shape, output_shape);
544 T activation_min, activation_max;
545 SetActivationMinMax(params, &activation_min, &activation_max);
546
547 for (int i = 0; i < flat_size; ++i) {
548 output_data[i] = ActivationFunctionWithMinMax(
549 input1_data[i] - input2_data[i], activation_min, activation_max);
550 }
551 }
552
553 } // namespace reference_ops
554 } // namespace tflite
555
556 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
557