1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17
18 #include <algorithm>
19
20 #include "tensorflow/lite/kernels/internal/common.h"
21
22 namespace tflite {
23
24 namespace reference_ops {
25
26 template <typename T>
DivCheckArithmeticParams(const ArithmeticParams & params)27 inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
28 TFLITE_DCHECK_LE(params.quantized_activation_min,
29 params.quantized_activation_max);
30 // Input offset is negative input zero point. Activation tensors are
31 // asymmetric quantized so they span the full int8 range.
32 constexpr int32_t max_value =
33 static_cast<int32_t>(std::numeric_limits<T>::max());
34 TFLITE_DCHECK_GE(params.input1_offset, -max_value);
35 TFLITE_DCHECK_LE(params.input1_offset, max_value);
36 TFLITE_DCHECK_GE(params.input2_offset, -max_value);
37 TFLITE_DCHECK_LE(params.input2_offset, max_value);
38 TFLITE_DCHECK_GE(params.output_offset, -max_value);
39 TFLITE_DCHECK_LE(params.output_offset, max_value);
40 }
41
42 // Element-wise div that can often be used for inner loop of broadcast Div as
43 // well as the non-broadcast Div.
44 template <typename T>
DivElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)45 inline void DivElementwise(int size, const ArithmeticParams& params,
46 const T* input1_data, const T* input2_data,
47 T* output_data) {
48 DivCheckArithmeticParams<T>(params);
49
50 for (int i = 0; i < size; ++i) {
51 const int32_t input1_val = params.input1_offset + input1_data[i];
52 const int32_t input2_val = params.input2_offset + input2_data[i];
53 TFLITE_DCHECK_NE(input2_val, 0);
54 int recip_shift;
55 const int32_t input2_inv =
56 (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
57 : -GetReciprocal(-input2_val, 31, &recip_shift);
58 const int headroom = CountLeadingSignBits(input1_val);
59 const int32_t unscaled_quotient =
60 MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
61 headroom);
62 const int total_shift = params.output_shift - recip_shift - headroom;
63 const int32_t unclamped_result =
64 params.output_offset +
65 MultiplyByQuantizedMultiplierSmallerThanOneExp(
66 unscaled_quotient, params.output_multiplier, total_shift);
67 const int32_t clamped_output =
68 std::min(params.quantized_activation_max,
69 std::max(params.quantized_activation_min, unclamped_result));
70 output_data[i] = static_cast<T>(clamped_output);
71 }
72 }
73
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)74 inline void Div(const ArithmeticParams& params,
75 const RuntimeShape& input1_shape, const uint8_t* input1_data,
76 const RuntimeShape& input2_shape, const uint8_t* input2_data,
77 const RuntimeShape& output_shape, uint8_t* output_data) {
78 TFLITE_DCHECK_LE(params.quantized_activation_min,
79 params.quantized_activation_max);
80 const int flat_size =
81 MatchingElementsSize(input1_shape, input2_shape, output_shape);
82
83 DivElementwise(flat_size, params, input1_data, input2_data, output_data);
84 }
85
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)86 inline void Div(const ArithmeticParams& params,
87 const RuntimeShape& input1_shape, const int8_t* input1_data,
88 const RuntimeShape& input2_shape, const int8_t* input2_data,
89 const RuntimeShape& output_shape, int8_t* output_data) {
90 TFLITE_DCHECK_LE(params.quantized_activation_min,
91 params.quantized_activation_max);
92 const int flat_size =
93 MatchingElementsSize(input1_shape, input2_shape, output_shape);
94
95 DivElementwise(flat_size, params, input1_data, input2_data, output_data);
96 }
97
98 template <typename T, int N = 5>
BroadcastDivSlowQuantized(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)99 inline void BroadcastDivSlowQuantized(
100 const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
101 const T* input1_data, const RuntimeShape& unextended_input2_shape,
102 const T* input2_data, const RuntimeShape& unextended_output_shape,
103 T* output_data) {
104 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
105 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
106 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
107
108 NdArrayDesc<N> desc1;
109 NdArrayDesc<N> desc2;
110 NdArrayDesc<N> output_desc;
111 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
112 unextended_input2_shape, &desc1, &desc2);
113 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
114 &output_desc);
115
116 DivCheckArithmeticParams<T>(params);
117
118 auto div_func = [&](int indexes[N]) {
119 const int32_t input1_val =
120 params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
121 const int32_t input2_val =
122 params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
123 TFLITE_DCHECK_NE(input2_val, 0);
124 int recip_shift;
125 const int32_t input2_inv =
126 (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
127 : -GetReciprocal(-input2_val, 31, &recip_shift);
128 const int headroom = CountLeadingSignBits(input1_val);
129 const int32_t unscaled_quotient =
130 MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
131 headroom);
132 const int total_shift = params.output_shift - recip_shift - headroom;
133 const int32_t unclamped_result =
134 params.output_offset +
135 MultiplyByQuantizedMultiplierSmallerThanOneExp(
136 unscaled_quotient, params.output_multiplier, total_shift);
137 const int32_t clamped_output =
138 std::min(params.quantized_activation_max,
139 std::max(params.quantized_activation_min, unclamped_result));
140 output_data[SubscriptToIndex(output_desc, indexes)] =
141 static_cast<T>(clamped_output);
142 };
143 NDOpsHelper<N>(output_desc, div_func);
144 }
145
146 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8_t * input1_data,const RuntimeShape & unextended_input2_shape,const uint8_t * input2_data,const RuntimeShape & unextended_output_shape,uint8_t * output_data)147 inline void BroadcastDivSlow(const ArithmeticParams& params,
148 const RuntimeShape& unextended_input1_shape,
149 const uint8_t* input1_data,
150 const RuntimeShape& unextended_input2_shape,
151 const uint8_t* input2_data,
152 const RuntimeShape& unextended_output_shape,
153 uint8_t* output_data) {
154 BroadcastDivSlowQuantized<uint8_t, N>(
155 params, unextended_input1_shape, input1_data, unextended_input2_shape,
156 input2_data, unextended_output_shape, output_data);
157 }
158
159 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const int8_t * input1_data,const RuntimeShape & unextended_input2_shape,const int8_t * input2_data,const RuntimeShape & unextended_output_shape,int8_t * output_data)160 inline void BroadcastDivSlow(const ArithmeticParams& params,
161 const RuntimeShape& unextended_input1_shape,
162 const int8_t* input1_data,
163 const RuntimeShape& unextended_input2_shape,
164 const int8_t* input2_data,
165 const RuntimeShape& unextended_output_shape,
166 int8_t* output_data) {
167 BroadcastDivSlowQuantized<int8_t, N>(
168 params, unextended_input1_shape, input1_data, unextended_input2_shape,
169 input2_data, unextended_output_shape, output_data);
170 }
171
172 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
173 // dimensionality if the runtime code does a single loop over one dimension
174 // that handles broadcasting as the base case. The code generator would then
175 // generate max(D1, D2) nested for loops.
176 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)177 void BroadcastDivSlow(const ArithmeticParams& params,
178 const RuntimeShape& unextended_input1_shape,
179 const T* input1_data,
180 const RuntimeShape& unextended_input2_shape,
181 const T* input2_data,
182 const RuntimeShape& unextended_output_shape,
183 T* output_data) {
184 T output_activation_min;
185 T output_activation_max;
186 GetActivationParams(params, &output_activation_min, &output_activation_max);
187
188 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
189 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
190 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
191
192 NdArrayDesc<N> desc1;
193 NdArrayDesc<N> desc2;
194 NdArrayDesc<N> output_desc;
195 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
196 unextended_input2_shape, &desc1, &desc2);
197 CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
198 &output_desc);
199
200 // In Tensorflow, the dimensions are canonically named (batch_number, row,
201 // col, channel), with extents (batches, height, width, depth), with the
202 // trailing dimension changing most rapidly (channels has the smallest
203 // stride, typically 1 element).
204 //
205 // In generated C code, we store arrays with the dimensions reversed. The
206 // first dimension has smallest stride.
207
208 auto div_func = [&](int indexes[N]) {
209 output_data[SubscriptToIndex(output_desc, indexes)] =
210 ActivationFunctionWithMinMax(
211 input1_data[SubscriptToIndex(desc1, indexes)] /
212 input2_data[SubscriptToIndex(desc2, indexes)],
213 output_activation_min, output_activation_max);
214 };
215 NDOpsHelper<N>(output_desc, div_func);
216 }
217
218 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)219 inline void Div(const ArithmeticParams& params,
220 const RuntimeShape& input1_shape, const T* input1_data,
221 const RuntimeShape& input2_shape, const T* input2_data,
222 const RuntimeShape& output_shape, T* output_data) {
223 T output_activation_min;
224 T output_activation_max;
225 GetActivationParams(params, &output_activation_min, &output_activation_max);
226
227 const int flat_size =
228 MatchingElementsSize(input1_shape, input2_shape, output_shape);
229 for (int i = 0; i < flat_size; ++i) {
230 output_data[i] = ActivationFunctionWithMinMax(
231 input1_data[i] / input2_data[i], output_activation_min,
232 output_activation_max);
233 }
234 }
235
236 } // namespace reference_ops
237 } // namespace tflite
238
239 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
240