1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <algorithm>
17 #include <cmath>
18 #include <limits>
19
20 #include "tensorflow/lite/kernels/internal/compatibility.h"
21 #include "tensorflow/lite/kernels/internal/quantization_util.h"
22 #include "tensorflow/lite/kernels/internal/round.h"
23
24 namespace tflite {
25
26 namespace {
27 // These constants are used to manipulate the binary representation of doubles.
28 // Double-precision binary64 floating point format is:
29 // Bit | 63 | 62-52 | 51-0 |
30 // | Sign | Exponent | Fraction |
31 // To avoid 64-bit integers as much as possible, I break this into high and
32 // low 32-bit chunks. High is:
33 // Bit | 31 | 30-20 | 19-0 |
34 // | Sign | Exponent | High Fraction |
35 // Low is:
36 // Bit | 31-0 |
37 // | Low Fraction |
38 // We then access the components through logical bit-wise operations to
39 // extract the parts needed, with the positions and masks derived from the
40 // layout shown above.
41 constexpr uint64_t kSignMask = 0x8000000000000000LL;
42 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
43 constexpr int32_t kExponentShift = 52;
44 constexpr int32_t kExponentBias = 1023;
45 constexpr uint32_t kExponentIsBadNum = 0x7ff;
46 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
47 constexpr uint32_t kFractionShift = 22;
48 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
49 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
50 } // namespace
51
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)52 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
53 int* shift) {
54 if (double_multiplier == 0.) {
55 *quantized_multiplier = 0;
56 *shift = 0;
57 return;
58 }
59 #ifdef TFLITE_EMULATE_FLOAT
60 // If we're trying to avoid the use of floating-point instructions (for
61 // example on microcontrollers) then use an alternative implementation
62 // that only requires integer and bitwise operations. To enable this, you
63 // need to set the define during the build process for your platform.
64 int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
65 #else // TFLITE_EMULATE_FLOAT
66 const double q = std::frexp(double_multiplier, shift);
67 auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
68 #endif // TFLITE_EMULATE_FLOAT
69 TFLITE_CHECK(q_fixed <= (1ll << 31));
70 if (q_fixed == (1ll << 31)) {
71 q_fixed /= 2;
72 ++*shift;
73 }
74 TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
75 *quantized_multiplier = static_cast<int32_t>(q_fixed);
76 }
77
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)78 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
79 int32_t* quantized_multiplier,
80 int* left_shift) {
81 TFLITE_CHECK_GT(double_multiplier, 1.);
82 QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
83 TFLITE_CHECK_GE(*left_shift, 0);
84 }
85
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)86 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
87 int32_t* quantized_multiplier,
88 int* left_shift) {
89 TFLITE_CHECK_LT(double_multiplier, 1.);
90 TFLITE_CHECK_GT(double_multiplier, 0.);
91 int shift;
92 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
93 TFLITE_CHECK_LE(shift, 0);
94 *left_shift = shift;
95 }
96
IntegerFrExp(double input,int * shift)97 int64_t IntegerFrExp(double input, int* shift) {
98 // Make sure our assumptions about the double layout hold.
99 TFLITE_CHECK_EQ(8, sizeof(double));
100
101 // We want to access the bits of the input double value directly, which is
102 // tricky to do safely, so use a union to handle the casting.
103 union {
104 double double_value;
105 uint64_t double_as_uint;
106 } cast_union;
107 cast_union.double_value = input;
108 const uint64_t u = cast_union.double_as_uint;
109
110 // If the bitfield is all zeros apart from the sign bit, this is a normalized
111 // zero value, so return standard values for this special case.
112 if ((u & ~kSignMask) == 0) {
113 *shift = 0;
114 return 0;
115 }
116
117 // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
118 // the exponent, and distinguished by whether the fractions are zero or
119 // non-zero.
120 const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
121 if (exponent_part == kExponentIsBadNum) {
122 *shift = std::numeric_limits<int>::max();
123 if (u & kFractionMask) {
124 // NaN, so just return zero (with the exponent set to INT_MAX).
125 return 0;
126 } else {
127 // Infinity, so return +/- INT_MAX.
128 if (u & kSignMask) {
129 return std::numeric_limits<int64_t>::min();
130 } else {
131 return std::numeric_limits<int64_t>::max();
132 }
133 }
134 }
135
136 // The shift is fairly easy to extract from the high bits of the double value,
137 // just by masking it out and applying a bias. The std::frexp() implementation
138 // always returns values between 0.5 and 1.0 though, whereas the exponent
139 // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
140 // interface.
141 *shift = (exponent_part - kExponentBias) + 1;
142
143 // There's an implicit high bit in the double format definition, so make sure
144 // we include that at the top, and then reconstruct the rest of the fractional
145 // value from the remaining fragments.
146 int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
147
148 // We're cutting off some bits at the bottom, so to exactly match the standard
149 // frexp implementation here we'll apply rounding by adding one to the least
150 // significant bit of the result if the discarded portion is over half of the
151 // maximum.
152 if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
153 fraction += 1;
154 }
155 // Negate the fraction if the sign bit was set.
156 if (u & kSignMask) {
157 fraction *= -1;
158 }
159
160 return fraction;
161 }
162
DoubleFromFractionAndShift(int64_t fraction,int shift)163 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
164 union {
165 double double_value;
166 uint64_t double_as_uint;
167 } result;
168
169 // Detect NaNs and infinities.
170 if (shift == std::numeric_limits<int>::max()) {
171 if (fraction == 0) {
172 return NAN;
173 } else if (fraction > 0) {
174 return INFINITY;
175 } else {
176 return -INFINITY;
177 }
178 }
179
180 // Return a normalized zero for a zero fraction.
181 if (fraction == 0) {
182 result.double_as_uint = 0;
183 return result.double_value;
184 }
185
186 bool is_negative = (fraction < 0);
187 int64_t encoded_fraction = is_negative ? -fraction : fraction;
188 int64_t encoded_shift = (shift - 1);
189 while (encoded_fraction < 0x40000000) {
190 encoded_fraction *= 2;
191 encoded_shift -= 1;
192 }
193 while (encoded_fraction > 0x80000000) {
194 encoded_fraction /= 2;
195 encoded_shift += 1;
196 }
197 encoded_fraction -= 0x40000000;
198 if (encoded_shift < -1022) {
199 encoded_shift = -1023;
200 } else if (encoded_shift > 1022) {
201 encoded_shift = 1023;
202 }
203 encoded_shift += kExponentBias;
204 uint64_t encoded_sign = is_negative ? kSignMask : 0;
205 result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
206 (encoded_fraction << kFractionShift);
207 return result.double_value;
208 }
209
IntegerDoubleMultiply(double a,double b)210 double IntegerDoubleMultiply(double a, double b) {
211 int a_shift;
212 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
213 int b_shift;
214 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
215 // Detect NaNs and infinities.
216 if (a_shift == std::numeric_limits<int>::max() ||
217 (b_shift == std::numeric_limits<int>::max())) {
218 return NAN;
219 }
220 const int result_shift = a_shift + b_shift + 1;
221 const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
222 return DoubleFromFractionAndShift(result_fraction, result_shift);
223 }
224
IntegerDoubleCompare(double a,double b)225 int IntegerDoubleCompare(double a, double b) {
226 int a_shift;
227 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
228 int b_shift;
229 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
230
231 // Detect NaNs and infinities.
232 if (a_shift == std::numeric_limits<int>::max() ||
233 (b_shift == std::numeric_limits<int>::max())) {
234 return 1;
235 }
236
237 if ((a_fraction == 0) && (b_fraction < 0)) {
238 return 1;
239 } else if ((a_fraction < 0) && (b_fraction == 0)) {
240 return -1;
241 } else if (a_shift < b_shift) {
242 return -1;
243 } else if (a_shift > b_shift) {
244 return 1;
245 } else if (a_fraction < b_fraction) {
246 return -1;
247 } else if (a_fraction > b_fraction) {
248 return 1;
249 } else {
250 return 0;
251 }
252 }
253
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)254 void PreprocessSoftmaxScaling(double beta, double input_scale,
255 int input_integer_bits,
256 int32_t* quantized_multiplier, int* left_shift) {
257 // If the overall multiplier (input and beta) is large, then exp() of an
258 // input difference of 1 scaled by this will be large. In other words, we
259 // can cap the multiplier and know that, when it is used, the output will be
260 // (round to) zero wherever the input is not at the maximum value.
261
262 // If the overall scale is less than one, and input_integer_bits=0, then the
263 // result is double equivalent of Q0.31 (actually with more precision). Thus
264 // this generates a Q(input_integer_bits).(31-input_integer_bits)
265 // representation.
266 #ifdef TFLITE_EMULATE_FLOAT
267 const double input_beta = IntegerDoubleMultiply(beta, input_scale);
268 int shift;
269 int64_t fraction = IntegerFrExp(input_beta, &shift);
270 shift += (31 - input_integer_bits);
271 double input_beta_real_multiplier =
272 DoubleFromFractionAndShift(fraction, shift);
273 if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
274 input_beta_real_multiplier = (1ll << 31) - 1.0;
275 }
276 #else // TFLITE_EMULATE_FLOAT
277 const double input_beta_real_multiplier = std::min(
278 beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
279 #endif // TFLITE_EMULATE_FLOAT
280
281 QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
282 quantized_multiplier, left_shift);
283 }
284
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)285 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
286 int input_integer_bits,
287 int32_t* quantized_multiplier,
288 int* left_shift,
289 int32_t* reverse_scaling_divisor,
290 int* reverse_scaling_left_shift) {
291 PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
292 quantized_multiplier, left_shift);
293
294 // Also calculate what amounts to the inverse scaling factor for the input.
295 const double real_reverse_scaling_divisor =
296 (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
297 tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
298 reverse_scaling_divisor,
299 reverse_scaling_left_shift);
300 }
301
CalculateInputRadius(int input_integer_bits,int input_left_shift)302 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
303 #ifdef TFLITE_EMULATE_FLOAT
304 int64_t result = (1 << input_integer_bits) - 1;
305 result <<= (31 - input_integer_bits);
306 result >>= input_left_shift;
307 return result;
308 #else // TFLITE_EMULATE_FLOAT
309 const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
310 (1ll << (31 - input_integer_bits)) /
311 (1ll << input_left_shift);
312 // Tighten bound using floor. Suppose that we could use the exact value.
313 // After scaling the difference, the result would be at the maximum. Thus we
314 // must ensure that our value has lower magnitude.
315 return static_cast<int>(std::floor(max_input_rescaled));
316 #endif // TFLITE_EMULATE_FLOAT
317 }
318
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)319 void NudgeQuantizationRange(const float min, const float max,
320 const int quant_min, const int quant_max,
321 float* nudged_min, float* nudged_max,
322 float* nudged_scale) {
323 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
324 const float quant_min_float = static_cast<float>(quant_min);
325 const float quant_max_float = static_cast<float>(quant_max);
326 *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
327 const float zero_point_from_min = quant_min_float - min / *nudged_scale;
328 uint16 nudged_zero_point;
329 if (zero_point_from_min < quant_min_float) {
330 nudged_zero_point = static_cast<uint16>(quant_min);
331 } else if (zero_point_from_min > quant_max_float) {
332 nudged_zero_point = static_cast<uint16>(quant_max);
333 } else {
334 nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
335 }
336 *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
337 *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
338 }
339
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)340 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
341 const float nudged_max, const float* input_data,
342 float* output_data, const float size) {
343 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
344 const float inv_nudged_scale = 1.0f / nudged_scale;
345
346 for (int i = 0; i < size; i++) {
347 const float src_val = input_data[i];
348 const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
349 const float clamped_shifted = clamped - nudged_min;
350 const float dst_val =
351 TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
352 nudged_min;
353 output_data[i] = dst_val;
354 }
355 }
356
CheckedLog2(const float x,int * log2_result)357 bool CheckedLog2(const float x, int* log2_result) {
358 // Using TfLiteRound instead of std::round and std::log instead of
359 // std::log2 to work around these fuctions being missing in a toolchain
360 // used in some TensorFlow tests as of May 2018.
361 const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
362 const float x_log2_rounded = TfLiteRound(x_log2);
363 const float x_log2_fracpart = x_log2 - x_log2_rounded;
364
365 *log2_result = static_cast<int>(x_log2_rounded);
366 return std::abs(x_log2_fracpart) < 1e-3;
367 }
368
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)369 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
370 int32_t* effective_scale_significand,
371 int* effective_shift) {
372 for (size_t i = 0; i < size; ++i) {
373 QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
374 &effective_shift[i]);
375 }
376 }
377
378 } // namespace tflite
379