1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/kernels/internal/quantization_util.h"
17
18 #include <algorithm>
19 #include <cmath>
20 #include <limits>
21
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/cppmath.h"
24
25 namespace tflite {
26
27 namespace {
28 // These constants are used to manipulate the binary representation of doubles.
29 // Double-precision binary64 floating point format is:
30 // Bit | 63 | 62-52 | 51-0 |
31 // | Sign | Exponent | Fraction |
32 // To avoid 64-bit integers as much as possible, I break this into high and
33 // low 32-bit chunks. High is:
34 // Bit | 31 | 30-20 | 19-0 |
35 // | Sign | Exponent | High Fraction |
36 // Low is:
37 // Bit | 31-0 |
38 // | Low Fraction |
39 // We then access the components through logical bit-wise operations to
40 // extract the parts needed, with the positions and masks derived from the
41 // layout shown above.
42 constexpr uint64_t kSignMask = 0x8000000000000000LL;
43 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
44 constexpr int32_t kExponentShift = 52;
45 constexpr int32_t kExponentBias = 1023;
46 constexpr uint32_t kExponentIsBadNum = 0x7ff;
47 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
48 constexpr uint32_t kFractionShift = 22;
49 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
50 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
51 } // namespace
52
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)53 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
54 int* shift) {
55 if (double_multiplier == 0.) {
56 *quantized_multiplier = 0;
57 *shift = 0;
58 return;
59 }
60 #ifdef TFLITE_EMULATE_FLOAT
61 // If we're trying to avoid the use of floating-point instructions (for
62 // example on microcontrollers) then use an alternative implementation
63 // that only requires integer and bitwise operations. To enable this, you
64 // need to set the define during the build process for your platform.
65 int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
66 #else // TFLITE_EMULATE_FLOAT
67 const double q = std::frexp(double_multiplier, shift);
68 auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
69 #endif // TFLITE_EMULATE_FLOAT
70 TFLITE_CHECK(q_fixed <= (1ll << 31));
71 if (q_fixed == (1ll << 31)) {
72 q_fixed /= 2;
73 ++*shift;
74 }
75 TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
76 // A shift amount smaller than -31 would cause all bits to be shifted out
77 // and thus all results would be zero. We implement that instead with
78 // q_fixed==0, so as to avoid hitting issues with right-shift
79 // operations with shift amounts greater than 31. Note that this happens
80 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
81 // that we're effectively flushing tiny double_multiplier's to zero.
82 // We could conceivably handle values in the range (roughly) [32, 63]
83 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
84 // the present handling is just doing 'flush denormals to zero'. We could
85 // reconsider and actually generate nonzero denormals if a need arises.
86 if (*shift < -31) {
87 *shift = 0;
88 q_fixed = 0;
89 }
90 *quantized_multiplier = static_cast<int32_t>(q_fixed);
91 }
92
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)93 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
94 int32_t* quantized_multiplier,
95 int* left_shift) {
96 TFLITE_CHECK_GT(double_multiplier, 1.);
97 QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
98 TFLITE_CHECK_GE(*left_shift, 0);
99 }
100
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)101 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
102 int32_t* quantized_multiplier,
103 int* left_shift) {
104 TFLITE_CHECK_LT(double_multiplier, 1.);
105 TFLITE_CHECK_GT(double_multiplier, 0.);
106 int shift;
107 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
108 TFLITE_CHECK_LE(shift, 0);
109 *left_shift = shift;
110 }
111
IntegerFrExp(double input,int * shift)112 int64_t IntegerFrExp(double input, int* shift) {
113 // Make sure our assumptions about the double layout hold.
114 TFLITE_CHECK_EQ(8, sizeof(double));
115
116 // We want to access the bits of the input double value directly, which is
117 // tricky to do safely, so use a union to handle the casting.
118 union {
119 double double_value;
120 uint64_t double_as_uint;
121 } cast_union;
122 cast_union.double_value = input;
123 const uint64_t u = cast_union.double_as_uint;
124
125 // If the bitfield is all zeros apart from the sign bit, this is a normalized
126 // zero value, so return standard values for this special case.
127 if ((u & ~kSignMask) == 0) {
128 *shift = 0;
129 return 0;
130 }
131
132 // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
133 // the exponent, and distinguished by whether the fractions are zero or
134 // non-zero.
135 const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
136 if (exponent_part == kExponentIsBadNum) {
137 *shift = std::numeric_limits<int>::max();
138 if (u & kFractionMask) {
139 // NaN, so just return zero (with the exponent set to INT_MAX).
140 return 0;
141 } else {
142 // Infinity, so return +/- INT_MAX.
143 if (u & kSignMask) {
144 return std::numeric_limits<int64_t>::min();
145 } else {
146 return std::numeric_limits<int64_t>::max();
147 }
148 }
149 }
150
151 // The shift is fairly easy to extract from the high bits of the double value,
152 // just by masking it out and applying a bias. The std::frexp() implementation
153 // always returns values between 0.5 and 1.0 though, whereas the exponent
154 // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
155 // interface.
156 *shift = (exponent_part - kExponentBias) + 1;
157
158 // There's an implicit high bit in the double format definition, so make sure
159 // we include that at the top, and then reconstruct the rest of the fractional
160 // value from the remaining fragments.
161 int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
162
163 // We're cutting off some bits at the bottom, so to exactly match the standard
164 // frexp implementation here we'll apply rounding by adding one to the least
165 // significant bit of the result if the discarded portion is over half of the
166 // maximum.
167 if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
168 fraction += 1;
169 }
170 // Negate the fraction if the sign bit was set.
171 if (u & kSignMask) {
172 fraction *= -1;
173 }
174
175 return fraction;
176 }
177
DoubleFromFractionAndShift(int64_t fraction,int shift)178 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
179 union {
180 double double_value;
181 uint64_t double_as_uint;
182 } result;
183
184 // Detect NaNs and infinities.
185 if (shift == std::numeric_limits<int>::max()) {
186 if (fraction == 0) {
187 return std::numeric_limits<double>::quiet_NaN();
188 } else if (fraction > 0) {
189 return std::numeric_limits<double>::infinity();
190 } else {
191 return -std::numeric_limits<double>::infinity();
192 }
193 }
194
195 // Return a normalized zero for a zero fraction.
196 if (fraction == 0) {
197 result.double_as_uint = 0;
198 return result.double_value;
199 }
200
201 bool is_negative = (fraction < 0);
202 int64_t encoded_fraction = is_negative ? -fraction : fraction;
203 int64_t encoded_shift = (shift - 1);
204 while (encoded_fraction < 0x40000000) {
205 encoded_fraction *= 2;
206 encoded_shift -= 1;
207 }
208 while (encoded_fraction > 0x80000000) {
209 encoded_fraction /= 2;
210 encoded_shift += 1;
211 }
212 encoded_fraction -= 0x40000000;
213 if (encoded_shift < -1022) {
214 encoded_shift = -1023;
215 } else if (encoded_shift > 1022) {
216 encoded_shift = 1023;
217 }
218 encoded_shift += kExponentBias;
219 uint64_t encoded_sign = is_negative ? kSignMask : 0;
220 result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
221 (encoded_fraction << kFractionShift);
222 return result.double_value;
223 }
224
IntegerDoubleMultiply(double a,double b)225 double IntegerDoubleMultiply(double a, double b) {
226 int a_shift;
227 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
228 int b_shift;
229 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
230 // Detect NaNs and infinities.
231 if (a_shift == std::numeric_limits<int>::max() ||
232 (b_shift == std::numeric_limits<int>::max())) {
233 return std::numeric_limits<double>::quiet_NaN();
234 }
235 const int result_shift = a_shift + b_shift + 1;
236 const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
237 return DoubleFromFractionAndShift(result_fraction, result_shift);
238 }
239
IntegerDoubleCompare(double a,double b)240 int IntegerDoubleCompare(double a, double b) {
241 int a_shift;
242 const int64_t a_fraction = IntegerFrExp(a, &a_shift);
243 int b_shift;
244 const int64_t b_fraction = IntegerFrExp(b, &b_shift);
245
246 // Detect NaNs and infinities.
247 if (a_shift == std::numeric_limits<int>::max() ||
248 (b_shift == std::numeric_limits<int>::max())) {
249 return 1;
250 }
251
252 if ((a_fraction == 0) && (b_fraction < 0)) {
253 return 1;
254 } else if ((a_fraction < 0) && (b_fraction == 0)) {
255 return -1;
256 } else if (a_shift < b_shift) {
257 return -1;
258 } else if (a_shift > b_shift) {
259 return 1;
260 } else if (a_fraction < b_fraction) {
261 return -1;
262 } else if (a_fraction > b_fraction) {
263 return 1;
264 } else {
265 return 0;
266 }
267 }
268
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)269 void PreprocessSoftmaxScaling(double beta, double input_scale,
270 int input_integer_bits,
271 int32_t* quantized_multiplier, int* left_shift) {
272 // If the overall multiplier (input and beta) is large, then exp() of an
273 // input difference of 1 scaled by this will be large. In other words, we
274 // can cap the multiplier and know that, when it is used, the output will be
275 // (round to) zero wherever the input is not at the maximum value.
276
277 // If the overall scale is less than one, and input_integer_bits=0, then the
278 // result is double equivalent of Q0.31 (actually with more precision). Thus
279 // this generates a Q(input_integer_bits).(31-input_integer_bits)
280 // representation.
281 #ifdef TFLITE_EMULATE_FLOAT
282 const double input_beta = IntegerDoubleMultiply(beta, input_scale);
283 int shift;
284 int64_t fraction = IntegerFrExp(input_beta, &shift);
285 shift += (31 - input_integer_bits);
286 double input_beta_real_multiplier =
287 DoubleFromFractionAndShift(fraction, shift);
288 if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
289 input_beta_real_multiplier = (1ll << 31) - 1.0;
290 }
291 #else // TFLITE_EMULATE_FLOAT
292 const double input_beta_real_multiplier = std::min<double>(
293 beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
294 #endif // TFLITE_EMULATE_FLOAT
295
296 QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
297 quantized_multiplier, left_shift);
298 }
299
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)300 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
301 int input_integer_bits,
302 int32_t* quantized_multiplier,
303 int* left_shift,
304 int32_t* reverse_scaling_divisor,
305 int* reverse_scaling_left_shift) {
306 PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
307 quantized_multiplier, left_shift);
308
309 // Also calculate what amounts to the inverse scaling factor for the input.
310 const double real_reverse_scaling_divisor =
311 (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
312 tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
313 reverse_scaling_divisor,
314 reverse_scaling_left_shift);
315 }
316
CalculateInputRadius(int input_integer_bits,int input_left_shift,int total_signed_bits)317 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
318 int total_signed_bits) {
319 #ifdef TFLITE_EMULATE_FLOAT
320 int64_t result = (1 << input_integer_bits) - 1;
321 result <<= (total_signed_bits - input_integer_bits);
322 result >>= input_left_shift;
323 return result;
324 #else // TFLITE_EMULATE_FLOAT
325 const double max_input_rescaled =
326 1.0 * ((1 << input_integer_bits) - 1) *
327 (1ll << (total_signed_bits - input_integer_bits)) /
328 (1ll << input_left_shift);
329 // Tighten bound using floor. Suppose that we could use the exact value.
330 // After scaling the difference, the result would be at the maximum. Thus we
331 // must ensure that our value has lower magnitude.
332 return static_cast<int>(std::floor(max_input_rescaled));
333 #endif // TFLITE_EMULATE_FLOAT
334 }
335
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)336 void NudgeQuantizationRange(const float min, const float max,
337 const int quant_min, const int quant_max,
338 float* nudged_min, float* nudged_max,
339 float* nudged_scale) {
340 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
341 const float quant_min_float = static_cast<float>(quant_min);
342 const float quant_max_float = static_cast<float>(quant_max);
343 *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
344 const float zero_point_from_min = quant_min_float - min / *nudged_scale;
345 uint16_t nudged_zero_point;
346 if (zero_point_from_min < quant_min_float) {
347 nudged_zero_point = static_cast<uint16_t>(quant_min);
348 } else if (zero_point_from_min > quant_max_float) {
349 nudged_zero_point = static_cast<uint16_t>(quant_max);
350 } else {
351 nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
352 }
353 *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
354 *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
355 }
356
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)357 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
358 const float nudged_max, const float* input_data,
359 float* output_data, const float size) {
360 // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
361 const float inv_nudged_scale = 1.0f / nudged_scale;
362
363 for (int i = 0; i < size; i++) {
364 const float src_val = input_data[i];
365 const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
366 const float clamped_shifted = clamped - nudged_min;
367 const float dst_val =
368 TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
369 nudged_min;
370 output_data[i] = dst_val;
371 }
372 }
373
CheckedLog2(const float x,int * log2_result)374 bool CheckedLog2(const float x, int* log2_result) {
375 // Using TfLiteRound instead of std::round and std::log instead of
376 // std::log2 to work around these functions being missing in a toolchain
377 // used in some TensorFlow tests as of May 2018.
378 const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
379 const float x_log2_rounded = TfLiteRound(x_log2);
380 const float x_log2_fracpart = x_log2 - x_log2_rounded;
381
382 *log2_result = static_cast<int>(x_log2_rounded);
383 return std::abs(x_log2_fracpart) < 1e-3f;
384 }
385
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)386 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
387 int32_t* effective_scale_significand,
388 int* effective_shift) {
389 for (size_t i = 0; i < size; ++i) {
390 QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
391 &effective_shift[i]);
392 }
393 }
394
395 } // namespace tflite
396