• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <algorithm>
17 #include <cmath>
18 #include <limits>
19 
20 #include "tensorflow/lite/kernels/internal/compatibility.h"
21 #include "tensorflow/lite/kernels/internal/quantization_util.h"
22 #include "tensorflow/lite/kernels/internal/round.h"
23 
24 namespace tflite {
25 
26 namespace {
27 // These constants are used to manipulate the binary representation of doubles.
28 // Double-precision binary64 floating point format is:
29 // Bit |  63  |  62-52   |   51-0   |
30 //     | Sign | Exponent | Fraction |
31 // To avoid 64-bit integers as much as possible, I break this into high and
32 // low 32-bit chunks. High is:
33 // Bit |  31  |  30-20   |      19-0     |
34 //     | Sign | Exponent | High Fraction |
35 // Low is:
36 // Bit |     31-0     |
37 //     | Low Fraction |
38 // We then access the components through logical bit-wise operations to
39 // extract the parts needed, with the positions and masks derived from the
40 // layout shown above.
41 constexpr uint64_t kSignMask = 0x8000000000000000LL;
42 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
43 constexpr int32_t kExponentShift = 52;
44 constexpr int32_t kExponentBias = 1023;
45 constexpr uint32_t kExponentIsBadNum = 0x7ff;
46 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
47 constexpr uint32_t kFractionShift = 22;
48 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
49 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
50 }  // namespace
51 
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)52 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
53                         int* shift) {
54   if (double_multiplier == 0.) {
55     *quantized_multiplier = 0;
56     *shift = 0;
57     return;
58   }
59 #ifdef TFLITE_EMULATE_FLOAT
60   // If we're trying to avoid the use of floating-point instructions (for
61   // example on microcontrollers) then use an alternative implementation
62   // that only requires integer and bitwise operations. To enable this, you
63   // need to set the define during the build process for your platform.
64   int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
65 #else   // TFLITE_EMULATE_FLOAT
66   const double q = std::frexp(double_multiplier, shift);
67   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
68 #endif  // TFLITE_EMULATE_FLOAT
69   TFLITE_CHECK(q_fixed <= (1ll << 31));
70   if (q_fixed == (1ll << 31)) {
71     q_fixed /= 2;
72     ++*shift;
73   }
74   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
75   *quantized_multiplier = static_cast<int32_t>(q_fixed);
76 }
77 
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)78 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
79                                       int32_t* quantized_multiplier,
80                                       int* left_shift) {
81   TFLITE_CHECK_GT(double_multiplier, 1.);
82   QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
83   TFLITE_CHECK_GE(*left_shift, 0);
84 }
85 
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)86 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
87                                          int32_t* quantized_multiplier,
88                                          int* left_shift) {
89   TFLITE_CHECK_LT(double_multiplier, 1.);
90   TFLITE_CHECK_GT(double_multiplier, 0.);
91   int shift;
92   QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
93   TFLITE_CHECK_LE(shift, 0);
94   *left_shift = shift;
95 }
96 
IntegerFrExp(double input,int * shift)97 int64_t IntegerFrExp(double input, int* shift) {
98   // Make sure our assumptions about the double layout hold.
99   TFLITE_CHECK_EQ(8, sizeof(double));
100 
101   // We want to access the bits of the input double value directly, which is
102   // tricky to do safely, so use a union to handle the casting.
103   union {
104     double double_value;
105     uint64_t double_as_uint;
106   } cast_union;
107   cast_union.double_value = input;
108   const uint64_t u = cast_union.double_as_uint;
109 
110   // If the bitfield is all zeros apart from the sign bit, this is a normalized
111   // zero value, so return standard values for this special case.
112   if ((u & ~kSignMask) == 0) {
113     *shift = 0;
114     return 0;
115   }
116 
117   // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
118   // the exponent, and distinguished by whether the fractions are zero or
119   // non-zero.
120   const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
121   if (exponent_part == kExponentIsBadNum) {
122     *shift = std::numeric_limits<int>::max();
123     if (u & kFractionMask) {
124       // NaN, so just return zero (with the exponent set to INT_MAX).
125       return 0;
126     } else {
127       // Infinity, so return +/- INT_MAX.
128       if (u & kSignMask) {
129         return std::numeric_limits<int64_t>::min();
130       } else {
131         return std::numeric_limits<int64_t>::max();
132       }
133     }
134   }
135 
136   // The shift is fairly easy to extract from the high bits of the double value,
137   // just by masking it out and applying a bias. The std::frexp() implementation
138   // always returns values between 0.5 and 1.0 though, whereas the exponent
139   // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
140   // interface.
141   *shift = (exponent_part - kExponentBias) + 1;
142 
143   // There's an implicit high bit in the double format definition, so make sure
144   // we include that at the top, and then reconstruct the rest of the fractional
145   // value from the remaining fragments.
146   int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
147 
148   // We're cutting off some bits at the bottom, so to exactly match the standard
149   // frexp implementation here we'll apply rounding by adding one to the least
150   // significant bit of the result if the discarded portion is over half of the
151   // maximum.
152   if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
153     fraction += 1;
154   }
155   // Negate the fraction if the sign bit was set.
156   if (u & kSignMask) {
157     fraction *= -1;
158   }
159 
160   return fraction;
161 }
162 
DoubleFromFractionAndShift(int64_t fraction,int shift)163 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
164   union {
165     double double_value;
166     uint64_t double_as_uint;
167   } result;
168 
169   // Detect NaNs and infinities.
170   if (shift == std::numeric_limits<int>::max()) {
171     if (fraction == 0) {
172       return NAN;
173     } else if (fraction > 0) {
174       return INFINITY;
175     } else {
176       return -INFINITY;
177     }
178   }
179 
180   // Return a normalized zero for a zero fraction.
181   if (fraction == 0) {
182     result.double_as_uint = 0;
183     return result.double_value;
184   }
185 
186   bool is_negative = (fraction < 0);
187   int64_t encoded_fraction = is_negative ? -fraction : fraction;
188   int64_t encoded_shift = (shift - 1);
189   while (encoded_fraction < 0x40000000) {
190     encoded_fraction *= 2;
191     encoded_shift -= 1;
192   }
193   while (encoded_fraction > 0x80000000) {
194     encoded_fraction /= 2;
195     encoded_shift += 1;
196   }
197   encoded_fraction -= 0x40000000;
198   if (encoded_shift < -1022) {
199     encoded_shift = -1023;
200   } else if (encoded_shift > 1022) {
201     encoded_shift = 1023;
202   }
203   encoded_shift += kExponentBias;
204   uint64_t encoded_sign = is_negative ? kSignMask : 0;
205   result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
206                           (encoded_fraction << kFractionShift);
207   return result.double_value;
208 }
209 
IntegerDoubleMultiply(double a,double b)210 double IntegerDoubleMultiply(double a, double b) {
211   int a_shift;
212   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
213   int b_shift;
214   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
215   // Detect NaNs and infinities.
216   if (a_shift == std::numeric_limits<int>::max() ||
217       (b_shift == std::numeric_limits<int>::max())) {
218     return NAN;
219   }
220   const int result_shift = a_shift + b_shift + 1;
221   const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
222   return DoubleFromFractionAndShift(result_fraction, result_shift);
223 }
224 
IntegerDoubleCompare(double a,double b)225 int IntegerDoubleCompare(double a, double b) {
226   int a_shift;
227   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
228   int b_shift;
229   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
230 
231   // Detect NaNs and infinities.
232   if (a_shift == std::numeric_limits<int>::max() ||
233       (b_shift == std::numeric_limits<int>::max())) {
234     return 1;
235   }
236 
237   if ((a_fraction == 0) && (b_fraction < 0)) {
238     return 1;
239   } else if ((a_fraction < 0) && (b_fraction == 0)) {
240     return -1;
241   } else if (a_shift < b_shift) {
242     return -1;
243   } else if (a_shift > b_shift) {
244     return 1;
245   } else if (a_fraction < b_fraction) {
246     return -1;
247   } else if (a_fraction > b_fraction) {
248     return 1;
249   } else {
250     return 0;
251   }
252 }
253 
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)254 void PreprocessSoftmaxScaling(double beta, double input_scale,
255                               int input_integer_bits,
256                               int32_t* quantized_multiplier, int* left_shift) {
257   // If the overall multiplier (input and beta) is large, then exp() of an
258   // input difference of 1 scaled by this will be large.  In other words, we
259   // can cap the multiplier and know that, when it is used, the output will be
260   // (round to) zero wherever the input is not at the maximum value.
261 
262   // If the overall scale is less than one, and input_integer_bits=0, then the
263   // result is double equivalent of Q0.31 (actually with more precision). Thus
264   // this generates a Q(input_integer_bits).(31-input_integer_bits)
265   // representation.
266 #ifdef TFLITE_EMULATE_FLOAT
267   const double input_beta = IntegerDoubleMultiply(beta, input_scale);
268   int shift;
269   int64_t fraction = IntegerFrExp(input_beta, &shift);
270   shift += (31 - input_integer_bits);
271   double input_beta_real_multiplier =
272       DoubleFromFractionAndShift(fraction, shift);
273   if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
274     input_beta_real_multiplier = (1ll << 31) - 1.0;
275   }
276 #else   // TFLITE_EMULATE_FLOAT
277   const double input_beta_real_multiplier = std::min(
278       beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
279 #endif  // TFLITE_EMULATE_FLOAT
280 
281   QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
282                                    quantized_multiplier, left_shift);
283 }
284 
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)285 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
286                                     int input_integer_bits,
287                                     int32_t* quantized_multiplier,
288                                     int* left_shift,
289                                     int32_t* reverse_scaling_divisor,
290                                     int* reverse_scaling_left_shift) {
291   PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
292                            quantized_multiplier, left_shift);
293 
294   // Also calculate what amounts to the inverse scaling factor for the input.
295   const double real_reverse_scaling_divisor =
296       (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
297   tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
298                                               reverse_scaling_divisor,
299                                               reverse_scaling_left_shift);
300 }
301 
CalculateInputRadius(int input_integer_bits,int input_left_shift)302 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
303 #ifdef TFLITE_EMULATE_FLOAT
304   int64_t result = (1 << input_integer_bits) - 1;
305   result <<= (31 - input_integer_bits);
306   result >>= input_left_shift;
307   return result;
308 #else   // TFLITE_EMULATE_FLOAT
309   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
310                                     (1ll << (31 - input_integer_bits)) /
311                                     (1ll << input_left_shift);
312   // Tighten bound using floor.  Suppose that we could use the exact value.
313   // After scaling the difference, the result would be at the maximum.  Thus we
314   // must ensure that our value has lower magnitude.
315   return static_cast<int>(std::floor(max_input_rescaled));
316 #endif  // TFLITE_EMULATE_FLOAT
317 }
318 
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)319 void NudgeQuantizationRange(const float min, const float max,
320                             const int quant_min, const int quant_max,
321                             float* nudged_min, float* nudged_max,
322                             float* nudged_scale) {
323   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
324   const float quant_min_float = static_cast<float>(quant_min);
325   const float quant_max_float = static_cast<float>(quant_max);
326   *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
327   const float zero_point_from_min = quant_min_float - min / *nudged_scale;
328   uint16 nudged_zero_point;
329   if (zero_point_from_min < quant_min_float) {
330     nudged_zero_point = static_cast<uint16>(quant_min);
331   } else if (zero_point_from_min > quant_max_float) {
332     nudged_zero_point = static_cast<uint16>(quant_max);
333   } else {
334     nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
335   }
336   *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
337   *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
338 }
339 
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)340 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
341                        const float nudged_max, const float* input_data,
342                        float* output_data, const float size) {
343   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
344   const float inv_nudged_scale = 1.0f / nudged_scale;
345 
346   for (int i = 0; i < size; i++) {
347     const float src_val = input_data[i];
348     const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
349     const float clamped_shifted = clamped - nudged_min;
350     const float dst_val =
351         TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
352         nudged_min;
353     output_data[i] = dst_val;
354   }
355 }
356 
CheckedLog2(const float x,int * log2_result)357 bool CheckedLog2(const float x, int* log2_result) {
358   // Using TfLiteRound instead of std::round and std::log instead of
359   // std::log2 to work around these fuctions being missing in a toolchain
360   // used in some TensorFlow tests as of May 2018.
361   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
362   const float x_log2_rounded = TfLiteRound(x_log2);
363   const float x_log2_fracpart = x_log2 - x_log2_rounded;
364 
365   *log2_result = static_cast<int>(x_log2_rounded);
366   return std::abs(x_log2_fracpart) < 1e-3;
367 }
368 
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)369 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
370                              int32_t* effective_scale_significand,
371                              int* effective_shift) {
372   for (size_t i = 0; i < size; ++i) {
373     QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
374                        &effective_shift[i]);
375   }
376 }
377 
378 }  // namespace tflite
379