• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- A class to store a normalized floating point number -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H
10 #define LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H
11 
12 #include "FPBits.h"
13 
14 #include "utils/CPP/TypeTraits.h"
15 
16 #include <stdint.h>
17 
18 namespace __llvm_libc {
19 namespace fputil {
20 
21 // A class which stores the normalized form of a floating point value.
22 // The special IEEE-754 bits patterns of Zero, infinity and NaNs are
23 // are not handled by this class.
24 //
25 // A normalized floating point number is of this form:
26 //    (-1)*sign * 2^exponent * <mantissa>
27 // where <mantissa> is of the form 1.<...>.
28 template <typename T> struct NormalFloat {
29   static_assert(
30       cpp::IsFloatingPointType<T>::Value,
31       "NormalFloat template parameter has to be a floating point type.");
32 
33   using UIntType = typename FPBits<T>::UIntType;
34   static constexpr UIntType one = (UIntType(1) << MantissaWidth<T>::value);
35 
36   // Unbiased exponent value.
37   int32_t exponent;
38 
39   UIntType mantissa;
40   // We want |UIntType| to have atleast one bit more than the actual mantissa
41   // bit width to accommodate the implicit 1 value.
42   static_assert(sizeof(UIntType) * 8 >= MantissaWidth<T>::value + 1,
43                 "Bad type for mantissa in NormalFloat.");
44 
45   bool sign;
46 
NormalFloatNormalFloat47   NormalFloat(int32_t e, UIntType m, bool s)
48       : exponent(e), mantissa(m), sign(s) {
49     if (mantissa >= one)
50       return;
51 
52     unsigned normalizationShift = evaluateNormalizationShift(mantissa);
53     mantissa = mantissa << normalizationShift;
54     exponent -= normalizationShift;
55   }
56 
NormalFloatNormalFloat57   explicit NormalFloat(T x) { initFromBits(FPBits<T>(x)); }
58 
NormalFloatNormalFloat59   explicit NormalFloat(FPBits<T> bits) { initFromBits(bits); }
60 
61   // Compares this normalized number with another normalized number.
62   // Returns -1 is this number is less than |other|, 0 if this number is equal
63   // to |other|, and 1 if this number is greater than |other|.
cmpNormalFloat64   int cmp(const NormalFloat<T> &other) const {
65     if (sign != other.sign)
66       return sign ? -1 : 1;
67 
68     if (exponent > other.exponent) {
69       return sign ? -1 : 1;
70     } else if (exponent == other.exponent) {
71       if (mantissa > other.mantissa)
72         return sign ? -1 : 1;
73       else if (mantissa == other.mantissa)
74         return 0;
75       else
76         return sign ? 1 : -1;
77     } else {
78       return sign ? 1 : -1;
79     }
80   }
81 
82   // Returns a new normalized floating point number which is equal in value
83   // to this number multiplied by 2^e. That is:
84   //     new = this *  2^e
mul2NormalFloat85   NormalFloat<T> mul2(int e) const {
86     NormalFloat<T> result = *this;
87     result.exponent += e;
88     return result;
89   }
90 
TNormalFloat91   operator T() const {
92     int biasedExponent = exponent + FPBits<T>::exponentBias;
93     // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
94     constexpr int maxExponentValue = (1 << ExponentWidth<T>::value) - 2;
95     if (biasedExponent > maxExponentValue) {
96       return sign ? FPBits<T>::negInf() : FPBits<T>::inf();
97     }
98 
99     FPBits<T> result(T(0.0));
100     result.sign = sign;
101 
102     constexpr int subnormalExponent = -FPBits<T>::exponentBias + 1;
103     if (exponent < subnormalExponent) {
104       unsigned shift = subnormalExponent - exponent;
105       // Since exponent > subnormalExponent, shift is strictly greater than
106       // zero.
107       if (shift <= MantissaWidth<T>::value + 1) {
108         // Generate a subnormal number. Might lead to loss of precision.
109         // We round to nearest and round halfway cases to even.
110         const UIntType shiftOutMask = (UIntType(1) << shift) - 1;
111         const UIntType shiftOutValue = mantissa & shiftOutMask;
112         const UIntType halfwayValue = UIntType(1) << (shift - 1);
113         result.exponent = 0;
114         result.mantissa = mantissa >> shift;
115         UIntType newMantissa = result.mantissa;
116         if (shiftOutValue > halfwayValue) {
117           newMantissa += 1;
118         } else if (shiftOutValue == halfwayValue) {
119           // Round to even.
120           if (result.mantissa & 0x1)
121             newMantissa += 1;
122         }
123         result.mantissa = newMantissa;
124         // Adding 1 to mantissa can lead to overflow. This can only happen if
125         // mantissa was all ones (0b111..11). For such a case, we will carry
126         // the overflow into the exponent.
127         if (newMantissa == one)
128           result.exponent = 1;
129         return result;
130       } else {
131         return result;
132       }
133     }
134 
135     result.exponent = exponent + FPBits<T>::exponentBias;
136     result.mantissa = mantissa;
137     return result;
138   }
139 
140 private:
initFromBitsNormalFloat141   void initFromBits(FPBits<T> bits) {
142     sign = bits.sign;
143 
144     if (bits.isInfOrNaN() || bits.isZero()) {
145       // Ignore special bit patterns. Implementations deal with them separately
146       // anyway so this should not be a problem.
147       exponent = 0;
148       mantissa = 0;
149       return;
150     }
151 
152     // Normalize subnormal numbers.
153     if (bits.exponent == 0) {
154       unsigned shift = evaluateNormalizationShift(bits.mantissa);
155       mantissa = UIntType(bits.mantissa) << shift;
156       exponent = 1 - FPBits<T>::exponentBias - shift;
157     } else {
158       exponent = bits.exponent - FPBits<T>::exponentBias;
159       mantissa = one | bits.mantissa;
160     }
161   }
162 
evaluateNormalizationShiftNormalFloat163   unsigned evaluateNormalizationShift(UIntType m) {
164     unsigned shift = 0;
165     for (; (one & m) == 0 && (shift < MantissaWidth<T>::value);
166          m <<= 1, ++shift)
167       ;
168     return shift;
169   }
170 };
171 
172 #if defined(__x86_64__) || defined(__i386__)
173 template <>
initFromBits(FPBits<long double> bits)174 inline void NormalFloat<long double>::initFromBits(FPBits<long double> bits) {
175   sign = bits.sign;
176 
177   if (bits.isInfOrNaN() || bits.isZero()) {
178     // Ignore special bit patterns. Implementations deal with them separately
179     // anyway so this should not be a problem.
180     exponent = 0;
181     mantissa = 0;
182     return;
183   }
184 
185   if (bits.exponent == 0) {
186     if (bits.implicitBit == 0) {
187       // Since we ignore zero value, the mantissa in this case is non-zero.
188       int normalizationShift = evaluateNormalizationShift(bits.mantissa);
189       exponent = -16382 - normalizationShift;
190       mantissa = (bits.mantissa << normalizationShift);
191     } else {
192       exponent = -16382;
193       mantissa = one | bits.mantissa;
194     }
195   } else {
196     if (bits.implicitBit == 0) {
197       // Invalid number so just store 0 similar to a NaN.
198       exponent = 0;
199       mantissa = 0;
200     } else {
201       exponent = bits.exponent - 16383;
202       mantissa = one | bits.mantissa;
203     }
204   }
205 }
206 
207 template <> inline NormalFloat<long double>::operator long double() const {
208   int biasedExponent = exponent + FPBits<long double>::exponentBias;
209   // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
210   constexpr int maxExponentValue = (1 << ExponentWidth<long double>::value) - 2;
211   if (biasedExponent > maxExponentValue) {
212     return sign ? FPBits<long double>::negInf() : FPBits<long double>::inf();
213   }
214 
215   FPBits<long double> result(0.0l);
216   result.sign = sign;
217 
218   constexpr int subnormalExponent = -FPBits<long double>::exponentBias + 1;
219   if (exponent < subnormalExponent) {
220     unsigned shift = subnormalExponent - exponent;
221     if (shift <= MantissaWidth<long double>::value + 1) {
222       // Generate a subnormal number. Might lead to loss of precision.
223       // We round to nearest and round halfway cases to even.
224       const UIntType shiftOutMask = (UIntType(1) << shift) - 1;
225       const UIntType shiftOutValue = mantissa & shiftOutMask;
226       const UIntType halfwayValue = UIntType(1) << (shift - 1);
227       result.exponent = 0;
228       result.mantissa = mantissa >> shift;
229       UIntType newMantissa = result.mantissa;
230       if (shiftOutValue > halfwayValue) {
231         newMantissa += 1;
232       } else if (shiftOutValue == halfwayValue) {
233         // Round to even.
234         if (result.mantissa & 0x1)
235           newMantissa += 1;
236       }
237       result.mantissa = newMantissa;
238       // Adding 1 to mantissa can lead to overflow. This can only happen if
239       // mantissa was all ones (0b111..11). For such a case, we will carry
240       // the overflow into the exponent and set the implicit bit to 1.
241       if (newMantissa == one) {
242         result.exponent = 1;
243         result.implicitBit = 1;
244       } else {
245         result.implicitBit = 0;
246       }
247       return result;
248     } else {
249       return result;
250     }
251   }
252 
253   result.exponent = biasedExponent;
254   result.mantissa = mantissa;
255   result.implicitBit = 1;
256   return result;
257 }
258 #endif
259 
260 } // namespace fputil
261 } // namespace __llvm_libc
262 
263 #endif // LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H
264