• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Adapted from https://github.com/Alexhuszagh/rust-lexical.
2 
3 //! Estimate the error in an 80-bit approximation of a float.
4 //!
5 //! This estimates the error in a floating-point representation.
6 //!
7 //! This implementation is loosely based off the Golang implementation,
8 //! found here:
9 //!     https://golang.org/src/strconv/atof.go
10 
11 use super::float::*;
12 use super::num::*;
13 use super::rounding::*;
14 
15 pub(crate) trait FloatErrors {
16     /// Get the full error scale.
error_scale() -> u3217     fn error_scale() -> u32;
18     /// Get the half error scale.
error_halfscale() -> u3219     fn error_halfscale() -> u32;
20     /// Determine if the number of errors is tolerable for float precision.
error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool21     fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool;
22 }
23 
24 /// Check if the error is accurate with a round-nearest rounding scheme.
25 #[inline]
nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool26 fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool {
27     // Round-to-nearest, need to use the halfway point.
28     if extrabits == 65 {
29         // Underflow, we have a shift larger than the mantissa.
30         // Representation is valid **only** if the value is close enough
31         // overflow to the next bit within errors. If it overflows,
32         // the representation is **not** valid.
33         !fp.mant.overflowing_add(errors).1
34     } else {
35         let mask: u64 = lower_n_mask(extrabits);
36         let extra: u64 = fp.mant & mask;
37 
38         // Round-to-nearest, need to check if we're close to halfway.
39         // IE, b10100 | 100000, where `|` signifies the truncation point.
40         let halfway: u64 = lower_n_halfway(extrabits);
41         let cmp1 = halfway.wrapping_sub(errors) < extra;
42         let cmp2 = extra < halfway.wrapping_add(errors);
43 
44         // If both comparisons are true, we have significant rounding error,
45         // and the value cannot be exactly represented. Otherwise, the
46         // representation is valid.
47         !(cmp1 && cmp2)
48     }
49 }
50 
51 impl FloatErrors for u64 {
52     #[inline]
error_scale() -> u3253     fn error_scale() -> u32 {
54         8
55     }
56 
57     #[inline]
error_halfscale() -> u3258     fn error_halfscale() -> u32 {
59         u64::error_scale() / 2
60     }
61 
62     #[inline]
error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool63     fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool {
64         // Determine if extended-precision float is a good approximation.
65         // If the error has affected too many units, the float will be
66         // inaccurate, or if the representation is too close to halfway
67         // that any operations could affect this halfway representation.
68         // See the documentation for dtoa for more information.
69         let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE);
70         let denormal_exp = bias - 63;
71         // This is always a valid u32, since (denormal_exp - fp.exp)
72         // will always be positive and the significand size is {23, 52}.
73         let extrabits = if fp.exp <= denormal_exp {
74             64 - F::MANTISSA_SIZE + denormal_exp - fp.exp
75         } else {
76             63 - F::MANTISSA_SIZE
77         };
78 
79         // Our logic is as follows: we want to determine if the actual
80         // mantissa and the errors during calculation differ significantly
81         // from the rounding point. The rounding point for round-nearest
82         // is the halfway point, IE, this when the truncated bits start
83         // with b1000..., while the rounding point for the round-toward
84         // is when the truncated bits are equal to 0.
85         // To do so, we can check whether the rounding point +/- the error
86         // are >/< the actual lower n bits.
87         //
88         // For whether we need to use signed or unsigned types for this
89         // analysis, see this example, using u8 rather than u64 to simplify
90         // things.
91         //
92         // # Comparisons
93         //      cmp1 = (halfway - errors) < extra
94         //      cmp1 = extra < (halfway + errors)
95         //
96         // # Large Extrabits, Low Errors
97         //
98         //      extrabits = 8
99         //      halfway          =  0b10000000
100         //      extra            =  0b10000010
101         //      errors           =  0b00000100
102         //      halfway - errors =  0b01111100
103         //      halfway + errors =  0b10000100
104         //
105         //      Unsigned:
106         //          halfway - errors = 124
107         //          halfway + errors = 132
108         //          extra            = 130
109         //          cmp1             = true
110         //          cmp2             = true
111         //      Signed:
112         //          halfway - errors = 124
113         //          halfway + errors = -124
114         //          extra            = -126
115         //          cmp1             = false
116         //          cmp2             = true
117         //
118         // # Conclusion
119         //
120         // Since errors will always be small, and since we want to detect
121         // if the representation is accurate, we need to use an **unsigned**
122         // type for comparisons.
123 
124         let extrabits = extrabits as u64;
125         let errors = count as u64;
126         if extrabits > 65 {
127             // Underflow, we have a literal 0.
128             return true;
129         }
130 
131         nearest_error_is_accurate(errors, fp, extrabits)
132     }
133 }
134