1 // Adapted from https://github.com/Alexhuszagh/rust-lexical.
2
3 //! Estimate the error in an 80-bit approximation of a float.
4 //!
5 //! This estimates the error in a floating-point representation.
6 //!
7 //! This implementation is loosely based off the Golang implementation,
8 //! found here:
9 //! https://golang.org/src/strconv/atof.go
10
11 use super::float::*;
12 use super::num::*;
13 use super::rounding::*;
14
15 pub(crate) trait FloatErrors {
16 /// Get the full error scale.
error_scale() -> u3217 fn error_scale() -> u32;
18 /// Get the half error scale.
error_halfscale() -> u3219 fn error_halfscale() -> u32;
20 /// Determine if the number of errors is tolerable for float precision.
error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool21 fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool;
22 }
23
24 /// Check if the error is accurate with a round-nearest rounding scheme.
25 #[inline]
nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool26 fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool {
27 // Round-to-nearest, need to use the halfway point.
28 if extrabits == 65 {
29 // Underflow, we have a shift larger than the mantissa.
30 // Representation is valid **only** if the value is close enough
31 // overflow to the next bit within errors. If it overflows,
32 // the representation is **not** valid.
33 !fp.mant.overflowing_add(errors).1
34 } else {
35 let mask: u64 = lower_n_mask(extrabits);
36 let extra: u64 = fp.mant & mask;
37
38 // Round-to-nearest, need to check if we're close to halfway.
39 // IE, b10100 | 100000, where `|` signifies the truncation point.
40 let halfway: u64 = lower_n_halfway(extrabits);
41 let cmp1 = halfway.wrapping_sub(errors) < extra;
42 let cmp2 = extra < halfway.wrapping_add(errors);
43
44 // If both comparisons are true, we have significant rounding error,
45 // and the value cannot be exactly represented. Otherwise, the
46 // representation is valid.
47 !(cmp1 && cmp2)
48 }
49 }
50
51 impl FloatErrors for u64 {
52 #[inline]
error_scale() -> u3253 fn error_scale() -> u32 {
54 8
55 }
56
57 #[inline]
error_halfscale() -> u3258 fn error_halfscale() -> u32 {
59 u64::error_scale() / 2
60 }
61
62 #[inline]
error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool63 fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool {
64 // Determine if extended-precision float is a good approximation.
65 // If the error has affected too many units, the float will be
66 // inaccurate, or if the representation is too close to halfway
67 // that any operations could affect this halfway representation.
68 // See the documentation for dtoa for more information.
69 let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE);
70 let denormal_exp = bias - 63;
71 // This is always a valid u32, since (denormal_exp - fp.exp)
72 // will always be positive and the significand size is {23, 52}.
73 let extrabits = if fp.exp <= denormal_exp {
74 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp
75 } else {
76 63 - F::MANTISSA_SIZE
77 };
78
79 // Our logic is as follows: we want to determine if the actual
80 // mantissa and the errors during calculation differ significantly
81 // from the rounding point. The rounding point for round-nearest
82 // is the halfway point, IE, this when the truncated bits start
83 // with b1000..., while the rounding point for the round-toward
84 // is when the truncated bits are equal to 0.
85 // To do so, we can check whether the rounding point +/- the error
86 // are >/< the actual lower n bits.
87 //
88 // For whether we need to use signed or unsigned types for this
89 // analysis, see this example, using u8 rather than u64 to simplify
90 // things.
91 //
92 // # Comparisons
93 // cmp1 = (halfway - errors) < extra
94 // cmp1 = extra < (halfway + errors)
95 //
96 // # Large Extrabits, Low Errors
97 //
98 // extrabits = 8
99 // halfway = 0b10000000
100 // extra = 0b10000010
101 // errors = 0b00000100
102 // halfway - errors = 0b01111100
103 // halfway + errors = 0b10000100
104 //
105 // Unsigned:
106 // halfway - errors = 124
107 // halfway + errors = 132
108 // extra = 130
109 // cmp1 = true
110 // cmp2 = true
111 // Signed:
112 // halfway - errors = 124
113 // halfway + errors = -124
114 // extra = -126
115 // cmp1 = false
116 // cmp2 = true
117 //
118 // # Conclusion
119 //
120 // Since errors will always be small, and since we want to detect
121 // if the representation is accurate, we need to use an **unsigned**
122 // type for comparisons.
123
124 let extrabits = extrabits as u64;
125 let errors = count as u64;
126 if extrabits > 65 {
127 // Underflow, we have a literal 0.
128 return true;
129 }
130
131 nearest_error_is_accurate(errors, fp, extrabits)
132 }
133 }
134