• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/internal/charconv_parse.h"
16 #include "absl/strings/charconv.h"
17 
18 #include <cassert>
19 #include <cstdint>
20 #include <limits>
21 
22 #include "absl/strings/internal/memutil.h"
23 
24 namespace absl {
25 ABSL_NAMESPACE_BEGIN
26 namespace {
27 
28 // ParseFloat<10> will read the first 19 significant digits of the mantissa.
29 // This number was chosen for multiple reasons.
30 //
31 // (a) First, for whatever integer type we choose to represent the mantissa, we
32 // want to choose the largest possible number of decimal digits for that integer
33 // type.  We are using uint64_t, which can express any 19-digit unsigned
34 // integer.
35 //
36 // (b) Second, we need to parse enough digits that the binary value of any
37 // mantissa we capture has more bits of resolution than the mantissa
38 // representation in the target float.  Our algorithm requires at least 3 bits
39 // of headway, but 19 decimal digits give a little more than that.
40 //
41 // The following static assertions verify the above comments:
42 constexpr int kDecimalMantissaDigitsMax = 19;
43 
44 static_assert(std::numeric_limits<uint64_t>::digits10 ==
45                   kDecimalMantissaDigitsMax,
46               "(a) above");
47 
48 // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
49 static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
50 static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
51 static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
52 
53 // The lowest valued 19-digit decimal mantissa we can read still contains
54 // sufficient information to reconstruct a binary mantissa.
55 static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above");
56 
57 // ParseFloat<16> will read the first 15 significant digits of the mantissa.
58 //
59 // Because a base-16-to-base-2 conversion can be done exactly, we do not need
60 // to maximize the number of scanned hex digits to improve our conversion.  What
61 // is required is to scan two more bits than the mantissa can represent, so that
62 // we always round correctly.
63 //
64 // (One extra bit does not suffice to perform correct rounding, since a number
65 // exactly halfway between two representable floats has unique rounding rules,
66 // so we need to differentiate between a "halfway between" number and a "closer
67 // to the larger value" number.)
68 constexpr int kHexadecimalMantissaDigitsMax = 15;
69 
70 // The minimum number of significant bits that will be read from
71 // kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since
72 // the most significant digit can be a "1", which only contributes a single
73 // significant bit.
74 constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
75     4 * kHexadecimalMantissaDigitsMax - 3;
76 
77 static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
78                   std::numeric_limits<double>::digits + 2,
79               "kHexadecimalMantissaDigitsMax too small");
80 
81 // We also impose a limit on the number of significant digits we will read from
82 // an exponent, to avoid having to deal with integer overflow.  We use 9 for
83 // this purpose.
84 //
85 // If we read a 9 digit exponent, the end result of the conversion will
86 // necessarily be infinity or zero, depending on the sign of the exponent.
87 // Therefore we can just drop extra digits on the floor without any extra
88 // logic.
89 constexpr int kDecimalExponentDigitsMax = 9;
90 static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
91               "int type too small");
92 
93 // To avoid incredibly large inputs causing integer overflow for our exponent,
94 // we impose an arbitrary but very large limit on the number of significant
95 // digits we will accept.  The implementation refuses to match a string with
96 // more consecutive significant mantissa digits than this.
97 constexpr int kDecimalDigitLimit = 50000000;
98 
99 // Corresponding limit for hexadecimal digit inputs.  This is one fourth the
100 // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
101 // a binary exponent adjustment of 4.
102 constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
103 
104 // The largest exponent we can read is 999999999 (per
105 // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
106 // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
107 // comfortably fits in an integer.
108 //
109 // We count kDecimalDigitLimit twice because there are independent limits for
110 // numbers before and after the decimal point.  (In the case where there are no
111 // significant digits before the decimal point, there are independent limits for
112 // post-decimal-point leading zeroes and for significant digits.)
113 static_assert(999999999 + 2 * kDecimalDigitLimit <
114                   std::numeric_limits<int>::max(),
115               "int type too small");
116 static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
117                   std::numeric_limits<int>::max(),
118               "int type too small");
119 
120 // Returns true if the provided bitfield allows parsing an exponent value
121 // (e.g., "1.5e100").
AllowExponent(chars_format flags)122 bool AllowExponent(chars_format flags) {
123   bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
124   bool scientific =
125       (flags & chars_format::scientific) == chars_format::scientific;
126   return scientific || !fixed;
127 }
128 
129 // Returns true if the provided bitfield requires an exponent value be present.
RequireExponent(chars_format flags)130 bool RequireExponent(chars_format flags) {
131   bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
132   bool scientific =
133       (flags & chars_format::scientific) == chars_format::scientific;
134   return scientific && !fixed;
135 }
136 
137 const int8_t kAsciiToInt[256] = {
138     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
139     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
140     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
141     9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
142     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
143     -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
144     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
145     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
146     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
147     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
148     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
149     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
150     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
151     -1, -1, -1, -1, -1, -1, -1, -1, -1};
152 
153 // Returns true if `ch` is a digit in the given base
154 template <int base>
155 bool IsDigit(char ch);
156 
157 // Converts a valid `ch` to its digit value in the given base.
158 template <int base>
159 unsigned ToDigit(char ch);
160 
161 // Returns true if `ch` is the exponent delimiter for the given base.
162 template <int base>
163 bool IsExponentCharacter(char ch);
164 
165 // Returns the maximum number of significant digits we will read for a float
166 // in the given base.
167 template <int base>
168 constexpr int MantissaDigitsMax();
169 
170 // Returns the largest consecutive run of digits we will accept when parsing a
171 // number in the given base.
172 template <int base>
173 constexpr int DigitLimit();
174 
175 // Returns the amount the exponent must be adjusted by for each dropped digit.
176 // (For decimal this is 1, since the digits are in base 10 and the exponent base
177 // is also 10, but for hexadecimal this is 4, since the digits are base 16 but
178 // the exponent base is 2.)
179 template <int base>
180 constexpr int DigitMagnitude();
181 
182 template <>
IsDigit(char ch)183 bool IsDigit<10>(char ch) {
184   return ch >= '0' && ch <= '9';
185 }
186 template <>
IsDigit(char ch)187 bool IsDigit<16>(char ch) {
188   return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
189 }
190 
191 template <>
ToDigit(char ch)192 unsigned ToDigit<10>(char ch) {
193   return static_cast<unsigned>(ch - '0');
194 }
195 template <>
ToDigit(char ch)196 unsigned ToDigit<16>(char ch) {
197   return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]);
198 }
199 
200 template <>
IsExponentCharacter(char ch)201 bool IsExponentCharacter<10>(char ch) {
202   return ch == 'e' || ch == 'E';
203 }
204 
205 template <>
IsExponentCharacter(char ch)206 bool IsExponentCharacter<16>(char ch) {
207   return ch == 'p' || ch == 'P';
208 }
209 
210 template <>
MantissaDigitsMax()211 constexpr int MantissaDigitsMax<10>() {
212   return kDecimalMantissaDigitsMax;
213 }
214 template <>
MantissaDigitsMax()215 constexpr int MantissaDigitsMax<16>() {
216   return kHexadecimalMantissaDigitsMax;
217 }
218 
219 template <>
DigitLimit()220 constexpr int DigitLimit<10>() {
221   return kDecimalDigitLimit;
222 }
223 template <>
DigitLimit()224 constexpr int DigitLimit<16>() {
225   return kHexadecimalDigitLimit;
226 }
227 
228 template <>
DigitMagnitude()229 constexpr int DigitMagnitude<10>() {
230   return 1;
231 }
232 template <>
DigitMagnitude()233 constexpr int DigitMagnitude<16>() {
234   return 4;
235 }
236 
237 // Reads decimal digits from [begin, end) into *out.  Returns the number of
238 // digits consumed.
239 //
240 // After max_digits has been read, keeps consuming characters, but no longer
241 // adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit
242 // is set; otherwise, it is left unmodified.
243 //
244 // If no digits are matched, returns 0 and leaves *out unchanged.
245 //
246 // ConsumeDigits does not protect against overflow on *out; max_digits must
247 // be chosen with respect to type T to avoid the possibility of overflow.
248 template <int base, typename T>
ConsumeDigits(const char * begin,const char * end,int max_digits,T * out,bool * dropped_nonzero_digit)249 int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,
250                   bool* dropped_nonzero_digit) {
251   if (base == 10) {
252     assert(max_digits <= std::numeric_limits<T>::digits10);
253   } else if (base == 16) {
254     assert(max_digits * 4 <= std::numeric_limits<T>::digits);
255   }
256   const char* const original_begin = begin;
257 
258   // Skip leading zeros, but only if *out is zero.
259   // They don't cause an overflow so we don't have to count them for
260   // `max_digits`.
261   while (!*out && end != begin && *begin == '0') ++begin;
262 
263   T accumulator = *out;
264   const char* significant_digits_end =
265       (end - begin > max_digits) ? begin + max_digits : end;
266   while (begin < significant_digits_end && IsDigit<base>(*begin)) {
267     // Do not guard against *out overflow; max_digits was chosen to avoid this.
268     // Do assert against it, to detect problems in debug builds.
269     auto digit = static_cast<T>(ToDigit<base>(*begin));
270     assert(accumulator * base >= accumulator);
271     accumulator *= base;
272     assert(accumulator + digit >= accumulator);
273     accumulator += digit;
274     ++begin;
275   }
276   bool dropped_nonzero = false;
277   while (begin < end && IsDigit<base>(*begin)) {
278     dropped_nonzero = dropped_nonzero || (*begin != '0');
279     ++begin;
280   }
281   if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
282     *dropped_nonzero_digit = true;
283   }
284   *out = accumulator;
285   return static_cast<int>(begin - original_begin);
286 }
287 
288 // Returns true if `v` is one of the chars allowed inside parentheses following
289 // a NaN.
IsNanChar(char v)290 bool IsNanChar(char v) {
291   return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
292          (v >= 'A' && v <= 'Z');
293 }
294 
295 // Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If
296 // one is found, sets `out` appropriately and returns true.
ParseInfinityOrNan(const char * begin,const char * end,strings_internal::ParsedFloat * out)297 bool ParseInfinityOrNan(const char* begin, const char* end,
298                         strings_internal::ParsedFloat* out) {
299   if (end - begin < 3) {
300     return false;
301   }
302   switch (*begin) {
303     case 'i':
304     case 'I': {
305       // An infinity string consists of the characters "inf" or "infinity",
306       // case insensitive.
307       if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
308         return false;
309       }
310       out->type = strings_internal::FloatType::kInfinity;
311       if (end - begin >= 8 &&
312           strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
313         out->end = begin + 8;
314       } else {
315         out->end = begin + 3;
316       }
317       return true;
318     }
319     case 'n':
320     case 'N': {
321       // A NaN consists of the characters "nan", case insensitive, optionally
322       // followed by a parenthesized sequence of zero or more alphanumeric
323       // characters and/or underscores.
324       if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
325         return false;
326       }
327       out->type = strings_internal::FloatType::kNan;
328       out->end = begin + 3;
329       // NaN is allowed to be followed by a parenthesized string, consisting of
330       // only the characters [a-zA-Z0-9_].  Match that if it's present.
331       begin += 3;
332       if (begin < end && *begin == '(') {
333         const char* nan_begin = begin + 1;
334         while (nan_begin < end && IsNanChar(*nan_begin)) {
335           ++nan_begin;
336         }
337         if (nan_begin < end && *nan_begin == ')') {
338           // We found an extra NaN specifier range
339           out->subrange_begin = begin + 1;
340           out->subrange_end = nan_begin;
341           out->end = nan_begin + 1;
342         }
343       }
344       return true;
345     }
346     default:
347       return false;
348   }
349 }
350 }  // namespace
351 
352 namespace strings_internal {
353 
354 template <int base>
ParseFloat(const char * begin,const char * end,chars_format format_flags)355 strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
356                                          chars_format format_flags) {
357   strings_internal::ParsedFloat result;
358 
359   // Exit early if we're given an empty range.
360   if (begin == end) return result;
361 
362   // Handle the infinity and NaN cases.
363   if (ParseInfinityOrNan(begin, end, &result)) {
364     return result;
365   }
366 
367   const char* const mantissa_begin = begin;
368   while (begin < end && *begin == '0') {
369     ++begin;  // skip leading zeros
370   }
371   uint64_t mantissa = 0;
372 
373   int exponent_adjustment = 0;
374   bool mantissa_is_inexact = false;
375   int pre_decimal_digits = ConsumeDigits<base>(
376       begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
377   begin += pre_decimal_digits;
378   int digits_left;
379   if (pre_decimal_digits >= DigitLimit<base>()) {
380     // refuse to parse pathological inputs
381     return result;
382   } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
383     // We dropped some non-fraction digits on the floor.  Adjust our exponent
384     // to compensate.
385     exponent_adjustment =
386         static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
387     digits_left = 0;
388   } else {
389     digits_left =
390         static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
391   }
392   if (begin < end && *begin == '.') {
393     ++begin;
394     if (mantissa == 0) {
395       // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
396       // have to adjust the exponent to reflect the changed place value.
397       const char* begin_zeros = begin;
398       while (begin < end && *begin == '0') {
399         ++begin;
400       }
401       int zeros_skipped = static_cast<int>(begin - begin_zeros);
402       if (zeros_skipped >= DigitLimit<base>()) {
403         // refuse to parse pathological inputs
404         return result;
405       }
406       exponent_adjustment -= static_cast<int>(zeros_skipped);
407     }
408     int post_decimal_digits = ConsumeDigits<base>(
409         begin, end, digits_left, &mantissa, &mantissa_is_inexact);
410     begin += post_decimal_digits;
411 
412     // Since `mantissa` is an integer, each significant digit we read after
413     // the decimal point requires an adjustment to the exponent. "1.23e0" will
414     // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
415     // "123e-2").
416     if (post_decimal_digits >= DigitLimit<base>()) {
417       // refuse to parse pathological inputs
418       return result;
419     } else if (post_decimal_digits > digits_left) {
420       exponent_adjustment -= digits_left;
421     } else {
422       exponent_adjustment -= post_decimal_digits;
423     }
424   }
425   // If we've found no mantissa whatsoever, this isn't a number.
426   if (mantissa_begin == begin) {
427     return result;
428   }
429   // A bare "." doesn't count as a mantissa either.
430   if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
431     return result;
432   }
433 
434   if (mantissa_is_inexact) {
435     // We dropped significant digits on the floor.  Handle this appropriately.
436     if (base == 10) {
437       // If we truncated significant decimal digits, store the full range of the
438       // mantissa for future big integer math for exact rounding.
439       result.subrange_begin = mantissa_begin;
440       result.subrange_end = begin;
441     } else if (base == 16) {
442       // If we truncated hex digits, reflect this fact by setting the low
443       // ("sticky") bit.  This allows for correct rounding in all cases.
444       mantissa |= 1;
445     }
446   }
447   result.mantissa = mantissa;
448 
449   const char* const exponent_begin = begin;
450   result.literal_exponent = 0;
451   bool found_exponent = false;
452   if (AllowExponent(format_flags) && begin < end &&
453       IsExponentCharacter<base>(*begin)) {
454     bool negative_exponent = false;
455     ++begin;
456     if (begin < end && *begin == '-') {
457       negative_exponent = true;
458       ++begin;
459     } else if (begin < end && *begin == '+') {
460       ++begin;
461     }
462     const char* const exponent_digits_begin = begin;
463     // Exponent is always expressed in decimal, even for hexadecimal floats.
464     begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
465                                &result.literal_exponent, nullptr);
466     if (begin == exponent_digits_begin) {
467       // there were no digits where we expected an exponent.  We failed to read
468       // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
469       found_exponent = false;
470       begin = exponent_begin;
471     } else {
472       found_exponent = true;
473       if (negative_exponent) {
474         result.literal_exponent = -result.literal_exponent;
475       }
476     }
477   }
478 
479   if (!found_exponent && RequireExponent(format_flags)) {
480     // Provided flags required an exponent, but none was found.  This results
481     // in a failure to scan.
482     return result;
483   }
484 
485   // Success!
486   result.type = strings_internal::FloatType::kNumber;
487   if (result.mantissa > 0) {
488     result.exponent = result.literal_exponent +
489                       (DigitMagnitude<base>() * exponent_adjustment);
490   } else {
491     result.exponent = 0;
492   }
493   result.end = begin;
494   return result;
495 }
496 
497 template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
498                                     chars_format format_flags);
499 template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
500                                     chars_format format_flags);
501 
502 }  // namespace strings_internal
503 ABSL_NAMESPACE_END
504 }  // namespace absl
505