• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "src/stdio/scanf_core/int_converter.h"
10 
11 #include "src/__support/CPP/limits.h"
12 #include "src/__support/ctype_utils.h"
13 #include "src/stdio/scanf_core/converter_utils.h"
14 #include "src/stdio/scanf_core/core_structs.h"
15 #include "src/stdio/scanf_core/reader.h"
16 
17 #include <stddef.h>
18 
19 namespace LIBC_NAMESPACE {
20 namespace scanf_core {
21 
22 // This code is very similar to the code in __support/str_to_integer.h but is
23 // not quite the same. Here is the list of differences and why they exist:
24 //  1) This takes a reader and a format section instead of a char* and the base.
25 //      This should be fairly self explanatory. While the char* could be adapted
26 //      to a reader and the base could be calculated ahead of time, the
27 //      semantics are slightly different, specifically a char* can be indexed
28 //      freely (I can read str[2] and then str[0]) whereas a File (which the
29 //      reader may contain) cannot.
30 //  2) Because this uses a Reader, this function can only unget once.
31 //      This is relevant because scanf specifies it reads the "longest sequence
32 //      of input characters which does not exceed any specified field width and
33 //      which is, or is a prefix of, a matching input sequence." Whereas the
34 //      strtol function accepts "the longest initial subsequence of the input
35 //      string (...) that is of the expected form." This is demonstrated by the
36 //      differences in how they deal with the string "0xZZZ" when parsing as
37 //      hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
38 //      since it reads the first 'Z', sees that it's not a valid hex digit, and
39 //      reverses one character. The strtol function on the other hand only
40 //      accepts the "0" since that's the longest valid hexadecimal sequence. It
41 //      sees the 'Z' after the "0x" and determines that this is not the prefix
42 //      to a valid hex string.
43 //  3) This conversion may have a maximum width.
44 //      If a maximum width is specified, this conversion is only allowed to
45 //      accept a certain number of characters. Strtol doesn't have any such
46 //      limitation.
convert_int(Reader * reader,const FormatSection & to_conv)47 int convert_int(Reader *reader, const FormatSection &to_conv) {
48   // %d "Matches an optionally signed decimal integer [...] with the value 10
49   // for the base argument. The corresponding argument shall be a pointer to
50   // signed integer."
51 
52   // %i "Matches an optionally signed integer [...] with the value 0 for the
53   // base argument. The corresponding argument shall be a pointer to signed
54   // integer."
55 
56   // %u "Matches an optionally signed decimal integer [...] with the value 10
57   // for the base argument. The corresponding argument shall be a pointer to
58   // unsigned integer"
59 
60   // %o "Matches an optionally signed octal integer [...] with the value 8 for
61   // the base argument. The corresponding argument shall be a pointer to
62   // unsigned integer"
63 
64   // %x/X "Matches an optionally signed hexadecimal integer [...] with the value
65   // 16 for the base argument. The corresponding argument shall be a pointer to
66   // unsigned integer"
67 
68   size_t max_width = cpp::numeric_limits<size_t>::max();
69   if (to_conv.max_width > 0) {
70     max_width = to_conv.max_width;
71   }
72 
73   uintmax_t result = 0;
74   bool is_number = false;
75   bool is_signed = false;
76   int base = 0;
77   if (to_conv.conv_name == 'i') {
78     base = 0;
79     is_signed = true;
80   } else if (to_conv.conv_name == 'o') {
81     base = 8;
82   } else if (to_lower(to_conv.conv_name) == 'x' || to_conv.conv_name == 'p') {
83     base = 16;
84   } else if (to_conv.conv_name == 'd') {
85     base = 10;
86     is_signed = true;
87   } else { // conv_name must be 'u'
88     base = 10;
89   }
90 
91   char cur_char = reader->getc();
92 
93   char result_sign = '+';
94   if (cur_char == '+' || cur_char == '-') {
95     result_sign = cur_char;
96     if (max_width > 1) {
97       --max_width;
98       cur_char = reader->getc();
99     } else {
100       // If the max width has been hit already, then the return value must be 0
101       // since no actual digits of the number have been parsed yet.
102       write_int_with_length(0, to_conv);
103       return MATCHING_FAILURE;
104     }
105   }
106   const bool is_negative = result_sign == '-';
107 
108   // Base of 0 means automatically determine the base. Base of 16 may have a
109   // prefix of "0x"
110   if (base == 0 || base == 16) {
111     // If the first character is 0, then it could be octal or hex.
112     if (cur_char == '0') {
113       is_number = true;
114 
115       // Read the next character to check.
116       if (max_width > 1) {
117         --max_width;
118         cur_char = reader->getc();
119       } else {
120         write_int_with_length(0, to_conv);
121         return READ_OK;
122       }
123 
124       if (to_lower(cur_char) == 'x') {
125         // This is a valid hex prefix.
126         base = 16;
127         if (max_width > 1) {
128           --max_width;
129           cur_char = reader->getc();
130         } else {
131           write_int_with_length(0, to_conv);
132           return READ_OK;
133         }
134 
135       } else {
136         if (base == 0) {
137           base = 8;
138         }
139       }
140     } else if (base == 0) {
141       if (internal::isdigit(cur_char)) {
142         // If the first character is a different number, then it's 10.
143         base = 10;
144       } else {
145         // If the first character isn't a valid digit, then there are no valid
146         // digits at all. The number is 0.
147         reader->ungetc(cur_char);
148         write_int_with_length(0, to_conv);
149         return MATCHING_FAILURE;
150       }
151     }
152   }
153 
154   constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
155   constexpr uintmax_t SIGNED_MAX =
156       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
157   constexpr uintmax_t NEGATIVE_SIGNED_MAX =
158       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;
159 
160   const uintmax_t MAX =
161       (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
162                  : UNSIGNED_MAX);
163 
164   const uintmax_t max_div_by_base = MAX / base;
165 
166   if (internal::isalnum(cur_char) && b36_char_to_int(cur_char) < base) {
167     is_number = true;
168   }
169 
170   bool has_overflow = false;
171   size_t i = 0;
172   for (; i < max_width && internal::isalnum(cur_char) &&
173          b36_char_to_int(cur_char) < base;
174        ++i, cur_char = reader->getc()) {
175 
176     uintmax_t cur_digit = b36_char_to_int(cur_char);
177 
178     if (result == MAX) {
179       has_overflow = true;
180       continue;
181     } else if (result > max_div_by_base) {
182       result = MAX;
183       has_overflow = true;
184     } else {
185       result = result * base;
186     }
187 
188     if (result > MAX - cur_digit) {
189       result = MAX;
190       has_overflow = true;
191     } else {
192       result = result + cur_digit;
193     }
194   }
195 
196   // We always read one more character than will be used, so we have to put the
197   // last one back.
198   reader->ungetc(cur_char);
199 
200   if (has_overflow) {
201     write_int_with_length(MAX, to_conv);
202   } else {
203     if (is_negative)
204       result = -result;
205 
206     write_int_with_length(result, to_conv);
207   }
208 
209   if (!is_number)
210     return MATCHING_FAILURE;
211   return READ_OK;
212 }
213 
214 } // namespace scanf_core
215 } // namespace LIBC_NAMESPACE
216