• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- Format string parser for scanf -------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
10 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
11 
12 #include "src/__support/arg_list.h"
13 #include "src/__support/ctype_utils.h"
14 #include "src/__support/str_to_integer.h"
15 #include "src/stdio/scanf_core/core_structs.h"
16 #include "src/stdio/scanf_core/scanf_config.h"
17 
18 #include <stddef.h>
19 
20 namespace LIBC_NAMESPACE {
21 namespace scanf_core {
22 
23 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
24 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
25 #else
26 #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
27 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
28 
29 template <typename ArgProvider> class Parser {
30   const char *__restrict str;
31 
32   size_t cur_pos = 0;
33   ArgProvider args_cur;
34 
35 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
36   // args_start stores the start of the va_args, which is used when a previous
37   // argument is needed. In that case, we have to read the arguments from the
38   // beginning since they don't support reading backwards.
39   ArgProvider args_start;
40   size_t args_index = 1;
41 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
42 
43 public:
44 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
Parser(const char * __restrict new_str,internal::ArgList & args)45   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
46       : str(new_str), args_cur(args), args_start(args) {}
47 #else
48   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
49       : str(new_str), args_cur(args) {}
50 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
51 
52   // get_next_section will parse the format string until it has a fully
53   // specified format section. This can either be a raw format section with no
54   // conversion, or a format section with a conversion that has all of its
55   // variables stored in the format section.
get_next_section()56   LIBC_INLINE FormatSection get_next_section() {
57     FormatSection section;
58     size_t starting_pos = cur_pos;
59     if (str[cur_pos] == '%') {
60       // format section
61       section.has_conv = true;
62 
63       ++cur_pos;
64       [[maybe_unused]] size_t conv_index = 0;
65 
66 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
67       conv_index = parse_index(&cur_pos);
68 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
69 
70       if (str[cur_pos] == '*') {
71         ++cur_pos;
72         section.flags = FormatFlags::NO_WRITE;
73       }
74 
75       // handle width
76       section.max_width = -1;
77       if (internal::isdigit(str[cur_pos])) {
78         auto result = internal::strtointeger<int>(str + cur_pos, 10);
79         section.max_width = result.value;
80         cur_pos = cur_pos + result.parsed_len;
81       }
82 
83       // TODO(michaelrj): add posix allocate flag support.
84       // if (str[cur_pos] == 'm') {
85       //   ++cur_pos;
86       //   section.flags = FormatFlags::ALLOCATE;
87       // }
88 
89       LengthModifier lm = parse_length_modifier(&cur_pos);
90       section.length_modifier = lm;
91 
92       section.conv_name = str[cur_pos];
93 
94       // If NO_WRITE is not set, then read the next arg as the output pointer.
95       if ((section.flags & FormatFlags::NO_WRITE) == 0) {
96         // Since all outputs are pointers, there's no need to distinguish when
97         // reading from va_args. They're all the same size and stored the same.
98         section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
99       }
100 
101       // If the end of the format section is on the '\0'. This means we need to
102       // not advance the cur_pos and we should not count this has having a
103       // conversion.
104       if (str[cur_pos] != '\0') {
105         ++cur_pos;
106       } else {
107         section.has_conv = false;
108       }
109 
110       // If the format is a bracketed one, then we need to parse out the insides
111       // of the brackets.
112       if (section.conv_name == '[') {
113         constexpr char CLOSING_BRACKET = ']';
114         constexpr char INVERT_FLAG = '^';
115         constexpr char RANGE_OPERATOR = '-';
116 
117         cpp::bitset<256> scan_set;
118         bool invert = false;
119 
120         // The circumflex in the first position represents the inversion flag,
121         // but it's easier to apply that at the end so we just store it for now.
122         if (str[cur_pos] == INVERT_FLAG) {
123           invert = true;
124           ++cur_pos;
125         }
126 
127         // This is used to determine if a hyphen is being used as a literal or
128         // as a range operator.
129         size_t set_start_pos = cur_pos;
130 
131         // Normally the right bracket closes the set, but if it's the first
132         // character (possibly after the inversion flag) then it's instead
133         // included as a character in the set and the second right bracket
134         // closes the set.
135         if (str[cur_pos] == CLOSING_BRACKET) {
136           scan_set.set(CLOSING_BRACKET);
137           ++cur_pos;
138         }
139 
140         while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
141           // If a hyphen is being used as a range operator, since it's neither
142           // at the beginning nor end of the set.
143           if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
144               str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
145             // Technically there is no requirement to correct the ordering of
146             // the range, but since the range operator is entirely
147             // implementation defined it seems like a good convenience.
148             char a = str[cur_pos - 1];
149             char b = str[cur_pos + 1];
150             char start = (a < b ? a : b);
151             char end = (a < b ? b : a);
152             scan_set.set_range(start, end);
153             cur_pos += 2;
154           } else {
155             scan_set.set(str[cur_pos]);
156             ++cur_pos;
157           }
158         }
159         if (invert)
160           scan_set.flip();
161 
162         if (str[cur_pos] == CLOSING_BRACKET) {
163           ++cur_pos;
164           section.scan_set = scan_set;
165         } else {
166           // if the end of the string was encountered, this is not a valid set.
167           section.has_conv = false;
168         }
169       }
170     } else {
171       // raw section
172       section.has_conv = false;
173       while (str[cur_pos] != '%' && str[cur_pos] != '\0')
174         ++cur_pos;
175     }
176     section.raw_string = {str + starting_pos, cur_pos - starting_pos};
177     return section;
178   }
179 
180 private:
181   // parse_length_modifier parses the length modifier inside a format string. It
182   // assumes that str[*local_pos] is inside a format specifier. It returns a
183   // LengthModifier with the length modifier it found. It will advance local_pos
184   // after the format specifier if one is found.
parse_length_modifier(size_t * local_pos)185   LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) {
186     switch (str[*local_pos]) {
187     case ('l'):
188       if (str[*local_pos + 1] == 'l') {
189         *local_pos += 2;
190         return LengthModifier::ll;
191       } else {
192         ++*local_pos;
193         return LengthModifier::l;
194       }
195     case ('h'):
196       if (str[*local_pos + 1] == 'h') {
197         *local_pos += 2;
198         return LengthModifier::hh;
199       } else {
200         ++*local_pos;
201         return LengthModifier::h;
202       }
203     case ('L'):
204       ++*local_pos;
205       return LengthModifier::L;
206     case ('j'):
207       ++*local_pos;
208       return LengthModifier::j;
209     case ('z'):
210       ++*local_pos;
211       return LengthModifier::z;
212     case ('t'):
213       ++*local_pos;
214       return LengthModifier::t;
215     default:
216       return LengthModifier::NONE;
217     }
218   }
219 
220   // get_next_arg_value gets the next value from the arg list as type T.
get_next_arg_value()221   template <class T> LIBC_INLINE T get_next_arg_value() {
222     return args_cur.template next_var<T>();
223   }
224 
225   //----------------------------------------------------
226   // INDEX MODE ONLY FUNCTIONS AFTER HERE:
227   //----------------------------------------------------
228 
229 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
230 
231   // parse_index parses the index of a value inside a format string. It
232   // assumes that str[*local_pos] points to character after a '%' or '*', and
233   // returns 0 if there is no closing $, or if it finds no number. If it finds a
234   // number, it will move local_pos past the end of the $, else it will not move
235   // local_pos.
parse_index(size_t * local_pos)236   LIBC_INLINE size_t parse_index(size_t *local_pos) {
237     if (internal::isdigit(str[*local_pos])) {
238       auto result = internal::strtointeger<int>(str + *local_pos, 10);
239       size_t index = result.value;
240       if (str[*local_pos + result.parsed_len] != '$')
241         return 0;
242       *local_pos = 1 + result.parsed_len + *local_pos;
243       return index;
244     }
245     return 0;
246   }
247 
248   // get_arg_value gets the value from the arg list at index (starting at 1).
249   // This may require parsing the format string. An index of 0 is interpreted as
250   // the next value.
get_arg_value(size_t index)251   template <class T> LIBC_INLINE T get_arg_value(size_t index) {
252     if (!(index == 0 || index == args_index))
253       args_to_index(index);
254 
255     ++args_index;
256     return get_next_arg_value<T>();
257   }
258 
259   // the ArgList can only return the next item in the list. This function is
260   // used in index mode when the item that needs to be read is not the next one.
261   // It moves cur_args to the index requested so the appropriate value may
262   // be read. This may involve parsing the format string, and is in the worst
263   // case an O(n^2) operation.
args_to_index(size_t index)264   LIBC_INLINE void args_to_index(size_t index) {
265     if (args_index > index) {
266       args_index = 1;
267       args_cur = args_start;
268     }
269 
270     while (args_index < index) {
271       // Since all arguments must be pointers, we can just read all of them as
272       // void * and not worry about type issues.
273       args_cur.template next_var<void *>();
274       ++args_index;
275     }
276   }
277 
278 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
279 };
280 
281 } // namespace scanf_core
282 } // namespace LIBC_NAMESPACE
283 
284 #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
285