1 //===-- Format string parser for scanf -------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 10 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 11 12 #include "src/__support/arg_list.h" 13 #include "src/__support/ctype_utils.h" 14 #include "src/__support/str_to_integer.h" 15 #include "src/stdio/scanf_core/core_structs.h" 16 #include "src/stdio/scanf_core/scanf_config.h" 17 18 #include <stddef.h> 19 20 namespace LIBC_NAMESPACE { 21 namespace scanf_core { 22 23 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 24 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) 25 #else 26 #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() 27 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 28 29 template <typename ArgProvider> class Parser { 30 const char *__restrict str; 31 32 size_t cur_pos = 0; 33 ArgProvider args_cur; 34 35 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 36 // args_start stores the start of the va_args, which is used when a previous 37 // argument is needed. In that case, we have to read the arguments from the 38 // beginning since they don't support reading backwards. 39 ArgProvider args_start; 40 size_t args_index = 1; 41 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 42 43 public: 44 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE Parser(const char * __restrict new_str,internal::ArgList & args)45 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 46 : str(new_str), args_cur(args), args_start(args) {} 47 #else 48 LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 49 : str(new_str), args_cur(args) {} 50 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 51 52 // get_next_section will parse the format string until it has a fully 53 // specified format section. This can either be a raw format section with no 54 // conversion, or a format section with a conversion that has all of its 55 // variables stored in the format section. get_next_section()56 LIBC_INLINE FormatSection get_next_section() { 57 FormatSection section; 58 size_t starting_pos = cur_pos; 59 if (str[cur_pos] == '%') { 60 // format section 61 section.has_conv = true; 62 63 ++cur_pos; 64 [[maybe_unused]] size_t conv_index = 0; 65 66 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 67 conv_index = parse_index(&cur_pos); 68 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 69 70 if (str[cur_pos] == '*') { 71 ++cur_pos; 72 section.flags = FormatFlags::NO_WRITE; 73 } 74 75 // handle width 76 section.max_width = -1; 77 if (internal::isdigit(str[cur_pos])) { 78 auto result = internal::strtointeger<int>(str + cur_pos, 10); 79 section.max_width = result.value; 80 cur_pos = cur_pos + result.parsed_len; 81 } 82 83 // TODO(michaelrj): add posix allocate flag support. 84 // if (str[cur_pos] == 'm') { 85 // ++cur_pos; 86 // section.flags = FormatFlags::ALLOCATE; 87 // } 88 89 LengthModifier lm = parse_length_modifier(&cur_pos); 90 section.length_modifier = lm; 91 92 section.conv_name = str[cur_pos]; 93 94 // If NO_WRITE is not set, then read the next arg as the output pointer. 95 if ((section.flags & FormatFlags::NO_WRITE) == 0) { 96 // Since all outputs are pointers, there's no need to distinguish when 97 // reading from va_args. They're all the same size and stored the same. 98 section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); 99 } 100 101 // If the end of the format section is on the '\0'. This means we need to 102 // not advance the cur_pos and we should not count this has having a 103 // conversion. 104 if (str[cur_pos] != '\0') { 105 ++cur_pos; 106 } else { 107 section.has_conv = false; 108 } 109 110 // If the format is a bracketed one, then we need to parse out the insides 111 // of the brackets. 112 if (section.conv_name == '[') { 113 constexpr char CLOSING_BRACKET = ']'; 114 constexpr char INVERT_FLAG = '^'; 115 constexpr char RANGE_OPERATOR = '-'; 116 117 cpp::bitset<256> scan_set; 118 bool invert = false; 119 120 // The circumflex in the first position represents the inversion flag, 121 // but it's easier to apply that at the end so we just store it for now. 122 if (str[cur_pos] == INVERT_FLAG) { 123 invert = true; 124 ++cur_pos; 125 } 126 127 // This is used to determine if a hyphen is being used as a literal or 128 // as a range operator. 129 size_t set_start_pos = cur_pos; 130 131 // Normally the right bracket closes the set, but if it's the first 132 // character (possibly after the inversion flag) then it's instead 133 // included as a character in the set and the second right bracket 134 // closes the set. 135 if (str[cur_pos] == CLOSING_BRACKET) { 136 scan_set.set(CLOSING_BRACKET); 137 ++cur_pos; 138 } 139 140 while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { 141 // If a hyphen is being used as a range operator, since it's neither 142 // at the beginning nor end of the set. 143 if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && 144 str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { 145 // Technically there is no requirement to correct the ordering of 146 // the range, but since the range operator is entirely 147 // implementation defined it seems like a good convenience. 148 char a = str[cur_pos - 1]; 149 char b = str[cur_pos + 1]; 150 char start = (a < b ? a : b); 151 char end = (a < b ? b : a); 152 scan_set.set_range(start, end); 153 cur_pos += 2; 154 } else { 155 scan_set.set(str[cur_pos]); 156 ++cur_pos; 157 } 158 } 159 if (invert) 160 scan_set.flip(); 161 162 if (str[cur_pos] == CLOSING_BRACKET) { 163 ++cur_pos; 164 section.scan_set = scan_set; 165 } else { 166 // if the end of the string was encountered, this is not a valid set. 167 section.has_conv = false; 168 } 169 } 170 } else { 171 // raw section 172 section.has_conv = false; 173 while (str[cur_pos] != '%' && str[cur_pos] != '\0') 174 ++cur_pos; 175 } 176 section.raw_string = {str + starting_pos, cur_pos - starting_pos}; 177 return section; 178 } 179 180 private: 181 // parse_length_modifier parses the length modifier inside a format string. It 182 // assumes that str[*local_pos] is inside a format specifier. It returns a 183 // LengthModifier with the length modifier it found. It will advance local_pos 184 // after the format specifier if one is found. parse_length_modifier(size_t * local_pos)185 LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { 186 switch (str[*local_pos]) { 187 case ('l'): 188 if (str[*local_pos + 1] == 'l') { 189 *local_pos += 2; 190 return LengthModifier::ll; 191 } else { 192 ++*local_pos; 193 return LengthModifier::l; 194 } 195 case ('h'): 196 if (str[*local_pos + 1] == 'h') { 197 *local_pos += 2; 198 return LengthModifier::hh; 199 } else { 200 ++*local_pos; 201 return LengthModifier::h; 202 } 203 case ('L'): 204 ++*local_pos; 205 return LengthModifier::L; 206 case ('j'): 207 ++*local_pos; 208 return LengthModifier::j; 209 case ('z'): 210 ++*local_pos; 211 return LengthModifier::z; 212 case ('t'): 213 ++*local_pos; 214 return LengthModifier::t; 215 default: 216 return LengthModifier::NONE; 217 } 218 } 219 220 // get_next_arg_value gets the next value from the arg list as type T. get_next_arg_value()221 template <class T> LIBC_INLINE T get_next_arg_value() { 222 return args_cur.template next_var<T>(); 223 } 224 225 //---------------------------------------------------- 226 // INDEX MODE ONLY FUNCTIONS AFTER HERE: 227 //---------------------------------------------------- 228 229 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 230 231 // parse_index parses the index of a value inside a format string. It 232 // assumes that str[*local_pos] points to character after a '%' or '*', and 233 // returns 0 if there is no closing $, or if it finds no number. If it finds a 234 // number, it will move local_pos past the end of the $, else it will not move 235 // local_pos. parse_index(size_t * local_pos)236 LIBC_INLINE size_t parse_index(size_t *local_pos) { 237 if (internal::isdigit(str[*local_pos])) { 238 auto result = internal::strtointeger<int>(str + *local_pos, 10); 239 size_t index = result.value; 240 if (str[*local_pos + result.parsed_len] != '$') 241 return 0; 242 *local_pos = 1 + result.parsed_len + *local_pos; 243 return index; 244 } 245 return 0; 246 } 247 248 // get_arg_value gets the value from the arg list at index (starting at 1). 249 // This may require parsing the format string. An index of 0 is interpreted as 250 // the next value. get_arg_value(size_t index)251 template <class T> LIBC_INLINE T get_arg_value(size_t index) { 252 if (!(index == 0 || index == args_index)) 253 args_to_index(index); 254 255 ++args_index; 256 return get_next_arg_value<T>(); 257 } 258 259 // the ArgList can only return the next item in the list. This function is 260 // used in index mode when the item that needs to be read is not the next one. 261 // It moves cur_args to the index requested so the appropriate value may 262 // be read. This may involve parsing the format string, and is in the worst 263 // case an O(n^2) operation. args_to_index(size_t index)264 LIBC_INLINE void args_to_index(size_t index) { 265 if (args_index > index) { 266 args_index = 1; 267 args_cur = args_start; 268 } 269 270 while (args_index < index) { 271 // Since all arguments must be pointers, we can just read all of them as 272 // void * and not worry about type issues. 273 args_cur.template next_var<void *>(); 274 ++args_index; 275 } 276 } 277 278 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 279 }; 280 281 } // namespace scanf_core 282 } // namespace LIBC_NAMESPACE 283 284 #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 285