1 #pragma once 2 3 #include <array> // array 4 #include <clocale> // localeconv 5 #include <cstddef> // size_t 6 #include <cstdio> // snprintf 7 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull 8 #include <initializer_list> // initializer_list 9 #include <string> // char_traits, string 10 #include <utility> // move 11 #include <vector> // vector 12 13 #include <nlohmann/detail/input/input_adapters.hpp> 14 #include <nlohmann/detail/input/position_t.hpp> 15 #include <nlohmann/detail/macro_scope.hpp> 16 17 namespace nlohmann 18 { 19 namespace detail 20 { 21 /////////// 22 // lexer // 23 /////////// 24 25 template<typename BasicJsonType> 26 class lexer_base 27 { 28 public: 29 /// token types for the parser 30 enum class token_type 31 { 32 uninitialized, ///< indicating the scanner is uninitialized 33 literal_true, ///< the `true` literal 34 literal_false, ///< the `false` literal 35 literal_null, ///< the `null` literal 36 value_string, ///< a string -- use get_string() for actual value 37 value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value 38 value_integer, ///< a signed integer -- use get_number_integer() for actual value 39 value_float, ///< an floating point number -- use get_number_float() for actual value 40 begin_array, ///< the character for array begin `[` 41 begin_object, ///< the character for object begin `{` 42 end_array, ///< the character for array end `]` 43 end_object, ///< the character for object end `}` 44 name_separator, ///< the name separator `:` 45 value_separator, ///< the value separator `,` 46 parse_error, ///< indicating a parse error 47 end_of_input, ///< indicating the end of the input buffer 48 literal_or_value ///< a literal or the begin of a value (only for diagnostics) 49 }; 50 51 /// return name of values of type token_type (only used for errors) 52 JSON_HEDLEY_RETURNS_NON_NULL 53 JSON_HEDLEY_CONST token_type_name(const token_type t)54 static const char* token_type_name(const token_type t) noexcept 55 { 56 switch (t) 57 { 58 case token_type::uninitialized: 59 return "<uninitialized>"; 60 case token_type::literal_true: 61 return "true literal"; 62 case token_type::literal_false: 63 return "false literal"; 64 case token_type::literal_null: 65 return "null literal"; 66 case token_type::value_string: 67 return "string literal"; 68 case token_type::value_unsigned: 69 case token_type::value_integer: 70 case token_type::value_float: 71 return "number literal"; 72 case token_type::begin_array: 73 return "'['"; 74 case token_type::begin_object: 75 return "'{'"; 76 case token_type::end_array: 77 return "']'"; 78 case token_type::end_object: 79 return "'}'"; 80 case token_type::name_separator: 81 return "':'"; 82 case token_type::value_separator: 83 return "','"; 84 case token_type::parse_error: 85 return "<parse error>"; 86 case token_type::end_of_input: 87 return "end of input"; 88 case token_type::literal_or_value: 89 return "'[', '{', or a literal"; 90 // LCOV_EXCL_START 91 default: // catch non-enum values 92 return "unknown token"; 93 // LCOV_EXCL_STOP 94 } 95 } 96 }; 97 /*! 98 @brief lexical analysis 99 100 This class organizes the lexical analysis during JSON deserialization. 101 */ 102 template<typename BasicJsonType, typename InputAdapterType> 103 class lexer : public lexer_base<BasicJsonType> 104 { 105 using number_integer_t = typename BasicJsonType::number_integer_t; 106 using number_unsigned_t = typename BasicJsonType::number_unsigned_t; 107 using number_float_t = typename BasicJsonType::number_float_t; 108 using string_t = typename BasicJsonType::string_t; 109 using char_type = typename InputAdapterType::char_type; 110 using char_int_type = typename std::char_traits<char_type>::int_type; 111 112 public: 113 using token_type = typename lexer_base<BasicJsonType>::token_type; 114 lexer(InputAdapterType && adapter,bool ignore_comments_=false)115 explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) 116 : ia(std::move(adapter)) 117 , ignore_comments(ignore_comments_) 118 , decimal_point_char(static_cast<char_int_type>(get_decimal_point())) 119 {} 120 121 // delete because of pointer members 122 lexer(const lexer&) = delete; 123 lexer(lexer&&) = default; 124 lexer& operator=(lexer&) = delete; 125 lexer& operator=(lexer&&) = default; 126 ~lexer() = default; 127 128 private: 129 ///////////////////// 130 // locales 131 ///////////////////// 132 133 /// return the locale-dependent decimal point 134 JSON_HEDLEY_PURE get_decimal_point()135 static char get_decimal_point() noexcept 136 { 137 const auto* loc = localeconv(); 138 JSON_ASSERT(loc != nullptr); 139 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); 140 } 141 142 ///////////////////// 143 // scan functions 144 ///////////////////// 145 146 /*! 147 @brief get codepoint from 4 hex characters following `\u` 148 149 For input "\u c1 c2 c3 c4" the codepoint is: 150 (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 151 = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) 152 153 Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' 154 must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The 155 conversion is done by subtracting the offset (0x30, 0x37, and 0x57) 156 between the ASCII value of the character and the desired integer value. 157 158 @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or 159 non-hex character) 160 */ get_codepoint()161 int get_codepoint() 162 { 163 // this function only makes sense after reading `\u` 164 JSON_ASSERT(current == 'u'); 165 int codepoint = 0; 166 167 const auto factors = { 12u, 8u, 4u, 0u }; 168 for (const auto factor : factors) 169 { 170 get(); 171 172 if (current >= '0' && current <= '9') 173 { 174 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor); 175 } 176 else if (current >= 'A' && current <= 'F') 177 { 178 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor); 179 } 180 else if (current >= 'a' && current <= 'f') 181 { 182 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor); 183 } 184 else 185 { 186 return -1; 187 } 188 } 189 190 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); 191 return codepoint; 192 } 193 194 /*! 195 @brief check if the next byte(s) are inside a given range 196 197 Adds the current byte and, for each passed range, reads a new byte and 198 checks if it is inside the range. If a violation was detected, set up an 199 error message and return false. Otherwise, return true. 200 201 @param[in] ranges list of integers; interpreted as list of pairs of 202 inclusive lower and upper bound, respectively 203 204 @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, 205 1, 2, or 3 pairs. This precondition is enforced by an assertion. 206 207 @return true if and only if no range violation was detected 208 */ next_byte_in_range(std::initializer_list<char_int_type> ranges)209 bool next_byte_in_range(std::initializer_list<char_int_type> ranges) 210 { 211 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); 212 add(current); 213 214 for (auto range = ranges.begin(); range != ranges.end(); ++range) 215 { 216 get(); 217 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) 218 { 219 add(current); 220 } 221 else 222 { 223 error_message = "invalid string: ill-formed UTF-8 byte"; 224 return false; 225 } 226 } 227 228 return true; 229 } 230 231 /*! 232 @brief scan a string literal 233 234 This function scans a string according to Sect. 7 of RFC 7159. While 235 scanning, bytes are escaped and copied into buffer token_buffer. Then the 236 function returns successfully, token_buffer is *not* null-terminated (as it 237 may contain \0 bytes), and token_buffer.size() is the number of bytes in the 238 string. 239 240 @return token_type::value_string if string could be successfully scanned, 241 token_type::parse_error otherwise 242 243 @note In case of errors, variable error_message contains a textual 244 description. 245 */ scan_string()246 token_type scan_string() 247 { 248 // reset token_buffer (ignore opening quote) 249 reset(); 250 251 // we entered the function by reading an open quote 252 JSON_ASSERT(current == '\"'); 253 254 while (true) 255 { 256 // get next character 257 switch (get()) 258 { 259 // end of file while parsing string 260 case std::char_traits<char_type>::eof(): 261 { 262 error_message = "invalid string: missing closing quote"; 263 return token_type::parse_error; 264 } 265 266 // closing quote 267 case '\"': 268 { 269 return token_type::value_string; 270 } 271 272 // escapes 273 case '\\': 274 { 275 switch (get()) 276 { 277 // quotation mark 278 case '\"': 279 add('\"'); 280 break; 281 // reverse solidus 282 case '\\': 283 add('\\'); 284 break; 285 // solidus 286 case '/': 287 add('/'); 288 break; 289 // backspace 290 case 'b': 291 add('\b'); 292 break; 293 // form feed 294 case 'f': 295 add('\f'); 296 break; 297 // line feed 298 case 'n': 299 add('\n'); 300 break; 301 // carriage return 302 case 'r': 303 add('\r'); 304 break; 305 // tab 306 case 't': 307 add('\t'); 308 break; 309 310 // unicode escapes 311 case 'u': 312 { 313 const int codepoint1 = get_codepoint(); 314 int codepoint = codepoint1; // start with codepoint1 315 316 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) 317 { 318 error_message = "invalid string: '\\u' must be followed by 4 hex digits"; 319 return token_type::parse_error; 320 } 321 322 // check if code point is a high surrogate 323 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) 324 { 325 // expect next \uxxxx entry 326 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) 327 { 328 const int codepoint2 = get_codepoint(); 329 330 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) 331 { 332 error_message = "invalid string: '\\u' must be followed by 4 hex digits"; 333 return token_type::parse_error; 334 } 335 336 // check if codepoint2 is a low surrogate 337 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) 338 { 339 // overwrite codepoint 340 codepoint = static_cast<int>( 341 // high surrogate occupies the most significant 22 bits 342 (static_cast<unsigned int>(codepoint1) << 10u) 343 // low surrogate occupies the least significant 15 bits 344 + static_cast<unsigned int>(codepoint2) 345 // there is still the 0xD800, 0xDC00 and 0x10000 noise 346 // in the result so we have to subtract with: 347 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 348 - 0x35FDC00u); 349 } 350 else 351 { 352 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; 353 return token_type::parse_error; 354 } 355 } 356 else 357 { 358 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; 359 return token_type::parse_error; 360 } 361 } 362 else 363 { 364 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) 365 { 366 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; 367 return token_type::parse_error; 368 } 369 } 370 371 // result of the above calculation yields a proper codepoint 372 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); 373 374 // translate codepoint into bytes 375 if (codepoint < 0x80) 376 { 377 // 1-byte characters: 0xxxxxxx (ASCII) 378 add(static_cast<char_int_type>(codepoint)); 379 } 380 else if (codepoint <= 0x7FF) 381 { 382 // 2-byte characters: 110xxxxx 10xxxxxx 383 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u))); 384 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); 385 } 386 else if (codepoint <= 0xFFFF) 387 { 388 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx 389 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u))); 390 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); 391 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); 392 } 393 else 394 { 395 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 396 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u))); 397 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu))); 398 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); 399 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); 400 } 401 402 break; 403 } 404 405 // other characters after escape 406 default: 407 error_message = "invalid string: forbidden character after backslash"; 408 return token_type::parse_error; 409 } 410 411 break; 412 } 413 414 // invalid control characters 415 case 0x00: 416 { 417 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; 418 return token_type::parse_error; 419 } 420 421 case 0x01: 422 { 423 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; 424 return token_type::parse_error; 425 } 426 427 case 0x02: 428 { 429 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; 430 return token_type::parse_error; 431 } 432 433 case 0x03: 434 { 435 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; 436 return token_type::parse_error; 437 } 438 439 case 0x04: 440 { 441 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; 442 return token_type::parse_error; 443 } 444 445 case 0x05: 446 { 447 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; 448 return token_type::parse_error; 449 } 450 451 case 0x06: 452 { 453 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; 454 return token_type::parse_error; 455 } 456 457 case 0x07: 458 { 459 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; 460 return token_type::parse_error; 461 } 462 463 case 0x08: 464 { 465 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; 466 return token_type::parse_error; 467 } 468 469 case 0x09: 470 { 471 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; 472 return token_type::parse_error; 473 } 474 475 case 0x0A: 476 { 477 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; 478 return token_type::parse_error; 479 } 480 481 case 0x0B: 482 { 483 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; 484 return token_type::parse_error; 485 } 486 487 case 0x0C: 488 { 489 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; 490 return token_type::parse_error; 491 } 492 493 case 0x0D: 494 { 495 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; 496 return token_type::parse_error; 497 } 498 499 case 0x0E: 500 { 501 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; 502 return token_type::parse_error; 503 } 504 505 case 0x0F: 506 { 507 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; 508 return token_type::parse_error; 509 } 510 511 case 0x10: 512 { 513 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; 514 return token_type::parse_error; 515 } 516 517 case 0x11: 518 { 519 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; 520 return token_type::parse_error; 521 } 522 523 case 0x12: 524 { 525 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; 526 return token_type::parse_error; 527 } 528 529 case 0x13: 530 { 531 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; 532 return token_type::parse_error; 533 } 534 535 case 0x14: 536 { 537 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; 538 return token_type::parse_error; 539 } 540 541 case 0x15: 542 { 543 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; 544 return token_type::parse_error; 545 } 546 547 case 0x16: 548 { 549 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; 550 return token_type::parse_error; 551 } 552 553 case 0x17: 554 { 555 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; 556 return token_type::parse_error; 557 } 558 559 case 0x18: 560 { 561 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; 562 return token_type::parse_error; 563 } 564 565 case 0x19: 566 { 567 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; 568 return token_type::parse_error; 569 } 570 571 case 0x1A: 572 { 573 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; 574 return token_type::parse_error; 575 } 576 577 case 0x1B: 578 { 579 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; 580 return token_type::parse_error; 581 } 582 583 case 0x1C: 584 { 585 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; 586 return token_type::parse_error; 587 } 588 589 case 0x1D: 590 { 591 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; 592 return token_type::parse_error; 593 } 594 595 case 0x1E: 596 { 597 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; 598 return token_type::parse_error; 599 } 600 601 case 0x1F: 602 { 603 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; 604 return token_type::parse_error; 605 } 606 607 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) 608 case 0x20: 609 case 0x21: 610 case 0x23: 611 case 0x24: 612 case 0x25: 613 case 0x26: 614 case 0x27: 615 case 0x28: 616 case 0x29: 617 case 0x2A: 618 case 0x2B: 619 case 0x2C: 620 case 0x2D: 621 case 0x2E: 622 case 0x2F: 623 case 0x30: 624 case 0x31: 625 case 0x32: 626 case 0x33: 627 case 0x34: 628 case 0x35: 629 case 0x36: 630 case 0x37: 631 case 0x38: 632 case 0x39: 633 case 0x3A: 634 case 0x3B: 635 case 0x3C: 636 case 0x3D: 637 case 0x3E: 638 case 0x3F: 639 case 0x40: 640 case 0x41: 641 case 0x42: 642 case 0x43: 643 case 0x44: 644 case 0x45: 645 case 0x46: 646 case 0x47: 647 case 0x48: 648 case 0x49: 649 case 0x4A: 650 case 0x4B: 651 case 0x4C: 652 case 0x4D: 653 case 0x4E: 654 case 0x4F: 655 case 0x50: 656 case 0x51: 657 case 0x52: 658 case 0x53: 659 case 0x54: 660 case 0x55: 661 case 0x56: 662 case 0x57: 663 case 0x58: 664 case 0x59: 665 case 0x5A: 666 case 0x5B: 667 case 0x5D: 668 case 0x5E: 669 case 0x5F: 670 case 0x60: 671 case 0x61: 672 case 0x62: 673 case 0x63: 674 case 0x64: 675 case 0x65: 676 case 0x66: 677 case 0x67: 678 case 0x68: 679 case 0x69: 680 case 0x6A: 681 case 0x6B: 682 case 0x6C: 683 case 0x6D: 684 case 0x6E: 685 case 0x6F: 686 case 0x70: 687 case 0x71: 688 case 0x72: 689 case 0x73: 690 case 0x74: 691 case 0x75: 692 case 0x76: 693 case 0x77: 694 case 0x78: 695 case 0x79: 696 case 0x7A: 697 case 0x7B: 698 case 0x7C: 699 case 0x7D: 700 case 0x7E: 701 case 0x7F: 702 { 703 add(current); 704 break; 705 } 706 707 // U+0080..U+07FF: bytes C2..DF 80..BF 708 case 0xC2: 709 case 0xC3: 710 case 0xC4: 711 case 0xC5: 712 case 0xC6: 713 case 0xC7: 714 case 0xC8: 715 case 0xC9: 716 case 0xCA: 717 case 0xCB: 718 case 0xCC: 719 case 0xCD: 720 case 0xCE: 721 case 0xCF: 722 case 0xD0: 723 case 0xD1: 724 case 0xD2: 725 case 0xD3: 726 case 0xD4: 727 case 0xD5: 728 case 0xD6: 729 case 0xD7: 730 case 0xD8: 731 case 0xD9: 732 case 0xDA: 733 case 0xDB: 734 case 0xDC: 735 case 0xDD: 736 case 0xDE: 737 case 0xDF: 738 { 739 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) 740 { 741 return token_type::parse_error; 742 } 743 break; 744 } 745 746 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF 747 case 0xE0: 748 { 749 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) 750 { 751 return token_type::parse_error; 752 } 753 break; 754 } 755 756 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF 757 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF 758 case 0xE1: 759 case 0xE2: 760 case 0xE3: 761 case 0xE4: 762 case 0xE5: 763 case 0xE6: 764 case 0xE7: 765 case 0xE8: 766 case 0xE9: 767 case 0xEA: 768 case 0xEB: 769 case 0xEC: 770 case 0xEE: 771 case 0xEF: 772 { 773 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) 774 { 775 return token_type::parse_error; 776 } 777 break; 778 } 779 780 // U+D000..U+D7FF: bytes ED 80..9F 80..BF 781 case 0xED: 782 { 783 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) 784 { 785 return token_type::parse_error; 786 } 787 break; 788 } 789 790 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 791 case 0xF0: 792 { 793 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) 794 { 795 return token_type::parse_error; 796 } 797 break; 798 } 799 800 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 801 case 0xF1: 802 case 0xF2: 803 case 0xF3: 804 { 805 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) 806 { 807 return token_type::parse_error; 808 } 809 break; 810 } 811 812 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 813 case 0xF4: 814 { 815 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) 816 { 817 return token_type::parse_error; 818 } 819 break; 820 } 821 822 // remaining bytes (80..C1 and F5..FF) are ill-formed 823 default: 824 { 825 error_message = "invalid string: ill-formed UTF-8 byte"; 826 return token_type::parse_error; 827 } 828 } 829 } 830 } 831 832 /*! 833 * @brief scan a comment 834 * @return whether comment could be scanned successfully 835 */ scan_comment()836 bool scan_comment() 837 { 838 switch (get()) 839 { 840 // single-line comments skip input until a newline or EOF is read 841 case '/': 842 { 843 while (true) 844 { 845 switch (get()) 846 { 847 case '\n': 848 case '\r': 849 case std::char_traits<char_type>::eof(): 850 case '\0': 851 return true; 852 853 default: 854 break; 855 } 856 } 857 } 858 859 // multi-line comments skip input until */ is read 860 case '*': 861 { 862 while (true) 863 { 864 switch (get()) 865 { 866 case std::char_traits<char_type>::eof(): 867 case '\0': 868 { 869 error_message = "invalid comment; missing closing '*/'"; 870 return false; 871 } 872 873 case '*': 874 { 875 switch (get()) 876 { 877 case '/': 878 return true; 879 880 default: 881 { 882 unget(); 883 continue; 884 } 885 } 886 } 887 888 default: 889 continue; 890 } 891 } 892 } 893 894 // unexpected character after reading '/' 895 default: 896 { 897 error_message = "invalid comment; expecting '/' or '*' after '/'"; 898 return false; 899 } 900 } 901 } 902 903 JSON_HEDLEY_NON_NULL(2) strtof(float & f,const char * str,char ** endptr)904 static void strtof(float& f, const char* str, char** endptr) noexcept 905 { 906 f = std::strtof(str, endptr); 907 } 908 909 JSON_HEDLEY_NON_NULL(2) strtof(double & f,const char * str,char ** endptr)910 static void strtof(double& f, const char* str, char** endptr) noexcept 911 { 912 f = std::strtod(str, endptr); 913 } 914 915 JSON_HEDLEY_NON_NULL(2) strtof(long double & f,const char * str,char ** endptr)916 static void strtof(long double& f, const char* str, char** endptr) noexcept 917 { 918 f = std::strtold(str, endptr); 919 } 920 921 /*! 922 @brief scan a number literal 923 924 This function scans a string according to Sect. 6 of RFC 7159. 925 926 The function is realized with a deterministic finite state machine derived 927 from the grammar described in RFC 7159. Starting in state "init", the 928 input is read and used to determined the next state. Only state "done" 929 accepts the number. State "error" is a trap state to model errors. In the 930 table below, "anything" means any character but the ones listed before. 931 932 state | 0 | 1-9 | e E | + | - | . | anything 933 ---------|----------|----------|----------|---------|---------|----------|----------- 934 init | zero | any1 | [error] | [error] | minus | [error] | [error] 935 minus | zero | any1 | [error] | [error] | [error] | [error] | [error] 936 zero | done | done | exponent | done | done | decimal1 | done 937 any1 | any1 | any1 | exponent | done | done | decimal1 | done 938 decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] 939 decimal2 | decimal2 | decimal2 | exponent | done | done | done | done 940 exponent | any2 | any2 | [error] | sign | sign | [error] | [error] 941 sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] 942 any2 | any2 | any2 | done | done | done | done | done 943 944 The state machine is realized with one label per state (prefixed with 945 "scan_number_") and `goto` statements between them. The state machine 946 contains cycles, but any cycle can be left when EOF is read. Therefore, 947 the function is guaranteed to terminate. 948 949 During scanning, the read bytes are stored in token_buffer. This string is 950 then converted to a signed integer, an unsigned integer, or a 951 floating-point number. 952 953 @return token_type::value_unsigned, token_type::value_integer, or 954 token_type::value_float if number could be successfully scanned, 955 token_type::parse_error otherwise 956 957 @note The scanner is independent of the current locale. Internally, the 958 locale's decimal point is used instead of `.` to work with the 959 locale-dependent converters. 960 */ scan_number()961 token_type scan_number() // lgtm [cpp/use-of-goto] 962 { 963 // reset token_buffer to store the number's bytes 964 reset(); 965 966 // the type of the parsed number; initially set to unsigned; will be 967 // changed if minus sign, decimal point or exponent is read 968 token_type number_type = token_type::value_unsigned; 969 970 // state (init): we just found out we need to scan a number 971 switch (current) 972 { 973 case '-': 974 { 975 add(current); 976 goto scan_number_minus; 977 } 978 979 case '0': 980 { 981 add(current); 982 goto scan_number_zero; 983 } 984 985 case '1': 986 case '2': 987 case '3': 988 case '4': 989 case '5': 990 case '6': 991 case '7': 992 case '8': 993 case '9': 994 { 995 add(current); 996 goto scan_number_any1; 997 } 998 999 // all other characters are rejected outside scan_number() 1000 default: // LCOV_EXCL_LINE 1001 JSON_ASSERT(false); // LCOV_EXCL_LINE 1002 } 1003 1004 scan_number_minus: 1005 // state: we just parsed a leading minus sign 1006 number_type = token_type::value_integer; 1007 switch (get()) 1008 { 1009 case '0': 1010 { 1011 add(current); 1012 goto scan_number_zero; 1013 } 1014 1015 case '1': 1016 case '2': 1017 case '3': 1018 case '4': 1019 case '5': 1020 case '6': 1021 case '7': 1022 case '8': 1023 case '9': 1024 { 1025 add(current); 1026 goto scan_number_any1; 1027 } 1028 1029 default: 1030 { 1031 error_message = "invalid number; expected digit after '-'"; 1032 return token_type::parse_error; 1033 } 1034 } 1035 1036 scan_number_zero: 1037 // state: we just parse a zero (maybe with a leading minus sign) 1038 switch (get()) 1039 { 1040 case '.': 1041 { 1042 add(decimal_point_char); 1043 goto scan_number_decimal1; 1044 } 1045 1046 case 'e': 1047 case 'E': 1048 { 1049 add(current); 1050 goto scan_number_exponent; 1051 } 1052 1053 default: 1054 goto scan_number_done; 1055 } 1056 1057 scan_number_any1: 1058 // state: we just parsed a number 0-9 (maybe with a leading minus sign) 1059 switch (get()) 1060 { 1061 case '0': 1062 case '1': 1063 case '2': 1064 case '3': 1065 case '4': 1066 case '5': 1067 case '6': 1068 case '7': 1069 case '8': 1070 case '9': 1071 { 1072 add(current); 1073 goto scan_number_any1; 1074 } 1075 1076 case '.': 1077 { 1078 add(decimal_point_char); 1079 goto scan_number_decimal1; 1080 } 1081 1082 case 'e': 1083 case 'E': 1084 { 1085 add(current); 1086 goto scan_number_exponent; 1087 } 1088 1089 default: 1090 goto scan_number_done; 1091 } 1092 1093 scan_number_decimal1: 1094 // state: we just parsed a decimal point 1095 number_type = token_type::value_float; 1096 switch (get()) 1097 { 1098 case '0': 1099 case '1': 1100 case '2': 1101 case '3': 1102 case '4': 1103 case '5': 1104 case '6': 1105 case '7': 1106 case '8': 1107 case '9': 1108 { 1109 add(current); 1110 goto scan_number_decimal2; 1111 } 1112 1113 default: 1114 { 1115 error_message = "invalid number; expected digit after '.'"; 1116 return token_type::parse_error; 1117 } 1118 } 1119 1120 scan_number_decimal2: 1121 // we just parsed at least one number after a decimal point 1122 switch (get()) 1123 { 1124 case '0': 1125 case '1': 1126 case '2': 1127 case '3': 1128 case '4': 1129 case '5': 1130 case '6': 1131 case '7': 1132 case '8': 1133 case '9': 1134 { 1135 add(current); 1136 goto scan_number_decimal2; 1137 } 1138 1139 case 'e': 1140 case 'E': 1141 { 1142 add(current); 1143 goto scan_number_exponent; 1144 } 1145 1146 default: 1147 goto scan_number_done; 1148 } 1149 1150 scan_number_exponent: 1151 // we just parsed an exponent 1152 number_type = token_type::value_float; 1153 switch (get()) 1154 { 1155 case '+': 1156 case '-': 1157 { 1158 add(current); 1159 goto scan_number_sign; 1160 } 1161 1162 case '0': 1163 case '1': 1164 case '2': 1165 case '3': 1166 case '4': 1167 case '5': 1168 case '6': 1169 case '7': 1170 case '8': 1171 case '9': 1172 { 1173 add(current); 1174 goto scan_number_any2; 1175 } 1176 1177 default: 1178 { 1179 error_message = 1180 "invalid number; expected '+', '-', or digit after exponent"; 1181 return token_type::parse_error; 1182 } 1183 } 1184 1185 scan_number_sign: 1186 // we just parsed an exponent sign 1187 switch (get()) 1188 { 1189 case '0': 1190 case '1': 1191 case '2': 1192 case '3': 1193 case '4': 1194 case '5': 1195 case '6': 1196 case '7': 1197 case '8': 1198 case '9': 1199 { 1200 add(current); 1201 goto scan_number_any2; 1202 } 1203 1204 default: 1205 { 1206 error_message = "invalid number; expected digit after exponent sign"; 1207 return token_type::parse_error; 1208 } 1209 } 1210 1211 scan_number_any2: 1212 // we just parsed a number after the exponent or exponent sign 1213 switch (get()) 1214 { 1215 case '0': 1216 case '1': 1217 case '2': 1218 case '3': 1219 case '4': 1220 case '5': 1221 case '6': 1222 case '7': 1223 case '8': 1224 case '9': 1225 { 1226 add(current); 1227 goto scan_number_any2; 1228 } 1229 1230 default: 1231 goto scan_number_done; 1232 } 1233 1234 scan_number_done: 1235 // unget the character after the number (we only read it to know that 1236 // we are done scanning a number) 1237 unget(); 1238 1239 char* endptr = nullptr; 1240 errno = 0; 1241 1242 // try to parse integers first and fall back to floats 1243 if (number_type == token_type::value_unsigned) 1244 { 1245 const auto x = std::strtoull(token_buffer.data(), &endptr, 10); 1246 1247 // we checked the number format before 1248 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); 1249 1250 if (errno == 0) 1251 { 1252 value_unsigned = static_cast<number_unsigned_t>(x); 1253 if (value_unsigned == x) 1254 { 1255 return token_type::value_unsigned; 1256 } 1257 } 1258 } 1259 else if (number_type == token_type::value_integer) 1260 { 1261 const auto x = std::strtoll(token_buffer.data(), &endptr, 10); 1262 1263 // we checked the number format before 1264 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); 1265 1266 if (errno == 0) 1267 { 1268 value_integer = static_cast<number_integer_t>(x); 1269 if (value_integer == x) 1270 { 1271 return token_type::value_integer; 1272 } 1273 } 1274 } 1275 1276 // this code is reached if we parse a floating-point number or if an 1277 // integer conversion above failed 1278 strtof(value_float, token_buffer.data(), &endptr); 1279 1280 // we checked the number format before 1281 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); 1282 1283 return token_type::value_float; 1284 } 1285 1286 /*! 1287 @param[in] literal_text the literal text to expect 1288 @param[in] length the length of the passed literal text 1289 @param[in] return_type the token type to return on success 1290 */ 1291 JSON_HEDLEY_NON_NULL(2) scan_literal(const char_type * literal_text,const std::size_t length,token_type return_type)1292 token_type scan_literal(const char_type* literal_text, const std::size_t length, 1293 token_type return_type) 1294 { 1295 JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]); 1296 for (std::size_t i = 1; i < length; ++i) 1297 { 1298 if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i])) 1299 { 1300 error_message = "invalid literal"; 1301 return token_type::parse_error; 1302 } 1303 } 1304 return return_type; 1305 } 1306 1307 ///////////////////// 1308 // input management 1309 ///////////////////// 1310 1311 /// reset token_buffer; current character is beginning of token reset()1312 void reset() noexcept 1313 { 1314 token_buffer.clear(); 1315 token_string.clear(); 1316 token_string.push_back(std::char_traits<char_type>::to_char_type(current)); 1317 } 1318 1319 /* 1320 @brief get next character from the input 1321 1322 This function provides the interface to the used input adapter. It does 1323 not throw in case the input reached EOF, but returns a 1324 `std::char_traits<char>::eof()` in that case. Stores the scanned characters 1325 for use in error messages. 1326 1327 @return character read from the input 1328 */ get()1329 char_int_type get() 1330 { 1331 ++position.chars_read_total; 1332 ++position.chars_read_current_line; 1333 1334 if (next_unget) 1335 { 1336 // just reset the next_unget variable and work with current 1337 next_unget = false; 1338 } 1339 else 1340 { 1341 current = ia.get_character(); 1342 } 1343 1344 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof())) 1345 { 1346 token_string.push_back(std::char_traits<char_type>::to_char_type(current)); 1347 } 1348 1349 if (current == '\n') 1350 { 1351 ++position.lines_read; 1352 position.chars_read_current_line = 0; 1353 } 1354 1355 return current; 1356 } 1357 1358 /*! 1359 @brief unget current character (read it again on next get) 1360 1361 We implement unget by setting variable next_unget to true. The input is not 1362 changed - we just simulate ungetting by modifying chars_read_total, 1363 chars_read_current_line, and token_string. The next call to get() will 1364 behave as if the unget character is read again. 1365 */ unget()1366 void unget() 1367 { 1368 next_unget = true; 1369 1370 --position.chars_read_total; 1371 1372 // in case we "unget" a newline, we have to also decrement the lines_read 1373 if (position.chars_read_current_line == 0) 1374 { 1375 if (position.lines_read > 0) 1376 { 1377 --position.lines_read; 1378 } 1379 } 1380 else 1381 { 1382 --position.chars_read_current_line; 1383 } 1384 1385 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof())) 1386 { 1387 JSON_ASSERT(!token_string.empty()); 1388 token_string.pop_back(); 1389 } 1390 } 1391 1392 /// add a character to token_buffer add(char_int_type c)1393 void add(char_int_type c) 1394 { 1395 token_buffer.push_back(static_cast<typename string_t::value_type>(c)); 1396 } 1397 1398 public: 1399 ///////////////////// 1400 // value getters 1401 ///////////////////// 1402 1403 /// return integer value get_number_integer() const1404 constexpr number_integer_t get_number_integer() const noexcept 1405 { 1406 return value_integer; 1407 } 1408 1409 /// return unsigned integer value get_number_unsigned() const1410 constexpr number_unsigned_t get_number_unsigned() const noexcept 1411 { 1412 return value_unsigned; 1413 } 1414 1415 /// return floating-point value get_number_float() const1416 constexpr number_float_t get_number_float() const noexcept 1417 { 1418 return value_float; 1419 } 1420 1421 /// return current string value (implicitly resets the token; useful only once) get_string()1422 string_t& get_string() 1423 { 1424 return token_buffer; 1425 } 1426 1427 ///////////////////// 1428 // diagnostics 1429 ///////////////////// 1430 1431 /// return position of last read token get_position() const1432 constexpr position_t get_position() const noexcept 1433 { 1434 return position; 1435 } 1436 1437 /// return the last read token (for errors only). Will never contain EOF 1438 /// (an arbitrary value that is not a valid char value, often -1), because 1439 /// 255 may legitimately occur. May contain NUL, which should be escaped. get_token_string() const1440 std::string get_token_string() const 1441 { 1442 // escape control characters 1443 std::string result; 1444 for (const auto c : token_string) 1445 { 1446 if (static_cast<unsigned char>(c) <= '\x1F') 1447 { 1448 // escape control characters 1449 std::array<char, 9> cs{{}}; 1450 (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)); 1451 result += cs.data(); 1452 } 1453 else 1454 { 1455 // add character as is 1456 result.push_back(static_cast<std::string::value_type>(c)); 1457 } 1458 } 1459 1460 return result; 1461 } 1462 1463 /// return syntax error message 1464 JSON_HEDLEY_RETURNS_NON_NULL get_error_message() const1465 constexpr const char* get_error_message() const noexcept 1466 { 1467 return error_message; 1468 } 1469 1470 ///////////////////// 1471 // actual scanner 1472 ///////////////////// 1473 1474 /*! 1475 @brief skip the UTF-8 byte order mark 1476 @return true iff there is no BOM or the correct BOM has been skipped 1477 */ skip_bom()1478 bool skip_bom() 1479 { 1480 if (get() == 0xEF) 1481 { 1482 // check if we completely parse the BOM 1483 return get() == 0xBB && get() == 0xBF; 1484 } 1485 1486 // the first character is not the beginning of the BOM; unget it to 1487 // process is later 1488 unget(); 1489 return true; 1490 } 1491 skip_whitespace()1492 void skip_whitespace() 1493 { 1494 do 1495 { 1496 get(); 1497 } 1498 while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); 1499 } 1500 scan()1501 token_type scan() 1502 { 1503 // initially, skip the BOM 1504 if (position.chars_read_total == 0 && !skip_bom()) 1505 { 1506 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; 1507 return token_type::parse_error; 1508 } 1509 1510 // read next character and ignore whitespace 1511 skip_whitespace(); 1512 1513 // ignore comments 1514 while (ignore_comments && current == '/') 1515 { 1516 if (!scan_comment()) 1517 { 1518 return token_type::parse_error; 1519 } 1520 1521 // skip following whitespace 1522 skip_whitespace(); 1523 } 1524 1525 switch (current) 1526 { 1527 // structural characters 1528 case '[': 1529 return token_type::begin_array; 1530 case ']': 1531 return token_type::end_array; 1532 case '{': 1533 return token_type::begin_object; 1534 case '}': 1535 return token_type::end_object; 1536 case ':': 1537 return token_type::name_separator; 1538 case ',': 1539 return token_type::value_separator; 1540 1541 // literals 1542 case 't': 1543 { 1544 std::array<char_type, 4> true_literal = {{'t', 'r', 'u', 'e'}}; 1545 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); 1546 } 1547 case 'f': 1548 { 1549 std::array<char_type, 5> false_literal = {{'f', 'a', 'l', 's', 'e'}}; 1550 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); 1551 } 1552 case 'n': 1553 { 1554 std::array<char_type, 4> null_literal = {{'n', 'u', 'l', 'l'}}; 1555 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); 1556 } 1557 1558 // string 1559 case '\"': 1560 return scan_string(); 1561 1562 // number 1563 case '-': 1564 case '0': 1565 case '1': 1566 case '2': 1567 case '3': 1568 case '4': 1569 case '5': 1570 case '6': 1571 case '7': 1572 case '8': 1573 case '9': 1574 return scan_number(); 1575 1576 // end of input (the null byte is needed when parsing from 1577 // string literals) 1578 case '\0': 1579 case std::char_traits<char_type>::eof(): 1580 return token_type::end_of_input; 1581 1582 // error 1583 default: 1584 error_message = "invalid literal"; 1585 return token_type::parse_error; 1586 } 1587 } 1588 1589 private: 1590 /// input adapter 1591 InputAdapterType ia; 1592 1593 /// whether comments should be ignored (true) or signaled as errors (false) 1594 const bool ignore_comments = false; 1595 1596 /// the current character 1597 char_int_type current = std::char_traits<char_type>::eof(); 1598 1599 /// whether the next get() call should just return current 1600 bool next_unget = false; 1601 1602 /// the start position of the current token 1603 position_t position {}; 1604 1605 /// raw input token string (for error messages) 1606 std::vector<char_type> token_string {}; 1607 1608 /// buffer for variable-length tokens (numbers, strings) 1609 string_t token_buffer {}; 1610 1611 /// a description of occurred lexer errors 1612 const char* error_message = ""; 1613 1614 // number values 1615 number_integer_t value_integer = 0; 1616 number_unsigned_t value_unsigned = 0; 1617 number_float_t value_float = 0; 1618 1619 /// the decimal point 1620 const char_int_type decimal_point_char = '.'; 1621 }; 1622 } // namespace detail 1623 } // namespace nlohmann 1624