• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #pragma once
2 
3 #include <array> // array
4 #include <clocale> // localeconv
5 #include <cstddef> // size_t
6 #include <cstdio> // snprintf
7 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8 #include <initializer_list> // initializer_list
9 #include <string> // char_traits, string
10 #include <utility> // move
11 #include <vector> // vector
12 
13 #include <nlohmann/detail/input/input_adapters.hpp>
14 #include <nlohmann/detail/input/position_t.hpp>
15 #include <nlohmann/detail/macro_scope.hpp>
16 
17 namespace nlohmann
18 {
19 namespace detail
20 {
21 ///////////
22 // lexer //
23 ///////////
24 
25 template<typename BasicJsonType>
26 class lexer_base
27 {
28   public:
29     /// token types for the parser
30     enum class token_type
31     {
32         uninitialized,    ///< indicating the scanner is uninitialized
33         literal_true,     ///< the `true` literal
34         literal_false,    ///< the `false` literal
35         literal_null,     ///< the `null` literal
36         value_string,     ///< a string -- use get_string() for actual value
37         value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
38         value_integer,    ///< a signed integer -- use get_number_integer() for actual value
39         value_float,      ///< an floating point number -- use get_number_float() for actual value
40         begin_array,      ///< the character for array begin `[`
41         begin_object,     ///< the character for object begin `{`
42         end_array,        ///< the character for array end `]`
43         end_object,       ///< the character for object end `}`
44         name_separator,   ///< the name separator `:`
45         value_separator,  ///< the value separator `,`
46         parse_error,      ///< indicating a parse error
47         end_of_input,     ///< indicating the end of the input buffer
48         literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
49     };
50 
51     /// return name of values of type token_type (only used for errors)
52     JSON_HEDLEY_RETURNS_NON_NULL
53     JSON_HEDLEY_CONST
token_type_name(const token_type t)54     static const char* token_type_name(const token_type t) noexcept
55     {
56         switch (t)
57         {
58             case token_type::uninitialized:
59                 return "<uninitialized>";
60             case token_type::literal_true:
61                 return "true literal";
62             case token_type::literal_false:
63                 return "false literal";
64             case token_type::literal_null:
65                 return "null literal";
66             case token_type::value_string:
67                 return "string literal";
68             case token_type::value_unsigned:
69             case token_type::value_integer:
70             case token_type::value_float:
71                 return "number literal";
72             case token_type::begin_array:
73                 return "'['";
74             case token_type::begin_object:
75                 return "'{'";
76             case token_type::end_array:
77                 return "']'";
78             case token_type::end_object:
79                 return "'}'";
80             case token_type::name_separator:
81                 return "':'";
82             case token_type::value_separator:
83                 return "','";
84             case token_type::parse_error:
85                 return "<parse error>";
86             case token_type::end_of_input:
87                 return "end of input";
88             case token_type::literal_or_value:
89                 return "'[', '{', or a literal";
90             // LCOV_EXCL_START
91             default: // catch non-enum values
92                 return "unknown token";
93                 // LCOV_EXCL_STOP
94         }
95     }
96 };
97 /*!
98 @brief lexical analysis
99 
100 This class organizes the lexical analysis during JSON deserialization.
101 */
102 template<typename BasicJsonType, typename InputAdapterType>
103 class lexer : public lexer_base<BasicJsonType>
104 {
105     using number_integer_t = typename BasicJsonType::number_integer_t;
106     using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
107     using number_float_t = typename BasicJsonType::number_float_t;
108     using string_t = typename BasicJsonType::string_t;
109     using char_type = typename InputAdapterType::char_type;
110     using char_int_type = typename std::char_traits<char_type>::int_type;
111 
112   public:
113     using token_type = typename lexer_base<BasicJsonType>::token_type;
114 
lexer(InputAdapterType && adapter,bool ignore_comments_=false)115     explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false)
116         : ia(std::move(adapter))
117         , ignore_comments(ignore_comments_)
118         , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
119     {}
120 
121     // delete because of pointer members
122     lexer(const lexer&) = delete;
123     lexer(lexer&&) = default;
124     lexer& operator=(lexer&) = delete;
125     lexer& operator=(lexer&&) = default;
126     ~lexer() = default;
127 
128   private:
129     /////////////////////
130     // locales
131     /////////////////////
132 
133     /// return the locale-dependent decimal point
134     JSON_HEDLEY_PURE
get_decimal_point()135     static char get_decimal_point() noexcept
136     {
137         const auto* loc = localeconv();
138         JSON_ASSERT(loc != nullptr);
139         return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
140     }
141 
142     /////////////////////
143     // scan functions
144     /////////////////////
145 
146     /*!
147     @brief get codepoint from 4 hex characters following `\u`
148 
149     For input "\u c1 c2 c3 c4" the codepoint is:
150       (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
151     = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
152 
153     Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
154     must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
155     conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
156     between the ASCII value of the character and the desired integer value.
157 
158     @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
159             non-hex character)
160     */
get_codepoint()161     int get_codepoint()
162     {
163         // this function only makes sense after reading `\u`
164         JSON_ASSERT(current == 'u');
165         int codepoint = 0;
166 
167         const auto factors = { 12u, 8u, 4u, 0u };
168         for (const auto factor : factors)
169         {
170             get();
171 
172             if (current >= '0' && current <= '9')
173             {
174                 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
175             }
176             else if (current >= 'A' && current <= 'F')
177             {
178                 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
179             }
180             else if (current >= 'a' && current <= 'f')
181             {
182                 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
183             }
184             else
185             {
186                 return -1;
187             }
188         }
189 
190         JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
191         return codepoint;
192     }
193 
194     /*!
195     @brief check if the next byte(s) are inside a given range
196 
197     Adds the current byte and, for each passed range, reads a new byte and
198     checks if it is inside the range. If a violation was detected, set up an
199     error message and return false. Otherwise, return true.
200 
201     @param[in] ranges  list of integers; interpreted as list of pairs of
202                        inclusive lower and upper bound, respectively
203 
204     @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
205          1, 2, or 3 pairs. This precondition is enforced by an assertion.
206 
207     @return true if and only if no range violation was detected
208     */
next_byte_in_range(std::initializer_list<char_int_type> ranges)209     bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
210     {
211         JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
212         add(current);
213 
214         for (auto range = ranges.begin(); range != ranges.end(); ++range)
215         {
216             get();
217             if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
218             {
219                 add(current);
220             }
221             else
222             {
223                 error_message = "invalid string: ill-formed UTF-8 byte";
224                 return false;
225             }
226         }
227 
228         return true;
229     }
230 
231     /*!
232     @brief scan a string literal
233 
234     This function scans a string according to Sect. 7 of RFC 7159. While
235     scanning, bytes are escaped and copied into buffer token_buffer. Then the
236     function returns successfully, token_buffer is *not* null-terminated (as it
237     may contain \0 bytes), and token_buffer.size() is the number of bytes in the
238     string.
239 
240     @return token_type::value_string if string could be successfully scanned,
241             token_type::parse_error otherwise
242 
243     @note In case of errors, variable error_message contains a textual
244           description.
245     */
scan_string()246     token_type scan_string()
247     {
248         // reset token_buffer (ignore opening quote)
249         reset();
250 
251         // we entered the function by reading an open quote
252         JSON_ASSERT(current == '\"');
253 
254         while (true)
255         {
256             // get next character
257             switch (get())
258             {
259                 // end of file while parsing string
260                 case std::char_traits<char_type>::eof():
261                 {
262                     error_message = "invalid string: missing closing quote";
263                     return token_type::parse_error;
264                 }
265 
266                 // closing quote
267                 case '\"':
268                 {
269                     return token_type::value_string;
270                 }
271 
272                 // escapes
273                 case '\\':
274                 {
275                     switch (get())
276                     {
277                         // quotation mark
278                         case '\"':
279                             add('\"');
280                             break;
281                         // reverse solidus
282                         case '\\':
283                             add('\\');
284                             break;
285                         // solidus
286                         case '/':
287                             add('/');
288                             break;
289                         // backspace
290                         case 'b':
291                             add('\b');
292                             break;
293                         // form feed
294                         case 'f':
295                             add('\f');
296                             break;
297                         // line feed
298                         case 'n':
299                             add('\n');
300                             break;
301                         // carriage return
302                         case 'r':
303                             add('\r');
304                             break;
305                         // tab
306                         case 't':
307                             add('\t');
308                             break;
309 
310                         // unicode escapes
311                         case 'u':
312                         {
313                             const int codepoint1 = get_codepoint();
314                             int codepoint = codepoint1; // start with codepoint1
315 
316                             if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
317                             {
318                                 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
319                                 return token_type::parse_error;
320                             }
321 
322                             // check if code point is a high surrogate
323                             if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
324                             {
325                                 // expect next \uxxxx entry
326                                 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
327                                 {
328                                     const int codepoint2 = get_codepoint();
329 
330                                     if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
331                                     {
332                                         error_message = "invalid string: '\\u' must be followed by 4 hex digits";
333                                         return token_type::parse_error;
334                                     }
335 
336                                     // check if codepoint2 is a low surrogate
337                                     if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
338                                     {
339                                         // overwrite codepoint
340                                         codepoint = static_cast<int>(
341                                                         // high surrogate occupies the most significant 22 bits
342                                                         (static_cast<unsigned int>(codepoint1) << 10u)
343                                                         // low surrogate occupies the least significant 15 bits
344                                                         + static_cast<unsigned int>(codepoint2)
345                                                         // there is still the 0xD800, 0xDC00 and 0x10000 noise
346                                                         // in the result so we have to subtract with:
347                                                         // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
348                                                         - 0x35FDC00u);
349                                     }
350                                     else
351                                     {
352                                         error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
353                                         return token_type::parse_error;
354                                     }
355                                 }
356                                 else
357                                 {
358                                     error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
359                                     return token_type::parse_error;
360                                 }
361                             }
362                             else
363                             {
364                                 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
365                                 {
366                                     error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
367                                     return token_type::parse_error;
368                                 }
369                             }
370 
371                             // result of the above calculation yields a proper codepoint
372                             JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
373 
374                             // translate codepoint into bytes
375                             if (codepoint < 0x80)
376                             {
377                                 // 1-byte characters: 0xxxxxxx (ASCII)
378                                 add(static_cast<char_int_type>(codepoint));
379                             }
380                             else if (codepoint <= 0x7FF)
381                             {
382                                 // 2-byte characters: 110xxxxx 10xxxxxx
383                                 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
384                                 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
385                             }
386                             else if (codepoint <= 0xFFFF)
387                             {
388                                 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
389                                 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
390                                 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
391                                 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
392                             }
393                             else
394                             {
395                                 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
396                                 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
397                                 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
398                                 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399                                 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
400                             }
401 
402                             break;
403                         }
404 
405                         // other characters after escape
406                         default:
407                             error_message = "invalid string: forbidden character after backslash";
408                             return token_type::parse_error;
409                     }
410 
411                     break;
412                 }
413 
414                 // invalid control characters
415                 case 0x00:
416                 {
417                     error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
418                     return token_type::parse_error;
419                 }
420 
421                 case 0x01:
422                 {
423                     error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
424                     return token_type::parse_error;
425                 }
426 
427                 case 0x02:
428                 {
429                     error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
430                     return token_type::parse_error;
431                 }
432 
433                 case 0x03:
434                 {
435                     error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
436                     return token_type::parse_error;
437                 }
438 
439                 case 0x04:
440                 {
441                     error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
442                     return token_type::parse_error;
443                 }
444 
445                 case 0x05:
446                 {
447                     error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
448                     return token_type::parse_error;
449                 }
450 
451                 case 0x06:
452                 {
453                     error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
454                     return token_type::parse_error;
455                 }
456 
457                 case 0x07:
458                 {
459                     error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
460                     return token_type::parse_error;
461                 }
462 
463                 case 0x08:
464                 {
465                     error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
466                     return token_type::parse_error;
467                 }
468 
469                 case 0x09:
470                 {
471                     error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
472                     return token_type::parse_error;
473                 }
474 
475                 case 0x0A:
476                 {
477                     error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
478                     return token_type::parse_error;
479                 }
480 
481                 case 0x0B:
482                 {
483                     error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
484                     return token_type::parse_error;
485                 }
486 
487                 case 0x0C:
488                 {
489                     error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
490                     return token_type::parse_error;
491                 }
492 
493                 case 0x0D:
494                 {
495                     error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
496                     return token_type::parse_error;
497                 }
498 
499                 case 0x0E:
500                 {
501                     error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
502                     return token_type::parse_error;
503                 }
504 
505                 case 0x0F:
506                 {
507                     error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
508                     return token_type::parse_error;
509                 }
510 
511                 case 0x10:
512                 {
513                     error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
514                     return token_type::parse_error;
515                 }
516 
517                 case 0x11:
518                 {
519                     error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
520                     return token_type::parse_error;
521                 }
522 
523                 case 0x12:
524                 {
525                     error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
526                     return token_type::parse_error;
527                 }
528 
529                 case 0x13:
530                 {
531                     error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
532                     return token_type::parse_error;
533                 }
534 
535                 case 0x14:
536                 {
537                     error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
538                     return token_type::parse_error;
539                 }
540 
541                 case 0x15:
542                 {
543                     error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
544                     return token_type::parse_error;
545                 }
546 
547                 case 0x16:
548                 {
549                     error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
550                     return token_type::parse_error;
551                 }
552 
553                 case 0x17:
554                 {
555                     error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
556                     return token_type::parse_error;
557                 }
558 
559                 case 0x18:
560                 {
561                     error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
562                     return token_type::parse_error;
563                 }
564 
565                 case 0x19:
566                 {
567                     error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
568                     return token_type::parse_error;
569                 }
570 
571                 case 0x1A:
572                 {
573                     error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
574                     return token_type::parse_error;
575                 }
576 
577                 case 0x1B:
578                 {
579                     error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
580                     return token_type::parse_error;
581                 }
582 
583                 case 0x1C:
584                 {
585                     error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
586                     return token_type::parse_error;
587                 }
588 
589                 case 0x1D:
590                 {
591                     error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
592                     return token_type::parse_error;
593                 }
594 
595                 case 0x1E:
596                 {
597                     error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
598                     return token_type::parse_error;
599                 }
600 
601                 case 0x1F:
602                 {
603                     error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
604                     return token_type::parse_error;
605                 }
606 
607                 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
608                 case 0x20:
609                 case 0x21:
610                 case 0x23:
611                 case 0x24:
612                 case 0x25:
613                 case 0x26:
614                 case 0x27:
615                 case 0x28:
616                 case 0x29:
617                 case 0x2A:
618                 case 0x2B:
619                 case 0x2C:
620                 case 0x2D:
621                 case 0x2E:
622                 case 0x2F:
623                 case 0x30:
624                 case 0x31:
625                 case 0x32:
626                 case 0x33:
627                 case 0x34:
628                 case 0x35:
629                 case 0x36:
630                 case 0x37:
631                 case 0x38:
632                 case 0x39:
633                 case 0x3A:
634                 case 0x3B:
635                 case 0x3C:
636                 case 0x3D:
637                 case 0x3E:
638                 case 0x3F:
639                 case 0x40:
640                 case 0x41:
641                 case 0x42:
642                 case 0x43:
643                 case 0x44:
644                 case 0x45:
645                 case 0x46:
646                 case 0x47:
647                 case 0x48:
648                 case 0x49:
649                 case 0x4A:
650                 case 0x4B:
651                 case 0x4C:
652                 case 0x4D:
653                 case 0x4E:
654                 case 0x4F:
655                 case 0x50:
656                 case 0x51:
657                 case 0x52:
658                 case 0x53:
659                 case 0x54:
660                 case 0x55:
661                 case 0x56:
662                 case 0x57:
663                 case 0x58:
664                 case 0x59:
665                 case 0x5A:
666                 case 0x5B:
667                 case 0x5D:
668                 case 0x5E:
669                 case 0x5F:
670                 case 0x60:
671                 case 0x61:
672                 case 0x62:
673                 case 0x63:
674                 case 0x64:
675                 case 0x65:
676                 case 0x66:
677                 case 0x67:
678                 case 0x68:
679                 case 0x69:
680                 case 0x6A:
681                 case 0x6B:
682                 case 0x6C:
683                 case 0x6D:
684                 case 0x6E:
685                 case 0x6F:
686                 case 0x70:
687                 case 0x71:
688                 case 0x72:
689                 case 0x73:
690                 case 0x74:
691                 case 0x75:
692                 case 0x76:
693                 case 0x77:
694                 case 0x78:
695                 case 0x79:
696                 case 0x7A:
697                 case 0x7B:
698                 case 0x7C:
699                 case 0x7D:
700                 case 0x7E:
701                 case 0x7F:
702                 {
703                     add(current);
704                     break;
705                 }
706 
707                 // U+0080..U+07FF: bytes C2..DF 80..BF
708                 case 0xC2:
709                 case 0xC3:
710                 case 0xC4:
711                 case 0xC5:
712                 case 0xC6:
713                 case 0xC7:
714                 case 0xC8:
715                 case 0xC9:
716                 case 0xCA:
717                 case 0xCB:
718                 case 0xCC:
719                 case 0xCD:
720                 case 0xCE:
721                 case 0xCF:
722                 case 0xD0:
723                 case 0xD1:
724                 case 0xD2:
725                 case 0xD3:
726                 case 0xD4:
727                 case 0xD5:
728                 case 0xD6:
729                 case 0xD7:
730                 case 0xD8:
731                 case 0xD9:
732                 case 0xDA:
733                 case 0xDB:
734                 case 0xDC:
735                 case 0xDD:
736                 case 0xDE:
737                 case 0xDF:
738                 {
739                     if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
740                     {
741                         return token_type::parse_error;
742                     }
743                     break;
744                 }
745 
746                 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
747                 case 0xE0:
748                 {
749                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
750                     {
751                         return token_type::parse_error;
752                     }
753                     break;
754                 }
755 
756                 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
757                 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
758                 case 0xE1:
759                 case 0xE2:
760                 case 0xE3:
761                 case 0xE4:
762                 case 0xE5:
763                 case 0xE6:
764                 case 0xE7:
765                 case 0xE8:
766                 case 0xE9:
767                 case 0xEA:
768                 case 0xEB:
769                 case 0xEC:
770                 case 0xEE:
771                 case 0xEF:
772                 {
773                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
774                     {
775                         return token_type::parse_error;
776                     }
777                     break;
778                 }
779 
780                 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
781                 case 0xED:
782                 {
783                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
784                     {
785                         return token_type::parse_error;
786                     }
787                     break;
788                 }
789 
790                 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
791                 case 0xF0:
792                 {
793                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
794                     {
795                         return token_type::parse_error;
796                     }
797                     break;
798                 }
799 
800                 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
801                 case 0xF1:
802                 case 0xF2:
803                 case 0xF3:
804                 {
805                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
806                     {
807                         return token_type::parse_error;
808                     }
809                     break;
810                 }
811 
812                 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
813                 case 0xF4:
814                 {
815                     if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
816                     {
817                         return token_type::parse_error;
818                     }
819                     break;
820                 }
821 
822                 // remaining bytes (80..C1 and F5..FF) are ill-formed
823                 default:
824                 {
825                     error_message = "invalid string: ill-formed UTF-8 byte";
826                     return token_type::parse_error;
827                 }
828             }
829         }
830     }
831 
832     /*!
833      * @brief scan a comment
834      * @return whether comment could be scanned successfully
835      */
scan_comment()836     bool scan_comment()
837     {
838         switch (get())
839         {
840             // single-line comments skip input until a newline or EOF is read
841             case '/':
842             {
843                 while (true)
844                 {
845                     switch (get())
846                     {
847                         case '\n':
848                         case '\r':
849                         case std::char_traits<char_type>::eof():
850                         case '\0':
851                             return true;
852 
853                         default:
854                             break;
855                     }
856                 }
857             }
858 
859             // multi-line comments skip input until */ is read
860             case '*':
861             {
862                 while (true)
863                 {
864                     switch (get())
865                     {
866                         case std::char_traits<char_type>::eof():
867                         case '\0':
868                         {
869                             error_message = "invalid comment; missing closing '*/'";
870                             return false;
871                         }
872 
873                         case '*':
874                         {
875                             switch (get())
876                             {
877                                 case '/':
878                                     return true;
879 
880                                 default:
881                                 {
882                                     unget();
883                                     continue;
884                                 }
885                             }
886                         }
887 
888                         default:
889                             continue;
890                     }
891                 }
892             }
893 
894             // unexpected character after reading '/'
895             default:
896             {
897                 error_message = "invalid comment; expecting '/' or '*' after '/'";
898                 return false;
899             }
900         }
901     }
902 
903     JSON_HEDLEY_NON_NULL(2)
strtof(float & f,const char * str,char ** endptr)904     static void strtof(float& f, const char* str, char** endptr) noexcept
905     {
906         f = std::strtof(str, endptr);
907     }
908 
909     JSON_HEDLEY_NON_NULL(2)
strtof(double & f,const char * str,char ** endptr)910     static void strtof(double& f, const char* str, char** endptr) noexcept
911     {
912         f = std::strtod(str, endptr);
913     }
914 
915     JSON_HEDLEY_NON_NULL(2)
strtof(long double & f,const char * str,char ** endptr)916     static void strtof(long double& f, const char* str, char** endptr) noexcept
917     {
918         f = std::strtold(str, endptr);
919     }
920 
921     /*!
922     @brief scan a number literal
923 
924     This function scans a string according to Sect. 6 of RFC 7159.
925 
926     The function is realized with a deterministic finite state machine derived
927     from the grammar described in RFC 7159. Starting in state "init", the
928     input is read and used to determined the next state. Only state "done"
929     accepts the number. State "error" is a trap state to model errors. In the
930     table below, "anything" means any character but the ones listed before.
931 
932     state    | 0        | 1-9      | e E      | +       | -       | .        | anything
933     ---------|----------|----------|----------|---------|---------|----------|-----------
934     init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
935     minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
936     zero     | done     | done     | exponent | done    | done    | decimal1 | done
937     any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
938     decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
939     decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
940     exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
941     sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
942     any2     | any2     | any2     | done     | done    | done    | done     | done
943 
944     The state machine is realized with one label per state (prefixed with
945     "scan_number_") and `goto` statements between them. The state machine
946     contains cycles, but any cycle can be left when EOF is read. Therefore,
947     the function is guaranteed to terminate.
948 
949     During scanning, the read bytes are stored in token_buffer. This string is
950     then converted to a signed integer, an unsigned integer, or a
951     floating-point number.
952 
953     @return token_type::value_unsigned, token_type::value_integer, or
954             token_type::value_float if number could be successfully scanned,
955             token_type::parse_error otherwise
956 
957     @note The scanner is independent of the current locale. Internally, the
958           locale's decimal point is used instead of `.` to work with the
959           locale-dependent converters.
960     */
scan_number()961     token_type scan_number()  // lgtm [cpp/use-of-goto]
962     {
963         // reset token_buffer to store the number's bytes
964         reset();
965 
966         // the type of the parsed number; initially set to unsigned; will be
967         // changed if minus sign, decimal point or exponent is read
968         token_type number_type = token_type::value_unsigned;
969 
970         // state (init): we just found out we need to scan a number
971         switch (current)
972         {
973             case '-':
974             {
975                 add(current);
976                 goto scan_number_minus;
977             }
978 
979             case '0':
980             {
981                 add(current);
982                 goto scan_number_zero;
983             }
984 
985             case '1':
986             case '2':
987             case '3':
988             case '4':
989             case '5':
990             case '6':
991             case '7':
992             case '8':
993             case '9':
994             {
995                 add(current);
996                 goto scan_number_any1;
997             }
998 
999             // all other characters are rejected outside scan_number()
1000             default:            // LCOV_EXCL_LINE
1001                 JSON_ASSERT(false);  // LCOV_EXCL_LINE
1002         }
1003 
1004 scan_number_minus:
1005         // state: we just parsed a leading minus sign
1006         number_type = token_type::value_integer;
1007         switch (get())
1008         {
1009             case '0':
1010             {
1011                 add(current);
1012                 goto scan_number_zero;
1013             }
1014 
1015             case '1':
1016             case '2':
1017             case '3':
1018             case '4':
1019             case '5':
1020             case '6':
1021             case '7':
1022             case '8':
1023             case '9':
1024             {
1025                 add(current);
1026                 goto scan_number_any1;
1027             }
1028 
1029             default:
1030             {
1031                 error_message = "invalid number; expected digit after '-'";
1032                 return token_type::parse_error;
1033             }
1034         }
1035 
1036 scan_number_zero:
1037         // state: we just parse a zero (maybe with a leading minus sign)
1038         switch (get())
1039         {
1040             case '.':
1041             {
1042                 add(decimal_point_char);
1043                 goto scan_number_decimal1;
1044             }
1045 
1046             case 'e':
1047             case 'E':
1048             {
1049                 add(current);
1050                 goto scan_number_exponent;
1051             }
1052 
1053             default:
1054                 goto scan_number_done;
1055         }
1056 
1057 scan_number_any1:
1058         // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1059         switch (get())
1060         {
1061             case '0':
1062             case '1':
1063             case '2':
1064             case '3':
1065             case '4':
1066             case '5':
1067             case '6':
1068             case '7':
1069             case '8':
1070             case '9':
1071             {
1072                 add(current);
1073                 goto scan_number_any1;
1074             }
1075 
1076             case '.':
1077             {
1078                 add(decimal_point_char);
1079                 goto scan_number_decimal1;
1080             }
1081 
1082             case 'e':
1083             case 'E':
1084             {
1085                 add(current);
1086                 goto scan_number_exponent;
1087             }
1088 
1089             default:
1090                 goto scan_number_done;
1091         }
1092 
1093 scan_number_decimal1:
1094         // state: we just parsed a decimal point
1095         number_type = token_type::value_float;
1096         switch (get())
1097         {
1098             case '0':
1099             case '1':
1100             case '2':
1101             case '3':
1102             case '4':
1103             case '5':
1104             case '6':
1105             case '7':
1106             case '8':
1107             case '9':
1108             {
1109                 add(current);
1110                 goto scan_number_decimal2;
1111             }
1112 
1113             default:
1114             {
1115                 error_message = "invalid number; expected digit after '.'";
1116                 return token_type::parse_error;
1117             }
1118         }
1119 
1120 scan_number_decimal2:
1121         // we just parsed at least one number after a decimal point
1122         switch (get())
1123         {
1124             case '0':
1125             case '1':
1126             case '2':
1127             case '3':
1128             case '4':
1129             case '5':
1130             case '6':
1131             case '7':
1132             case '8':
1133             case '9':
1134             {
1135                 add(current);
1136                 goto scan_number_decimal2;
1137             }
1138 
1139             case 'e':
1140             case 'E':
1141             {
1142                 add(current);
1143                 goto scan_number_exponent;
1144             }
1145 
1146             default:
1147                 goto scan_number_done;
1148         }
1149 
1150 scan_number_exponent:
1151         // we just parsed an exponent
1152         number_type = token_type::value_float;
1153         switch (get())
1154         {
1155             case '+':
1156             case '-':
1157             {
1158                 add(current);
1159                 goto scan_number_sign;
1160             }
1161 
1162             case '0':
1163             case '1':
1164             case '2':
1165             case '3':
1166             case '4':
1167             case '5':
1168             case '6':
1169             case '7':
1170             case '8':
1171             case '9':
1172             {
1173                 add(current);
1174                 goto scan_number_any2;
1175             }
1176 
1177             default:
1178             {
1179                 error_message =
1180                     "invalid number; expected '+', '-', or digit after exponent";
1181                 return token_type::parse_error;
1182             }
1183         }
1184 
1185 scan_number_sign:
1186         // we just parsed an exponent sign
1187         switch (get())
1188         {
1189             case '0':
1190             case '1':
1191             case '2':
1192             case '3':
1193             case '4':
1194             case '5':
1195             case '6':
1196             case '7':
1197             case '8':
1198             case '9':
1199             {
1200                 add(current);
1201                 goto scan_number_any2;
1202             }
1203 
1204             default:
1205             {
1206                 error_message = "invalid number; expected digit after exponent sign";
1207                 return token_type::parse_error;
1208             }
1209         }
1210 
1211 scan_number_any2:
1212         // we just parsed a number after the exponent or exponent sign
1213         switch (get())
1214         {
1215             case '0':
1216             case '1':
1217             case '2':
1218             case '3':
1219             case '4':
1220             case '5':
1221             case '6':
1222             case '7':
1223             case '8':
1224             case '9':
1225             {
1226                 add(current);
1227                 goto scan_number_any2;
1228             }
1229 
1230             default:
1231                 goto scan_number_done;
1232         }
1233 
1234 scan_number_done:
1235         // unget the character after the number (we only read it to know that
1236         // we are done scanning a number)
1237         unget();
1238 
1239         char* endptr = nullptr;
1240         errno = 0;
1241 
1242         // try to parse integers first and fall back to floats
1243         if (number_type == token_type::value_unsigned)
1244         {
1245             const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1246 
1247             // we checked the number format before
1248             JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1249 
1250             if (errno == 0)
1251             {
1252                 value_unsigned = static_cast<number_unsigned_t>(x);
1253                 if (value_unsigned == x)
1254                 {
1255                     return token_type::value_unsigned;
1256                 }
1257             }
1258         }
1259         else if (number_type == token_type::value_integer)
1260         {
1261             const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1262 
1263             // we checked the number format before
1264             JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1265 
1266             if (errno == 0)
1267             {
1268                 value_integer = static_cast<number_integer_t>(x);
1269                 if (value_integer == x)
1270                 {
1271                     return token_type::value_integer;
1272                 }
1273             }
1274         }
1275 
1276         // this code is reached if we parse a floating-point number or if an
1277         // integer conversion above failed
1278         strtof(value_float, token_buffer.data(), &endptr);
1279 
1280         // we checked the number format before
1281         JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1282 
1283         return token_type::value_float;
1284     }
1285 
1286     /*!
1287     @param[in] literal_text  the literal text to expect
1288     @param[in] length        the length of the passed literal text
1289     @param[in] return_type   the token type to return on success
1290     */
1291     JSON_HEDLEY_NON_NULL(2)
scan_literal(const char_type * literal_text,const std::size_t length,token_type return_type)1292     token_type scan_literal(const char_type* literal_text, const std::size_t length,
1293                             token_type return_type)
1294     {
1295         JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1296         for (std::size_t i = 1; i < length; ++i)
1297         {
1298             if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1299             {
1300                 error_message = "invalid literal";
1301                 return token_type::parse_error;
1302             }
1303         }
1304         return return_type;
1305     }
1306 
1307     /////////////////////
1308     // input management
1309     /////////////////////
1310 
1311     /// reset token_buffer; current character is beginning of token
reset()1312     void reset() noexcept
1313     {
1314         token_buffer.clear();
1315         token_string.clear();
1316         token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1317     }
1318 
1319     /*
1320     @brief get next character from the input
1321 
1322     This function provides the interface to the used input adapter. It does
1323     not throw in case the input reached EOF, but returns a
1324     `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
1325     for use in error messages.
1326 
1327     @return character read from the input
1328     */
get()1329     char_int_type get()
1330     {
1331         ++position.chars_read_total;
1332         ++position.chars_read_current_line;
1333 
1334         if (next_unget)
1335         {
1336             // just reset the next_unget variable and work with current
1337             next_unget = false;
1338         }
1339         else
1340         {
1341             current = ia.get_character();
1342         }
1343 
1344         if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1345         {
1346             token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1347         }
1348 
1349         if (current == '\n')
1350         {
1351             ++position.lines_read;
1352             position.chars_read_current_line = 0;
1353         }
1354 
1355         return current;
1356     }
1357 
1358     /*!
1359     @brief unget current character (read it again on next get)
1360 
1361     We implement unget by setting variable next_unget to true. The input is not
1362     changed - we just simulate ungetting by modifying chars_read_total,
1363     chars_read_current_line, and token_string. The next call to get() will
1364     behave as if the unget character is read again.
1365     */
unget()1366     void unget()
1367     {
1368         next_unget = true;
1369 
1370         --position.chars_read_total;
1371 
1372         // in case we "unget" a newline, we have to also decrement the lines_read
1373         if (position.chars_read_current_line == 0)
1374         {
1375             if (position.lines_read > 0)
1376             {
1377                 --position.lines_read;
1378             }
1379         }
1380         else
1381         {
1382             --position.chars_read_current_line;
1383         }
1384 
1385         if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1386         {
1387             JSON_ASSERT(!token_string.empty());
1388             token_string.pop_back();
1389         }
1390     }
1391 
1392     /// add a character to token_buffer
add(char_int_type c)1393     void add(char_int_type c)
1394     {
1395         token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1396     }
1397 
1398   public:
1399     /////////////////////
1400     // value getters
1401     /////////////////////
1402 
1403     /// return integer value
get_number_integer() const1404     constexpr number_integer_t get_number_integer() const noexcept
1405     {
1406         return value_integer;
1407     }
1408 
1409     /// return unsigned integer value
get_number_unsigned() const1410     constexpr number_unsigned_t get_number_unsigned() const noexcept
1411     {
1412         return value_unsigned;
1413     }
1414 
1415     /// return floating-point value
get_number_float() const1416     constexpr number_float_t get_number_float() const noexcept
1417     {
1418         return value_float;
1419     }
1420 
1421     /// return current string value (implicitly resets the token; useful only once)
get_string()1422     string_t& get_string()
1423     {
1424         return token_buffer;
1425     }
1426 
1427     /////////////////////
1428     // diagnostics
1429     /////////////////////
1430 
1431     /// return position of last read token
get_position() const1432     constexpr position_t get_position() const noexcept
1433     {
1434         return position;
1435     }
1436 
1437     /// return the last read token (for errors only).  Will never contain EOF
1438     /// (an arbitrary value that is not a valid char value, often -1), because
1439     /// 255 may legitimately occur.  May contain NUL, which should be escaped.
get_token_string() const1440     std::string get_token_string() const
1441     {
1442         // escape control characters
1443         std::string result;
1444         for (const auto c : token_string)
1445         {
1446             if (static_cast<unsigned char>(c) <= '\x1F')
1447             {
1448                 // escape control characters
1449                 std::array<char, 9> cs{{}};
1450                 (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
1451                 result += cs.data();
1452             }
1453             else
1454             {
1455                 // add character as is
1456                 result.push_back(static_cast<std::string::value_type>(c));
1457             }
1458         }
1459 
1460         return result;
1461     }
1462 
1463     /// return syntax error message
1464     JSON_HEDLEY_RETURNS_NON_NULL
get_error_message() const1465     constexpr const char* get_error_message() const noexcept
1466     {
1467         return error_message;
1468     }
1469 
1470     /////////////////////
1471     // actual scanner
1472     /////////////////////
1473 
1474     /*!
1475     @brief skip the UTF-8 byte order mark
1476     @return true iff there is no BOM or the correct BOM has been skipped
1477     */
skip_bom()1478     bool skip_bom()
1479     {
1480         if (get() == 0xEF)
1481         {
1482             // check if we completely parse the BOM
1483             return get() == 0xBB && get() == 0xBF;
1484         }
1485 
1486         // the first character is not the beginning of the BOM; unget it to
1487         // process is later
1488         unget();
1489         return true;
1490     }
1491 
skip_whitespace()1492     void skip_whitespace()
1493     {
1494         do
1495         {
1496             get();
1497         }
1498         while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1499     }
1500 
scan()1501     token_type scan()
1502     {
1503         // initially, skip the BOM
1504         if (position.chars_read_total == 0 && !skip_bom())
1505         {
1506             error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1507             return token_type::parse_error;
1508         }
1509 
1510         // read next character and ignore whitespace
1511         skip_whitespace();
1512 
1513         // ignore comments
1514         while (ignore_comments && current == '/')
1515         {
1516             if (!scan_comment())
1517             {
1518                 return token_type::parse_error;
1519             }
1520 
1521             // skip following whitespace
1522             skip_whitespace();
1523         }
1524 
1525         switch (current)
1526         {
1527             // structural characters
1528             case '[':
1529                 return token_type::begin_array;
1530             case ']':
1531                 return token_type::end_array;
1532             case '{':
1533                 return token_type::begin_object;
1534             case '}':
1535                 return token_type::end_object;
1536             case ':':
1537                 return token_type::name_separator;
1538             case ',':
1539                 return token_type::value_separator;
1540 
1541             // literals
1542             case 't':
1543             {
1544                 std::array<char_type, 4> true_literal = {{'t', 'r', 'u', 'e'}};
1545                 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1546             }
1547             case 'f':
1548             {
1549                 std::array<char_type, 5> false_literal = {{'f', 'a', 'l', 's', 'e'}};
1550                 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1551             }
1552             case 'n':
1553             {
1554                 std::array<char_type, 4> null_literal = {{'n', 'u', 'l', 'l'}};
1555                 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1556             }
1557 
1558             // string
1559             case '\"':
1560                 return scan_string();
1561 
1562             // number
1563             case '-':
1564             case '0':
1565             case '1':
1566             case '2':
1567             case '3':
1568             case '4':
1569             case '5':
1570             case '6':
1571             case '7':
1572             case '8':
1573             case '9':
1574                 return scan_number();
1575 
1576             // end of input (the null byte is needed when parsing from
1577             // string literals)
1578             case '\0':
1579             case std::char_traits<char_type>::eof():
1580                 return token_type::end_of_input;
1581 
1582             // error
1583             default:
1584                 error_message = "invalid literal";
1585                 return token_type::parse_error;
1586         }
1587     }
1588 
1589   private:
1590     /// input adapter
1591     InputAdapterType ia;
1592 
1593     /// whether comments should be ignored (true) or signaled as errors (false)
1594     const bool ignore_comments = false;
1595 
1596     /// the current character
1597     char_int_type current = std::char_traits<char_type>::eof();
1598 
1599     /// whether the next get() call should just return current
1600     bool next_unget = false;
1601 
1602     /// the start position of the current token
1603     position_t position {};
1604 
1605     /// raw input token string (for error messages)
1606     std::vector<char_type> token_string {};
1607 
1608     /// buffer for variable-length tokens (numbers, strings)
1609     string_t token_buffer {};
1610 
1611     /// a description of occurred lexer errors
1612     const char* error_message = "";
1613 
1614     // number values
1615     number_integer_t value_integer = 0;
1616     number_unsigned_t value_unsigned = 0;
1617     number_float_t value_float = 0;
1618 
1619     /// the decimal point
1620     const char_int_type decimal_point_char = '.';
1621 };
1622 }  // namespace detail
1623 }  // namespace nlohmann
1624