1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/json/json_parser.h"
6
7 #include <cmath>
8 #include <string_view>
9 #include <utility>
10 #include <vector>
11
12 #include "base/logging.h"
13 #include "base/numerics/safe_conversions.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_util.h"
16 #include "base/strings/stringprintf.h"
17 #include "base/strings/utf_string_conversion_utils.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "base/third_party/icu/icu_utf.h"
20 #include "base/values.h"
21
22 namespace base {
23 namespace internal {
24
25 namespace {
26
27 const int32_t kExtendedASCIIStart = 0x80;
28
29 // Simple class that checks for maximum recursion/"stack overflow."
30 class StackMarker {
31 public:
StackMarker(int max_depth,int * depth)32 StackMarker(int max_depth, int* depth)
33 : max_depth_(max_depth), depth_(depth) {
34 ++(*depth_);
35 DCHECK_LE(*depth_, max_depth_);
36 }
~StackMarker()37 ~StackMarker() { --(*depth_); }
38
IsTooDeep() const39 bool IsTooDeep() const { return *depth_ >= max_depth_; }
40
41 private:
42 const int max_depth_;
43 int* const depth_;
44
45 StackMarker(const StackMarker&) = delete;
46 StackMarker& operator=(const StackMarker&) = delete;
47 };
48
49 constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD;
50
51 } // namespace
52
53 // This is U+FFFD.
54 const char kUnicodeReplacementString[] = "\xEF\xBF\xBD";
55
JSONParser(int options,int max_depth)56 JSONParser::JSONParser(int options, int max_depth)
57 : options_(options),
58 max_depth_(max_depth),
59 index_(0),
60 stack_depth_(0),
61 line_number_(0),
62 index_last_line_(0),
63 error_code_(JSONReader::JSON_NO_ERROR),
64 error_line_(0),
65 error_column_(0) {
66 CHECK_LE(max_depth, JSONReader::kStackMaxDepth);
67 }
68
69 JSONParser::~JSONParser() = default;
70
Parse(std::string_view input)71 std::optional<Value> JSONParser::Parse(std::string_view input) {
72 input_ = input;
73 index_ = 0;
74 line_number_ = 1;
75 index_last_line_ = 0;
76
77 error_code_ = JSONReader::JSON_NO_ERROR;
78 error_line_ = 0;
79 error_column_ = 0;
80
81 // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure
82 // that the index_ will not overflow when parsing.
83 if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) {
84 ReportError(JSONReader::JSON_TOO_LARGE, 0);
85 return std::nullopt;
86 }
87
88 // When the input JSON string starts with a UTF-8 Byte-Order-Mark,
89 // advance the start position to avoid the ParseNextToken function mis-
90 // treating a Unicode BOM as an invalid character and returning NULL.
91 ConsumeIfMatch("\xEF\xBB\xBF");
92
93 // Parse the first and any nested tokens.
94 std::optional<Value> root(ParseNextToken());
95 if (!root)
96 return std::nullopt;
97
98 // Make sure the input stream is at an end.
99 if (GetNextToken() != T_END_OF_INPUT) {
100 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);
101 return std::nullopt;
102 }
103
104 return root;
105 }
106
error_code() const107 JSONReader::JsonParseError JSONParser::error_code() const {
108 return error_code_;
109 }
110
GetErrorMessage() const111 std::string JSONParser::GetErrorMessage() const {
112 return FormatErrorMessage(error_line_, error_column_,
113 JSONReader::ErrorCodeToString(error_code_));
114 }
115
error_line() const116 int JSONParser::error_line() const {
117 return error_line_;
118 }
119
error_column() const120 int JSONParser::error_column() const {
121 return error_column_;
122 }
123
124 // StringBuilder ///////////////////////////////////////////////////////////////
125
StringBuilder()126 JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {}
127
StringBuilder(const char * pos)128 JSONParser::StringBuilder::StringBuilder(const char* pos)
129 : pos_(pos), length_(0) {}
130
131 JSONParser::StringBuilder::~StringBuilder() = default;
132
133 JSONParser::StringBuilder& JSONParser::StringBuilder::operator=(
134 StringBuilder&& other) = default;
135
Append(uint32_t point)136 void JSONParser::StringBuilder::Append(uint32_t point) {
137 DCHECK(IsValidCharacter(point));
138
139 if (point < kExtendedASCIIStart && !string_) {
140 DCHECK_EQ(static_cast<char>(point), pos_[length_]);
141 ++length_;
142 } else {
143 Convert();
144 if (UNLIKELY(point == kUnicodeReplacementPoint)) {
145 string_->append(kUnicodeReplacementString);
146 } else {
147 WriteUnicodeCharacter(point, &*string_);
148 }
149 }
150 }
151
Convert()152 void JSONParser::StringBuilder::Convert() {
153 if (string_)
154 return;
155 string_.emplace(pos_, length_);
156 }
157
DestructiveAsString()158 std::string JSONParser::StringBuilder::DestructiveAsString() {
159 if (string_)
160 return std::move(*string_);
161 return std::string(pos_, length_);
162 }
163
164 // JSONParser private //////////////////////////////////////////////////////////
165
PeekChars(int count)166 std::optional<std::string_view> JSONParser::PeekChars(int count) {
167 if (static_cast<size_t>(index_) + count > input_.length())
168 return std::nullopt;
169 // Using std::string_view::substr() is significantly slower (according to
170 // base_perftests) than constructing a substring manually.
171 return std::string_view(input_.data() + index_, count);
172 }
173
PeekChar()174 std::optional<char> JSONParser::PeekChar() {
175 std::optional<std::string_view> chars = PeekChars(1);
176 if (chars)
177 return (*chars)[0];
178 return std::nullopt;
179 }
180
ConsumeChars(int count)181 std::optional<std::string_view> JSONParser::ConsumeChars(int count) {
182 std::optional<std::string_view> chars = PeekChars(count);
183 if (chars)
184 index_ += count;
185 return chars;
186 }
187
ConsumeChar()188 std::optional<char> JSONParser::ConsumeChar() {
189 std::optional<std::string_view> chars = ConsumeChars(1);
190 if (chars)
191 return (*chars)[0];
192 return std::nullopt;
193 }
194
pos()195 const char* JSONParser::pos() {
196 CHECK_LE(static_cast<size_t>(index_), input_.length());
197 return input_.data() + index_;
198 }
199
GetNextToken()200 JSONParser::Token JSONParser::GetNextToken() {
201 EatWhitespaceAndComments();
202
203 std::optional<char> c = PeekChar();
204 if (!c)
205 return T_END_OF_INPUT;
206
207 switch (*c) {
208 case '{':
209 return T_OBJECT_BEGIN;
210 case '}':
211 return T_OBJECT_END;
212 case '[':
213 return T_ARRAY_BEGIN;
214 case ']':
215 return T_ARRAY_END;
216 case '"':
217 return T_STRING;
218 case '0':
219 case '1':
220 case '2':
221 case '3':
222 case '4':
223 case '5':
224 case '6':
225 case '7':
226 case '8':
227 case '9':
228 case '-':
229 return T_NUMBER;
230 case 't':
231 return T_BOOL_TRUE;
232 case 'f':
233 return T_BOOL_FALSE;
234 case 'n':
235 return T_NULL;
236 case ',':
237 return T_LIST_SEPARATOR;
238 case ':':
239 return T_OBJECT_PAIR_SEPARATOR;
240 default:
241 return T_INVALID_TOKEN;
242 }
243 }
244
EatWhitespaceAndComments()245 void JSONParser::EatWhitespaceAndComments() {
246 while (std::optional<char> c = PeekChar()) {
247 switch (*c) {
248 case '\r':
249 case '\n':
250 index_last_line_ = index_;
251 // Don't increment line_number_ twice for "\r\n".
252 if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) {
253 ++line_number_;
254 }
255 FALLTHROUGH;
256 case ' ':
257 case '\t':
258 ConsumeChar();
259 break;
260 case '/':
261 if (!EatComment())
262 return;
263 break;
264 default:
265 return;
266 }
267 }
268 }
269
EatComment()270 bool JSONParser::EatComment() {
271 std::optional<std::string_view> comment_start = ConsumeChars(2);
272 if (!comment_start)
273 return false;
274
275 if (comment_start == "//") {
276 // Single line comment, read to newline.
277 while (std::optional<char> c = PeekChar()) {
278 if (c == '\n' || c == '\r')
279 return true;
280 ConsumeChar();
281 }
282 } else if (comment_start == "/*") {
283 char previous_char = '\0';
284 // Block comment, read until end marker.
285 while (std::optional<char> c = PeekChar()) {
286 if (previous_char == '*' && c == '/') {
287 // EatWhitespaceAndComments will inspect pos(), which will still be on
288 // the last / of the comment, so advance once more (which may also be
289 // end of input).
290 ConsumeChar();
291 return true;
292 }
293 previous_char = *ConsumeChar();
294 }
295
296 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
297 }
298
299 return false;
300 }
301
ParseNextToken()302 std::optional<Value> JSONParser::ParseNextToken() {
303 return ParseToken(GetNextToken());
304 }
305
ParseToken(Token token)306 std::optional<Value> JSONParser::ParseToken(Token token) {
307 switch (token) {
308 case T_OBJECT_BEGIN:
309 return ConsumeDictionary();
310 case T_ARRAY_BEGIN:
311 return ConsumeList();
312 case T_STRING:
313 return ConsumeString();
314 case T_NUMBER:
315 return ConsumeNumber();
316 case T_BOOL_TRUE:
317 case T_BOOL_FALSE:
318 case T_NULL:
319 return ConsumeLiteral();
320 default:
321 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
322 return std::nullopt;
323 }
324 }
325
ConsumeDictionary()326 std::optional<Value> JSONParser::ConsumeDictionary() {
327 if (ConsumeChar() != '{') {
328 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
329 return std::nullopt;
330 }
331
332 StackMarker depth_check(max_depth_, &stack_depth_);
333 if (depth_check.IsTooDeep()) {
334 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
335 return std::nullopt;
336 }
337
338 std::vector<Value::DictStorage::value_type> dict_storage;
339
340 Token token = GetNextToken();
341 while (token != T_OBJECT_END) {
342 if (token != T_STRING) {
343 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);
344 return std::nullopt;
345 }
346
347 // First consume the key.
348 StringBuilder key;
349 if (!ConsumeStringRaw(&key)) {
350 return std::nullopt;
351 }
352
353 // Read the separator.
354 token = GetNextToken();
355 if (token != T_OBJECT_PAIR_SEPARATOR) {
356 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
357 return std::nullopt;
358 }
359
360 // The next token is the value. Ownership transfers to |dict|.
361 ConsumeChar();
362 std::optional<Value> value = ParseNextToken();
363 if (!value) {
364 // ReportError from deeper level.
365 return std::nullopt;
366 }
367
368 dict_storage.emplace_back(key.DestructiveAsString(),
369 std::make_unique<Value>(std::move(*value)));
370
371 token = GetNextToken();
372 if (token == T_LIST_SEPARATOR) {
373 ConsumeChar();
374 token = GetNextToken();
375 if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
376 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
377 return std::nullopt;
378 }
379 } else if (token != T_OBJECT_END) {
380 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
381 return std::nullopt;
382 }
383 }
384
385 ConsumeChar(); // Closing '}'.
386
387 return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES));
388 }
389
ConsumeList()390 std::optional<Value> JSONParser::ConsumeList() {
391 if (ConsumeChar() != '[') {
392 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
393 return std::nullopt;
394 }
395
396 StackMarker depth_check(max_depth_, &stack_depth_);
397 if (depth_check.IsTooDeep()) {
398 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
399 return std::nullopt;
400 }
401
402 Value::ListStorage list_storage;
403
404 Token token = GetNextToken();
405 while (token != T_ARRAY_END) {
406 std::optional<Value> item = ParseToken(token);
407 if (!item) {
408 // ReportError from deeper level.
409 return std::nullopt;
410 }
411
412 list_storage.push_back(std::move(*item));
413
414 token = GetNextToken();
415 if (token == T_LIST_SEPARATOR) {
416 ConsumeChar();
417 token = GetNextToken();
418 if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
419 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
420 return std::nullopt;
421 }
422 } else if (token != T_ARRAY_END) {
423 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
424 return std::nullopt;
425 }
426 }
427
428 ConsumeChar(); // Closing ']'.
429
430 return Value(std::move(list_storage));
431 }
432
ConsumeString()433 std::optional<Value> JSONParser::ConsumeString() {
434 StringBuilder string;
435 if (!ConsumeStringRaw(&string))
436 return std::nullopt;
437
438 return Value(string.DestructiveAsString());
439 }
440
ConsumeStringRaw(StringBuilder * out)441 bool JSONParser::ConsumeStringRaw(StringBuilder* out) {
442 if (ConsumeChar() != '"') {
443 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
444 return false;
445 }
446
447 // StringBuilder will internally build a std::string_view unless a UTF-16
448 // conversion occurs, at which point it will perform a copy into a
449 // std::string.
450 StringBuilder string(pos());
451
452 while (PeekChar()) {
453 uint32_t next_char = 0;
454 if (!ReadUnicodeCharacter(input_.data(),
455 static_cast<int32_t>(input_.length()), &index_,
456 &next_char) ||
457 !IsValidCharacter(next_char)) {
458 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
459 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);
460 return false;
461 }
462 ConsumeChar();
463 string.Append(kUnicodeReplacementPoint);
464 continue;
465 }
466
467 if (next_char == '"') {
468 ConsumeChar();
469 *out = std::move(string);
470 return true;
471 } else if (next_char != '\\') {
472 // If this character is not an escape sequence...
473 ConsumeChar();
474 string.Append(next_char);
475 } else {
476 // And if it is an escape sequence, the input string will be adjusted
477 // (either by combining the two characters of an encoded escape sequence,
478 // or with a UTF conversion), so using std::string_view isn't possible --
479 // force a conversion.
480 string.Convert();
481
482 // Read past the escape '\' and ensure there's a character following.
483 std::optional<std::string_view> escape_sequence = ConsumeChars(2);
484 if (!escape_sequence) {
485 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
486 return false;
487 }
488
489 switch ((*escape_sequence)[1]) {
490 // Allowed esape sequences:
491 case 'x': { // UTF-8 sequence.
492 // UTF-8 \x escape sequences are not allowed in the spec, but they
493 // are supported here for backwards-compatiblity with the old parser.
494 escape_sequence = ConsumeChars(2);
495 if (!escape_sequence) {
496 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
497 return false;
498 }
499
500 int hex_digit = 0;
501 if (!HexStringToInt(*escape_sequence, &hex_digit) ||
502 !IsValidCharacter(hex_digit)) {
503 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
504 return false;
505 }
506
507 string.Append(hex_digit);
508 break;
509 }
510 case 'u': { // UTF-16 sequence.
511 // UTF units are of the form \uXXXX.
512 uint32_t code_point;
513 if (!DecodeUTF16(&code_point)) {
514 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
515 return false;
516 }
517 string.Append(code_point);
518 break;
519 }
520 case '"':
521 string.Append('"');
522 break;
523 case '\\':
524 string.Append('\\');
525 break;
526 case '/':
527 string.Append('/');
528 break;
529 case 'b':
530 string.Append('\b');
531 break;
532 case 'f':
533 string.Append('\f');
534 break;
535 case 'n':
536 string.Append('\n');
537 break;
538 case 'r':
539 string.Append('\r');
540 break;
541 case 't':
542 string.Append('\t');
543 break;
544 case 'v': // Not listed as valid escape sequence in the RFC.
545 string.Append('\v');
546 break;
547 // All other escape squences are illegal.
548 default:
549 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
550 return false;
551 }
552 }
553 }
554
555 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
556 return false;
557 }
558
559 // Entry is at the first X in \uXXXX.
DecodeUTF16(uint32_t * out_code_point)560 bool JSONParser::DecodeUTF16(uint32_t* out_code_point) {
561 std::optional<std::string_view> escape_sequence = ConsumeChars(4);
562 if (!escape_sequence)
563 return false;
564
565 // Consume the UTF-16 code unit, which may be a high surrogate.
566 int code_unit16_high = 0;
567 if (!HexStringToInt(*escape_sequence, &code_unit16_high))
568 return false;
569
570 // If this is a high surrogate, consume the next code unit to get the
571 // low surrogate.
572 if (CBU16_IS_SURROGATE(code_unit16_high)) {
573 // Make sure this is the high surrogate. If not, it's an encoding
574 // error.
575 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
576 return false;
577
578 // Make sure that the token has more characters to consume the
579 // lower surrogate.
580 if (!ConsumeIfMatch("\\u"))
581 return false;
582
583 escape_sequence = ConsumeChars(4);
584 if (!escape_sequence)
585 return false;
586
587 int code_unit16_low = 0;
588 if (!HexStringToInt(*escape_sequence, &code_unit16_low))
589 return false;
590
591 if (!CBU16_IS_TRAIL(code_unit16_low))
592 return false;
593
594 uint32_t code_point =
595 CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low);
596 if (!IsValidCharacter(code_point))
597 return false;
598
599 *out_code_point = code_point;
600 } else {
601 // Not a surrogate.
602 DCHECK(CBU16_IS_SINGLE(code_unit16_high));
603 if (!IsValidCharacter(code_unit16_high)) {
604 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
605 return false;
606 }
607 *out_code_point = kUnicodeReplacementPoint;
608 return true;
609 }
610
611 *out_code_point = code_unit16_high;
612 }
613
614 return true;
615 }
616
ConsumeNumber()617 std::optional<Value> JSONParser::ConsumeNumber() {
618 const char* num_start = pos();
619 const int start_index = index_;
620 int end_index = start_index;
621
622 if (PeekChar() == '-')
623 ConsumeChar();
624
625 if (!ReadInt(false)) {
626 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
627 return std::nullopt;
628 }
629 end_index = index_;
630
631 // The optional fraction part.
632 if (PeekChar() == '.') {
633 ConsumeChar();
634 if (!ReadInt(true)) {
635 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
636 return std::nullopt;
637 }
638 end_index = index_;
639 }
640
641 // Optional exponent part.
642 std::optional<char> c = PeekChar();
643 if (c == 'e' || c == 'E') {
644 ConsumeChar();
645 if (PeekChar() == '-' || PeekChar() == '+') {
646 ConsumeChar();
647 }
648 if (!ReadInt(true)) {
649 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
650 return std::nullopt;
651 }
652 end_index = index_;
653 }
654
655 // ReadInt is greedy because numbers have no easily detectable sentinel,
656 // so save off where the parser should be on exit (see Consume invariant at
657 // the top of the header), then make sure the next token is one which is
658 // valid.
659 int exit_index = index_;
660
661 switch (GetNextToken()) {
662 case T_OBJECT_END:
663 case T_ARRAY_END:
664 case T_LIST_SEPARATOR:
665 case T_END_OF_INPUT:
666 break;
667 default:
668 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
669 return std::nullopt;
670 }
671
672 index_ = exit_index;
673
674 std::string_view num_string(num_start, end_index - start_index);
675
676 int num_int;
677 if (StringToInt(num_string, &num_int))
678 return Value(num_int);
679
680 return std::nullopt;
681 }
682
ReadInt(bool allow_leading_zeros)683 bool JSONParser::ReadInt(bool allow_leading_zeros) {
684 size_t len = 0;
685 char first = 0;
686
687 while (std::optional<char> c = PeekChar()) {
688 if (!IsAsciiDigit(c))
689 break;
690
691 if (len == 0)
692 first = *c;
693
694 ++len;
695 ConsumeChar();
696 }
697
698 if (len == 0)
699 return false;
700
701 if (!allow_leading_zeros && len > 1 && first == '0')
702 return false;
703
704 return true;
705 }
706
ConsumeLiteral()707 std::optional<Value> JSONParser::ConsumeLiteral() {
708 if (ConsumeIfMatch("true")) {
709 return Value(true);
710 } else if (ConsumeIfMatch("false")) {
711 return Value(false);
712 } else if (ConsumeIfMatch("null")) {
713 return Value(Value::Type::NONE);
714 } else {
715 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
716 return std::nullopt;
717 }
718 }
719
ConsumeIfMatch(std::string_view match)720 bool JSONParser::ConsumeIfMatch(std::string_view match) {
721 if (match == PeekChars(match.size())) {
722 ConsumeChars(match.size());
723 return true;
724 }
725 return false;
726 }
727
ReportError(JSONReader::JsonParseError code,int column_adjust)728 void JSONParser::ReportError(JSONReader::JsonParseError code,
729 int column_adjust) {
730 error_code_ = code;
731 error_line_ = line_number_;
732 error_column_ = index_ - index_last_line_ + column_adjust;
733 }
734
735 // static
FormatErrorMessage(int line,int column,const std::string & description)736 std::string JSONParser::FormatErrorMessage(int line,
737 int column,
738 const std::string& description) {
739 if (line || column) {
740 return StringPrintf("Line: %i, column: %i, %s", line, column,
741 description.c_str());
742 }
743 return description;
744 }
745
746 } // namespace internal
747 } // namespace base
748