1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/json/json_parser.h"
6
7 #include <cmath>
8 #include <utility>
9 #include <vector>
10
11 #include "base/logging.h"
12 #include "base/macros.h"
13 #include "base/numerics/safe_conversions.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_piece.h"
16 #include "base/strings/string_util.h"
17 #include "base/strings/stringprintf.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "base/third_party/icu/icu_utf.h"
21 #include "base/values.h"
22
23 namespace base {
24 namespace internal {
25
26 namespace {
27
28 const int32_t kExtendedASCIIStart = 0x80;
29
30 // Simple class that checks for maximum recursion/"stack overflow."
31 class StackMarker {
32 public:
StackMarker(int max_depth,int * depth)33 StackMarker(int max_depth, int* depth)
34 : max_depth_(max_depth), depth_(depth) {
35 ++(*depth_);
36 DCHECK_LE(*depth_, max_depth_);
37 }
~StackMarker()38 ~StackMarker() {
39 --(*depth_);
40 }
41
IsTooDeep() const42 bool IsTooDeep() const { return *depth_ >= max_depth_; }
43
44 private:
45 const int max_depth_;
46 int* const depth_;
47
48 DISALLOW_COPY_AND_ASSIGN(StackMarker);
49 };
50
51 constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD;
52
53 } // namespace
54
55 // This is U+FFFD.
56 const char kUnicodeReplacementString[] = "\xEF\xBF\xBD";
57
JSONParser(int options,int max_depth)58 JSONParser::JSONParser(int options, int max_depth)
59 : options_(options),
60 max_depth_(max_depth),
61 index_(0),
62 stack_depth_(0),
63 line_number_(0),
64 index_last_line_(0),
65 error_code_(JSONReader::JSON_NO_ERROR),
66 error_line_(0),
67 error_column_(0) {
68 CHECK_LE(max_depth, JSONReader::kStackMaxDepth);
69 }
70
71 JSONParser::~JSONParser() = default;
72
Parse(StringPiece input)73 Optional<Value> JSONParser::Parse(StringPiece input) {
74 input_ = input;
75 index_ = 0;
76 line_number_ = 1;
77 index_last_line_ = 0;
78
79 error_code_ = JSONReader::JSON_NO_ERROR;
80 error_line_ = 0;
81 error_column_ = 0;
82
83 // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure
84 // that the index_ will not overflow when parsing.
85 if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) {
86 ReportError(JSONReader::JSON_TOO_LARGE, 0);
87 return nullopt;
88 }
89
90 // When the input JSON string starts with a UTF-8 Byte-Order-Mark,
91 // advance the start position to avoid the ParseNextToken function mis-
92 // treating a Unicode BOM as an invalid character and returning NULL.
93 ConsumeIfMatch("\xEF\xBB\xBF");
94
95 // Parse the first and any nested tokens.
96 Optional<Value> root(ParseNextToken());
97 if (!root)
98 return nullopt;
99
100 // Make sure the input stream is at an end.
101 if (GetNextToken() != T_END_OF_INPUT) {
102 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);
103 return nullopt;
104 }
105
106 return root;
107 }
108
error_code() const109 JSONReader::JsonParseError JSONParser::error_code() const {
110 return error_code_;
111 }
112
GetErrorMessage() const113 std::string JSONParser::GetErrorMessage() const {
114 return FormatErrorMessage(error_line_, error_column_,
115 JSONReader::ErrorCodeToString(error_code_));
116 }
117
error_line() const118 int JSONParser::error_line() const {
119 return error_line_;
120 }
121
error_column() const122 int JSONParser::error_column() const {
123 return error_column_;
124 }
125
126 // StringBuilder ///////////////////////////////////////////////////////////////
127
StringBuilder()128 JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {}
129
StringBuilder(const char * pos)130 JSONParser::StringBuilder::StringBuilder(const char* pos)
131 : pos_(pos), length_(0) {}
132
133 JSONParser::StringBuilder::~StringBuilder() = default;
134
135 JSONParser::StringBuilder& JSONParser::StringBuilder::operator=(
136 StringBuilder&& other) = default;
137
Append(uint32_t point)138 void JSONParser::StringBuilder::Append(uint32_t point) {
139 DCHECK(IsValidCharacter(point));
140
141 if (point < kExtendedASCIIStart && !string_) {
142 DCHECK_EQ(static_cast<char>(point), pos_[length_]);
143 ++length_;
144 } else {
145 Convert();
146 if (UNLIKELY(point == kUnicodeReplacementPoint)) {
147 string_->append(kUnicodeReplacementString);
148 } else {
149 WriteUnicodeCharacter(point, &*string_);
150 }
151 }
152 }
153
Convert()154 void JSONParser::StringBuilder::Convert() {
155 if (string_)
156 return;
157 string_.emplace(pos_, length_);
158 }
159
DestructiveAsString()160 std::string JSONParser::StringBuilder::DestructiveAsString() {
161 if (string_)
162 return std::move(*string_);
163 return std::string(pos_, length_);
164 }
165
166 // JSONParser private //////////////////////////////////////////////////////////
167
PeekChars(int count)168 Optional<StringPiece> JSONParser::PeekChars(int count) {
169 if (static_cast<size_t>(index_) + count > input_.length())
170 return nullopt;
171 // Using StringPiece::substr() is significantly slower (according to
172 // base_perftests) than constructing a substring manually.
173 return StringPiece(input_.data() + index_, count);
174 }
175
PeekChar()176 Optional<char> JSONParser::PeekChar() {
177 Optional<StringPiece> chars = PeekChars(1);
178 if (chars)
179 return (*chars)[0];
180 return nullopt;
181 }
182
ConsumeChars(int count)183 Optional<StringPiece> JSONParser::ConsumeChars(int count) {
184 Optional<StringPiece> chars = PeekChars(count);
185 if (chars)
186 index_ += count;
187 return chars;
188 }
189
ConsumeChar()190 Optional<char> JSONParser::ConsumeChar() {
191 Optional<StringPiece> chars = ConsumeChars(1);
192 if (chars)
193 return (*chars)[0];
194 return nullopt;
195 }
196
pos()197 const char* JSONParser::pos() {
198 CHECK_LE(static_cast<size_t>(index_), input_.length());
199 return input_.data() + index_;
200 }
201
GetNextToken()202 JSONParser::Token JSONParser::GetNextToken() {
203 EatWhitespaceAndComments();
204
205 Optional<char> c = PeekChar();
206 if (!c)
207 return T_END_OF_INPUT;
208
209 switch (*c) {
210 case '{':
211 return T_OBJECT_BEGIN;
212 case '}':
213 return T_OBJECT_END;
214 case '[':
215 return T_ARRAY_BEGIN;
216 case ']':
217 return T_ARRAY_END;
218 case '"':
219 return T_STRING;
220 case '0':
221 case '1':
222 case '2':
223 case '3':
224 case '4':
225 case '5':
226 case '6':
227 case '7':
228 case '8':
229 case '9':
230 case '-':
231 return T_NUMBER;
232 case 't':
233 return T_BOOL_TRUE;
234 case 'f':
235 return T_BOOL_FALSE;
236 case 'n':
237 return T_NULL;
238 case ',':
239 return T_LIST_SEPARATOR;
240 case ':':
241 return T_OBJECT_PAIR_SEPARATOR;
242 default:
243 return T_INVALID_TOKEN;
244 }
245 }
246
EatWhitespaceAndComments()247 void JSONParser::EatWhitespaceAndComments() {
248 while (Optional<char> c = PeekChar()) {
249 switch (*c) {
250 case '\r':
251 case '\n':
252 index_last_line_ = index_;
253 // Don't increment line_number_ twice for "\r\n".
254 if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) {
255 ++line_number_;
256 }
257 FALLTHROUGH;
258 case ' ':
259 case '\t':
260 ConsumeChar();
261 break;
262 case '/':
263 if (!EatComment())
264 return;
265 break;
266 default:
267 return;
268 }
269 }
270 }
271
EatComment()272 bool JSONParser::EatComment() {
273 Optional<StringPiece> comment_start = ConsumeChars(2);
274 if (!comment_start)
275 return false;
276
277 if (comment_start == "//") {
278 // Single line comment, read to newline.
279 while (Optional<char> c = PeekChar()) {
280 if (c == '\n' || c == '\r')
281 return true;
282 ConsumeChar();
283 }
284 } else if (comment_start == "/*") {
285 char previous_char = '\0';
286 // Block comment, read until end marker.
287 while (Optional<char> c = PeekChar()) {
288 if (previous_char == '*' && c == '/') {
289 // EatWhitespaceAndComments will inspect pos(), which will still be on
290 // the last / of the comment, so advance once more (which may also be
291 // end of input).
292 ConsumeChar();
293 return true;
294 }
295 previous_char = *ConsumeChar();
296 }
297
298 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
299 }
300
301 return false;
302 }
303
ParseNextToken()304 Optional<Value> JSONParser::ParseNextToken() {
305 return ParseToken(GetNextToken());
306 }
307
ParseToken(Token token)308 Optional<Value> JSONParser::ParseToken(Token token) {
309 switch (token) {
310 case T_OBJECT_BEGIN:
311 return ConsumeDictionary();
312 case T_ARRAY_BEGIN:
313 return ConsumeList();
314 case T_STRING:
315 return ConsumeString();
316 case T_NUMBER:
317 return ConsumeNumber();
318 case T_BOOL_TRUE:
319 case T_BOOL_FALSE:
320 case T_NULL:
321 return ConsumeLiteral();
322 default:
323 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
324 return nullopt;
325 }
326 }
327
ConsumeDictionary()328 Optional<Value> JSONParser::ConsumeDictionary() {
329 if (ConsumeChar() != '{') {
330 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
331 return nullopt;
332 }
333
334 StackMarker depth_check(max_depth_, &stack_depth_);
335 if (depth_check.IsTooDeep()) {
336 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
337 return nullopt;
338 }
339
340 std::vector<Value::DictStorage::value_type> dict_storage;
341
342 Token token = GetNextToken();
343 while (token != T_OBJECT_END) {
344 if (token != T_STRING) {
345 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);
346 return nullopt;
347 }
348
349 // First consume the key.
350 StringBuilder key;
351 if (!ConsumeStringRaw(&key)) {
352 return nullopt;
353 }
354
355 // Read the separator.
356 token = GetNextToken();
357 if (token != T_OBJECT_PAIR_SEPARATOR) {
358 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
359 return nullopt;
360 }
361
362 // The next token is the value. Ownership transfers to |dict|.
363 ConsumeChar();
364 Optional<Value> value = ParseNextToken();
365 if (!value) {
366 // ReportError from deeper level.
367 return nullopt;
368 }
369
370 dict_storage.emplace_back(key.DestructiveAsString(),
371 std::make_unique<Value>(std::move(*value)));
372
373 token = GetNextToken();
374 if (token == T_LIST_SEPARATOR) {
375 ConsumeChar();
376 token = GetNextToken();
377 if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
378 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
379 return nullopt;
380 }
381 } else if (token != T_OBJECT_END) {
382 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
383 return nullopt;
384 }
385 }
386
387 ConsumeChar(); // Closing '}'.
388
389 return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES));
390 }
391
ConsumeList()392 Optional<Value> JSONParser::ConsumeList() {
393 if (ConsumeChar() != '[') {
394 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
395 return nullopt;
396 }
397
398 StackMarker depth_check(max_depth_, &stack_depth_);
399 if (depth_check.IsTooDeep()) {
400 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
401 return nullopt;
402 }
403
404 Value::ListStorage list_storage;
405
406 Token token = GetNextToken();
407 while (token != T_ARRAY_END) {
408 Optional<Value> item = ParseToken(token);
409 if (!item) {
410 // ReportError from deeper level.
411 return nullopt;
412 }
413
414 list_storage.push_back(std::move(*item));
415
416 token = GetNextToken();
417 if (token == T_LIST_SEPARATOR) {
418 ConsumeChar();
419 token = GetNextToken();
420 if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
421 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
422 return nullopt;
423 }
424 } else if (token != T_ARRAY_END) {
425 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
426 return nullopt;
427 }
428 }
429
430 ConsumeChar(); // Closing ']'.
431
432 return Value(std::move(list_storage));
433 }
434
ConsumeString()435 Optional<Value> JSONParser::ConsumeString() {
436 StringBuilder string;
437 if (!ConsumeStringRaw(&string))
438 return nullopt;
439
440 return Value(string.DestructiveAsString());
441 }
442
ConsumeStringRaw(StringBuilder * out)443 bool JSONParser::ConsumeStringRaw(StringBuilder* out) {
444 if (ConsumeChar() != '"') {
445 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
446 return false;
447 }
448
449 // StringBuilder will internally build a StringPiece unless a UTF-16
450 // conversion occurs, at which point it will perform a copy into a
451 // std::string.
452 StringBuilder string(pos());
453
454 while (PeekChar()) {
455 uint32_t next_char = 0;
456 if (!ReadUnicodeCharacter(input_.data(),
457 static_cast<int32_t>(input_.length()),
458 &index_,
459 &next_char) ||
460 !IsValidCharacter(next_char)) {
461 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
462 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);
463 return false;
464 }
465 ConsumeChar();
466 string.Append(kUnicodeReplacementPoint);
467 continue;
468 }
469
470 if (next_char == '"') {
471 ConsumeChar();
472 *out = std::move(string);
473 return true;
474 } else if (next_char != '\\') {
475 // If this character is not an escape sequence...
476 ConsumeChar();
477 string.Append(next_char);
478 } else {
479 // And if it is an escape sequence, the input string will be adjusted
480 // (either by combining the two characters of an encoded escape sequence,
481 // or with a UTF conversion), so using StringPiece isn't possible -- force
482 // a conversion.
483 string.Convert();
484
485 // Read past the escape '\' and ensure there's a character following.
486 Optional<StringPiece> escape_sequence = ConsumeChars(2);
487 if (!escape_sequence) {
488 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
489 return false;
490 }
491
492 switch ((*escape_sequence)[1]) {
493 // Allowed esape sequences:
494 case 'x': { // UTF-8 sequence.
495 // UTF-8 \x escape sequences are not allowed in the spec, but they
496 // are supported here for backwards-compatiblity with the old parser.
497 escape_sequence = ConsumeChars(2);
498 if (!escape_sequence) {
499 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
500 return false;
501 }
502
503 int hex_digit = 0;
504 if (!HexStringToInt(*escape_sequence, &hex_digit) ||
505 !IsValidCharacter(hex_digit)) {
506 ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
507 return false;
508 }
509
510 string.Append(hex_digit);
511 break;
512 }
513 case 'u': { // UTF-16 sequence.
514 // UTF units are of the form \uXXXX.
515 uint32_t code_point;
516 if (!DecodeUTF16(&code_point)) {
517 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
518 return false;
519 }
520 string.Append(code_point);
521 break;
522 }
523 case '"':
524 string.Append('"');
525 break;
526 case '\\':
527 string.Append('\\');
528 break;
529 case '/':
530 string.Append('/');
531 break;
532 case 'b':
533 string.Append('\b');
534 break;
535 case 'f':
536 string.Append('\f');
537 break;
538 case 'n':
539 string.Append('\n');
540 break;
541 case 'r':
542 string.Append('\r');
543 break;
544 case 't':
545 string.Append('\t');
546 break;
547 case 'v': // Not listed as valid escape sequence in the RFC.
548 string.Append('\v');
549 break;
550 // All other escape squences are illegal.
551 default:
552 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
553 return false;
554 }
555 }
556 }
557
558 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
559 return false;
560 }
561
562 // Entry is at the first X in \uXXXX.
DecodeUTF16(uint32_t * out_code_point)563 bool JSONParser::DecodeUTF16(uint32_t* out_code_point) {
564 Optional<StringPiece> escape_sequence = ConsumeChars(4);
565 if (!escape_sequence)
566 return false;
567
568 // Consume the UTF-16 code unit, which may be a high surrogate.
569 int code_unit16_high = 0;
570 if (!HexStringToInt(*escape_sequence, &code_unit16_high))
571 return false;
572
573 // If this is a high surrogate, consume the next code unit to get the
574 // low surrogate.
575 if (CBU16_IS_SURROGATE(code_unit16_high)) {
576 // Make sure this is the high surrogate. If not, it's an encoding
577 // error.
578 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
579 return false;
580
581 // Make sure that the token has more characters to consume the
582 // lower surrogate.
583 if (!ConsumeIfMatch("\\u"))
584 return false;
585
586 escape_sequence = ConsumeChars(4);
587 if (!escape_sequence)
588 return false;
589
590 int code_unit16_low = 0;
591 if (!HexStringToInt(*escape_sequence, &code_unit16_low))
592 return false;
593
594 if (!CBU16_IS_TRAIL(code_unit16_low))
595 return false;
596
597 uint32_t code_point =
598 CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low);
599 if (!IsValidCharacter(code_point))
600 return false;
601
602 *out_code_point = code_point;
603 } else {
604 // Not a surrogate.
605 DCHECK(CBU16_IS_SINGLE(code_unit16_high));
606 if (!IsValidCharacter(code_unit16_high)) {
607 if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
608 return false;
609 }
610 *out_code_point = kUnicodeReplacementPoint;
611 return true;
612 }
613
614 *out_code_point = code_unit16_high;
615 }
616
617 return true;
618 }
619
ConsumeNumber()620 Optional<Value> JSONParser::ConsumeNumber() {
621 const char* num_start = pos();
622 const int start_index = index_;
623 int end_index = start_index;
624
625 if (PeekChar() == '-')
626 ConsumeChar();
627
628 if (!ReadInt(false)) {
629 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
630 return nullopt;
631 }
632 end_index = index_;
633
634 // The optional fraction part.
635 if (PeekChar() == '.') {
636 ConsumeChar();
637 if (!ReadInt(true)) {
638 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
639 return nullopt;
640 }
641 end_index = index_;
642 }
643
644 // Optional exponent part.
645 Optional<char> c = PeekChar();
646 if (c == 'e' || c == 'E') {
647 ConsumeChar();
648 if (PeekChar() == '-' || PeekChar() == '+') {
649 ConsumeChar();
650 }
651 if (!ReadInt(true)) {
652 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
653 return nullopt;
654 }
655 end_index = index_;
656 }
657
658 // ReadInt is greedy because numbers have no easily detectable sentinel,
659 // so save off where the parser should be on exit (see Consume invariant at
660 // the top of the header), then make sure the next token is one which is
661 // valid.
662 int exit_index = index_;
663
664 switch (GetNextToken()) {
665 case T_OBJECT_END:
666 case T_ARRAY_END:
667 case T_LIST_SEPARATOR:
668 case T_END_OF_INPUT:
669 break;
670 default:
671 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
672 return nullopt;
673 }
674
675 index_ = exit_index;
676
677 StringPiece num_string(num_start, end_index - start_index);
678
679 int num_int;
680 if (StringToInt(num_string, &num_int))
681 return Value(num_int);
682
683 double num_double;
684 if (StringToDouble(num_string.as_string(), &num_double) &&
685 std::isfinite(num_double)) {
686 return Value(num_double);
687 }
688
689 return nullopt;
690 }
691
ReadInt(bool allow_leading_zeros)692 bool JSONParser::ReadInt(bool allow_leading_zeros) {
693 size_t len = 0;
694 char first = 0;
695
696 while (Optional<char> c = PeekChar()) {
697 if (!IsAsciiDigit(c))
698 break;
699
700 if (len == 0)
701 first = *c;
702
703 ++len;
704 ConsumeChar();
705 }
706
707 if (len == 0)
708 return false;
709
710 if (!allow_leading_zeros && len > 1 && first == '0')
711 return false;
712
713 return true;
714 }
715
ConsumeLiteral()716 Optional<Value> JSONParser::ConsumeLiteral() {
717 if (ConsumeIfMatch("true")) {
718 return Value(true);
719 } else if (ConsumeIfMatch("false")) {
720 return Value(false);
721 } else if (ConsumeIfMatch("null")) {
722 return Value(Value::Type::NONE);
723 } else {
724 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
725 return nullopt;
726 }
727 }
728
ConsumeIfMatch(StringPiece match)729 bool JSONParser::ConsumeIfMatch(StringPiece match) {
730 if (match == PeekChars(match.size())) {
731 ConsumeChars(match.size());
732 return true;
733 }
734 return false;
735 }
736
ReportError(JSONReader::JsonParseError code,int column_adjust)737 void JSONParser::ReportError(JSONReader::JsonParseError code,
738 int column_adjust) {
739 error_code_ = code;
740 error_line_ = line_number_;
741 error_column_ = index_ - index_last_line_ + column_adjust;
742 }
743
744 // static
FormatErrorMessage(int line,int column,const std::string & description)745 std::string JSONParser::FormatErrorMessage(int line, int column,
746 const std::string& description) {
747 if (line || column) {
748 return StringPrintf("Line: %i, column: %i, %s",
749 line, column, description.c_str());
750 }
751 return description;
752 }
753
754 } // namespace internal
755 } // namespace base
756