1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/json/json_reader.h"
6
7 #include "base/float_util.h"
8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h"
10 #include "base/string_number_conversions.h"
11 #include "base/string_util.h"
12 #include "base/utf_string_conversions.h"
13 #include "base/values.h"
14
15 namespace base {
16
17 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
18 0, 0);
19 static const int kStackLimit = 100;
20
21 namespace {
22
23 // A helper method for ParseNumberToken. It reads an int from the end of
24 // token. The method returns false if there is no valid integer at the end of
25 // the token.
ReadInt(JSONReader::Token & token,bool can_have_leading_zeros)26 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
27 wchar_t first = token.NextChar();
28 int len = 0;
29
30 // Read in more digits
31 wchar_t c = first;
32 while ('\0' != c && '0' <= c && c <= '9') {
33 ++token.length;
34 ++len;
35 c = token.NextChar();
36 }
37 // We need at least 1 digit.
38 if (len == 0)
39 return false;
40
41 if (!can_have_leading_zeros && len > 1 && '0' == first)
42 return false;
43
44 return true;
45 }
46
47 // A helper method for ParseStringToken. It reads |digits| hex digits from the
48 // token. If the sequence if digits is not valid (contains other characters),
49 // the method returns false.
ReadHexDigits(JSONReader::Token & token,int digits)50 bool ReadHexDigits(JSONReader::Token& token, int digits) {
51 for (int i = 1; i <= digits; ++i) {
52 wchar_t c = *(token.begin + token.length + i);
53 if ('\0' == c)
54 return false;
55 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
56 ('A' <= c && c <= 'F'))) {
57 return false;
58 }
59 }
60
61 token.length += digits;
62 return true;
63 }
64
65 } // anonymous namespace
66
67 const char* JSONReader::kBadRootElementType =
68 "Root value must be an array or object.";
69 const char* JSONReader::kInvalidEscape =
70 "Invalid escape sequence.";
71 const char* JSONReader::kSyntaxError =
72 "Syntax error.";
73 const char* JSONReader::kTrailingComma =
74 "Trailing comma not allowed.";
75 const char* JSONReader::kTooMuchNesting =
76 "Too much nesting.";
77 const char* JSONReader::kUnexpectedDataAfterRoot =
78 "Unexpected data after root element.";
79 const char* JSONReader::kUnsupportedEncoding =
80 "Unsupported encoding. JSON must be UTF-8.";
81 const char* JSONReader::kUnquotedDictionaryKey =
82 "Dictionary keys must be quoted.";
83
JSONReader()84 JSONReader::JSONReader()
85 : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
86 allow_trailing_comma_(false),
87 error_code_(JSON_NO_ERROR), error_line_(0), error_col_(0) {}
88
89 /* static */
Read(const std::string & json,bool allow_trailing_comma)90 Value* JSONReader::Read(const std::string& json,
91 bool allow_trailing_comma) {
92 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);
93 }
94
95 /* static */
ReadAndReturnError(const std::string & json,bool allow_trailing_comma,int * error_code_out,std::string * error_msg_out)96 Value* JSONReader::ReadAndReturnError(const std::string& json,
97 bool allow_trailing_comma,
98 int* error_code_out,
99 std::string* error_msg_out) {
100 JSONReader reader = JSONReader();
101 Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
102 if (root)
103 return root;
104
105 if (error_code_out)
106 *error_code_out = reader.error_code();
107 if (error_msg_out)
108 *error_msg_out = reader.GetErrorMessage();
109
110 return NULL;
111 }
112
113 /* static */
ErrorCodeToString(JsonParseError error_code)114 std::string JSONReader::ErrorCodeToString(JsonParseError error_code) {
115 switch (error_code) {
116 case JSON_NO_ERROR:
117 return std::string();
118 case JSON_BAD_ROOT_ELEMENT_TYPE:
119 return kBadRootElementType;
120 case JSON_INVALID_ESCAPE:
121 return kInvalidEscape;
122 case JSON_SYNTAX_ERROR:
123 return kSyntaxError;
124 case JSON_TRAILING_COMMA:
125 return kTrailingComma;
126 case JSON_TOO_MUCH_NESTING:
127 return kTooMuchNesting;
128 case JSON_UNEXPECTED_DATA_AFTER_ROOT:
129 return kUnexpectedDataAfterRoot;
130 case JSON_UNSUPPORTED_ENCODING:
131 return kUnsupportedEncoding;
132 case JSON_UNQUOTED_DICTIONARY_KEY:
133 return kUnquotedDictionaryKey;
134 default:
135 NOTREACHED();
136 return std::string();
137 }
138 }
139
GetErrorMessage() const140 std::string JSONReader::GetErrorMessage() const {
141 return FormatErrorMessage(error_line_, error_col_,
142 ErrorCodeToString(error_code_));
143 }
144
JsonToValue(const std::string & json,bool check_root,bool allow_trailing_comma)145 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
146 bool allow_trailing_comma) {
147 // The input must be in UTF-8.
148 if (!IsStringUTF8(json.c_str())) {
149 error_code_ = JSON_UNSUPPORTED_ENCODING;
150 return NULL;
151 }
152
153 // The conversion from UTF8 to wstring removes null bytes for us
154 // (a good thing).
155 std::wstring json_wide(UTF8ToWide(json));
156 start_pos_ = json_wide.c_str();
157
158 // When the input JSON string starts with a UTF-8 Byte-Order-Mark
159 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
160 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
161 // mis-treating a Unicode BOM as an invalid character and returning NULL,
162 // skip a converted Unicode BOM if it exists.
163 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
164 ++start_pos_;
165 }
166
167 json_pos_ = start_pos_;
168 allow_trailing_comma_ = allow_trailing_comma;
169 stack_depth_ = 0;
170 error_code_ = JSON_NO_ERROR;
171
172 scoped_ptr<Value> root(BuildValue(check_root));
173 if (root.get()) {
174 if (ParseToken().type == Token::END_OF_INPUT) {
175 return root.release();
176 } else {
177 SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_);
178 }
179 }
180
181 // Default to calling errors "syntax errors".
182 if (error_code_ == 0)
183 SetErrorCode(JSON_SYNTAX_ERROR, json_pos_);
184
185 return NULL;
186 }
187
188 /* static */
FormatErrorMessage(int line,int column,const std::string & description)189 std::string JSONReader::FormatErrorMessage(int line, int column,
190 const std::string& description) {
191 if (line || column) {
192 return StringPrintf("Line: %i, column: %i, %s",
193 line, column, description.c_str());
194 }
195 return description;
196 }
197
BuildValue(bool is_root)198 Value* JSONReader::BuildValue(bool is_root) {
199 ++stack_depth_;
200 if (stack_depth_ > kStackLimit) {
201 SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_);
202 return NULL;
203 }
204
205 Token token = ParseToken();
206 // The root token must be an array or an object.
207 if (is_root && token.type != Token::OBJECT_BEGIN &&
208 token.type != Token::ARRAY_BEGIN) {
209 SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_);
210 return NULL;
211 }
212
213 scoped_ptr<Value> node;
214
215 switch (token.type) {
216 case Token::END_OF_INPUT:
217 case Token::INVALID_TOKEN:
218 return NULL;
219
220 case Token::NULL_TOKEN:
221 node.reset(Value::CreateNullValue());
222 break;
223
224 case Token::BOOL_TRUE:
225 node.reset(Value::CreateBooleanValue(true));
226 break;
227
228 case Token::BOOL_FALSE:
229 node.reset(Value::CreateBooleanValue(false));
230 break;
231
232 case Token::NUMBER:
233 node.reset(DecodeNumber(token));
234 if (!node.get())
235 return NULL;
236 break;
237
238 case Token::STRING:
239 node.reset(DecodeString(token));
240 if (!node.get())
241 return NULL;
242 break;
243
244 case Token::ARRAY_BEGIN:
245 {
246 json_pos_ += token.length;
247 token = ParseToken();
248
249 node.reset(new ListValue());
250 while (token.type != Token::ARRAY_END) {
251 Value* array_node = BuildValue(false);
252 if (!array_node)
253 return NULL;
254 static_cast<ListValue*>(node.get())->Append(array_node);
255
256 // After a list value, we expect a comma or the end of the list.
257 token = ParseToken();
258 if (token.type == Token::LIST_SEPARATOR) {
259 json_pos_ += token.length;
260 token = ParseToken();
261 // Trailing commas are invalid according to the JSON RFC, but some
262 // consumers need the parsing leniency, so handle accordingly.
263 if (token.type == Token::ARRAY_END) {
264 if (!allow_trailing_comma_) {
265 SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
266 return NULL;
267 }
268 // Trailing comma OK, stop parsing the Array.
269 break;
270 }
271 } else if (token.type != Token::ARRAY_END) {
272 // Unexpected value after list value. Bail out.
273 return NULL;
274 }
275 }
276 if (token.type != Token::ARRAY_END) {
277 return NULL;
278 }
279 break;
280 }
281
282 case Token::OBJECT_BEGIN:
283 {
284 json_pos_ += token.length;
285 token = ParseToken();
286
287 node.reset(new DictionaryValue);
288 while (token.type != Token::OBJECT_END) {
289 if (token.type != Token::STRING) {
290 SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_);
291 return NULL;
292 }
293 scoped_ptr<Value> dict_key_value(DecodeString(token));
294 if (!dict_key_value.get())
295 return NULL;
296
297 // Convert the key into a wstring.
298 std::string dict_key;
299 bool success = dict_key_value->GetAsString(&dict_key);
300 DCHECK(success);
301
302 json_pos_ += token.length;
303 token = ParseToken();
304 if (token.type != Token::OBJECT_PAIR_SEPARATOR)
305 return NULL;
306
307 json_pos_ += token.length;
308 token = ParseToken();
309 Value* dict_value = BuildValue(false);
310 if (!dict_value)
311 return NULL;
312 static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
313 dict_key, dict_value);
314
315 // After a key/value pair, we expect a comma or the end of the
316 // object.
317 token = ParseToken();
318 if (token.type == Token::LIST_SEPARATOR) {
319 json_pos_ += token.length;
320 token = ParseToken();
321 // Trailing commas are invalid according to the JSON RFC, but some
322 // consumers need the parsing leniency, so handle accordingly.
323 if (token.type == Token::OBJECT_END) {
324 if (!allow_trailing_comma_) {
325 SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
326 return NULL;
327 }
328 // Trailing comma OK, stop parsing the Object.
329 break;
330 }
331 } else if (token.type != Token::OBJECT_END) {
332 // Unexpected value after last object value. Bail out.
333 return NULL;
334 }
335 }
336 if (token.type != Token::OBJECT_END)
337 return NULL;
338
339 break;
340 }
341
342 default:
343 // We got a token that's not a value.
344 return NULL;
345 }
346 json_pos_ += token.length;
347
348 --stack_depth_;
349 return node.release();
350 }
351
ParseNumberToken()352 JSONReader::Token JSONReader::ParseNumberToken() {
353 // We just grab the number here. We validate the size in DecodeNumber.
354 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
355 Token token(Token::NUMBER, json_pos_, 0);
356 wchar_t c = *json_pos_;
357 if ('-' == c) {
358 ++token.length;
359 c = token.NextChar();
360 }
361
362 if (!ReadInt(token, false))
363 return kInvalidToken;
364
365 // Optional fraction part
366 c = token.NextChar();
367 if ('.' == c) {
368 ++token.length;
369 if (!ReadInt(token, true))
370 return kInvalidToken;
371 c = token.NextChar();
372 }
373
374 // Optional exponent part
375 if ('e' == c || 'E' == c) {
376 ++token.length;
377 c = token.NextChar();
378 if ('-' == c || '+' == c) {
379 ++token.length;
380 c = token.NextChar();
381 }
382 if (!ReadInt(token, true))
383 return kInvalidToken;
384 }
385
386 return token;
387 }
388
DecodeNumber(const Token & token)389 Value* JSONReader::DecodeNumber(const Token& token) {
390 const std::wstring num_string(token.begin, token.length);
391
392 int num_int;
393 if (StringToInt(WideToUTF8(num_string), &num_int))
394 return Value::CreateIntegerValue(num_int);
395
396 double num_double;
397 if (StringToDouble(WideToUTF8(num_string), &num_double) &&
398 base::IsFinite(num_double))
399 return Value::CreateDoubleValue(num_double);
400
401 return NULL;
402 }
403
ParseStringToken()404 JSONReader::Token JSONReader::ParseStringToken() {
405 Token token(Token::STRING, json_pos_, 1);
406 wchar_t c = token.NextChar();
407 while ('\0' != c) {
408 if ('\\' == c) {
409 ++token.length;
410 c = token.NextChar();
411 // Make sure the escaped char is valid.
412 switch (c) {
413 case 'x':
414 if (!ReadHexDigits(token, 2)) {
415 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
416 return kInvalidToken;
417 }
418 break;
419 case 'u':
420 if (!ReadHexDigits(token, 4)) {
421 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
422 return kInvalidToken;
423 }
424 break;
425 case '\\':
426 case '/':
427 case 'b':
428 case 'f':
429 case 'n':
430 case 'r':
431 case 't':
432 case 'v':
433 case '"':
434 break;
435 default:
436 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
437 return kInvalidToken;
438 }
439 } else if ('"' == c) {
440 ++token.length;
441 return token;
442 }
443 ++token.length;
444 c = token.NextChar();
445 }
446 return kInvalidToken;
447 }
448
DecodeString(const Token & token)449 Value* JSONReader::DecodeString(const Token& token) {
450 std::wstring decoded_str;
451 decoded_str.reserve(token.length - 2);
452
453 for (int i = 1; i < token.length - 1; ++i) {
454 wchar_t c = *(token.begin + i);
455 if ('\\' == c) {
456 ++i;
457 c = *(token.begin + i);
458 switch (c) {
459 case '"':
460 case '/':
461 case '\\':
462 decoded_str.push_back(c);
463 break;
464 case 'b':
465 decoded_str.push_back('\b');
466 break;
467 case 'f':
468 decoded_str.push_back('\f');
469 break;
470 case 'n':
471 decoded_str.push_back('\n');
472 break;
473 case 'r':
474 decoded_str.push_back('\r');
475 break;
476 case 't':
477 decoded_str.push_back('\t');
478 break;
479 case 'v':
480 decoded_str.push_back('\v');
481 break;
482
483 case 'x':
484 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
485 HexDigitToInt(*(token.begin + i + 2)));
486 i += 2;
487 break;
488 case 'u':
489 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +
490 (HexDigitToInt(*(token.begin + i + 2)) << 8) +
491 (HexDigitToInt(*(token.begin + i + 3)) << 4) +
492 HexDigitToInt(*(token.begin + i + 4)));
493 i += 4;
494 break;
495
496 default:
497 // We should only have valid strings at this point. If not,
498 // ParseStringToken didn't do it's job.
499 NOTREACHED();
500 return NULL;
501 }
502 } else {
503 // Not escaped
504 decoded_str.push_back(c);
505 }
506 }
507 return Value::CreateStringValue(WideToUTF16Hack(decoded_str));
508 }
509
ParseToken()510 JSONReader::Token JSONReader::ParseToken() {
511 static const std::wstring kNullString(L"null");
512 static const std::wstring kTrueString(L"true");
513 static const std::wstring kFalseString(L"false");
514
515 EatWhitespaceAndComments();
516
517 Token token(Token::INVALID_TOKEN, 0, 0);
518 switch (*json_pos_) {
519 case '\0':
520 token.type = Token::END_OF_INPUT;
521 break;
522
523 case 'n':
524 if (NextStringMatch(kNullString))
525 token = Token(Token::NULL_TOKEN, json_pos_, 4);
526 break;
527
528 case 't':
529 if (NextStringMatch(kTrueString))
530 token = Token(Token::BOOL_TRUE, json_pos_, 4);
531 break;
532
533 case 'f':
534 if (NextStringMatch(kFalseString))
535 token = Token(Token::BOOL_FALSE, json_pos_, 5);
536 break;
537
538 case '[':
539 token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
540 break;
541
542 case ']':
543 token = Token(Token::ARRAY_END, json_pos_, 1);
544 break;
545
546 case ',':
547 token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
548 break;
549
550 case '{':
551 token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
552 break;
553
554 case '}':
555 token = Token(Token::OBJECT_END, json_pos_, 1);
556 break;
557
558 case ':':
559 token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
560 break;
561
562 case '0':
563 case '1':
564 case '2':
565 case '3':
566 case '4':
567 case '5':
568 case '6':
569 case '7':
570 case '8':
571 case '9':
572 case '-':
573 token = ParseNumberToken();
574 break;
575
576 case '"':
577 token = ParseStringToken();
578 break;
579 }
580 return token;
581 }
582
EatWhitespaceAndComments()583 void JSONReader::EatWhitespaceAndComments() {
584 while ('\0' != *json_pos_) {
585 switch (*json_pos_) {
586 case ' ':
587 case '\n':
588 case '\r':
589 case '\t':
590 ++json_pos_;
591 break;
592 case '/':
593 // TODO(tc): This isn't in the RFC so it should be a parser flag.
594 if (!EatComment())
595 return;
596 break;
597 default:
598 // Not a whitespace char, just exit.
599 return;
600 }
601 }
602 }
603
EatComment()604 bool JSONReader::EatComment() {
605 if ('/' != *json_pos_)
606 return false;
607
608 wchar_t next_char = *(json_pos_ + 1);
609 if ('/' == next_char) {
610 // Line comment, read until \n or \r
611 json_pos_ += 2;
612 while ('\0' != *json_pos_) {
613 switch (*json_pos_) {
614 case '\n':
615 case '\r':
616 ++json_pos_;
617 return true;
618 default:
619 ++json_pos_;
620 }
621 }
622 } else if ('*' == next_char) {
623 // Block comment, read until */
624 json_pos_ += 2;
625 while ('\0' != *json_pos_) {
626 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
627 json_pos_ += 2;
628 return true;
629 }
630 ++json_pos_;
631 }
632 } else {
633 return false;
634 }
635 return true;
636 }
637
NextStringMatch(const std::wstring & str)638 bool JSONReader::NextStringMatch(const std::wstring& str) {
639 for (size_t i = 0; i < str.length(); ++i) {
640 if ('\0' == *json_pos_)
641 return false;
642 if (*(json_pos_ + i) != str[i])
643 return false;
644 }
645 return true;
646 }
647
SetErrorCode(JsonParseError error,const wchar_t * error_pos)648 void JSONReader::SetErrorCode(JsonParseError error,
649 const wchar_t* error_pos) {
650 int line_number = 1;
651 int column_number = 1;
652
653 // Figure out the line and column the error occured at.
654 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
655 if (*pos == '\0') {
656 NOTREACHED();
657 return;
658 }
659
660 if (*pos == '\n') {
661 ++line_number;
662 column_number = 1;
663 } else {
664 ++column_number;
665 }
666 }
667
668 error_line_ = line_number;
669 error_col_ = column_number;
670 error_code_ = error;
671 }
672
673 } // namespace base
674