1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/json/json_reader.h"
6
7 #include "base/float_util.h"
8 #include "base/logging.h"
9 #include "base/scoped_ptr.h"
10 #include "base/string_util.h"
11 #include "base/utf_string_conversions.h"
12 #include "base/values.h"
13
14 namespace base {
15
16 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
17 0, 0);
18 static const int kStackLimit = 100;
19
20 namespace {
21
HexToInt(wchar_t c)22 inline int HexToInt(wchar_t c) {
23 if ('0' <= c && c <= '9') {
24 return c - '0';
25 } else if ('A' <= c && c <= 'F') {
26 return c - 'A' + 10;
27 } else if ('a' <= c && c <= 'f') {
28 return c - 'a' + 10;
29 }
30 NOTREACHED();
31 return 0;
32 }
33
34 // A helper method for ParseNumberToken. It reads an int from the end of
35 // token. The method returns false if there is no valid integer at the end of
36 // the token.
ReadInt(JSONReader::Token & token,bool can_have_leading_zeros)37 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
38 wchar_t first = token.NextChar();
39 int len = 0;
40
41 // Read in more digits
42 wchar_t c = first;
43 while ('\0' != c && '0' <= c && c <= '9') {
44 ++token.length;
45 ++len;
46 c = token.NextChar();
47 }
48 // We need at least 1 digit.
49 if (len == 0)
50 return false;
51
52 if (!can_have_leading_zeros && len > 1 && '0' == first)
53 return false;
54
55 return true;
56 }
57
58 // A helper method for ParseStringToken. It reads |digits| hex digits from the
59 // token. If the sequence if digits is not valid (contains other characters),
60 // the method returns false.
ReadHexDigits(JSONReader::Token & token,int digits)61 bool ReadHexDigits(JSONReader::Token& token, int digits) {
62 for (int i = 1; i <= digits; ++i) {
63 wchar_t c = *(token.begin + token.length + i);
64 if ('\0' == c)
65 return false;
66 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
67 ('A' <= c && c <= 'F'))) {
68 return false;
69 }
70 }
71
72 token.length += digits;
73 return true;
74 }
75
76 } // anonymous namespace
77
78 const char* JSONReader::kBadRootElementType =
79 "Root value must be an array or object.";
80 const char* JSONReader::kInvalidEscape =
81 "Invalid escape sequence.";
82 const char* JSONReader::kSyntaxError =
83 "Syntax error.";
84 const char* JSONReader::kTrailingComma =
85 "Trailing comma not allowed.";
86 const char* JSONReader::kTooMuchNesting =
87 "Too much nesting.";
88 const char* JSONReader::kUnexpectedDataAfterRoot =
89 "Unexpected data after root element.";
90 const char* JSONReader::kUnsupportedEncoding =
91 "Unsupported encoding. JSON must be UTF-8.";
92 const char* JSONReader::kUnquotedDictionaryKey =
93 "Dictionary keys must be quoted.";
94
95 /* static */
Read(const std::string & json,bool allow_trailing_comma)96 Value* JSONReader::Read(const std::string& json,
97 bool allow_trailing_comma) {
98 return ReadAndReturnError(json, allow_trailing_comma, NULL);
99 }
100
101 /* static */
ReadAndReturnError(const std::string & json,bool allow_trailing_comma,std::string * error_message_out)102 Value* JSONReader::ReadAndReturnError(const std::string& json,
103 bool allow_trailing_comma,
104 std::string *error_message_out) {
105 JSONReader reader = JSONReader();
106 Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
107 if (root)
108 return root;
109
110 if (error_message_out)
111 *error_message_out = reader.error_message();
112
113 return NULL;
114 }
115
116 /* static */
FormatErrorMessage(int line,int column,const char * description)117 std::string JSONReader::FormatErrorMessage(int line, int column,
118 const char* description) {
119 return StringPrintf("Line: %i, column: %i, %s",
120 line, column, description);
121 }
122
JSONReader()123 JSONReader::JSONReader()
124 : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
125 allow_trailing_comma_(false) {}
126
JsonToValue(const std::string & json,bool check_root,bool allow_trailing_comma)127 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
128 bool allow_trailing_comma) {
129 // The input must be in UTF-8.
130 if (!IsStringUTF8(json.c_str())) {
131 error_message_ = kUnsupportedEncoding;
132 return NULL;
133 }
134
135 // The conversion from UTF8 to wstring removes null bytes for us
136 // (a good thing).
137 std::wstring json_wide(UTF8ToWide(json));
138 start_pos_ = json_wide.c_str();
139
140 // When the input JSON string starts with a UTF-8 Byte-Order-Mark
141 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
142 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
143 // mis-treating a Unicode BOM as an invalid character and returning NULL,
144 // skip a converted Unicode BOM if it exists.
145 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
146 ++start_pos_;
147 }
148
149 json_pos_ = start_pos_;
150 allow_trailing_comma_ = allow_trailing_comma;
151 stack_depth_ = 0;
152 error_message_.clear();
153
154 scoped_ptr<Value> root(BuildValue(check_root));
155 if (root.get()) {
156 if (ParseToken().type == Token::END_OF_INPUT) {
157 return root.release();
158 } else {
159 SetErrorMessage(kUnexpectedDataAfterRoot, json_pos_);
160 }
161 }
162
163 // Default to calling errors "syntax errors".
164 if (error_message_.empty())
165 SetErrorMessage(kSyntaxError, json_pos_);
166
167 return NULL;
168 }
169
BuildValue(bool is_root)170 Value* JSONReader::BuildValue(bool is_root) {
171 ++stack_depth_;
172 if (stack_depth_ > kStackLimit) {
173 SetErrorMessage(kTooMuchNesting, json_pos_);
174 return NULL;
175 }
176
177 Token token = ParseToken();
178 // The root token must be an array or an object.
179 if (is_root && token.type != Token::OBJECT_BEGIN &&
180 token.type != Token::ARRAY_BEGIN) {
181 SetErrorMessage(kBadRootElementType, json_pos_);
182 return NULL;
183 }
184
185 scoped_ptr<Value> node;
186
187 switch (token.type) {
188 case Token::END_OF_INPUT:
189 case Token::INVALID_TOKEN:
190 return NULL;
191
192 case Token::NULL_TOKEN:
193 node.reset(Value::CreateNullValue());
194 break;
195
196 case Token::BOOL_TRUE:
197 node.reset(Value::CreateBooleanValue(true));
198 break;
199
200 case Token::BOOL_FALSE:
201 node.reset(Value::CreateBooleanValue(false));
202 break;
203
204 case Token::NUMBER:
205 node.reset(DecodeNumber(token));
206 if (!node.get())
207 return NULL;
208 break;
209
210 case Token::STRING:
211 node.reset(DecodeString(token));
212 if (!node.get())
213 return NULL;
214 break;
215
216 case Token::ARRAY_BEGIN:
217 {
218 json_pos_ += token.length;
219 token = ParseToken();
220
221 node.reset(new ListValue());
222 while (token.type != Token::ARRAY_END) {
223 Value* array_node = BuildValue(false);
224 if (!array_node)
225 return NULL;
226 static_cast<ListValue*>(node.get())->Append(array_node);
227
228 // After a list value, we expect a comma or the end of the list.
229 token = ParseToken();
230 if (token.type == Token::LIST_SEPARATOR) {
231 json_pos_ += token.length;
232 token = ParseToken();
233 // Trailing commas are invalid according to the JSON RFC, but some
234 // consumers need the parsing leniency, so handle accordingly.
235 if (token.type == Token::ARRAY_END) {
236 if (!allow_trailing_comma_) {
237 SetErrorMessage(kTrailingComma, json_pos_);
238 return NULL;
239 }
240 // Trailing comma OK, stop parsing the Array.
241 break;
242 }
243 } else if (token.type != Token::ARRAY_END) {
244 // Unexpected value after list value. Bail out.
245 return NULL;
246 }
247 }
248 if (token.type != Token::ARRAY_END) {
249 return NULL;
250 }
251 break;
252 }
253
254 case Token::OBJECT_BEGIN:
255 {
256 json_pos_ += token.length;
257 token = ParseToken();
258
259 node.reset(new DictionaryValue);
260 while (token.type != Token::OBJECT_END) {
261 if (token.type != Token::STRING) {
262 SetErrorMessage(kUnquotedDictionaryKey, json_pos_);
263 return NULL;
264 }
265 scoped_ptr<Value> dict_key_value(DecodeString(token));
266 if (!dict_key_value.get())
267 return NULL;
268
269 // Convert the key into a wstring.
270 std::wstring dict_key;
271 bool success = dict_key_value->GetAsString(&dict_key);
272 DCHECK(success);
273
274 json_pos_ += token.length;
275 token = ParseToken();
276 if (token.type != Token::OBJECT_PAIR_SEPARATOR)
277 return NULL;
278
279 json_pos_ += token.length;
280 token = ParseToken();
281 Value* dict_value = BuildValue(false);
282 if (!dict_value)
283 return NULL;
284 static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
285 dict_key, dict_value);
286
287 // After a key/value pair, we expect a comma or the end of the
288 // object.
289 token = ParseToken();
290 if (token.type == Token::LIST_SEPARATOR) {
291 json_pos_ += token.length;
292 token = ParseToken();
293 // Trailing commas are invalid according to the JSON RFC, but some
294 // consumers need the parsing leniency, so handle accordingly.
295 if (token.type == Token::OBJECT_END) {
296 if (!allow_trailing_comma_) {
297 SetErrorMessage(kTrailingComma, json_pos_);
298 return NULL;
299 }
300 // Trailing comma OK, stop parsing the Object.
301 break;
302 }
303 } else if (token.type != Token::OBJECT_END) {
304 // Unexpected value after last object value. Bail out.
305 return NULL;
306 }
307 }
308 if (token.type != Token::OBJECT_END)
309 return NULL;
310
311 break;
312 }
313
314 default:
315 // We got a token that's not a value.
316 return NULL;
317 }
318 json_pos_ += token.length;
319
320 --stack_depth_;
321 return node.release();
322 }
323
ParseNumberToken()324 JSONReader::Token JSONReader::ParseNumberToken() {
325 // We just grab the number here. We validate the size in DecodeNumber.
326 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
327 Token token(Token::NUMBER, json_pos_, 0);
328 wchar_t c = *json_pos_;
329 if ('-' == c) {
330 ++token.length;
331 c = token.NextChar();
332 }
333
334 if (!ReadInt(token, false))
335 return kInvalidToken;
336
337 // Optional fraction part
338 c = token.NextChar();
339 if ('.' == c) {
340 ++token.length;
341 if (!ReadInt(token, true))
342 return kInvalidToken;
343 c = token.NextChar();
344 }
345
346 // Optional exponent part
347 if ('e' == c || 'E' == c) {
348 ++token.length;
349 c = token.NextChar();
350 if ('-' == c || '+' == c) {
351 ++token.length;
352 c = token.NextChar();
353 }
354 if (!ReadInt(token, true))
355 return kInvalidToken;
356 }
357
358 return token;
359 }
360
DecodeNumber(const Token & token)361 Value* JSONReader::DecodeNumber(const Token& token) {
362 const std::wstring num_string(token.begin, token.length);
363
364 int num_int;
365 if (StringToInt(WideToUTF16Hack(num_string), &num_int))
366 return Value::CreateIntegerValue(num_int);
367
368 double num_double;
369 if (StringToDouble(WideToUTF16Hack(num_string), &num_double) &&
370 base::IsFinite(num_double))
371 return Value::CreateRealValue(num_double);
372
373 return NULL;
374 }
375
ParseStringToken()376 JSONReader::Token JSONReader::ParseStringToken() {
377 Token token(Token::STRING, json_pos_, 1);
378 wchar_t c = token.NextChar();
379 while ('\0' != c) {
380 if ('\\' == c) {
381 ++token.length;
382 c = token.NextChar();
383 // Make sure the escaped char is valid.
384 switch (c) {
385 case 'x':
386 if (!ReadHexDigits(token, 2)) {
387 SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
388 return kInvalidToken;
389 }
390 break;
391 case 'u':
392 if (!ReadHexDigits(token, 4)) {
393 SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
394 return kInvalidToken;
395 }
396 break;
397 case '\\':
398 case '/':
399 case 'b':
400 case 'f':
401 case 'n':
402 case 'r':
403 case 't':
404 case 'v':
405 case '"':
406 break;
407 default:
408 SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
409 return kInvalidToken;
410 }
411 } else if ('"' == c) {
412 ++token.length;
413 return token;
414 }
415 ++token.length;
416 c = token.NextChar();
417 }
418 return kInvalidToken;
419 }
420
DecodeString(const Token & token)421 Value* JSONReader::DecodeString(const Token& token) {
422 std::wstring decoded_str;
423 decoded_str.reserve(token.length - 2);
424
425 for (int i = 1; i < token.length - 1; ++i) {
426 wchar_t c = *(token.begin + i);
427 if ('\\' == c) {
428 ++i;
429 c = *(token.begin + i);
430 switch (c) {
431 case '"':
432 case '/':
433 case '\\':
434 decoded_str.push_back(c);
435 break;
436 case 'b':
437 decoded_str.push_back('\b');
438 break;
439 case 'f':
440 decoded_str.push_back('\f');
441 break;
442 case 'n':
443 decoded_str.push_back('\n');
444 break;
445 case 'r':
446 decoded_str.push_back('\r');
447 break;
448 case 't':
449 decoded_str.push_back('\t');
450 break;
451 case 'v':
452 decoded_str.push_back('\v');
453 break;
454
455 case 'x':
456 decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 4) +
457 HexToInt(*(token.begin + i + 2)));
458 i += 2;
459 break;
460 case 'u':
461 decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 12 ) +
462 (HexToInt(*(token.begin + i + 2)) << 8) +
463 (HexToInt(*(token.begin + i + 3)) << 4) +
464 HexToInt(*(token.begin + i + 4)));
465 i += 4;
466 break;
467
468 default:
469 // We should only have valid strings at this point. If not,
470 // ParseStringToken didn't do it's job.
471 NOTREACHED();
472 return NULL;
473 }
474 } else {
475 // Not escaped
476 decoded_str.push_back(c);
477 }
478 }
479 return Value::CreateStringValue(decoded_str);
480 }
481
ParseToken()482 JSONReader::Token JSONReader::ParseToken() {
483 static const std::wstring kNullString(L"null");
484 static const std::wstring kTrueString(L"true");
485 static const std::wstring kFalseString(L"false");
486
487 EatWhitespaceAndComments();
488
489 Token token(Token::INVALID_TOKEN, 0, 0);
490 switch (*json_pos_) {
491 case '\0':
492 token.type = Token::END_OF_INPUT;
493 break;
494
495 case 'n':
496 if (NextStringMatch(kNullString))
497 token = Token(Token::NULL_TOKEN, json_pos_, 4);
498 break;
499
500 case 't':
501 if (NextStringMatch(kTrueString))
502 token = Token(Token::BOOL_TRUE, json_pos_, 4);
503 break;
504
505 case 'f':
506 if (NextStringMatch(kFalseString))
507 token = Token(Token::BOOL_FALSE, json_pos_, 5);
508 break;
509
510 case '[':
511 token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
512 break;
513
514 case ']':
515 token = Token(Token::ARRAY_END, json_pos_, 1);
516 break;
517
518 case ',':
519 token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
520 break;
521
522 case '{':
523 token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
524 break;
525
526 case '}':
527 token = Token(Token::OBJECT_END, json_pos_, 1);
528 break;
529
530 case ':':
531 token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
532 break;
533
534 case '0':
535 case '1':
536 case '2':
537 case '3':
538 case '4':
539 case '5':
540 case '6':
541 case '7':
542 case '8':
543 case '9':
544 case '-':
545 token = ParseNumberToken();
546 break;
547
548 case '"':
549 token = ParseStringToken();
550 break;
551 }
552 return token;
553 }
554
NextStringMatch(const std::wstring & str)555 bool JSONReader::NextStringMatch(const std::wstring& str) {
556 for (size_t i = 0; i < str.length(); ++i) {
557 if ('\0' == *json_pos_)
558 return false;
559 if (*(json_pos_ + i) != str[i])
560 return false;
561 }
562 return true;
563 }
564
EatWhitespaceAndComments()565 void JSONReader::EatWhitespaceAndComments() {
566 while ('\0' != *json_pos_) {
567 switch (*json_pos_) {
568 case ' ':
569 case '\n':
570 case '\r':
571 case '\t':
572 ++json_pos_;
573 break;
574 case '/':
575 // TODO(tc): This isn't in the RFC so it should be a parser flag.
576 if (!EatComment())
577 return;
578 break;
579 default:
580 // Not a whitespace char, just exit.
581 return;
582 }
583 }
584 }
585
EatComment()586 bool JSONReader::EatComment() {
587 if ('/' != *json_pos_)
588 return false;
589
590 wchar_t next_char = *(json_pos_ + 1);
591 if ('/' == next_char) {
592 // Line comment, read until \n or \r
593 json_pos_ += 2;
594 while ('\0' != *json_pos_) {
595 switch (*json_pos_) {
596 case '\n':
597 case '\r':
598 ++json_pos_;
599 return true;
600 default:
601 ++json_pos_;
602 }
603 }
604 } else if ('*' == next_char) {
605 // Block comment, read until */
606 json_pos_ += 2;
607 while ('\0' != *json_pos_) {
608 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
609 json_pos_ += 2;
610 return true;
611 }
612 ++json_pos_;
613 }
614 } else {
615 return false;
616 }
617 return true;
618 }
619
SetErrorMessage(const char * description,const wchar_t * error_pos)620 void JSONReader::SetErrorMessage(const char* description,
621 const wchar_t* error_pos) {
622 int line_number = 1;
623 int column_number = 1;
624
625 // Figure out the line and column the error occured at.
626 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
627 if (*pos == '\0') {
628 NOTREACHED();
629 return;
630 }
631
632 if (*pos == '\n') {
633 ++line_number;
634 column_number = 1;
635 } else {
636 ++column_number;
637 }
638 }
639
640 error_message_ = FormatErrorMessage(line_number, column_number, description);
641 }
642
643 } // namespace base
644