1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32
33 #include <algorithm>
34 #include <cctype>
35 #include <cerrno>
36 #include <cmath>
37 #include <cstdlib>
38 #include <cstring>
39 #include <limits>
40 #include <memory>
41
42 #include <google/protobuf/stubs/logging.h>
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/strutil.h>
45 #include <google/protobuf/util/internal/object_writer.h>
46 #include <google/protobuf/util/internal/json_escaping.h>
47
48
49 namespace google {
50 namespace protobuf {
51 namespace util {
52
53 // Allow these symbols to be referenced as util::Status, util::error::* in
54 // this file.
55 using util::Status;
56 namespace error {
57 using util::error::CANCELLED;
58 using util::error::INTERNAL;
59 using util::error::INVALID_ARGUMENT;
60 } // namespace error
61
62 namespace converter {
63
64 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
65 static const int kUnicodeEscapedLength = 6;
66
67 static const int kDefaultMaxRecursionDepth = 100;
68
69 // These cannot be constexpr for portability with VS2015.
70 static const StringPiece kKeywordTrue = "true";
71 static const StringPiece kKeywordFalse = "false";
72 static const StringPiece kKeywordNull = "null";
73
IsLetter(char c)74 inline bool IsLetter(char c) {
75 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
76 (c == '$');
77 }
78
IsAlphanumeric(char c)79 inline bool IsAlphanumeric(char c) {
80 return IsLetter(c) || ('0' <= c && c <= '9');
81 }
82
83 // Indicates a character may not be part of an unquoted key.
IsKeySeparator(char c)84 inline bool IsKeySeparator(char c) {
85 return (ascii_isspace(c) || c == '"' || c == '\'' || c == '{' ||
86 c == '}' || c == '[' || c == ']' || c == ':' || c == ',');
87 }
88
ReplaceInvalidCodePoints(StringPiece str,const std::string & replacement,std::string * dst)89 inline void ReplaceInvalidCodePoints(StringPiece str,
90 const std::string& replacement,
91 std::string* dst) {
92 while (!str.empty()) {
93 int n_valid_bytes = internal::UTF8SpnStructurallyValid(str);
94 StringPiece valid_part = str.substr(0, n_valid_bytes);
95 StrAppend(dst, valid_part);
96
97 if (n_valid_bytes == str.size()) {
98 break;
99 }
100
101 // Append replacement value.
102 StrAppend(dst, replacement);
103
104 // Move past valid bytes + one invalid byte.
105 str.remove_prefix(n_valid_bytes + 1);
106 }
107 }
108
ConsumeKey(StringPiece * input,StringPiece * key)109 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
110 if (input->empty() || !IsLetter((*input)[0])) return false;
111 int len = 1;
112 for (; len < input->size(); ++len) {
113 if (!IsAlphanumeric((*input)[len])) {
114 break;
115 }
116 }
117 *key = StringPiece(input->data(), len);
118 *input = StringPiece(input->data() + len, input->size() - len);
119 return true;
120 }
121
122 // Same as 'ConsumeKey', but allows a widened set of key characters.
ConsumeKeyPermissive(StringPiece * input,StringPiece * key)123 static bool ConsumeKeyPermissive(StringPiece* input,
124 StringPiece* key) {
125 if (input->empty() || !IsLetter((*input)[0])) return false;
126 int len = 1;
127 for (; len < input->size(); ++len) {
128 if (IsKeySeparator((*input)[len])) {
129 break;
130 }
131 }
132 *key = StringPiece(input->data(), len);
133 *input = StringPiece(input->data() + len, input->size() - len);
134 return true;
135 }
136
MatchKey(StringPiece input)137 static bool MatchKey(StringPiece input) {
138 return !input.empty() && IsLetter(input[0]);
139 }
140
JsonStreamParser(ObjectWriter * ow)141 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
142 : ow_(ow),
143 stack_(),
144 leftover_(),
145 json_(),
146 p_(),
147 key_(),
148 key_storage_(),
149 finishing_(false),
150 parsed_(),
151 parsed_storage_(),
152 string_open_(0),
153 chunk_storage_(),
154 coerce_to_utf8_(false),
155 utf8_replacement_character_(" "),
156 allow_empty_null_(false),
157 allow_permissive_key_naming_(false),
158 loose_float_number_conversion_(false),
159 recursion_depth_(0),
160 max_recursion_depth_(kDefaultMaxRecursionDepth) {
161 // Initialize the stack with a single value to be parsed.
162 stack_.push(VALUE);
163 }
164
~JsonStreamParser()165 JsonStreamParser::~JsonStreamParser() {}
166
167
Parse(StringPiece json)168 util::Status JsonStreamParser::Parse(StringPiece json) {
169 StringPiece chunk = json;
170 // If we have leftovers from a previous chunk, append the new chunk to it
171 // and create a new StringPiece pointing at the string's data. This could
172 // be large but we rely on the chunks to be small, assuming they are
173 // fragments of a Cord.
174 if (!leftover_.empty()) {
175 // Don't point chunk to leftover_ because leftover_ will be updated in
176 // ParseChunk(chunk).
177 chunk_storage_.swap(leftover_);
178 StrAppend(&chunk_storage_, json);
179 chunk = StringPiece(chunk_storage_);
180 }
181
182 // Find the structurally valid UTF8 prefix and parse only that.
183 int n = internal::UTF8SpnStructurallyValid(chunk);
184 if (n > 0) {
185 util::Status status = ParseChunk(chunk.substr(0, n));
186
187 // Any leftover characters are stashed in leftover_ for later parsing when
188 // there is more data available.
189 StrAppend(&leftover_, chunk.substr(n));
190 return status;
191 } else {
192 leftover_.assign(chunk.data(), chunk.size());
193 return util::Status();
194 }
195 }
196
FinishParse()197 util::Status JsonStreamParser::FinishParse() {
198 // If we do not expect anything and there is nothing left to parse we're all
199 // done.
200 if (stack_.empty() && leftover_.empty()) {
201 return util::Status();
202 }
203
204 // Lifetime needs to last until RunParser returns, so keep this variable
205 // outside of the coerce_to_utf8 block.
206 std::unique_ptr<std::string> scratch;
207
208 bool is_valid_utf8 = internal::IsStructurallyValidUTF8(leftover_);
209 if (coerce_to_utf8_ && !is_valid_utf8) {
210 scratch.reset(new std::string);
211 scratch->reserve(leftover_.size() * utf8_replacement_character_.size());
212 ReplaceInvalidCodePoints(leftover_, utf8_replacement_character_,
213 scratch.get());
214 p_ = json_ = *scratch;
215 } else {
216 p_ = json_ = leftover_;
217 if (!is_valid_utf8) {
218 return ReportFailure("Encountered non UTF-8 code points.");
219 }
220 }
221
222 // Parse the remainder in finishing mode, which reports errors for things like
223 // unterminated strings or unknown tokens that would normally be retried.
224 finishing_ = true;
225 util::Status result = RunParser();
226 if (result.ok()) {
227 SkipWhitespace();
228 if (!p_.empty()) {
229 result = ReportFailure("Parsing terminated before end of input.");
230 }
231 }
232 return result;
233 }
234
ParseChunk(StringPiece chunk)235 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
236 // Do not do any work if the chunk is empty.
237 if (chunk.empty()) return util::Status();
238
239 p_ = json_ = chunk;
240
241 finishing_ = false;
242 util::Status result = RunParser();
243 if (!result.ok()) return result;
244
245 SkipWhitespace();
246 if (p_.empty()) {
247 // If we parsed everything we had, clear the leftover.
248 leftover_.clear();
249 } else {
250 // If we do not expect anything i.e. stack is empty, and we have non-empty
251 // string left to parse, we report an error.
252 if (stack_.empty()) {
253 return ReportFailure("Parsing terminated before end of input.");
254 }
255 // If we expect future data i.e. stack is non-empty, and we have some
256 // unparsed data left, we save it for later parse.
257 leftover_ = std::string(p_);
258 }
259 return util::Status();
260 }
261
RunParser()262 util::Status JsonStreamParser::RunParser() {
263 while (!stack_.empty()) {
264 ParseType type = stack_.top();
265 TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
266 stack_.pop();
267 util::Status result;
268 switch (type) {
269 case VALUE:
270 result = ParseValue(t);
271 break;
272
273 case OBJ_MID:
274 result = ParseObjectMid(t);
275 break;
276
277 case ENTRY:
278 result = ParseEntry(t);
279 break;
280
281 case ENTRY_MID:
282 result = ParseEntryMid(t);
283 break;
284
285 case ARRAY_VALUE:
286 result = ParseArrayValue(t);
287 break;
288
289 case ARRAY_MID:
290 result = ParseArrayMid(t);
291 break;
292
293 default:
294 result = util::Status(util::error::INTERNAL,
295 StrCat("Unknown parse type: ", type));
296 break;
297 }
298 if (!result.ok()) {
299 // If we were cancelled, save our state and try again later.
300 if (!finishing_ &&
301 result == util::Status(util::error::CANCELLED, "")) {
302 stack_.push(type);
303 // If we have a key we still need to render, make sure to save off the
304 // contents in our own storage.
305 if (!key_.empty() && key_storage_.empty()) {
306 StrAppend(&key_storage_, key_);
307 key_ = StringPiece(key_storage_);
308 }
309 result = util::Status();
310 }
311 return result;
312 }
313 }
314 return util::Status();
315 }
316
ParseValue(TokenType type)317 util::Status JsonStreamParser::ParseValue(TokenType type) {
318 switch (type) {
319 case BEGIN_OBJECT:
320 return HandleBeginObject();
321 case BEGIN_ARRAY:
322 return HandleBeginArray();
323 case BEGIN_STRING:
324 return ParseString();
325 case BEGIN_NUMBER:
326 return ParseNumber();
327 case BEGIN_TRUE:
328 return ParseTrue();
329 case BEGIN_FALSE:
330 return ParseFalse();
331 case BEGIN_NULL:
332 return ParseNull();
333 case UNKNOWN:
334 return ReportUnknown("Expected a value.");
335 default: {
336 if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
337 return ParseEmptyNull();
338 }
339
340 // Special case for having been cut off while parsing, wait for more data.
341 // This handles things like 'fals' being at the end of the string, we
342 // don't know if the next char would be e, completing it, or something
343 // else, making it invalid.
344 if (!finishing_ && p_.length() < kKeywordFalse.length()) {
345 return util::Status(util::error::CANCELLED, "");
346 }
347 return ReportFailure("Unexpected token.");
348 }
349 }
350 }
351
ParseString()352 util::Status JsonStreamParser::ParseString() {
353 util::Status result = ParseStringHelper();
354 if (result.ok()) {
355 ow_->RenderString(key_, parsed_);
356 key_ = StringPiece();
357 parsed_ = StringPiece();
358 parsed_storage_.clear();
359 }
360 return result;
361 }
362
ParseStringHelper()363 util::Status JsonStreamParser::ParseStringHelper() {
364 // If we haven't seen the start quote, grab it and remember it for later.
365 if (string_open_ == 0) {
366 string_open_ = *p_.data();
367 GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
368 Advance();
369 }
370 // Track where we last copied data from so we can minimize copying.
371 const char* last = p_.data();
372 while (!p_.empty()) {
373 const char* data = p_.data();
374 if (*data == '\\') {
375 // We're about to handle an escape, copy all bytes from last to data.
376 if (last < data) {
377 parsed_storage_.append(last, data - last);
378 }
379 // If we ran out of string after the \, cancel or report an error
380 // depending on if we expect more data later.
381 if (p_.length() == 1) {
382 if (!finishing_) {
383 return util::Status(util::error::CANCELLED, "");
384 }
385 return ReportFailure("Closing quote expected in string.");
386 }
387 // Parse a unicode escape if we found \u in the string.
388 if (data[1] == 'u') {
389 util::Status result = ParseUnicodeEscape();
390 if (!result.ok()) {
391 return result;
392 }
393 // Move last pointer past the unicode escape and continue.
394 last = p_.data();
395 continue;
396 }
397 // Handle the standard set of backslash-escaped characters.
398 switch (data[1]) {
399 case 'b':
400 parsed_storage_.push_back('\b');
401 break;
402 case 'f':
403 parsed_storage_.push_back('\f');
404 break;
405 case 'n':
406 parsed_storage_.push_back('\n');
407 break;
408 case 'r':
409 parsed_storage_.push_back('\r');
410 break;
411 case 't':
412 parsed_storage_.push_back('\t');
413 break;
414 case 'v':
415 parsed_storage_.push_back('\v');
416 break;
417 default:
418 parsed_storage_.push_back(data[1]);
419 }
420 // We handled two characters, so advance past them and continue.
421 p_.remove_prefix(2);
422 last = p_.data();
423 continue;
424 }
425 // If we found the closing quote note it, advance past it, and return.
426 if (*data == string_open_) {
427 // If we didn't copy anything, reuse the input buffer.
428 if (parsed_storage_.empty()) {
429 parsed_ = StringPiece(last, data - last);
430 } else {
431 if (last < data) {
432 parsed_storage_.append(last, data - last);
433 }
434 parsed_ = StringPiece(parsed_storage_);
435 }
436 // Clear the quote char so next time we try to parse a string we'll
437 // start fresh.
438 string_open_ = 0;
439 Advance();
440 return util::Status();
441 }
442 // Normal character, just advance past it.
443 Advance();
444 }
445 // If we ran out of characters, copy over what we have so far.
446 if (last < p_.data()) {
447 parsed_storage_.append(last, p_.data() - last);
448 }
449 // If we didn't find the closing quote but we expect more data, cancel for now
450 if (!finishing_) {
451 return util::Status(util::error::CANCELLED, "");
452 }
453 // End of string reached without a closing quote, report an error.
454 string_open_ = 0;
455 return ReportFailure("Closing quote expected in string.");
456 }
457
458 // Converts a unicode escaped character to a decimal value stored in a char32
459 // for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
460 // convert that from the hex number to a decimal value.
461 //
462 // There are some security exploits with UTF-8 that we should be careful of:
463 // - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
464 // - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()465 util::Status JsonStreamParser::ParseUnicodeEscape() {
466 if (p_.length() < kUnicodeEscapedLength) {
467 if (!finishing_) {
468 return util::Status(util::error::CANCELLED, "");
469 }
470 return ReportFailure("Illegal hex string.");
471 }
472 GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
473 GOOGLE_DCHECK_EQ('u', p_.data()[1]);
474 uint32 code = 0;
475 for (int i = 2; i < kUnicodeEscapedLength; ++i) {
476 if (!isxdigit(p_.data()[i])) {
477 return ReportFailure("Invalid escape sequence.");
478 }
479 code = (code << 4) + hex_digit_to_int(p_.data()[i]);
480 }
481 if (code >= JsonEscaping::kMinHighSurrogate &&
482 code <= JsonEscaping::kMaxHighSurrogate) {
483 if (p_.length() < 2 * kUnicodeEscapedLength) {
484 if (!finishing_) {
485 return util::Status(util::error::CANCELLED, "");
486 }
487 if (!coerce_to_utf8_) {
488 return ReportFailure("Missing low surrogate.");
489 }
490 } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
491 p_.data()[kUnicodeEscapedLength + 1] == 'u') {
492 uint32 low_code = 0;
493 for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
494 ++i) {
495 if (!isxdigit(p_.data()[i])) {
496 return ReportFailure("Invalid escape sequence.");
497 }
498 low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
499 }
500 if (low_code >= JsonEscaping::kMinLowSurrogate &&
501 low_code <= JsonEscaping::kMaxLowSurrogate) {
502 // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
503 code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
504 JsonEscaping::kMinSupplementaryCodePoint;
505 // Advance past the first code unit escape.
506 p_.remove_prefix(kUnicodeEscapedLength);
507 } else if (!coerce_to_utf8_) {
508 return ReportFailure("Invalid low surrogate.");
509 }
510 } else if (!coerce_to_utf8_) {
511 return ReportFailure("Missing low surrogate.");
512 }
513 }
514 if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
515 return ReportFailure("Invalid unicode code point.");
516 }
517 char buf[UTFmax];
518 int len = EncodeAsUTF8Char(code, buf);
519 // Advance past the [final] code unit escape.
520 p_.remove_prefix(kUnicodeEscapedLength);
521 parsed_storage_.append(buf, len);
522 return util::Status();
523 }
524
ParseNumber()525 util::Status JsonStreamParser::ParseNumber() {
526 NumberResult number;
527 util::Status result = ParseNumberHelper(&number);
528 if (result.ok()) {
529 switch (number.type) {
530 case NumberResult::DOUBLE:
531 ow_->RenderDouble(key_, number.double_val);
532 key_ = StringPiece();
533 break;
534
535 case NumberResult::INT:
536 ow_->RenderInt64(key_, number.int_val);
537 key_ = StringPiece();
538 break;
539
540 case NumberResult::UINT:
541 ow_->RenderUint64(key_, number.uint_val);
542 key_ = StringPiece();
543 break;
544
545 default:
546 return ReportFailure("Unable to parse number.");
547 }
548 }
549 return result;
550 }
551
ParseDoubleHelper(const std::string & number,NumberResult * result)552 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
553 NumberResult* result) {
554 if (!safe_strtod(number, &result->double_val)) {
555 return ReportFailure("Unable to parse number.");
556 }
557 if (!loose_float_number_conversion_ && !std::isfinite(result->double_val)) {
558 return ReportFailure("Number exceeds the range of double.");
559 }
560 result->type = NumberResult::DOUBLE;
561 return util::Status();
562 }
563
ParseNumberHelper(NumberResult * result)564 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
565 const char* data = p_.data();
566 int length = p_.length();
567
568 // Look for the first non-numeric character, or the end of the string.
569 int index = 0;
570 bool floating = false;
571 bool negative = data[index] == '-';
572 // Find the first character that cannot be part of the number. Along the way
573 // detect if the number needs to be parsed as a double.
574 // Note that this restricts numbers to the JSON specification, so for example
575 // we do not support hex or octal notations.
576 for (; index < length; ++index) {
577 char c = data[index];
578 if (isdigit(c)) continue;
579 if (c == '.' || c == 'e' || c == 'E') {
580 floating = true;
581 continue;
582 }
583 if (c == '+' || c == '-' || c == 'x') continue;
584 // Not a valid number character, break out.
585 break;
586 }
587
588 // If the entire input is a valid number, and we may have more content in the
589 // future, we abort for now and resume when we know more.
590 if (index == length && !finishing_) {
591 return util::Status(util::error::CANCELLED, "");
592 }
593
594 // Create a string containing just the number, so we can use safe_strtoX
595 std::string number = std::string(p_.substr(0, index));
596
597 // Floating point number, parse as a double.
598 if (floating) {
599 util::Status status = ParseDoubleHelper(number, result);
600 if (status.ok()) {
601 p_.remove_prefix(index);
602 }
603 return status;
604 }
605
606 // Positive non-floating point number, parse as a uint64.
607 if (!negative) {
608 // Octal/Hex numbers are not valid JSON values.
609 if (number.length() >= 2 && number[0] == '0') {
610 return ReportFailure("Octal/hex numbers are not valid JSON values.");
611 }
612 if (safe_strtou64(number, &result->uint_val)) {
613 result->type = NumberResult::UINT;
614 p_.remove_prefix(index);
615 return util::Status();
616 } else {
617 // If the value is too large, parse it as double.
618 util::Status status = ParseDoubleHelper(number, result);
619 if (status.ok()) {
620 p_.remove_prefix(index);
621 }
622 return status;
623 }
624 }
625
626 // Octal/Hex numbers are not valid JSON values.
627 if (number.length() >= 3 && number[1] == '0') {
628 return ReportFailure("Octal/hex numbers are not valid JSON values.");
629 }
630 // Negative non-floating point number, parse as an int64.
631 if (safe_strto64(number, &result->int_val)) {
632 result->type = NumberResult::INT;
633 p_.remove_prefix(index);
634 return util::Status();
635 } else {
636 // If the value is too large, parse it as double.
637 util::Status status = ParseDoubleHelper(number, result);
638 if (status.ok()) {
639 p_.remove_prefix(index);
640 }
641 return status;
642 }
643 }
644
HandleBeginObject()645 util::Status JsonStreamParser::HandleBeginObject() {
646 GOOGLE_DCHECK_EQ('{', *p_.data());
647 Advance();
648 ow_->StartObject(key_);
649 auto status = IncrementRecursionDepth(key_);
650 if (!status.ok()) {
651 return status;
652 }
653 key_ = StringPiece();
654 stack_.push(ENTRY);
655 return util::Status();
656 }
657
ParseObjectMid(TokenType type)658 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
659 if (type == UNKNOWN) {
660 return ReportUnknown("Expected , or } after key:value pair.");
661 }
662
663 // Object is complete, advance past the comma and render the EndObject.
664 if (type == END_OBJECT) {
665 Advance();
666 ow_->EndObject();
667 --recursion_depth_;
668 return util::Status();
669 }
670 // Found a comma, advance past it and get ready for an entry.
671 if (type == VALUE_SEPARATOR) {
672 Advance();
673 stack_.push(ENTRY);
674 return util::Status();
675 }
676 // Illegal token after key:value pair.
677 return ReportFailure("Expected , or } after key:value pair.");
678 }
679
ParseEntry(TokenType type)680 util::Status JsonStreamParser::ParseEntry(TokenType type) {
681 if (type == UNKNOWN) {
682 return ReportUnknown("Expected an object key or }.");
683 }
684
685 // Close the object and return. This allows for trailing commas.
686 if (type == END_OBJECT) {
687 ow_->EndObject();
688 Advance();
689 --recursion_depth_;
690 return util::Status();
691 }
692
693 util::Status result;
694 if (type == BEGIN_STRING) {
695 // Key is a string (standard JSON), parse it and store the string.
696 result = ParseStringHelper();
697 if (result.ok()) {
698 key_storage_.clear();
699 if (!parsed_storage_.empty()) {
700 parsed_storage_.swap(key_storage_);
701 key_ = StringPiece(key_storage_);
702 } else {
703 key_ = parsed_;
704 }
705 parsed_ = StringPiece();
706 }
707 } else if (type == BEGIN_KEY) {
708 // Key is a bare key (back compat), create a StringPiece pointing to it.
709 result = ParseKey();
710 } else if (type == BEGIN_NULL || type == BEGIN_TRUE || type == BEGIN_FALSE) {
711 // Key may be a bare key that begins with a reserved word.
712 result = ParseKey();
713 if (result.ok() && (key_ == kKeywordNull || key_ == kKeywordTrue ||
714 key_ == kKeywordFalse)) {
715 result = ReportFailure("Expected an object key or }.");
716 }
717 } else {
718 // Unknown key type, report an error.
719 result = ReportFailure("Expected an object key or }.");
720 }
721 // On success we next expect an entry mid ':' then an object mid ',' or '}'
722 if (result.ok()) {
723 stack_.push(OBJ_MID);
724 stack_.push(ENTRY_MID);
725 }
726 return result;
727 }
728
ParseEntryMid(TokenType type)729 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
730 if (type == UNKNOWN) {
731 return ReportUnknown("Expected : between key:value pair.");
732 }
733 if (type == ENTRY_SEPARATOR) {
734 Advance();
735 stack_.push(VALUE);
736 return util::Status();
737 }
738 return ReportFailure("Expected : between key:value pair.");
739 }
740
HandleBeginArray()741 util::Status JsonStreamParser::HandleBeginArray() {
742 GOOGLE_DCHECK_EQ('[', *p_.data());
743 Advance();
744 ow_->StartList(key_);
745 key_ = StringPiece();
746 stack_.push(ARRAY_VALUE);
747 return util::Status();
748 }
749
ParseArrayValue(TokenType type)750 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
751 if (type == UNKNOWN) {
752 return ReportUnknown("Expected a value or ] within an array.");
753 }
754
755 if (type == END_ARRAY) {
756 ow_->EndList();
757 Advance();
758 return util::Status();
759 }
760
761 // The ParseValue call may push something onto the stack so we need to make
762 // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
763 // empty-null array value is relying on this ARRAY_MID token.
764 stack_.push(ARRAY_MID);
765 util::Status result = ParseValue(type);
766 if (result == util::Status(util::error::CANCELLED, "")) {
767 // If we were cancelled, pop back off the ARRAY_MID so we don't try to
768 // push it on again when we try over.
769 stack_.pop();
770 }
771 return result;
772 }
773
ParseArrayMid(TokenType type)774 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
775 if (type == UNKNOWN) {
776 return ReportUnknown("Expected , or ] after array value.");
777 }
778
779 if (type == END_ARRAY) {
780 ow_->EndList();
781 Advance();
782 return util::Status();
783 }
784
785 // Found a comma, advance past it and expect an array value next.
786 if (type == VALUE_SEPARATOR) {
787 Advance();
788 stack_.push(ARRAY_VALUE);
789 return util::Status();
790 }
791 // Illegal token after array value.
792 return ReportFailure("Expected , or ] after array value.");
793 }
794
ParseTrue()795 util::Status JsonStreamParser::ParseTrue() {
796 ow_->RenderBool(key_, true);
797 key_ = StringPiece();
798 p_.remove_prefix(kKeywordTrue.length());
799 return util::Status();
800 }
801
ParseFalse()802 util::Status JsonStreamParser::ParseFalse() {
803 ow_->RenderBool(key_, false);
804 key_ = StringPiece();
805 p_.remove_prefix(kKeywordFalse.length());
806 return util::Status();
807 }
808
ParseNull()809 util::Status JsonStreamParser::ParseNull() {
810 ow_->RenderNull(key_);
811 key_ = StringPiece();
812 p_.remove_prefix(kKeywordNull.length());
813 return util::Status();
814 }
815
ParseEmptyNull()816 util::Status JsonStreamParser::ParseEmptyNull() {
817 ow_->RenderNull(key_);
818 key_ = StringPiece();
819 return util::Status();
820 }
821
IsEmptyNullAllowed(TokenType type)822 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
823 if (stack_.empty()) return false;
824 return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
825 stack_.top() == OBJ_MID;
826 }
827
ReportFailure(StringPiece message)828 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
829 static const int kContextLength = 20;
830 const char* p_start = p_.data();
831 const char* json_start = json_.data();
832 const char* begin = std::max(p_start - kContextLength, json_start);
833 const char* end =
834 std::min(p_start + kContextLength, json_start + json_.size());
835 StringPiece segment(begin, end - begin);
836 std::string location(p_start - begin, ' ');
837 location.push_back('^');
838 return util::Status(util::error::INVALID_ARGUMENT,
839 StrCat(message, "\n", segment, "\n", location));
840 }
841
ReportUnknown(StringPiece message)842 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
843 // If we aren't finishing the parse, cancel parsing and try later.
844 if (!finishing_) {
845 return util::Status(util::error::CANCELLED, "");
846 }
847 if (p_.empty()) {
848 return ReportFailure(StrCat("Unexpected end of string. ", message));
849 }
850 return ReportFailure(message);
851 }
852
IncrementRecursionDepth(StringPiece key) const853 util::Status JsonStreamParser::IncrementRecursionDepth(
854 StringPiece key) const {
855 if (++recursion_depth_ > max_recursion_depth_) {
856 return Status(
857 util::error::INVALID_ARGUMENT,
858 StrCat("Message too deep. Max recursion depth reached for key '",
859 key, "'"));
860 }
861 return util::Status();
862 }
863
SkipWhitespace()864 void JsonStreamParser::SkipWhitespace() {
865 while (!p_.empty() && ascii_isspace(*p_.data())) {
866 Advance();
867 }
868 }
869
Advance()870 void JsonStreamParser::Advance() {
871 // Advance by moving one UTF8 character while making sure we don't go beyond
872 // the length of StringPiece.
873 p_.remove_prefix(std::min<int>(
874 p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
875 }
876
ParseKey()877 util::Status JsonStreamParser::ParseKey() {
878 StringPiece original = p_;
879
880 if (allow_permissive_key_naming_) {
881 if (!ConsumeKeyPermissive(&p_, &key_)) {
882 return ReportFailure("Invalid key or variable name.");
883 }
884 } else {
885 if (!ConsumeKey(&p_, &key_)) {
886 return ReportFailure("Invalid key or variable name.");
887 }
888 }
889
890 // If we consumed everything but expect more data, reset p_ and cancel since
891 // we can't know if the key was complete or not.
892 if (!finishing_ && p_.empty()) {
893 p_ = original;
894 return util::Status(util::error::CANCELLED, "");
895 }
896 // Since we aren't using the key storage, clear it out.
897 key_storage_.clear();
898 return util::Status();
899 }
900
GetNextTokenType()901 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
902 SkipWhitespace();
903
904 int size = p_.size();
905 if (size == 0) {
906 // If we ran out of data, report unknown and we'll place the previous parse
907 // type onto the stack and try again when we have more data.
908 return UNKNOWN;
909 }
910 // TODO(sven): Split this method based on context since different contexts
911 // support different tokens. Would slightly speed up processing?
912 const char* data = p_.data();
913 StringPiece data_view = StringPiece(data, size);
914 if (*data == '\"' || *data == '\'') return BEGIN_STRING;
915 if (*data == '-' || ('0' <= *data && *data <= '9')) {
916 return BEGIN_NUMBER;
917 }
918 if (size >= kKeywordTrue.length() &&
919 HasPrefixString(data_view, kKeywordTrue)) {
920 return BEGIN_TRUE;
921 }
922 if (size >= kKeywordFalse.length() &&
923 HasPrefixString(data_view, kKeywordFalse)) {
924 return BEGIN_FALSE;
925 }
926 if (size >= kKeywordNull.length() &&
927 HasPrefixString(data_view, kKeywordNull)) {
928 return BEGIN_NULL;
929 }
930 if (*data == '{') return BEGIN_OBJECT;
931 if (*data == '}') return END_OBJECT;
932 if (*data == '[') return BEGIN_ARRAY;
933 if (*data == ']') return END_ARRAY;
934 if (*data == ':') return ENTRY_SEPARATOR;
935 if (*data == ',') return VALUE_SEPARATOR;
936 if (MatchKey(p_)) {
937 return BEGIN_KEY;
938 }
939
940 // We don't know that we necessarily have an invalid token here, just that we
941 // can't parse what we have so far. So we don't report an error and just
942 // return UNKNOWN so we can try again later when we have more data, or if we
943 // finish and we have leftovers.
944 return UNKNOWN;
945 }
946
947 } // namespace converter
948 } // namespace util
949 } // namespace protobuf
950 } // namespace google
951