1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32
33 #include <algorithm>
34 #include <cctype>
35 #include <cerrno>
36 #include <cstdlib>
37 #include <cstring>
38 #include <memory>
39
40 #include <google/protobuf/stubs/logging.h>
41 #include <google/protobuf/stubs/common.h>
42 #include <google/protobuf/stubs/strutil.h>
43
44 #include <google/protobuf/util/internal/object_writer.h>
45 #include <google/protobuf/util/internal/json_escaping.h>
46 #include <google/protobuf/stubs/mathlimits.h>
47
48
49 namespace google {
50 namespace protobuf {
51 namespace util {
52
53 // Allow these symbols to be referenced as util::Status, util::error::* in
54 // this file.
55 using util::Status;
56 namespace error {
57 using util::error::CANCELLED;
58 using util::error::INTERNAL;
59 using util::error::INVALID_ARGUMENT;
60 } // namespace error
61
62 namespace converter {
63
64 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
65 static const int kUnicodeEscapedLength = 6;
66
67 static const int kDefaultMaxRecursionDepth = 100;
68
69 // Length of the true, false, and null literals.
70 static const int true_len = strlen("true");
71 static const int false_len = strlen("false");
72 static const int null_len = strlen("null");
73
IsLetter(char c)74 inline bool IsLetter(char c) {
75 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
76 (c == '$');
77 }
78
IsAlphanumeric(char c)79 inline bool IsAlphanumeric(char c) {
80 return IsLetter(c) || ('0' <= c && c <= '9');
81 }
82
ConsumeKey(StringPiece * input,StringPiece * key)83 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
84 if (input->empty() || !IsLetter((*input)[0])) return false;
85 int len = 1;
86 for (; len < input->size(); ++len) {
87 if (!IsAlphanumeric((*input)[len])) {
88 break;
89 }
90 }
91 *key = StringPiece(input->data(), len);
92 *input = StringPiece(input->data() + len, input->size() - len);
93 return true;
94 }
95
MatchKey(StringPiece input)96 static bool MatchKey(StringPiece input) {
97 return !input.empty() && IsLetter(input[0]);
98 }
99
JsonStreamParser(ObjectWriter * ow)100 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
101 : ow_(ow),
102 stack_(),
103 leftover_(),
104 json_(),
105 p_(),
106 key_(),
107 key_storage_(),
108 finishing_(false),
109 parsed_(),
110 parsed_storage_(),
111 string_open_(0),
112 chunk_storage_(),
113 coerce_to_utf8_(false),
114 allow_empty_null_(false),
115 loose_float_number_conversion_(false),
116 recursion_depth_(0),
117 max_recursion_depth_(kDefaultMaxRecursionDepth) {
118 // Initialize the stack with a single value to be parsed.
119 stack_.push(VALUE);
120 }
121
~JsonStreamParser()122 JsonStreamParser::~JsonStreamParser() {}
123
124
Parse(StringPiece json)125 util::Status JsonStreamParser::Parse(StringPiece json) {
126 StringPiece chunk = json;
127 // If we have leftovers from a previous chunk, append the new chunk to it
128 // and create a new StringPiece pointing at the string's data. This could
129 // be large but we rely on the chunks to be small, assuming they are
130 // fragments of a Cord.
131 if (!leftover_.empty()) {
132 // Don't point chunk to leftover_ because leftover_ will be updated in
133 // ParseChunk(chunk).
134 chunk_storage_.swap(leftover_);
135 StrAppend(&chunk_storage_, json);
136 chunk = StringPiece(chunk_storage_);
137 }
138
139 // Find the structurally valid UTF8 prefix and parse only that.
140 int n = internal::UTF8SpnStructurallyValid(chunk);
141 if (n > 0) {
142 util::Status status = ParseChunk(chunk.substr(0, n));
143
144 // Any leftover characters are stashed in leftover_ for later parsing when
145 // there is more data available.
146 StrAppend(&leftover_, chunk.substr(n));
147 return status;
148 } else {
149 leftover_.assign(chunk.data(), chunk.size());
150 return util::Status();
151 }
152 }
153
FinishParse()154 util::Status JsonStreamParser::FinishParse() {
155 // If we do not expect anything and there is nothing left to parse we're all
156 // done.
157 if (stack_.empty() && leftover_.empty()) {
158 return util::Status();
159 }
160
161 // Storage for UTF8-coerced string.
162 std::unique_ptr<char[]> utf8;
163 if (coerce_to_utf8_) {
164 utf8.reset(new char[leftover_.size()]);
165 char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
166 p_ = json_ = StringPiece(coerced, leftover_.size());
167 } else {
168 p_ = json_ = leftover_;
169 if (!internal::IsStructurallyValidUTF8(leftover_)) {
170 return ReportFailure("Encountered non UTF-8 code points.");
171 }
172 }
173
174 // Parse the remainder in finishing mode, which reports errors for things like
175 // unterminated strings or unknown tokens that would normally be retried.
176 finishing_ = true;
177 util::Status result = RunParser();
178 if (result.ok()) {
179 SkipWhitespace();
180 if (!p_.empty()) {
181 result = ReportFailure("Parsing terminated before end of input.");
182 }
183 }
184 return result;
185 }
186
ParseChunk(StringPiece chunk)187 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
188 // Do not do any work if the chunk is empty.
189 if (chunk.empty()) return util::Status();
190
191 p_ = json_ = chunk;
192
193 finishing_ = false;
194 util::Status result = RunParser();
195 if (!result.ok()) return result;
196
197 SkipWhitespace();
198 if (p_.empty()) {
199 // If we parsed everything we had, clear the leftover.
200 leftover_.clear();
201 } else {
202 // If we do not expect anything i.e. stack is empty, and we have non-empty
203 // string left to parse, we report an error.
204 if (stack_.empty()) {
205 return ReportFailure("Parsing terminated before end of input.");
206 }
207 // If we expect future data i.e. stack is non-empty, and we have some
208 // unparsed data left, we save it for later parse.
209 leftover_ = std::string(p_);
210 }
211 return util::Status();
212 }
213
RunParser()214 util::Status JsonStreamParser::RunParser() {
215 while (!stack_.empty()) {
216 ParseType type = stack_.top();
217 TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
218 stack_.pop();
219 util::Status result;
220 switch (type) {
221 case VALUE:
222 result = ParseValue(t);
223 break;
224
225 case OBJ_MID:
226 result = ParseObjectMid(t);
227 break;
228
229 case ENTRY:
230 result = ParseEntry(t);
231 break;
232
233 case ENTRY_MID:
234 result = ParseEntryMid(t);
235 break;
236
237 case ARRAY_VALUE:
238 result = ParseArrayValue(t);
239 break;
240
241 case ARRAY_MID:
242 result = ParseArrayMid(t);
243 break;
244
245 default:
246 result = util::Status(util::error::INTERNAL,
247 StrCat("Unknown parse type: ", type));
248 break;
249 }
250 if (!result.ok()) {
251 // If we were cancelled, save our state and try again later.
252 if (!finishing_ &&
253 result == util::Status(util::error::CANCELLED, "")) {
254 stack_.push(type);
255 // If we have a key we still need to render, make sure to save off the
256 // contents in our own storage.
257 if (!key_.empty() && key_storage_.empty()) {
258 StrAppend(&key_storage_, key_);
259 key_ = StringPiece(key_storage_);
260 }
261 result = util::Status();
262 }
263 return result;
264 }
265 }
266 return util::Status();
267 }
268
ParseValue(TokenType type)269 util::Status JsonStreamParser::ParseValue(TokenType type) {
270 switch (type) {
271 case BEGIN_OBJECT:
272 return HandleBeginObject();
273 case BEGIN_ARRAY:
274 return HandleBeginArray();
275 case BEGIN_STRING:
276 return ParseString();
277 case BEGIN_NUMBER:
278 return ParseNumber();
279 case BEGIN_TRUE:
280 return ParseTrue();
281 case BEGIN_FALSE:
282 return ParseFalse();
283 case BEGIN_NULL:
284 return ParseNull();
285 case UNKNOWN:
286 return ReportUnknown("Expected a value.");
287 default: {
288 if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
289 return ParseEmptyNull();
290 }
291
292 // Special case for having been cut off while parsing, wait for more data.
293 // This handles things like 'fals' being at the end of the string, we
294 // don't know if the next char would be e, completing it, or something
295 // else, making it invalid.
296 if (!finishing_ && p_.length() < false_len) {
297 return util::Status(util::error::CANCELLED, "");
298 }
299 return ReportFailure("Unexpected token.");
300 }
301 }
302 }
303
ParseString()304 util::Status JsonStreamParser::ParseString() {
305 util::Status result = ParseStringHelper();
306 if (result.ok()) {
307 ow_->RenderString(key_, parsed_);
308 key_ = StringPiece();
309 parsed_ = StringPiece();
310 parsed_storage_.clear();
311 }
312 return result;
313 }
314
ParseStringHelper()315 util::Status JsonStreamParser::ParseStringHelper() {
316 // If we haven't seen the start quote, grab it and remember it for later.
317 if (string_open_ == 0) {
318 string_open_ = *p_.data();
319 GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
320 Advance();
321 }
322 // Track where we last copied data from so we can minimize copying.
323 const char* last = p_.data();
324 while (!p_.empty()) {
325 const char* data = p_.data();
326 if (*data == '\\') {
327 // We're about to handle an escape, copy all bytes from last to data.
328 if (last < data) {
329 parsed_storage_.append(last, data - last);
330 }
331 // If we ran out of string after the \, cancel or report an error
332 // depending on if we expect more data later.
333 if (p_.length() == 1) {
334 if (!finishing_) {
335 return util::Status(util::error::CANCELLED, "");
336 }
337 return ReportFailure("Closing quote expected in string.");
338 }
339 // Parse a unicode escape if we found \u in the string.
340 if (data[1] == 'u') {
341 util::Status result = ParseUnicodeEscape();
342 if (!result.ok()) {
343 return result;
344 }
345 // Move last pointer past the unicode escape and continue.
346 last = p_.data();
347 continue;
348 }
349 // Handle the standard set of backslash-escaped characters.
350 switch (data[1]) {
351 case 'b':
352 parsed_storage_.push_back('\b');
353 break;
354 case 'f':
355 parsed_storage_.push_back('\f');
356 break;
357 case 'n':
358 parsed_storage_.push_back('\n');
359 break;
360 case 'r':
361 parsed_storage_.push_back('\r');
362 break;
363 case 't':
364 parsed_storage_.push_back('\t');
365 break;
366 case 'v':
367 parsed_storage_.push_back('\v');
368 break;
369 default:
370 parsed_storage_.push_back(data[1]);
371 }
372 // We handled two characters, so advance past them and continue.
373 p_.remove_prefix(2);
374 last = p_.data();
375 continue;
376 }
377 // If we found the closing quote note it, advance past it, and return.
378 if (*data == string_open_) {
379 // If we didn't copy anything, reuse the input buffer.
380 if (parsed_storage_.empty()) {
381 parsed_ = StringPiece(last, data - last);
382 } else {
383 if (last < data) {
384 parsed_storage_.append(last, data - last);
385 }
386 parsed_ = StringPiece(parsed_storage_);
387 }
388 // Clear the quote char so next time we try to parse a string we'll
389 // start fresh.
390 string_open_ = 0;
391 Advance();
392 return util::Status();
393 }
394 // Normal character, just advance past it.
395 Advance();
396 }
397 // If we ran out of characters, copy over what we have so far.
398 if (last < p_.data()) {
399 parsed_storage_.append(last, p_.data() - last);
400 }
401 // If we didn't find the closing quote but we expect more data, cancel for now
402 if (!finishing_) {
403 return util::Status(util::error::CANCELLED, "");
404 }
405 // End of string reached without a closing quote, report an error.
406 string_open_ = 0;
407 return ReportFailure("Closing quote expected in string.");
408 }
409
410 // Converts a unicode escaped character to a decimal value stored in a char32
411 // for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
412 // convert that from the hex number to a decimal value.
413 //
414 // There are some security exploits with UTF-8 that we should be careful of:
415 // - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
416 // - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()417 util::Status JsonStreamParser::ParseUnicodeEscape() {
418 if (p_.length() < kUnicodeEscapedLength) {
419 if (!finishing_) {
420 return util::Status(util::error::CANCELLED, "");
421 }
422 return ReportFailure("Illegal hex string.");
423 }
424 GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
425 GOOGLE_DCHECK_EQ('u', p_.data()[1]);
426 uint32 code = 0;
427 for (int i = 2; i < kUnicodeEscapedLength; ++i) {
428 if (!isxdigit(p_.data()[i])) {
429 return ReportFailure("Invalid escape sequence.");
430 }
431 code = (code << 4) + hex_digit_to_int(p_.data()[i]);
432 }
433 if (code >= JsonEscaping::kMinHighSurrogate &&
434 code <= JsonEscaping::kMaxHighSurrogate) {
435 if (p_.length() < 2 * kUnicodeEscapedLength) {
436 if (!finishing_) {
437 return util::Status(util::error::CANCELLED, "");
438 }
439 if (!coerce_to_utf8_) {
440 return ReportFailure("Missing low surrogate.");
441 }
442 } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
443 p_.data()[kUnicodeEscapedLength + 1] == 'u') {
444 uint32 low_code = 0;
445 for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
446 ++i) {
447 if (!isxdigit(p_.data()[i])) {
448 return ReportFailure("Invalid escape sequence.");
449 }
450 low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
451 }
452 if (low_code >= JsonEscaping::kMinLowSurrogate &&
453 low_code <= JsonEscaping::kMaxLowSurrogate) {
454 // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
455 code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
456 JsonEscaping::kMinSupplementaryCodePoint;
457 // Advance past the first code unit escape.
458 p_.remove_prefix(kUnicodeEscapedLength);
459 } else if (!coerce_to_utf8_) {
460 return ReportFailure("Invalid low surrogate.");
461 }
462 } else if (!coerce_to_utf8_) {
463 return ReportFailure("Missing low surrogate.");
464 }
465 }
466 if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
467 return ReportFailure("Invalid unicode code point.");
468 }
469 char buf[UTFmax];
470 int len = EncodeAsUTF8Char(code, buf);
471 // Advance past the [final] code unit escape.
472 p_.remove_prefix(kUnicodeEscapedLength);
473 parsed_storage_.append(buf, len);
474 return util::Status();
475 }
476
ParseNumber()477 util::Status JsonStreamParser::ParseNumber() {
478 NumberResult number;
479 util::Status result = ParseNumberHelper(&number);
480 if (result.ok()) {
481 switch (number.type) {
482 case NumberResult::DOUBLE:
483 ow_->RenderDouble(key_, number.double_val);
484 key_ = StringPiece();
485 break;
486
487 case NumberResult::INT:
488 ow_->RenderInt64(key_, number.int_val);
489 key_ = StringPiece();
490 break;
491
492 case NumberResult::UINT:
493 ow_->RenderUint64(key_, number.uint_val);
494 key_ = StringPiece();
495 break;
496
497 default:
498 return ReportFailure("Unable to parse number.");
499 }
500 }
501 return result;
502 }
503
ParseDoubleHelper(const std::string & number,NumberResult * result)504 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
505 NumberResult* result) {
506 if (!safe_strtod(number, &result->double_val)) {
507 return ReportFailure("Unable to parse number.");
508 }
509 if (!loose_float_number_conversion_ &&
510 !MathLimits<double>::IsFinite(result->double_val)) {
511 return ReportFailure("Number exceeds the range of double.");
512 }
513 result->type = NumberResult::DOUBLE;
514 return util::Status();
515 }
516
ParseNumberHelper(NumberResult * result)517 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
518 const char* data = p_.data();
519 int length = p_.length();
520
521 // Look for the first non-numeric character, or the end of the string.
522 int index = 0;
523 bool floating = false;
524 bool negative = data[index] == '-';
525 // Find the first character that cannot be part of the number. Along the way
526 // detect if the number needs to be parsed as a double.
527 // Note that this restricts numbers to the JSON specification, so for example
528 // we do not support hex or octal notations.
529 for (; index < length; ++index) {
530 char c = data[index];
531 if (isdigit(c)) continue;
532 if (c == '.' || c == 'e' || c == 'E') {
533 floating = true;
534 continue;
535 }
536 if (c == '+' || c == '-' || c == 'x') continue;
537 // Not a valid number character, break out.
538 break;
539 }
540
541 // If the entire input is a valid number, and we may have more content in the
542 // future, we abort for now and resume when we know more.
543 if (index == length && !finishing_) {
544 return util::Status(util::error::CANCELLED, "");
545 }
546
547 // Create a string containing just the number, so we can use safe_strtoX
548 std::string number = std::string(p_.substr(0, index));
549
550 // Floating point number, parse as a double.
551 if (floating) {
552 util::Status status = ParseDoubleHelper(number, result);
553 if (status.ok()) {
554 p_.remove_prefix(index);
555 }
556 return status;
557 }
558
559 // Positive non-floating point number, parse as a uint64.
560 if (!negative) {
561 // Octal/Hex numbers are not valid JSON values.
562 if (number.length() >= 2 && number[0] == '0') {
563 return ReportFailure("Octal/hex numbers are not valid JSON values.");
564 }
565 if (safe_strtou64(number, &result->uint_val)) {
566 result->type = NumberResult::UINT;
567 p_.remove_prefix(index);
568 return util::Status();
569 } else {
570 // If the value is too large, parse it as double.
571 util::Status status = ParseDoubleHelper(number, result);
572 if (status.ok()) {
573 p_.remove_prefix(index);
574 }
575 return status;
576 }
577 }
578
579 // Octal/Hex numbers are not valid JSON values.
580 if (number.length() >= 3 && number[1] == '0') {
581 return ReportFailure("Octal/hex numbers are not valid JSON values.");
582 }
583 // Negative non-floating point number, parse as an int64.
584 if (safe_strto64(number, &result->int_val)) {
585 result->type = NumberResult::INT;
586 p_.remove_prefix(index);
587 return util::Status();
588 } else {
589 // If the value is too large, parse it as double.
590 util::Status status = ParseDoubleHelper(number, result);
591 if (status.ok()) {
592 p_.remove_prefix(index);
593 }
594 return status;
595 }
596 }
597
HandleBeginObject()598 util::Status JsonStreamParser::HandleBeginObject() {
599 GOOGLE_DCHECK_EQ('{', *p_.data());
600 Advance();
601 ow_->StartObject(key_);
602 auto status = IncrementRecursionDepth(key_);
603 if (!status.ok()) {
604 return status;
605 }
606 key_ = StringPiece();
607 stack_.push(ENTRY);
608 return util::Status();
609 }
610
ParseObjectMid(TokenType type)611 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
612 if (type == UNKNOWN) {
613 return ReportUnknown("Expected , or } after key:value pair.");
614 }
615
616 // Object is complete, advance past the comma and render the EndObject.
617 if (type == END_OBJECT) {
618 Advance();
619 ow_->EndObject();
620 --recursion_depth_;
621 return util::Status();
622 }
623 // Found a comma, advance past it and get ready for an entry.
624 if (type == VALUE_SEPARATOR) {
625 Advance();
626 stack_.push(ENTRY);
627 return util::Status();
628 }
629 // Illegal token after key:value pair.
630 return ReportFailure("Expected , or } after key:value pair.");
631 }
632
ParseEntry(TokenType type)633 util::Status JsonStreamParser::ParseEntry(TokenType type) {
634 if (type == UNKNOWN) {
635 return ReportUnknown("Expected an object key or }.");
636 }
637
638 // Close the object and return. This allows for trailing commas.
639 if (type == END_OBJECT) {
640 ow_->EndObject();
641 Advance();
642 --recursion_depth_;
643 return util::Status();
644 }
645
646 util::Status result;
647 if (type == BEGIN_STRING) {
648 // Key is a string (standard JSON), parse it and store the string.
649 result = ParseStringHelper();
650 if (result.ok()) {
651 key_storage_.clear();
652 if (!parsed_storage_.empty()) {
653 parsed_storage_.swap(key_storage_);
654 key_ = StringPiece(key_storage_);
655 } else {
656 key_ = parsed_;
657 }
658 parsed_ = StringPiece();
659 }
660 } else if (type == BEGIN_KEY) {
661 // Key is a bare key (back compat), create a StringPiece pointing to it.
662 result = ParseKey();
663 } else {
664 // Unknown key type, report an error.
665 result = ReportFailure("Expected an object key or }.");
666 }
667 // On success we next expect an entry mid ':' then an object mid ',' or '}'
668 if (result.ok()) {
669 stack_.push(OBJ_MID);
670 stack_.push(ENTRY_MID);
671 }
672 return result;
673 }
674
ParseEntryMid(TokenType type)675 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
676 if (type == UNKNOWN) {
677 return ReportUnknown("Expected : between key:value pair.");
678 }
679 if (type == ENTRY_SEPARATOR) {
680 Advance();
681 stack_.push(VALUE);
682 return util::Status();
683 }
684 return ReportFailure("Expected : between key:value pair.");
685 }
686
HandleBeginArray()687 util::Status JsonStreamParser::HandleBeginArray() {
688 GOOGLE_DCHECK_EQ('[', *p_.data());
689 Advance();
690 ow_->StartList(key_);
691 key_ = StringPiece();
692 stack_.push(ARRAY_VALUE);
693 return util::Status();
694 }
695
ParseArrayValue(TokenType type)696 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
697 if (type == UNKNOWN) {
698 return ReportUnknown("Expected a value or ] within an array.");
699 }
700
701 if (type == END_ARRAY) {
702 ow_->EndList();
703 Advance();
704 return util::Status();
705 }
706
707 // The ParseValue call may push something onto the stack so we need to make
708 // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
709 // empty-null array value is relying on this ARRAY_MID token.
710 stack_.push(ARRAY_MID);
711 util::Status result = ParseValue(type);
712 if (result == util::Status(util::error::CANCELLED, "")) {
713 // If we were cancelled, pop back off the ARRAY_MID so we don't try to
714 // push it on again when we try over.
715 stack_.pop();
716 }
717 return result;
718 }
719
ParseArrayMid(TokenType type)720 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
721 if (type == UNKNOWN) {
722 return ReportUnknown("Expected , or ] after array value.");
723 }
724
725 if (type == END_ARRAY) {
726 ow_->EndList();
727 Advance();
728 return util::Status();
729 }
730
731 // Found a comma, advance past it and expect an array value next.
732 if (type == VALUE_SEPARATOR) {
733 Advance();
734 stack_.push(ARRAY_VALUE);
735 return util::Status();
736 }
737 // Illegal token after array value.
738 return ReportFailure("Expected , or ] after array value.");
739 }
740
ParseTrue()741 util::Status JsonStreamParser::ParseTrue() {
742 ow_->RenderBool(key_, true);
743 key_ = StringPiece();
744 p_.remove_prefix(true_len);
745 return util::Status();
746 }
747
ParseFalse()748 util::Status JsonStreamParser::ParseFalse() {
749 ow_->RenderBool(key_, false);
750 key_ = StringPiece();
751 p_.remove_prefix(false_len);
752 return util::Status();
753 }
754
ParseNull()755 util::Status JsonStreamParser::ParseNull() {
756 ow_->RenderNull(key_);
757 key_ = StringPiece();
758 p_.remove_prefix(null_len);
759 return util::Status();
760 }
761
ParseEmptyNull()762 util::Status JsonStreamParser::ParseEmptyNull() {
763 ow_->RenderNull(key_);
764 key_ = StringPiece();
765 return util::Status();
766 }
767
IsEmptyNullAllowed(TokenType type)768 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
769 if (stack_.empty()) return false;
770 return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
771 stack_.top() == OBJ_MID;
772 }
773
ReportFailure(StringPiece message)774 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
775 static const int kContextLength = 20;
776 const char* p_start = p_.data();
777 const char* json_start = json_.data();
778 const char* begin = std::max(p_start - kContextLength, json_start);
779 const char* end =
780 std::min(p_start + kContextLength, json_start + json_.size());
781 StringPiece segment(begin, end - begin);
782 std::string location(p_start - begin, ' ');
783 location.push_back('^');
784 return util::Status(util::error::INVALID_ARGUMENT,
785 StrCat(message, "\n", segment, "\n", location));
786 }
787
ReportUnknown(StringPiece message)788 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
789 // If we aren't finishing the parse, cancel parsing and try later.
790 if (!finishing_) {
791 return util::Status(util::error::CANCELLED, "");
792 }
793 if (p_.empty()) {
794 return ReportFailure(StrCat("Unexpected end of string. ", message));
795 }
796 return ReportFailure(message);
797 }
798
IncrementRecursionDepth(StringPiece key) const799 util::Status JsonStreamParser::IncrementRecursionDepth(
800 StringPiece key) const {
801 if (++recursion_depth_ > max_recursion_depth_) {
802 return Status(
803 util::error::INVALID_ARGUMENT,
804 StrCat("Message too deep. Max recursion depth reached for key '",
805 key, "'"));
806 }
807 return util::Status();
808 }
809
SkipWhitespace()810 void JsonStreamParser::SkipWhitespace() {
811 while (!p_.empty() && ascii_isspace(*p_.data())) {
812 Advance();
813 }
814 }
815
Advance()816 void JsonStreamParser::Advance() {
817 // Advance by moving one UTF8 character while making sure we don't go beyond
818 // the length of StringPiece.
819 p_.remove_prefix(std::min<int>(
820 p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
821 }
822
ParseKey()823 util::Status JsonStreamParser::ParseKey() {
824 StringPiece original = p_;
825 if (!ConsumeKey(&p_, &key_)) {
826 return ReportFailure("Invalid key or variable name.");
827 }
828 // If we consumed everything but expect more data, reset p_ and cancel since
829 // we can't know if the key was complete or not.
830 if (!finishing_ && p_.empty()) {
831 p_ = original;
832 return util::Status(util::error::CANCELLED, "");
833 }
834 // Since we aren't using the key storage, clear it out.
835 key_storage_.clear();
836 return util::Status();
837 }
838
GetNextTokenType()839 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
840 SkipWhitespace();
841
842 int size = p_.size();
843 if (size == 0) {
844 // If we ran out of data, report unknown and we'll place the previous parse
845 // type onto the stack and try again when we have more data.
846 return UNKNOWN;
847 }
848 // TODO(sven): Split this method based on context since different contexts
849 // support different tokens. Would slightly speed up processing?
850 const char* data = p_.data();
851 if (*data == '\"' || *data == '\'') return BEGIN_STRING;
852 if (*data == '-' || ('0' <= *data && *data <= '9')) {
853 return BEGIN_NUMBER;
854 }
855 if (size >= true_len && !strncmp(data, "true", true_len)) {
856 return BEGIN_TRUE;
857 }
858 if (size >= false_len && !strncmp(data, "false", false_len)) {
859 return BEGIN_FALSE;
860 }
861 if (size >= null_len && !strncmp(data, "null", null_len)) {
862 return BEGIN_NULL;
863 }
864 if (*data == '{') return BEGIN_OBJECT;
865 if (*data == '}') return END_OBJECT;
866 if (*data == '[') return BEGIN_ARRAY;
867 if (*data == ']') return END_ARRAY;
868 if (*data == ':') return ENTRY_SEPARATOR;
869 if (*data == ',') return VALUE_SEPARATOR;
870 if (MatchKey(p_)) {
871 return BEGIN_KEY;
872 }
873
874 // We don't know that we necessarily have an invalid token here, just that we
875 // can't parse what we have so far. So we don't report an error and just
876 // return UNKNOWN so we can try again later when we have more data, or if we
877 // finish and we have leftovers.
878 return UNKNOWN;
879 }
880
881 } // namespace converter
882 } // namespace util
883 } // namespace protobuf
884 } // namespace google
885