• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32 
33 #include <algorithm>
34 #include <cctype>
35 #include <cerrno>
36 #include <cmath>
37 #include <cstdlib>
38 #include <cstring>
39 #include <limits>
40 #include <memory>
41 
42 #include <google/protobuf/stubs/logging.h>
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/strutil.h>
45 #include <google/protobuf/util/internal/object_writer.h>
46 #include <google/protobuf/util/internal/json_escaping.h>
47 
48 
49 namespace google {
50 namespace protobuf {
51 namespace util {
52 
53 // Allow these symbols to be referenced as util::Status, util::error::* in
54 // this file.
55 using util::Status;
56 namespace error {
57 using util::error::CANCELLED;
58 using util::error::INTERNAL;
59 using util::error::INVALID_ARGUMENT;
60 }  // namespace error
61 
62 namespace converter {
63 
64 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
65 static const int kUnicodeEscapedLength = 6;
66 
67 static const int kDefaultMaxRecursionDepth = 100;
68 
69 // These cannot be constexpr for portability with VS2015.
70 static const StringPiece kKeywordTrue = "true";
71 static const StringPiece kKeywordFalse = "false";
72 static const StringPiece kKeywordNull = "null";
73 
IsLetter(char c)74 inline bool IsLetter(char c) {
75   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
76          (c == '$');
77 }
78 
IsAlphanumeric(char c)79 inline bool IsAlphanumeric(char c) {
80   return IsLetter(c) || ('0' <= c && c <= '9');
81 }
82 
83 // Indicates a character may not be part of an unquoted key.
IsKeySeparator(char c)84 inline bool IsKeySeparator(char c) {
85   return (ascii_isspace(c) || c == '"' || c == '\'' || c == '{' ||
86           c == '}' || c == '[' || c == ']' || c == ':' || c == ',');
87 }
88 
ReplaceInvalidCodePoints(StringPiece str,const std::string & replacement,std::string * dst)89 inline void ReplaceInvalidCodePoints(StringPiece str,
90                                      const std::string& replacement,
91                                      std::string* dst) {
92   while (!str.empty()) {
93     int n_valid_bytes = internal::UTF8SpnStructurallyValid(str);
94     StringPiece valid_part = str.substr(0, n_valid_bytes);
95     StrAppend(dst, valid_part);
96 
97     if (n_valid_bytes == str.size()) {
98       break;
99     }
100 
101     // Append replacement value.
102     StrAppend(dst, replacement);
103 
104     // Move past valid bytes + one invalid byte.
105     str.remove_prefix(n_valid_bytes + 1);
106   }
107 }
108 
ConsumeKey(StringPiece * input,StringPiece * key)109 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
110   if (input->empty() || !IsLetter((*input)[0])) return false;
111   int len = 1;
112   for (; len < input->size(); ++len) {
113     if (!IsAlphanumeric((*input)[len])) {
114       break;
115     }
116   }
117   *key = StringPiece(input->data(), len);
118   *input = StringPiece(input->data() + len, input->size() - len);
119   return true;
120 }
121 
122 // Same as 'ConsumeKey', but allows a widened set of key characters.
ConsumeKeyPermissive(StringPiece * input,StringPiece * key)123 static bool ConsumeKeyPermissive(StringPiece* input,
124                                  StringPiece* key) {
125   if (input->empty() || !IsLetter((*input)[0])) return false;
126   int len = 1;
127   for (; len < input->size(); ++len) {
128     if (IsKeySeparator((*input)[len])) {
129       break;
130     }
131   }
132   *key = StringPiece(input->data(), len);
133   *input = StringPiece(input->data() + len, input->size() - len);
134   return true;
135 }
136 
MatchKey(StringPiece input)137 static bool MatchKey(StringPiece input) {
138   return !input.empty() && IsLetter(input[0]);
139 }
140 
JsonStreamParser(ObjectWriter * ow)141 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
142     : ow_(ow),
143       stack_(),
144       leftover_(),
145       json_(),
146       p_(),
147       key_(),
148       key_storage_(),
149       finishing_(false),
150       parsed_(),
151       parsed_storage_(),
152       string_open_(0),
153       chunk_storage_(),
154       coerce_to_utf8_(false),
155       utf8_replacement_character_(" "),
156       allow_empty_null_(false),
157       allow_permissive_key_naming_(false),
158       loose_float_number_conversion_(false),
159       recursion_depth_(0),
160       max_recursion_depth_(kDefaultMaxRecursionDepth) {
161   // Initialize the stack with a single value to be parsed.
162   stack_.push(VALUE);
163 }
164 
~JsonStreamParser()165 JsonStreamParser::~JsonStreamParser() {}
166 
167 
Parse(StringPiece json)168 util::Status JsonStreamParser::Parse(StringPiece json) {
169   StringPiece chunk = json;
170   // If we have leftovers from a previous chunk, append the new chunk to it
171   // and create a new StringPiece pointing at the string's data. This could
172   // be large but we rely on the chunks to be small, assuming they are
173   // fragments of a Cord.
174   if (!leftover_.empty()) {
175     // Don't point chunk to leftover_ because leftover_ will be updated in
176     // ParseChunk(chunk).
177     chunk_storage_.swap(leftover_);
178     StrAppend(&chunk_storage_, json);
179     chunk = StringPiece(chunk_storage_);
180   }
181 
182   // Find the structurally valid UTF8 prefix and parse only that.
183   int n = internal::UTF8SpnStructurallyValid(chunk);
184   if (n > 0) {
185     util::Status status = ParseChunk(chunk.substr(0, n));
186 
187     // Any leftover characters are stashed in leftover_ for later parsing when
188     // there is more data available.
189     StrAppend(&leftover_, chunk.substr(n));
190     return status;
191   } else {
192     leftover_.assign(chunk.data(), chunk.size());
193     return util::Status();
194   }
195 }
196 
FinishParse()197 util::Status JsonStreamParser::FinishParse() {
198   // If we do not expect anything and there is nothing left to parse we're all
199   // done.
200   if (stack_.empty() && leftover_.empty()) {
201     return util::Status();
202   }
203 
204   // Lifetime needs to last until RunParser returns, so keep this variable
205   // outside of the coerce_to_utf8 block.
206   std::unique_ptr<std::string> scratch;
207 
208   bool is_valid_utf8 = internal::IsStructurallyValidUTF8(leftover_);
209   if (coerce_to_utf8_ && !is_valid_utf8) {
210     scratch.reset(new std::string);
211     scratch->reserve(leftover_.size() * utf8_replacement_character_.size());
212     ReplaceInvalidCodePoints(leftover_, utf8_replacement_character_,
213                              scratch.get());
214     p_ = json_ = *scratch;
215   } else {
216     p_ = json_ = leftover_;
217     if (!is_valid_utf8) {
218       return ReportFailure("Encountered non UTF-8 code points.");
219     }
220   }
221 
222   // Parse the remainder in finishing mode, which reports errors for things like
223   // unterminated strings or unknown tokens that would normally be retried.
224   finishing_ = true;
225   util::Status result = RunParser();
226   if (result.ok()) {
227     SkipWhitespace();
228     if (!p_.empty()) {
229       result = ReportFailure("Parsing terminated before end of input.");
230     }
231   }
232   return result;
233 }
234 
ParseChunk(StringPiece chunk)235 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
236   // Do not do any work if the chunk is empty.
237   if (chunk.empty()) return util::Status();
238 
239   p_ = json_ = chunk;
240 
241   finishing_ = false;
242   util::Status result = RunParser();
243   if (!result.ok()) return result;
244 
245   SkipWhitespace();
246   if (p_.empty()) {
247     // If we parsed everything we had, clear the leftover.
248     leftover_.clear();
249   } else {
250     // If we do not expect anything i.e. stack is empty, and we have non-empty
251     // string left to parse, we report an error.
252     if (stack_.empty()) {
253       return ReportFailure("Parsing terminated before end of input.");
254     }
255     // If we expect future data i.e. stack is non-empty, and we have some
256     // unparsed data left, we save it for later parse.
257     leftover_ = std::string(p_);
258   }
259   return util::Status();
260 }
261 
RunParser()262 util::Status JsonStreamParser::RunParser() {
263   while (!stack_.empty()) {
264     ParseType type = stack_.top();
265     TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
266     stack_.pop();
267     util::Status result;
268     switch (type) {
269       case VALUE:
270         result = ParseValue(t);
271         break;
272 
273       case OBJ_MID:
274         result = ParseObjectMid(t);
275         break;
276 
277       case ENTRY:
278         result = ParseEntry(t);
279         break;
280 
281       case ENTRY_MID:
282         result = ParseEntryMid(t);
283         break;
284 
285       case ARRAY_VALUE:
286         result = ParseArrayValue(t);
287         break;
288 
289       case ARRAY_MID:
290         result = ParseArrayMid(t);
291         break;
292 
293       default:
294         result = util::Status(util::error::INTERNAL,
295                               StrCat("Unknown parse type: ", type));
296         break;
297     }
298     if (!result.ok()) {
299       // If we were cancelled, save our state and try again later.
300       if (!finishing_ &&
301           result == util::Status(util::error::CANCELLED, "")) {
302         stack_.push(type);
303         // If we have a key we still need to render, make sure to save off the
304         // contents in our own storage.
305         if (!key_.empty() && key_storage_.empty()) {
306           StrAppend(&key_storage_, key_);
307           key_ = StringPiece(key_storage_);
308         }
309         result = util::Status();
310       }
311       return result;
312     }
313   }
314   return util::Status();
315 }
316 
ParseValue(TokenType type)317 util::Status JsonStreamParser::ParseValue(TokenType type) {
318   switch (type) {
319     case BEGIN_OBJECT:
320       return HandleBeginObject();
321     case BEGIN_ARRAY:
322       return HandleBeginArray();
323     case BEGIN_STRING:
324       return ParseString();
325     case BEGIN_NUMBER:
326       return ParseNumber();
327     case BEGIN_TRUE:
328       return ParseTrue();
329     case BEGIN_FALSE:
330       return ParseFalse();
331     case BEGIN_NULL:
332       return ParseNull();
333     case UNKNOWN:
334       return ReportUnknown("Expected a value.");
335     default: {
336       if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
337         return ParseEmptyNull();
338       }
339 
340       // Special case for having been cut off while parsing, wait for more data.
341       // This handles things like 'fals' being at the end of the string, we
342       // don't know if the next char would be e, completing it, or something
343       // else, making it invalid.
344       if (!finishing_ && p_.length() < kKeywordFalse.length()) {
345         return util::Status(util::error::CANCELLED, "");
346       }
347       return ReportFailure("Unexpected token.");
348     }
349   }
350 }
351 
ParseString()352 util::Status JsonStreamParser::ParseString() {
353   util::Status result = ParseStringHelper();
354   if (result.ok()) {
355     ow_->RenderString(key_, parsed_);
356     key_ = StringPiece();
357     parsed_ = StringPiece();
358     parsed_storage_.clear();
359   }
360   return result;
361 }
362 
ParseStringHelper()363 util::Status JsonStreamParser::ParseStringHelper() {
364   // If we haven't seen the start quote, grab it and remember it for later.
365   if (string_open_ == 0) {
366     string_open_ = *p_.data();
367     GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
368     Advance();
369   }
370   // Track where we last copied data from so we can minimize copying.
371   const char* last = p_.data();
372   while (!p_.empty()) {
373     const char* data = p_.data();
374     if (*data == '\\') {
375       // We're about to handle an escape, copy all bytes from last to data.
376       if (last < data) {
377         parsed_storage_.append(last, data - last);
378       }
379       // If we ran out of string after the \, cancel or report an error
380       // depending on if we expect more data later.
381       if (p_.length() == 1) {
382         if (!finishing_) {
383           return util::Status(util::error::CANCELLED, "");
384         }
385         return ReportFailure("Closing quote expected in string.");
386       }
387       // Parse a unicode escape if we found \u in the string.
388       if (data[1] == 'u') {
389         util::Status result = ParseUnicodeEscape();
390         if (!result.ok()) {
391           return result;
392         }
393         // Move last pointer past the unicode escape and continue.
394         last = p_.data();
395         continue;
396       }
397       // Handle the standard set of backslash-escaped characters.
398       switch (data[1]) {
399         case 'b':
400           parsed_storage_.push_back('\b');
401           break;
402         case 'f':
403           parsed_storage_.push_back('\f');
404           break;
405         case 'n':
406           parsed_storage_.push_back('\n');
407           break;
408         case 'r':
409           parsed_storage_.push_back('\r');
410           break;
411         case 't':
412           parsed_storage_.push_back('\t');
413           break;
414         case 'v':
415           parsed_storage_.push_back('\v');
416           break;
417         default:
418           parsed_storage_.push_back(data[1]);
419       }
420       // We handled two characters, so advance past them and continue.
421       p_.remove_prefix(2);
422       last = p_.data();
423       continue;
424     }
425     // If we found the closing quote note it, advance past it, and return.
426     if (*data == string_open_) {
427       // If we didn't copy anything, reuse the input buffer.
428       if (parsed_storage_.empty()) {
429         parsed_ = StringPiece(last, data - last);
430       } else {
431         if (last < data) {
432           parsed_storage_.append(last, data - last);
433         }
434         parsed_ = StringPiece(parsed_storage_);
435       }
436       // Clear the quote char so next time we try to parse a string we'll
437       // start fresh.
438       string_open_ = 0;
439       Advance();
440       return util::Status();
441     }
442     // Normal character, just advance past it.
443     Advance();
444   }
445   // If we ran out of characters, copy over what we have so far.
446   if (last < p_.data()) {
447     parsed_storage_.append(last, p_.data() - last);
448   }
449   // If we didn't find the closing quote but we expect more data, cancel for now
450   if (!finishing_) {
451     return util::Status(util::error::CANCELLED, "");
452   }
453   // End of string reached without a closing quote, report an error.
454   string_open_ = 0;
455   return ReportFailure("Closing quote expected in string.");
456 }
457 
458 // Converts a unicode escaped character to a decimal value stored in a char32
459 // for use in UTF8 encoding utility.  We assume that str begins with \uhhhh and
460 // convert that from the hex number to a decimal value.
461 //
462 // There are some security exploits with UTF-8 that we should be careful of:
463 //   - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
464 //   - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()465 util::Status JsonStreamParser::ParseUnicodeEscape() {
466   if (p_.length() < kUnicodeEscapedLength) {
467     if (!finishing_) {
468       return util::Status(util::error::CANCELLED, "");
469     }
470     return ReportFailure("Illegal hex string.");
471   }
472   GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
473   GOOGLE_DCHECK_EQ('u', p_.data()[1]);
474   uint32 code = 0;
475   for (int i = 2; i < kUnicodeEscapedLength; ++i) {
476     if (!isxdigit(p_.data()[i])) {
477       return ReportFailure("Invalid escape sequence.");
478     }
479     code = (code << 4) + hex_digit_to_int(p_.data()[i]);
480   }
481   if (code >= JsonEscaping::kMinHighSurrogate &&
482       code <= JsonEscaping::kMaxHighSurrogate) {
483     if (p_.length() < 2 * kUnicodeEscapedLength) {
484       if (!finishing_) {
485         return util::Status(util::error::CANCELLED, "");
486       }
487       if (!coerce_to_utf8_) {
488         return ReportFailure("Missing low surrogate.");
489       }
490     } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
491                p_.data()[kUnicodeEscapedLength + 1] == 'u') {
492       uint32 low_code = 0;
493       for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
494            ++i) {
495         if (!isxdigit(p_.data()[i])) {
496           return ReportFailure("Invalid escape sequence.");
497         }
498         low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
499       }
500       if (low_code >= JsonEscaping::kMinLowSurrogate &&
501           low_code <= JsonEscaping::kMaxLowSurrogate) {
502         // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
503         code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
504                JsonEscaping::kMinSupplementaryCodePoint;
505         // Advance past the first code unit escape.
506         p_.remove_prefix(kUnicodeEscapedLength);
507       } else if (!coerce_to_utf8_) {
508         return ReportFailure("Invalid low surrogate.");
509       }
510     } else if (!coerce_to_utf8_) {
511       return ReportFailure("Missing low surrogate.");
512     }
513   }
514   if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
515     return ReportFailure("Invalid unicode code point.");
516   }
517   char buf[UTFmax];
518   int len = EncodeAsUTF8Char(code, buf);
519   // Advance past the [final] code unit escape.
520   p_.remove_prefix(kUnicodeEscapedLength);
521   parsed_storage_.append(buf, len);
522   return util::Status();
523 }
524 
ParseNumber()525 util::Status JsonStreamParser::ParseNumber() {
526   NumberResult number;
527   util::Status result = ParseNumberHelper(&number);
528   if (result.ok()) {
529     switch (number.type) {
530       case NumberResult::DOUBLE:
531         ow_->RenderDouble(key_, number.double_val);
532         key_ = StringPiece();
533         break;
534 
535       case NumberResult::INT:
536         ow_->RenderInt64(key_, number.int_val);
537         key_ = StringPiece();
538         break;
539 
540       case NumberResult::UINT:
541         ow_->RenderUint64(key_, number.uint_val);
542         key_ = StringPiece();
543         break;
544 
545       default:
546         return ReportFailure("Unable to parse number.");
547     }
548   }
549   return result;
550 }
551 
ParseDoubleHelper(const std::string & number,NumberResult * result)552 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
553                                                  NumberResult* result) {
554   if (!safe_strtod(number, &result->double_val)) {
555     return ReportFailure("Unable to parse number.");
556   }
557   if (!loose_float_number_conversion_ && !std::isfinite(result->double_val)) {
558     return ReportFailure("Number exceeds the range of double.");
559   }
560   result->type = NumberResult::DOUBLE;
561   return util::Status();
562 }
563 
ParseNumberHelper(NumberResult * result)564 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
565   const char* data = p_.data();
566   int length = p_.length();
567 
568   // Look for the first non-numeric character, or the end of the string.
569   int index = 0;
570   bool floating = false;
571   bool negative = data[index] == '-';
572   // Find the first character that cannot be part of the number. Along the way
573   // detect if the number needs to be parsed as a double.
574   // Note that this restricts numbers to the JSON specification, so for example
575   // we do not support hex or octal notations.
576   for (; index < length; ++index) {
577     char c = data[index];
578     if (isdigit(c)) continue;
579     if (c == '.' || c == 'e' || c == 'E') {
580       floating = true;
581       continue;
582     }
583     if (c == '+' || c == '-' || c == 'x') continue;
584     // Not a valid number character, break out.
585     break;
586   }
587 
588   // If the entire input is a valid number, and we may have more content in the
589   // future, we abort for now and resume when we know more.
590   if (index == length && !finishing_) {
591     return util::Status(util::error::CANCELLED, "");
592   }
593 
594   // Create a string containing just the number, so we can use safe_strtoX
595   std::string number = std::string(p_.substr(0, index));
596 
597   // Floating point number, parse as a double.
598   if (floating) {
599     util::Status status = ParseDoubleHelper(number, result);
600     if (status.ok()) {
601       p_.remove_prefix(index);
602     }
603     return status;
604   }
605 
606   // Positive non-floating point number, parse as a uint64.
607   if (!negative) {
608     // Octal/Hex numbers are not valid JSON values.
609     if (number.length() >= 2 && number[0] == '0') {
610       return ReportFailure("Octal/hex numbers are not valid JSON values.");
611     }
612     if (safe_strtou64(number, &result->uint_val)) {
613       result->type = NumberResult::UINT;
614       p_.remove_prefix(index);
615       return util::Status();
616     } else {
617       // If the value is too large, parse it as double.
618       util::Status status = ParseDoubleHelper(number, result);
619       if (status.ok()) {
620         p_.remove_prefix(index);
621       }
622       return status;
623     }
624   }
625 
626   // Octal/Hex numbers are not valid JSON values.
627   if (number.length() >= 3 && number[1] == '0') {
628     return ReportFailure("Octal/hex numbers are not valid JSON values.");
629   }
630   // Negative non-floating point number, parse as an int64.
631   if (safe_strto64(number, &result->int_val)) {
632     result->type = NumberResult::INT;
633     p_.remove_prefix(index);
634     return util::Status();
635   } else {
636     // If the value is too large, parse it as double.
637     util::Status status = ParseDoubleHelper(number, result);
638     if (status.ok()) {
639       p_.remove_prefix(index);
640     }
641     return status;
642   }
643 }
644 
HandleBeginObject()645 util::Status JsonStreamParser::HandleBeginObject() {
646   GOOGLE_DCHECK_EQ('{', *p_.data());
647   Advance();
648   ow_->StartObject(key_);
649   auto status = IncrementRecursionDepth(key_);
650   if (!status.ok()) {
651     return status;
652   }
653   key_ = StringPiece();
654   stack_.push(ENTRY);
655   return util::Status();
656 }
657 
ParseObjectMid(TokenType type)658 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
659   if (type == UNKNOWN) {
660     return ReportUnknown("Expected , or } after key:value pair.");
661   }
662 
663   // Object is complete, advance past the comma and render the EndObject.
664   if (type == END_OBJECT) {
665     Advance();
666     ow_->EndObject();
667     --recursion_depth_;
668     return util::Status();
669   }
670   // Found a comma, advance past it and get ready for an entry.
671   if (type == VALUE_SEPARATOR) {
672     Advance();
673     stack_.push(ENTRY);
674     return util::Status();
675   }
676   // Illegal token after key:value pair.
677   return ReportFailure("Expected , or } after key:value pair.");
678 }
679 
ParseEntry(TokenType type)680 util::Status JsonStreamParser::ParseEntry(TokenType type) {
681   if (type == UNKNOWN) {
682     return ReportUnknown("Expected an object key or }.");
683   }
684 
685   // Close the object and return. This allows for trailing commas.
686   if (type == END_OBJECT) {
687     ow_->EndObject();
688     Advance();
689     --recursion_depth_;
690     return util::Status();
691   }
692 
693   util::Status result;
694   if (type == BEGIN_STRING) {
695     // Key is a string (standard JSON), parse it and store the string.
696     result = ParseStringHelper();
697     if (result.ok()) {
698       key_storage_.clear();
699       if (!parsed_storage_.empty()) {
700         parsed_storage_.swap(key_storage_);
701         key_ = StringPiece(key_storage_);
702       } else {
703         key_ = parsed_;
704       }
705       parsed_ = StringPiece();
706     }
707   } else if (type == BEGIN_KEY) {
708     // Key is a bare key (back compat), create a StringPiece pointing to it.
709     result = ParseKey();
710   } else if (type == BEGIN_NULL || type == BEGIN_TRUE || type == BEGIN_FALSE) {
711     // Key may be a bare key that begins with a reserved word.
712     result = ParseKey();
713     if (result.ok() && (key_ == kKeywordNull || key_ == kKeywordTrue ||
714                         key_ == kKeywordFalse)) {
715       result = ReportFailure("Expected an object key or }.");
716     }
717   } else {
718     // Unknown key type, report an error.
719     result = ReportFailure("Expected an object key or }.");
720   }
721   // On success we next expect an entry mid ':' then an object mid ',' or '}'
722   if (result.ok()) {
723     stack_.push(OBJ_MID);
724     stack_.push(ENTRY_MID);
725   }
726   return result;
727 }
728 
ParseEntryMid(TokenType type)729 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
730   if (type == UNKNOWN) {
731     return ReportUnknown("Expected : between key:value pair.");
732   }
733   if (type == ENTRY_SEPARATOR) {
734     Advance();
735     stack_.push(VALUE);
736     return util::Status();
737   }
738   return ReportFailure("Expected : between key:value pair.");
739 }
740 
HandleBeginArray()741 util::Status JsonStreamParser::HandleBeginArray() {
742   GOOGLE_DCHECK_EQ('[', *p_.data());
743   Advance();
744   ow_->StartList(key_);
745   key_ = StringPiece();
746   stack_.push(ARRAY_VALUE);
747   return util::Status();
748 }
749 
ParseArrayValue(TokenType type)750 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
751   if (type == UNKNOWN) {
752     return ReportUnknown("Expected a value or ] within an array.");
753   }
754 
755   if (type == END_ARRAY) {
756     ow_->EndList();
757     Advance();
758     return util::Status();
759   }
760 
761   // The ParseValue call may push something onto the stack so we need to make
762   // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
763   // empty-null array value is relying on this ARRAY_MID token.
764   stack_.push(ARRAY_MID);
765   util::Status result = ParseValue(type);
766   if (result == util::Status(util::error::CANCELLED, "")) {
767     // If we were cancelled, pop back off the ARRAY_MID so we don't try to
768     // push it on again when we try over.
769     stack_.pop();
770   }
771   return result;
772 }
773 
ParseArrayMid(TokenType type)774 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
775   if (type == UNKNOWN) {
776     return ReportUnknown("Expected , or ] after array value.");
777   }
778 
779   if (type == END_ARRAY) {
780     ow_->EndList();
781     Advance();
782     return util::Status();
783   }
784 
785   // Found a comma, advance past it and expect an array value next.
786   if (type == VALUE_SEPARATOR) {
787     Advance();
788     stack_.push(ARRAY_VALUE);
789     return util::Status();
790   }
791   // Illegal token after array value.
792   return ReportFailure("Expected , or ] after array value.");
793 }
794 
ParseTrue()795 util::Status JsonStreamParser::ParseTrue() {
796   ow_->RenderBool(key_, true);
797   key_ = StringPiece();
798   p_.remove_prefix(kKeywordTrue.length());
799   return util::Status();
800 }
801 
ParseFalse()802 util::Status JsonStreamParser::ParseFalse() {
803   ow_->RenderBool(key_, false);
804   key_ = StringPiece();
805   p_.remove_prefix(kKeywordFalse.length());
806   return util::Status();
807 }
808 
ParseNull()809 util::Status JsonStreamParser::ParseNull() {
810   ow_->RenderNull(key_);
811   key_ = StringPiece();
812   p_.remove_prefix(kKeywordNull.length());
813   return util::Status();
814 }
815 
ParseEmptyNull()816 util::Status JsonStreamParser::ParseEmptyNull() {
817   ow_->RenderNull(key_);
818   key_ = StringPiece();
819   return util::Status();
820 }
821 
IsEmptyNullAllowed(TokenType type)822 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
823   if (stack_.empty()) return false;
824   return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
825          stack_.top() == OBJ_MID;
826 }
827 
ReportFailure(StringPiece message)828 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
829   static const int kContextLength = 20;
830   const char* p_start = p_.data();
831   const char* json_start = json_.data();
832   const char* begin = std::max(p_start - kContextLength, json_start);
833   const char* end =
834       std::min(p_start + kContextLength, json_start + json_.size());
835   StringPiece segment(begin, end - begin);
836   std::string location(p_start - begin, ' ');
837   location.push_back('^');
838   return util::Status(util::error::INVALID_ARGUMENT,
839                       StrCat(message, "\n", segment, "\n", location));
840 }
841 
ReportUnknown(StringPiece message)842 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
843   // If we aren't finishing the parse, cancel parsing and try later.
844   if (!finishing_) {
845     return util::Status(util::error::CANCELLED, "");
846   }
847   if (p_.empty()) {
848     return ReportFailure(StrCat("Unexpected end of string. ", message));
849   }
850   return ReportFailure(message);
851 }
852 
IncrementRecursionDepth(StringPiece key) const853 util::Status JsonStreamParser::IncrementRecursionDepth(
854     StringPiece key) const {
855   if (++recursion_depth_ > max_recursion_depth_) {
856     return Status(
857         util::error::INVALID_ARGUMENT,
858         StrCat("Message too deep. Max recursion depth reached for key '",
859                      key, "'"));
860   }
861   return util::Status();
862 }
863 
SkipWhitespace()864 void JsonStreamParser::SkipWhitespace() {
865   while (!p_.empty() && ascii_isspace(*p_.data())) {
866     Advance();
867   }
868 }
869 
Advance()870 void JsonStreamParser::Advance() {
871   // Advance by moving one UTF8 character while making sure we don't go beyond
872   // the length of StringPiece.
873   p_.remove_prefix(std::min<int>(
874       p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
875 }
876 
ParseKey()877 util::Status JsonStreamParser::ParseKey() {
878   StringPiece original = p_;
879 
880   if (allow_permissive_key_naming_) {
881     if (!ConsumeKeyPermissive(&p_, &key_)) {
882       return ReportFailure("Invalid key or variable name.");
883     }
884   } else {
885     if (!ConsumeKey(&p_, &key_)) {
886       return ReportFailure("Invalid key or variable name.");
887     }
888   }
889 
890   // If we consumed everything but expect more data, reset p_ and cancel since
891   // we can't know if the key was complete or not.
892   if (!finishing_ && p_.empty()) {
893     p_ = original;
894     return util::Status(util::error::CANCELLED, "");
895   }
896   // Since we aren't using the key storage, clear it out.
897   key_storage_.clear();
898   return util::Status();
899 }
900 
GetNextTokenType()901 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
902   SkipWhitespace();
903 
904   int size = p_.size();
905   if (size == 0) {
906     // If we ran out of data, report unknown and we'll place the previous parse
907     // type onto the stack and try again when we have more data.
908     return UNKNOWN;
909   }
910   // TODO(sven): Split this method based on context since different contexts
911   // support different tokens. Would slightly speed up processing?
912   const char* data = p_.data();
913   StringPiece data_view = StringPiece(data, size);
914   if (*data == '\"' || *data == '\'') return BEGIN_STRING;
915   if (*data == '-' || ('0' <= *data && *data <= '9')) {
916     return BEGIN_NUMBER;
917   }
918   if (size >= kKeywordTrue.length() &&
919       HasPrefixString(data_view, kKeywordTrue)) {
920     return BEGIN_TRUE;
921   }
922   if (size >= kKeywordFalse.length() &&
923       HasPrefixString(data_view, kKeywordFalse)) {
924     return BEGIN_FALSE;
925   }
926   if (size >= kKeywordNull.length() &&
927       HasPrefixString(data_view, kKeywordNull)) {
928     return BEGIN_NULL;
929   }
930   if (*data == '{') return BEGIN_OBJECT;
931   if (*data == '}') return END_OBJECT;
932   if (*data == '[') return BEGIN_ARRAY;
933   if (*data == ']') return END_ARRAY;
934   if (*data == ':') return ENTRY_SEPARATOR;
935   if (*data == ',') return VALUE_SEPARATOR;
936   if (MatchKey(p_)) {
937     return BEGIN_KEY;
938   }
939 
940   // We don't know that we necessarily have an invalid token here, just that we
941   // can't parse what we have so far. So we don't report an error and just
942   // return UNKNOWN so we can try again later when we have more data, or if we
943   // finish and we have leftovers.
944   return UNKNOWN;
945 }
946 
947 }  // namespace converter
948 }  // namespace util
949 }  // namespace protobuf
950 }  // namespace google
951