1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32
33 #include <algorithm>
34 #include <cctype>
35 #include <cerrno>
36 #include <cstdlib>
37 #include <cstring>
38 #include <memory>
39 #ifndef _SHARED_PTR_H
40 #include <google/protobuf/stubs/shared_ptr.h>
41 #endif
42
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/common.h>
45 #include <google/protobuf/util/internal/object_writer.h>
46 #include <google/protobuf/util/internal/json_escaping.h>
47 #include <google/protobuf/stubs/strutil.h>
48
49 namespace google {
50 namespace protobuf {
51 namespace util {
52
53 // Allow these symbols to be referenced as util::Status, util::error::* in
54 // this file.
55 using util::Status;
56 namespace error {
57 using util::error::INTERNAL;
58 using util::error::INVALID_ARGUMENT;
59 } // namespace error
60
61 namespace converter {
62
63 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
64 static const int kUnicodeEscapedLength = 6;
65
66 // Length of the true, false, and null literals.
67 static const int true_len = strlen("true");
68 static const int false_len = strlen("false");
69 static const int null_len = strlen("null");
70
IsLetter(char c)71 inline bool IsLetter(char c) {
72 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
73 (c == '$');
74 }
75
IsAlphanumeric(char c)76 inline bool IsAlphanumeric(char c) {
77 return IsLetter(c) || ('0' <= c && c <= '9');
78 }
79
ConsumeKey(StringPiece * input,StringPiece * key)80 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
81 if (input->empty() || !IsLetter((*input)[0])) return false;
82 int len = 1;
83 for (; len < input->size(); ++len) {
84 if (!IsAlphanumeric((*input)[len])) {
85 break;
86 }
87 }
88 *key = StringPiece(input->data(), len);
89 *input = StringPiece(input->data() + len, input->size() - len);
90 return true;
91 }
92
MatchKey(StringPiece input)93 static bool MatchKey(StringPiece input) {
94 return !input.empty() && IsLetter(input[0]);
95 }
96
JsonStreamParser(ObjectWriter * ow)97 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
98 : ow_(ow),
99 stack_(),
100 leftover_(),
101 json_(),
102 p_(),
103 key_(),
104 key_storage_(),
105 finishing_(false),
106 parsed_(),
107 parsed_storage_(),
108 string_open_(0),
109 chunk_storage_(),
110 coerce_to_utf8_(false) {
111 // Initialize the stack with a single value to be parsed.
112 stack_.push(VALUE);
113 }
114
~JsonStreamParser()115 JsonStreamParser::~JsonStreamParser() {}
116
117
Parse(StringPiece json)118 util::Status JsonStreamParser::Parse(StringPiece json) {
119 StringPiece chunk = json;
120 // If we have leftovers from a previous chunk, append the new chunk to it
121 // and create a new StringPiece pointing at the string's data. This could
122 // be large but we rely on the chunks to be small, assuming they are
123 // fragments of a Cord.
124 if (!leftover_.empty()) {
125 // Don't point chunk to leftover_ because leftover_ will be updated in
126 // ParseChunk(chunk).
127 chunk_storage_.swap(leftover_);
128 json.AppendToString(&chunk_storage_);
129 chunk = StringPiece(chunk_storage_);
130 }
131
132 // Find the structurally valid UTF8 prefix and parse only that.
133 int n = internal::UTF8SpnStructurallyValid(chunk);
134 if (n > 0) {
135 util::Status status = ParseChunk(chunk.substr(0, n));
136
137 // Any leftover characters are stashed in leftover_ for later parsing when
138 // there is more data available.
139 chunk.substr(n).AppendToString(&leftover_);
140 return status;
141 } else {
142 chunk.CopyToString(&leftover_);
143 return util::Status::OK;
144 }
145 }
146
FinishParse()147 util::Status JsonStreamParser::FinishParse() {
148 // If we do not expect anything and there is nothing left to parse we're all
149 // done.
150 if (stack_.empty() && leftover_.empty()) {
151 return util::Status::OK;
152 }
153
154 // Storage for UTF8-coerced string.
155 google::protobuf::scoped_array<char> utf8;
156 if (coerce_to_utf8_) {
157 utf8.reset(new char[leftover_.size()]);
158 char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
159 p_ = json_ = StringPiece(coerced, leftover_.size());
160 } else {
161 p_ = json_ = leftover_;
162 if (!internal::IsStructurallyValidUTF8(leftover_)) {
163 return ReportFailure("Encountered non UTF-8 code points.");
164 }
165 }
166
167 // Parse the remainder in finishing mode, which reports errors for things like
168 // unterminated strings or unknown tokens that would normally be retried.
169 finishing_ = true;
170 util::Status result = RunParser();
171 if (result.ok()) {
172 SkipWhitespace();
173 if (!p_.empty()) {
174 result = ReportFailure("Parsing terminated before end of input.");
175 }
176 }
177 return result;
178 }
179
ParseChunk(StringPiece chunk)180 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
181 // Do not do any work if the chunk is empty.
182 if (chunk.empty()) return util::Status::OK;
183
184 p_ = json_ = chunk;
185
186 finishing_ = false;
187 util::Status result = RunParser();
188 if (!result.ok()) return result;
189
190 SkipWhitespace();
191 if (p_.empty()) {
192 // If we parsed everything we had, clear the leftover.
193 leftover_.clear();
194 } else {
195 // If we do not expect anything i.e. stack is empty, and we have non-empty
196 // string left to parse, we report an error.
197 if (stack_.empty()) {
198 return ReportFailure("Parsing terminated before end of input.");
199 }
200 // If we expect future data i.e. stack is non-empty, and we have some
201 // unparsed data left, we save it for later parse.
202 leftover_ = p_.ToString();
203 }
204 return util::Status::OK;
205 }
206
RunParser()207 util::Status JsonStreamParser::RunParser() {
208 while (!stack_.empty()) {
209 ParseType type = stack_.top();
210 TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
211 stack_.pop();
212 util::Status result;
213 switch (type) {
214 case VALUE:
215 result = ParseValue(t);
216 break;
217
218 case OBJ_MID:
219 result = ParseObjectMid(t);
220 break;
221
222 case ENTRY:
223 result = ParseEntry(t);
224 break;
225
226 case ENTRY_MID:
227 result = ParseEntryMid(t);
228 break;
229
230 case ARRAY_VALUE:
231 result = ParseArrayValue(t);
232 break;
233
234 case ARRAY_MID:
235 result = ParseArrayMid(t);
236 break;
237
238 default:
239 result = util::Status(util::error::INTERNAL,
240 StrCat("Unknown parse type: ", type));
241 break;
242 }
243 if (!result.ok()) {
244 // If we were cancelled, save our state and try again later.
245 if (!finishing_ && result == util::Status::CANCELLED) {
246 stack_.push(type);
247 // If we have a key we still need to render, make sure to save off the
248 // contents in our own storage.
249 if (!key_.empty() && key_storage_.empty()) {
250 key_.AppendToString(&key_storage_);
251 key_ = StringPiece(key_storage_);
252 }
253 result = util::Status::OK;
254 }
255 return result;
256 }
257 }
258 return util::Status::OK;
259 }
260
ParseValue(TokenType type)261 util::Status JsonStreamParser::ParseValue(TokenType type) {
262 switch (type) {
263 case BEGIN_OBJECT:
264 return HandleBeginObject();
265 case BEGIN_ARRAY:
266 return HandleBeginArray();
267 case BEGIN_STRING:
268 return ParseString();
269 case BEGIN_NUMBER:
270 return ParseNumber();
271 case BEGIN_TRUE:
272 return ParseTrue();
273 case BEGIN_FALSE:
274 return ParseFalse();
275 case BEGIN_NULL:
276 return ParseNull();
277 case UNKNOWN:
278 return ReportUnknown("Expected a value.");
279 default: {
280 // Special case for having been cut off while parsing, wait for more data.
281 // This handles things like 'fals' being at the end of the string, we
282 // don't know if the next char would be e, completing it, or something
283 // else, making it invalid.
284 if (!finishing_ && p_.length() < false_len) {
285 return util::Status::CANCELLED;
286 }
287 return ReportFailure("Unexpected token.");
288 }
289 }
290 }
291
ParseString()292 util::Status JsonStreamParser::ParseString() {
293 util::Status result = ParseStringHelper();
294 if (result.ok()) {
295 ow_->RenderString(key_, parsed_);
296 key_.clear();
297 parsed_.clear();
298 parsed_storage_.clear();
299 }
300 return result;
301 }
302
ParseStringHelper()303 util::Status JsonStreamParser::ParseStringHelper() {
304 // If we haven't seen the start quote, grab it and remember it for later.
305 if (string_open_ == 0) {
306 string_open_ = *p_.data();
307 GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
308 Advance();
309 }
310 // Track where we last copied data from so we can minimize copying.
311 const char* last = p_.data();
312 while (!p_.empty()) {
313 const char* data = p_.data();
314 if (*data == '\\') {
315 // We're about to handle an escape, copy all bytes from last to data.
316 if (last < data) {
317 parsed_storage_.append(last, data - last);
318 last = data;
319 }
320 // If we ran out of string after the \, cancel or report an error
321 // depending on if we expect more data later.
322 if (p_.length() == 1) {
323 if (!finishing_) {
324 return util::Status::CANCELLED;
325 }
326 return ReportFailure("Closing quote expected in string.");
327 }
328 // Parse a unicode escape if we found \u in the string.
329 if (data[1] == 'u') {
330 util::Status result = ParseUnicodeEscape();
331 if (!result.ok()) {
332 return result;
333 }
334 // Move last pointer past the unicode escape and continue.
335 last = p_.data();
336 continue;
337 }
338 // Handle the standard set of backslash-escaped characters.
339 switch (data[1]) {
340 case 'b':
341 parsed_storage_.push_back('\b');
342 break;
343 case 'f':
344 parsed_storage_.push_back('\f');
345 break;
346 case 'n':
347 parsed_storage_.push_back('\n');
348 break;
349 case 'r':
350 parsed_storage_.push_back('\r');
351 break;
352 case 't':
353 parsed_storage_.push_back('\t');
354 break;
355 case 'v':
356 parsed_storage_.push_back('\v');
357 break;
358 default:
359 parsed_storage_.push_back(data[1]);
360 }
361 // We handled two characters, so advance past them and continue.
362 p_.remove_prefix(2);
363 last = p_.data();
364 continue;
365 }
366 // If we found the closing quote note it, advance past it, and return.
367 if (*data == string_open_) {
368 // If we didn't copy anything, reuse the input buffer.
369 if (parsed_storage_.empty()) {
370 parsed_ = StringPiece(last, data - last);
371 } else {
372 if (last < data) {
373 parsed_storage_.append(last, data - last);
374 last = data;
375 }
376 parsed_ = StringPiece(parsed_storage_);
377 }
378 // Clear the quote char so next time we try to parse a string we'll
379 // start fresh.
380 string_open_ = 0;
381 Advance();
382 return util::Status::OK;
383 }
384 // Normal character, just advance past it.
385 Advance();
386 }
387 // If we ran out of characters, copy over what we have so far.
388 if (last < p_.data()) {
389 parsed_storage_.append(last, p_.data() - last);
390 }
391 // If we didn't find the closing quote but we expect more data, cancel for now
392 if (!finishing_) {
393 return util::Status::CANCELLED;
394 }
395 // End of string reached without a closing quote, report an error.
396 string_open_ = 0;
397 return ReportFailure("Closing quote expected in string.");
398 }
399
400 // Converts a unicode escaped character to a decimal value stored in a char32
401 // for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
402 // convert that from the hex number to a decimal value.
403 //
404 // There are some security exploits with UTF-8 that we should be careful of:
405 // - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
406 // - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()407 util::Status JsonStreamParser::ParseUnicodeEscape() {
408 if (p_.length() < kUnicodeEscapedLength) {
409 if (!finishing_) {
410 return util::Status::CANCELLED;
411 }
412 return ReportFailure("Illegal hex string.");
413 }
414 GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
415 GOOGLE_DCHECK_EQ('u', p_.data()[1]);
416 uint32 code = 0;
417 for (int i = 2; i < kUnicodeEscapedLength; ++i) {
418 if (!isxdigit(p_.data()[i])) {
419 return ReportFailure("Invalid escape sequence.");
420 }
421 code = (code << 4) + hex_digit_to_int(p_.data()[i]);
422 }
423 if (code >= JsonEscaping::kMinHighSurrogate &&
424 code <= JsonEscaping::kMaxHighSurrogate) {
425 if (p_.length() < 2 * kUnicodeEscapedLength) {
426 if (!finishing_) {
427 return util::Status::CANCELLED;
428 }
429 if (!coerce_to_utf8_) {
430 return ReportFailure("Missing low surrogate.");
431 }
432 } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
433 p_.data()[kUnicodeEscapedLength + 1] == 'u') {
434 uint32 low_code = 0;
435 for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
436 ++i) {
437 if (!isxdigit(p_.data()[i])) {
438 return ReportFailure("Invalid escape sequence.");
439 }
440 low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
441 }
442 if (low_code >= JsonEscaping::kMinLowSurrogate &&
443 low_code <= JsonEscaping::kMaxLowSurrogate) {
444 // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
445 code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
446 JsonEscaping::kMinSupplementaryCodePoint;
447 // Advance past the first code unit escape.
448 p_.remove_prefix(kUnicodeEscapedLength);
449 } else if (!coerce_to_utf8_) {
450 return ReportFailure("Invalid low surrogate.");
451 }
452 } else if (!coerce_to_utf8_) {
453 return ReportFailure("Missing low surrogate.");
454 }
455 }
456 if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
457 return ReportFailure("Invalid unicode code point.");
458 }
459 char buf[UTFmax];
460 int len = EncodeAsUTF8Char(code, buf);
461 // Advance past the [final] code unit escape.
462 p_.remove_prefix(kUnicodeEscapedLength);
463 parsed_storage_.append(buf, len);
464 return util::Status::OK;
465 }
466
ParseNumber()467 util::Status JsonStreamParser::ParseNumber() {
468 NumberResult number;
469 util::Status result = ParseNumberHelper(&number);
470 if (result.ok()) {
471 switch (number.type) {
472 case NumberResult::DOUBLE:
473 ow_->RenderDouble(key_, number.double_val);
474 key_.clear();
475 break;
476
477 case NumberResult::INT:
478 ow_->RenderInt64(key_, number.int_val);
479 key_.clear();
480 break;
481
482 case NumberResult::UINT:
483 ow_->RenderUint64(key_, number.uint_val);
484 key_.clear();
485 break;
486
487 default:
488 return ReportFailure("Unable to parse number.");
489 }
490 }
491 return result;
492 }
493
ParseNumberHelper(NumberResult * result)494 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
495 const char* data = p_.data();
496 int length = p_.length();
497
498 // Look for the first non-numeric character, or the end of the string.
499 int index = 0;
500 bool floating = false;
501 bool negative = data[index] == '-';
502 // Find the first character that cannot be part of the number. Along the way
503 // detect if the number needs to be parsed as a double.
504 // Note that this restricts numbers to the JSON specification, so for example
505 // we do not support hex or octal notations.
506 for (; index < length; ++index) {
507 char c = data[index];
508 if (isdigit(c)) continue;
509 if (c == '.' || c == 'e' || c == 'E') {
510 floating = true;
511 continue;
512 }
513 if (c == '+' || c == '-' || c == 'x') continue;
514 // Not a valid number character, break out.
515 break;
516 }
517
518 // If the entire input is a valid number, and we may have more content in the
519 // future, we abort for now and resume when we know more.
520 if (index == length && !finishing_) {
521 return util::Status::CANCELLED;
522 }
523
524 // Create a string containing just the number, so we can use safe_strtoX
525 string number = p_.substr(0, index).ToString();
526
527 // Floating point number, parse as a double.
528 if (floating) {
529 if (!safe_strtod(number, &result->double_val)) {
530 return ReportFailure("Unable to parse number.");
531 }
532 result->type = NumberResult::DOUBLE;
533 p_.remove_prefix(index);
534 return util::Status::OK;
535 }
536
537 // Positive non-floating point number, parse as a uint64.
538 if (!negative) {
539 // Octal/Hex numbers are not valid JSON values.
540 if (number.length() >= 2 && number[0] == '0') {
541 return ReportFailure("Octal/hex numbers are not valid JSON values.");
542 }
543 if (!safe_strtou64(number, &result->uint_val)) {
544 return ReportFailure("Unable to parse number.");
545 }
546 result->type = NumberResult::UINT;
547 p_.remove_prefix(index);
548 return util::Status::OK;
549 }
550
551 // Octal/Hex numbers are not valid JSON values.
552 if (number.length() >= 3 && number[1] == '0') {
553 return ReportFailure("Octal/hex numbers are not valid JSON values.");
554 }
555 // Negative non-floating point number, parse as an int64.
556 if (!safe_strto64(number, &result->int_val)) {
557 return ReportFailure("Unable to parse number.");
558 }
559 result->type = NumberResult::INT;
560 p_.remove_prefix(index);
561 return util::Status::OK;
562 }
563
HandleBeginObject()564 util::Status JsonStreamParser::HandleBeginObject() {
565 GOOGLE_DCHECK_EQ('{', *p_.data());
566 Advance();
567 ow_->StartObject(key_);
568 key_.clear();
569 stack_.push(ENTRY);
570 return util::Status::OK;
571 }
572
ParseObjectMid(TokenType type)573 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
574 if (type == UNKNOWN) {
575 return ReportUnknown("Expected , or } after key:value pair.");
576 }
577
578 // Object is complete, advance past the comma and render the EndObject.
579 if (type == END_OBJECT) {
580 Advance();
581 ow_->EndObject();
582 return util::Status::OK;
583 }
584 // Found a comma, advance past it and get ready for an entry.
585 if (type == VALUE_SEPARATOR) {
586 Advance();
587 stack_.push(ENTRY);
588 return util::Status::OK;
589 }
590 // Illegal token after key:value pair.
591 return ReportFailure("Expected , or } after key:value pair.");
592 }
593
ParseEntry(TokenType type)594 util::Status JsonStreamParser::ParseEntry(TokenType type) {
595 if (type == UNKNOWN) {
596 return ReportUnknown("Expected an object key or }.");
597 }
598
599 // Close the object and return. This allows for trailing commas.
600 if (type == END_OBJECT) {
601 ow_->EndObject();
602 Advance();
603 return util::Status::OK;
604 }
605
606 util::Status result;
607 if (type == BEGIN_STRING) {
608 // Key is a string (standard JSON), parse it and store the string.
609 result = ParseStringHelper();
610 if (result.ok()) {
611 key_storage_.clear();
612 if (!parsed_storage_.empty()) {
613 parsed_storage_.swap(key_storage_);
614 key_ = StringPiece(key_storage_);
615 } else {
616 key_ = parsed_;
617 }
618 parsed_.clear();
619 }
620 } else if (type == BEGIN_KEY) {
621 // Key is a bare key (back compat), create a StringPiece pointing to it.
622 result = ParseKey();
623 } else {
624 // Unknown key type, report an error.
625 result = ReportFailure("Expected an object key or }.");
626 }
627 // On success we next expect an entry mid ':' then an object mid ',' or '}'
628 if (result.ok()) {
629 stack_.push(OBJ_MID);
630 stack_.push(ENTRY_MID);
631 }
632 return result;
633 }
634
ParseEntryMid(TokenType type)635 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
636 if (type == UNKNOWN) {
637 return ReportUnknown("Expected : between key:value pair.");
638 }
639 if (type == ENTRY_SEPARATOR) {
640 Advance();
641 stack_.push(VALUE);
642 return util::Status::OK;
643 }
644 return ReportFailure("Expected : between key:value pair.");
645 }
646
HandleBeginArray()647 util::Status JsonStreamParser::HandleBeginArray() {
648 GOOGLE_DCHECK_EQ('[', *p_.data());
649 Advance();
650 ow_->StartList(key_);
651 key_.clear();
652 stack_.push(ARRAY_VALUE);
653 return util::Status::OK;
654 }
655
ParseArrayValue(TokenType type)656 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
657 if (type == UNKNOWN) {
658 return ReportUnknown("Expected a value or ] within an array.");
659 }
660
661 if (type == END_ARRAY) {
662 ow_->EndList();
663 Advance();
664 return util::Status::OK;
665 }
666
667 // The ParseValue call may push something onto the stack so we need to make
668 // sure an ARRAY_MID is after it, so we push it on now.
669 stack_.push(ARRAY_MID);
670 util::Status result = ParseValue(type);
671 if (result == util::Status::CANCELLED) {
672 // If we were cancelled, pop back off the ARRAY_MID so we don't try to
673 // push it on again when we try over.
674 stack_.pop();
675 }
676 return result;
677 }
678
ParseArrayMid(TokenType type)679 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
680 if (type == UNKNOWN) {
681 return ReportUnknown("Expected , or ] after array value.");
682 }
683
684 if (type == END_ARRAY) {
685 ow_->EndList();
686 Advance();
687 return util::Status::OK;
688 }
689
690 // Found a comma, advance past it and expect an array value next.
691 if (type == VALUE_SEPARATOR) {
692 Advance();
693 stack_.push(ARRAY_VALUE);
694 return util::Status::OK;
695 }
696 // Illegal token after array value.
697 return ReportFailure("Expected , or ] after array value.");
698 }
699
ParseTrue()700 util::Status JsonStreamParser::ParseTrue() {
701 ow_->RenderBool(key_, true);
702 key_.clear();
703 p_.remove_prefix(true_len);
704 return util::Status::OK;
705 }
706
ParseFalse()707 util::Status JsonStreamParser::ParseFalse() {
708 ow_->RenderBool(key_, false);
709 key_.clear();
710 p_.remove_prefix(false_len);
711 return util::Status::OK;
712 }
713
ParseNull()714 util::Status JsonStreamParser::ParseNull() {
715 ow_->RenderNull(key_);
716 key_.clear();
717 p_.remove_prefix(null_len);
718 return util::Status::OK;
719 }
720
ReportFailure(StringPiece message)721 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
722 static const int kContextLength = 20;
723 const char* p_start = p_.data();
724 const char* json_start = json_.data();
725 const char* begin = std::max(p_start - kContextLength, json_start);
726 const char* end =
727 std::min(p_start + kContextLength, json_start + json_.size());
728 StringPiece segment(begin, end - begin);
729 string location(p_start - begin, ' ');
730 location.push_back('^');
731 return util::Status(util::error::INVALID_ARGUMENT,
732 StrCat(message, "\n", segment, "\n", location));
733 }
734
ReportUnknown(StringPiece message)735 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
736 // If we aren't finishing the parse, cancel parsing and try later.
737 if (!finishing_) {
738 return util::Status::CANCELLED;
739 }
740 if (p_.empty()) {
741 return ReportFailure(StrCat("Unexpected end of string. ", message));
742 }
743 return ReportFailure(message);
744 }
745
SkipWhitespace()746 void JsonStreamParser::SkipWhitespace() {
747 while (!p_.empty() && ascii_isspace(*p_.data())) {
748 Advance();
749 }
750 }
751
Advance()752 void JsonStreamParser::Advance() {
753 // Advance by moving one UTF8 character while making sure we don't go beyond
754 // the length of StringPiece.
755 p_.remove_prefix(std::min<int>(
756 p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
757 }
758
ParseKey()759 util::Status JsonStreamParser::ParseKey() {
760 StringPiece original = p_;
761 if (!ConsumeKey(&p_, &key_)) {
762 return ReportFailure("Invalid key or variable name.");
763 }
764 // If we consumed everything but expect more data, reset p_ and cancel since
765 // we can't know if the key was complete or not.
766 if (!finishing_ && p_.empty()) {
767 p_ = original;
768 return util::Status::CANCELLED;
769 }
770 // Since we aren't using the key storage, clear it out.
771 key_storage_.clear();
772 return util::Status::OK;
773 }
774
GetNextTokenType()775 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
776 SkipWhitespace();
777
778 int size = p_.size();
779 if (size == 0) {
780 // If we ran out of data, report unknown and we'll place the previous parse
781 // type onto the stack and try again when we have more data.
782 return UNKNOWN;
783 }
784 // TODO(sven): Split this method based on context since different contexts
785 // support different tokens. Would slightly speed up processing?
786 const char* data = p_.data();
787 if (*data == '\"' || *data == '\'') return BEGIN_STRING;
788 if (*data == '-' || ('0' <= *data && *data <= '9')) {
789 return BEGIN_NUMBER;
790 }
791 if (size >= true_len && !strncmp(data, "true", true_len)) {
792 return BEGIN_TRUE;
793 }
794 if (size >= false_len && !strncmp(data, "false", false_len)) {
795 return BEGIN_FALSE;
796 }
797 if (size >= null_len && !strncmp(data, "null", null_len)) {
798 return BEGIN_NULL;
799 }
800 if (*data == '{') return BEGIN_OBJECT;
801 if (*data == '}') return END_OBJECT;
802 if (*data == '[') return BEGIN_ARRAY;
803 if (*data == ']') return END_ARRAY;
804 if (*data == ':') return ENTRY_SEPARATOR;
805 if (*data == ',') return VALUE_SEPARATOR;
806 if (MatchKey(p_)) {
807 return BEGIN_KEY;
808 }
809
810 // We don't know that we necessarily have an invalid token here, just that we
811 // can't parse what we have so far. So we don't report an error and just
812 // return UNKNOWN so we can try again later when we have more data, or if we
813 // finish and we have leftovers.
814 return UNKNOWN;
815 }
816
817 } // namespace converter
818 } // namespace util
819 } // namespace protobuf
820 } // namespace google
821