1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // Author: kenton@google.com (Kenton Varda)
32 // Based on original Protocol Buffers design by
33 // Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Here we have a hand-written lexer. At first you might ask yourself,
36 // "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
37 // yes I am crazy, but that's beside the point. There are actually reasons
38 // why I ended up writing this this way.
39 //
40 // The traditional approach to lexing is to use lex to generate a lexer for
41 // you. Unfortunately, lex's output is ridiculously ugly and difficult to
42 // integrate cleanly with C++ code, especially abstract code or code meant
43 // as a library. Better parser-generators exist but would add dependencies
44 // which most users won't already have, which we'd like to avoid. (GNU flex
45 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
46 // and not library-friendly.)
47 //
48 // The next approach that any good software engineer should look at is to
49 // use regular expressions. And, indeed, I did. I have code which
50 // implements this same class using regular expressions. It's about 200
51 // lines shorter. However:
52 // - Rather than error messages telling you "This string has an invalid
53 // escape sequence at line 5, column 45", you get error messages like
54 // "Parse error on line 5". Giving more precise errors requires adding
55 // a lot of code that ends up basically as complex as the hand-coded
56 // version anyway.
57 // - The regular expression to match a string literal looks like this:
58 // kString = new RE("(\"([^\"\\\\]|" // non-escaped
59 // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
60 // "\\\\x[0-9a-fA-F])*\"|" // hex escape
61 // "\'([^\'\\\\]|" // Also support single-quotes.
62 // "\\\\[abfnrtv?\"'\\\\0-7]|"
63 // "\\\\x[0-9a-fA-F])*\')");
64 // Verifying the correctness of this line noise is actually harder than
65 // verifying the correctness of ConsumeString(), defined below. I'm not
66 // even confident that the above is correct, after staring at it for some
67 // time.
68 // - PCRE is fast, but there's still more overhead involved than the code
69 // below.
70 // - Sadly, regular expressions are not part of the C standard library, so
71 // using them would require depending on some other library. For the
72 // open source release, this could be really annoying. Nobody likes
73 // downloading one piece of software just to find that they need to
74 // download something else to make it work, and in all likelihood
75 // people downloading Protocol Buffers will already be doing so just
76 // to make something else work. We could include a copy of PCRE with
77 // our code, but that obligates us to keep it up-to-date and just seems
78 // like a big waste just to save 200 lines of code.
79 //
80 // On a similar but unrelated note, I'm even scared to use ctype.h.
81 // Apparently functions like isalpha() are locale-dependent. So, if we used
82 // that, then if this code is being called from some program that doesn't
83 // have its locale set to "C", it would behave strangely. We can't just set
84 // the locale to "C" ourselves since we might break the calling program that
85 // way, particularly if it is multi-threaded. WTF? Someone please let me
86 // (Kenton) know if I'm missing something here...
87 //
88 // I'd love to hear about other alternatives, though, as this code isn't
89 // exactly pretty.
90
91 #include <google/protobuf/io/tokenizer.h>
92
93 #include <google/protobuf/stubs/common.h>
94 #include <google/protobuf/stubs/logging.h>
95 #include <google/protobuf/stubs/stringprintf.h>
96 #include <google/protobuf/stubs/strutil.h>
97 #include <google/protobuf/io/strtod.h>
98 #include <google/protobuf/io/zero_copy_stream.h>
99 #include <google/protobuf/stubs/stl_util.h>
100
101 namespace google {
102 namespace protobuf {
103 namespace io {
104 namespace {
105
106 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
107 // So, I have written replacement functions here. Someone please smack me if
108 // this is a bad idea or if there is some way around this.
109 //
110 // These "character classes" are designed to be used in template methods.
111 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
112 // whitespace.
113
114 // Note: No class is allowed to contain '\0', since this is used to mark end-
115 // of-input and is handled specially.
116
117 #define CHARACTER_CLASS(NAME, EXPRESSION) \
118 class NAME { \
119 public: \
120 static inline bool InClass(char c) { return EXPRESSION; } \
121 }
122
123 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || c == '\r' ||
124 c == '\v' || c == '\f');
125 CHARACTER_CLASS(WhitespaceNoNewline,
126 c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f');
127
128 CHARACTER_CLASS(Unprintable, c<' ' && c> '\0');
129
130 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
131 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
132 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
133 ('A' <= c && c <= 'F'));
134
135 CHARACTER_CLASS(Letter,
136 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
137
138 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
139 ('A' <= c && c <= 'Z') ||
140 ('0' <= c && c <= '9') || (c == '_'));
141
142 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
143 c == 'r' || c == 't' || c == 'v' || c == '\\' ||
144 c == '?' || c == '\'' || c == '\"');
145
146 #undef CHARACTER_CLASS
147
148 // Given a char, interpret it as a numeric digit and return its value.
149 // This supports any number base up to 36.
DigitValue(char digit)150 inline int DigitValue(char digit) {
151 if ('0' <= digit && digit <= '9') return digit - '0';
152 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
153 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
154 return -1;
155 }
156
157 // Inline because it's only used in one place.
TranslateEscape(char c)158 inline char TranslateEscape(char c) {
159 switch (c) {
160 case 'a':
161 return '\a';
162 case 'b':
163 return '\b';
164 case 'f':
165 return '\f';
166 case 'n':
167 return '\n';
168 case 'r':
169 return '\r';
170 case 't':
171 return '\t';
172 case 'v':
173 return '\v';
174 case '\\':
175 return '\\';
176 case '?':
177 return '\?'; // Trigraphs = :(
178 case '\'':
179 return '\'';
180 case '"':
181 return '\"';
182
183 // We expect escape sequences to have been validated separately.
184 default:
185 return '?';
186 }
187 }
188
189 } // anonymous namespace
190
~ErrorCollector()191 ErrorCollector::~ErrorCollector() {}
192
193 // ===================================================================
194
Tokenizer(ZeroCopyInputStream * input,ErrorCollector * error_collector)195 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
196 ErrorCollector* error_collector)
197 : input_(input),
198 error_collector_(error_collector),
199 buffer_(NULL),
200 buffer_size_(0),
201 buffer_pos_(0),
202 read_error_(false),
203 line_(0),
204 column_(0),
205 record_target_(NULL),
206 record_start_(-1),
207 allow_f_after_float_(false),
208 comment_style_(CPP_COMMENT_STYLE),
209 require_space_after_number_(true),
210 allow_multiline_strings_(false) {
211 current_.line = 0;
212 current_.column = 0;
213 current_.end_column = 0;
214 current_.type = TYPE_START;
215
216 Refresh();
217 }
218
~Tokenizer()219 Tokenizer::~Tokenizer() {
220 // If we had any buffer left unread, return it to the underlying stream
221 // so that someone else can read it.
222 if (buffer_size_ > buffer_pos_) {
223 input_->BackUp(buffer_size_ - buffer_pos_);
224 }
225 }
226
227 // -------------------------------------------------------------------
228 // Internal helpers.
229
NextChar()230 void Tokenizer::NextChar() {
231 // Update our line and column counters based on the character being
232 // consumed.
233 if (current_char_ == '\n') {
234 ++line_;
235 column_ = 0;
236 } else if (current_char_ == '\t') {
237 column_ += kTabWidth - column_ % kTabWidth;
238 } else {
239 ++column_;
240 }
241
242 // Advance to the next character.
243 ++buffer_pos_;
244 if (buffer_pos_ < buffer_size_) {
245 current_char_ = buffer_[buffer_pos_];
246 } else {
247 Refresh();
248 }
249 }
250
Refresh()251 void Tokenizer::Refresh() {
252 if (read_error_) {
253 current_char_ = '\0';
254 return;
255 }
256
257 // If we're in a token, append the rest of the buffer to it.
258 if (record_target_ != NULL && record_start_ < buffer_size_) {
259 record_target_->append(buffer_ + record_start_,
260 buffer_size_ - record_start_);
261 record_start_ = 0;
262 }
263
264 const void* data = NULL;
265 buffer_ = NULL;
266 buffer_pos_ = 0;
267 do {
268 if (!input_->Next(&data, &buffer_size_)) {
269 // end of stream (or read error)
270 buffer_size_ = 0;
271 read_error_ = true;
272 current_char_ = '\0';
273 return;
274 }
275 } while (buffer_size_ == 0);
276
277 buffer_ = static_cast<const char*>(data);
278
279 current_char_ = buffer_[0];
280 }
281
RecordTo(std::string * target)282 inline void Tokenizer::RecordTo(std::string* target) {
283 record_target_ = target;
284 record_start_ = buffer_pos_;
285 }
286
StopRecording()287 inline void Tokenizer::StopRecording() {
288 // Note: The if() is necessary because some STL implementations crash when
289 // you call string::append(NULL, 0), presumably because they are trying to
290 // be helpful by detecting the NULL pointer, even though there's nothing
291 // wrong with reading zero bytes from NULL.
292 if (buffer_pos_ != record_start_) {
293 record_target_->append(buffer_ + record_start_,
294 buffer_pos_ - record_start_);
295 }
296 record_target_ = NULL;
297 record_start_ = -1;
298 }
299
StartToken()300 inline void Tokenizer::StartToken() {
301 current_.type = TYPE_START; // Just for the sake of initializing it.
302 current_.text.clear();
303 current_.line = line_;
304 current_.column = column_;
305 RecordTo(¤t_.text);
306 }
307
EndToken()308 inline void Tokenizer::EndToken() {
309 StopRecording();
310 current_.end_column = column_;
311 }
312
313 // -------------------------------------------------------------------
314 // Helper methods that consume characters.
315
316 template <typename CharacterClass>
LookingAt()317 inline bool Tokenizer::LookingAt() {
318 return CharacterClass::InClass(current_char_);
319 }
320
321 template <typename CharacterClass>
TryConsumeOne()322 inline bool Tokenizer::TryConsumeOne() {
323 if (CharacterClass::InClass(current_char_)) {
324 NextChar();
325 return true;
326 } else {
327 return false;
328 }
329 }
330
TryConsume(char c)331 inline bool Tokenizer::TryConsume(char c) {
332 if (current_char_ == c) {
333 NextChar();
334 return true;
335 } else {
336 return false;
337 }
338 }
339
340 template <typename CharacterClass>
ConsumeZeroOrMore()341 inline void Tokenizer::ConsumeZeroOrMore() {
342 while (CharacterClass::InClass(current_char_)) {
343 NextChar();
344 }
345 }
346
347 template <typename CharacterClass>
ConsumeOneOrMore(const char * error)348 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
349 if (!CharacterClass::InClass(current_char_)) {
350 AddError(error);
351 } else {
352 do {
353 NextChar();
354 } while (CharacterClass::InClass(current_char_));
355 }
356 }
357
358 // -------------------------------------------------------------------
359 // Methods that read whole patterns matching certain kinds of tokens
360 // or comments.
361
ConsumeString(char delimiter)362 void Tokenizer::ConsumeString(char delimiter) {
363 while (true) {
364 switch (current_char_) {
365 case '\0':
366 AddError("Unexpected end of string.");
367 return;
368
369 case '\n': {
370 if (!allow_multiline_strings_) {
371 AddError("String literals cannot cross line boundaries.");
372 return;
373 }
374 NextChar();
375 break;
376 }
377
378 case '\\': {
379 // An escape sequence.
380 NextChar();
381 if (TryConsumeOne<Escape>()) {
382 // Valid escape sequence.
383 } else if (TryConsumeOne<OctalDigit>()) {
384 // Possibly followed by two more octal digits, but these will
385 // just be consumed by the main loop anyway so we don't need
386 // to do so explicitly here.
387 } else if (TryConsume('x')) {
388 if (!TryConsumeOne<HexDigit>()) {
389 AddError("Expected hex digits for escape sequence.");
390 }
391 // Possibly followed by another hex digit, but again we don't care.
392 } else if (TryConsume('u')) {
393 if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
394 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
395 AddError("Expected four hex digits for \\u escape sequence.");
396 }
397 } else if (TryConsume('U')) {
398 // We expect 8 hex digits; but only the range up to 0x10ffff is
399 // legal.
400 if (!TryConsume('0') || !TryConsume('0') ||
401 !(TryConsume('0') || TryConsume('1')) ||
402 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
403 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
404 !TryConsumeOne<HexDigit>()) {
405 AddError(
406 "Expected eight hex digits up to 10ffff for \\U escape "
407 "sequence");
408 }
409 } else {
410 AddError("Invalid escape sequence in string literal.");
411 }
412 break;
413 }
414
415 default: {
416 if (current_char_ == delimiter) {
417 NextChar();
418 return;
419 }
420 NextChar();
421 break;
422 }
423 }
424 }
425 }
426
ConsumeNumber(bool started_with_zero,bool started_with_dot)427 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
428 bool started_with_dot) {
429 bool is_float = false;
430
431 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
432 // A hex number (started with "0x").
433 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
434
435 } else if (started_with_zero && LookingAt<Digit>()) {
436 // An octal number (had a leading zero).
437 ConsumeZeroOrMore<OctalDigit>();
438 if (LookingAt<Digit>()) {
439 AddError("Numbers starting with leading zero must be in octal.");
440 ConsumeZeroOrMore<Digit>();
441 }
442
443 } else {
444 // A decimal number.
445 if (started_with_dot) {
446 is_float = true;
447 ConsumeZeroOrMore<Digit>();
448 } else {
449 ConsumeZeroOrMore<Digit>();
450
451 if (TryConsume('.')) {
452 is_float = true;
453 ConsumeZeroOrMore<Digit>();
454 }
455 }
456
457 if (TryConsume('e') || TryConsume('E')) {
458 is_float = true;
459 TryConsume('-') || TryConsume('+');
460 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
461 }
462
463 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
464 is_float = true;
465 }
466 }
467
468 if (LookingAt<Letter>() && require_space_after_number_) {
469 AddError("Need space between number and identifier.");
470 } else if (current_char_ == '.') {
471 if (is_float) {
472 AddError(
473 "Already saw decimal point or exponent; can't have another one.");
474 } else {
475 AddError("Hex and octal numbers must be integers.");
476 }
477 }
478
479 return is_float ? TYPE_FLOAT : TYPE_INTEGER;
480 }
481
ConsumeLineComment(std::string * content)482 void Tokenizer::ConsumeLineComment(std::string* content) {
483 if (content != NULL) RecordTo(content);
484
485 while (current_char_ != '\0' && current_char_ != '\n') {
486 NextChar();
487 }
488 TryConsume('\n');
489
490 if (content != NULL) StopRecording();
491 }
492
ConsumeBlockComment(std::string * content)493 void Tokenizer::ConsumeBlockComment(std::string* content) {
494 int start_line = line_;
495 int start_column = column_ - 2;
496
497 if (content != NULL) RecordTo(content);
498
499 while (true) {
500 while (current_char_ != '\0' && current_char_ != '*' &&
501 current_char_ != '/' && current_char_ != '\n') {
502 NextChar();
503 }
504
505 if (TryConsume('\n')) {
506 if (content != NULL) StopRecording();
507
508 // Consume leading whitespace and asterisk;
509 ConsumeZeroOrMore<WhitespaceNoNewline>();
510 if (TryConsume('*')) {
511 if (TryConsume('/')) {
512 // End of comment.
513 break;
514 }
515 }
516
517 if (content != NULL) RecordTo(content);
518 } else if (TryConsume('*') && TryConsume('/')) {
519 // End of comment.
520 if (content != NULL) {
521 StopRecording();
522 // Strip trailing "*/".
523 content->erase(content->size() - 2);
524 }
525 break;
526 } else if (TryConsume('/') && current_char_ == '*') {
527 // Note: We didn't consume the '*' because if there is a '/' after it
528 // we want to interpret that as the end of the comment.
529 AddError(
530 "\"/*\" inside block comment. Block comments cannot be nested.");
531 } else if (current_char_ == '\0') {
532 AddError("End-of-file inside block comment.");
533 error_collector_->AddError(start_line, start_column,
534 " Comment started here.");
535 if (content != NULL) StopRecording();
536 break;
537 }
538 }
539 }
540
TryConsumeCommentStart()541 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
542 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
543 if (TryConsume('/')) {
544 return LINE_COMMENT;
545 } else if (TryConsume('*')) {
546 return BLOCK_COMMENT;
547 } else {
548 // Oops, it was just a slash. Return it.
549 current_.type = TYPE_SYMBOL;
550 current_.text = "/";
551 current_.line = line_;
552 current_.column = column_ - 1;
553 current_.end_column = column_;
554 return SLASH_NOT_COMMENT;
555 }
556 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
557 return LINE_COMMENT;
558 } else {
559 return NO_COMMENT;
560 }
561 }
562
563 // -------------------------------------------------------------------
564
Next()565 bool Tokenizer::Next() {
566 previous_ = current_;
567
568 while (!read_error_) {
569 ConsumeZeroOrMore<Whitespace>();
570
571 switch (TryConsumeCommentStart()) {
572 case LINE_COMMENT:
573 ConsumeLineComment(NULL);
574 continue;
575 case BLOCK_COMMENT:
576 ConsumeBlockComment(NULL);
577 continue;
578 case SLASH_NOT_COMMENT:
579 return true;
580 case NO_COMMENT:
581 break;
582 }
583
584 // Check for EOF before continuing.
585 if (read_error_) break;
586
587 if (LookingAt<Unprintable>() || current_char_ == '\0') {
588 AddError("Invalid control characters encountered in text.");
589 NextChar();
590 // Skip more unprintable characters, too. But, remember that '\0' is
591 // also what current_char_ is set to after EOF / read error. We have
592 // to be careful not to go into an infinite loop of trying to consume
593 // it, so make sure to check read_error_ explicitly before consuming
594 // '\0'.
595 while (TryConsumeOne<Unprintable>() ||
596 (!read_error_ && TryConsume('\0'))) {
597 // Ignore.
598 }
599
600 } else {
601 // Reading some sort of token.
602 StartToken();
603
604 if (TryConsumeOne<Letter>()) {
605 ConsumeZeroOrMore<Alphanumeric>();
606 current_.type = TYPE_IDENTIFIER;
607 } else if (TryConsume('0')) {
608 current_.type = ConsumeNumber(true, false);
609 } else if (TryConsume('.')) {
610 // This could be the beginning of a floating-point number, or it could
611 // just be a '.' symbol.
612
613 if (TryConsumeOne<Digit>()) {
614 // It's a floating-point number.
615 if (previous_.type == TYPE_IDENTIFIER &&
616 current_.line == previous_.line &&
617 current_.column == previous_.end_column) {
618 // We don't accept syntax like "blah.123".
619 error_collector_->AddError(
620 line_, column_ - 2,
621 "Need space between identifier and decimal point.");
622 }
623 current_.type = ConsumeNumber(false, true);
624 } else {
625 current_.type = TYPE_SYMBOL;
626 }
627 } else if (TryConsumeOne<Digit>()) {
628 current_.type = ConsumeNumber(false, false);
629 } else if (TryConsume('\"')) {
630 ConsumeString('\"');
631 current_.type = TYPE_STRING;
632 } else if (TryConsume('\'')) {
633 ConsumeString('\'');
634 current_.type = TYPE_STRING;
635 } else {
636 // Check if the high order bit is set.
637 if (current_char_ & 0x80) {
638 error_collector_->AddError(
639 line_, column_,
640 StringPrintf("Interpreting non ascii codepoint %d.",
641 static_cast<unsigned char>(current_char_)));
642 }
643 NextChar();
644 current_.type = TYPE_SYMBOL;
645 }
646
647 EndToken();
648 return true;
649 }
650 }
651
652 // EOF
653 current_.type = TYPE_END;
654 current_.text.clear();
655 current_.line = line_;
656 current_.column = column_;
657 current_.end_column = column_;
658 return false;
659 }
660
661 namespace {
662
663 // Helper class for collecting comments and putting them in the right places.
664 //
665 // This basically just buffers the most recent comment until it can be decided
666 // exactly where that comment should be placed. When Flush() is called, the
667 // current comment goes into either prev_trailing_comments or detached_comments.
668 // When the CommentCollector is destroyed, the last buffered comment goes into
669 // next_leading_comments.
670 class CommentCollector {
671 public:
CommentCollector(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)672 CommentCollector(std::string* prev_trailing_comments,
673 std::vector<std::string>* detached_comments,
674 std::string* next_leading_comments)
675 : prev_trailing_comments_(prev_trailing_comments),
676 detached_comments_(detached_comments),
677 next_leading_comments_(next_leading_comments),
678 has_comment_(false),
679 is_line_comment_(false),
680 can_attach_to_prev_(true) {
681 if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
682 if (detached_comments != NULL) detached_comments->clear();
683 if (next_leading_comments != NULL) next_leading_comments->clear();
684 }
685
~CommentCollector()686 ~CommentCollector() {
687 // Whatever is in the buffer is a leading comment.
688 if (next_leading_comments_ != NULL && has_comment_) {
689 comment_buffer_.swap(*next_leading_comments_);
690 }
691 }
692
693 // About to read a line comment. Get the comment buffer pointer in order to
694 // read into it.
GetBufferForLineComment()695 std::string* GetBufferForLineComment() {
696 // We want to combine with previous line comments, but not block comments.
697 if (has_comment_ && !is_line_comment_) {
698 Flush();
699 }
700 has_comment_ = true;
701 is_line_comment_ = true;
702 return &comment_buffer_;
703 }
704
705 // About to read a block comment. Get the comment buffer pointer in order to
706 // read into it.
GetBufferForBlockComment()707 std::string* GetBufferForBlockComment() {
708 if (has_comment_) {
709 Flush();
710 }
711 has_comment_ = true;
712 is_line_comment_ = false;
713 return &comment_buffer_;
714 }
715
ClearBuffer()716 void ClearBuffer() {
717 comment_buffer_.clear();
718 has_comment_ = false;
719 }
720
721 // Called once we know that the comment buffer is complete and is *not*
722 // connected to the next token.
Flush()723 void Flush() {
724 if (has_comment_) {
725 if (can_attach_to_prev_) {
726 if (prev_trailing_comments_ != NULL) {
727 prev_trailing_comments_->append(comment_buffer_);
728 }
729 can_attach_to_prev_ = false;
730 } else {
731 if (detached_comments_ != NULL) {
732 detached_comments_->push_back(comment_buffer_);
733 }
734 }
735 ClearBuffer();
736 }
737 }
738
DetachFromPrev()739 void DetachFromPrev() { can_attach_to_prev_ = false; }
740
741 private:
742 std::string* prev_trailing_comments_;
743 std::vector<std::string>* detached_comments_;
744 std::string* next_leading_comments_;
745
746 std::string comment_buffer_;
747
748 // True if any comments were read into comment_buffer_. This can be true even
749 // if comment_buffer_ is empty, namely if the comment was "/**/".
750 bool has_comment_;
751
752 // Is the comment in the comment buffer a line comment?
753 bool is_line_comment_;
754
755 // Is it still possible that we could be reading a comment attached to the
756 // previous token?
757 bool can_attach_to_prev_;
758 };
759
760 } // namespace
761
NextWithComments(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)762 bool Tokenizer::NextWithComments(std::string* prev_trailing_comments,
763 std::vector<std::string>* detached_comments,
764 std::string* next_leading_comments) {
765 CommentCollector collector(prev_trailing_comments, detached_comments,
766 next_leading_comments);
767
768 if (current_.type == TYPE_START) {
769 // Ignore unicode byte order mark(BOM) if it appears at the file
770 // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
771 if (TryConsume((char)0xEF)) {
772 if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
773 AddError(
774 "Proto file starts with 0xEF but not UTF-8 BOM. "
775 "Only UTF-8 is accepted for proto file.");
776 return false;
777 }
778 }
779 collector.DetachFromPrev();
780 } else {
781 // A comment appearing on the same line must be attached to the previous
782 // declaration.
783 ConsumeZeroOrMore<WhitespaceNoNewline>();
784 switch (TryConsumeCommentStart()) {
785 case LINE_COMMENT:
786 ConsumeLineComment(collector.GetBufferForLineComment());
787
788 // Don't allow comments on subsequent lines to be attached to a trailing
789 // comment.
790 collector.Flush();
791 break;
792 case BLOCK_COMMENT:
793 ConsumeBlockComment(collector.GetBufferForBlockComment());
794
795 ConsumeZeroOrMore<WhitespaceNoNewline>();
796 if (!TryConsume('\n')) {
797 // Oops, the next token is on the same line. If we recorded a comment
798 // we really have no idea which token it should be attached to.
799 collector.ClearBuffer();
800 return Next();
801 }
802
803 // Don't allow comments on subsequent lines to be attached to a trailing
804 // comment.
805 collector.Flush();
806 break;
807 case SLASH_NOT_COMMENT:
808 return true;
809 case NO_COMMENT:
810 if (!TryConsume('\n')) {
811 // The next token is on the same line. There are no comments.
812 return Next();
813 }
814 break;
815 }
816 }
817
818 // OK, we are now on the line *after* the previous token.
819 while (true) {
820 ConsumeZeroOrMore<WhitespaceNoNewline>();
821
822 switch (TryConsumeCommentStart()) {
823 case LINE_COMMENT:
824 ConsumeLineComment(collector.GetBufferForLineComment());
825 break;
826 case BLOCK_COMMENT:
827 ConsumeBlockComment(collector.GetBufferForBlockComment());
828
829 // Consume the rest of the line so that we don't interpret it as a
830 // blank line the next time around the loop.
831 ConsumeZeroOrMore<WhitespaceNoNewline>();
832 TryConsume('\n');
833 break;
834 case SLASH_NOT_COMMENT:
835 return true;
836 case NO_COMMENT:
837 if (TryConsume('\n')) {
838 // Completely blank line.
839 collector.Flush();
840 collector.DetachFromPrev();
841 } else {
842 bool result = Next();
843 if (!result || current_.text == "}" || current_.text == "]" ||
844 current_.text == ")") {
845 // It looks like we're at the end of a scope. In this case it
846 // makes no sense to attach a comment to the following token.
847 collector.Flush();
848 }
849 return result;
850 }
851 break;
852 }
853 }
854 }
855
856 // -------------------------------------------------------------------
857 // Token-parsing helpers. Remember that these don't need to report
858 // errors since any errors should already have been reported while
859 // tokenizing. Also, these can assume that whatever text they
860 // are given is text that the tokenizer actually parsed as a token
861 // of the given type.
862
ParseInteger(const std::string & text,uint64 max_value,uint64 * output)863 bool Tokenizer::ParseInteger(const std::string& text, uint64 max_value,
864 uint64* output) {
865 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
866 // is non-standard. I hate the C standard library. :(
867
868 // return strtoull(text.c_str(), NULL, 0);
869
870 const char* ptr = text.c_str();
871 int base = 10;
872 if (ptr[0] == '0') {
873 if (ptr[1] == 'x' || ptr[1] == 'X') {
874 // This is hex.
875 base = 16;
876 ptr += 2;
877 } else {
878 // This is octal.
879 base = 8;
880 }
881 }
882
883 uint64 result = 0;
884 for (; *ptr != '\0'; ptr++) {
885 int digit = DigitValue(*ptr);
886 if (digit < 0 || digit >= base) {
887 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
888 // token, but Tokenizer still think it's integer.
889 return false;
890 }
891 if (digit > max_value || result > (max_value - digit) / base) {
892 // Overflow.
893 return false;
894 }
895 result = result * base + digit;
896 }
897
898 *output = result;
899 return true;
900 }
901
ParseFloat(const std::string & text)902 double Tokenizer::ParseFloat(const std::string& text) {
903 const char* start = text.c_str();
904 char* end;
905 double result = NoLocaleStrtod(start, &end);
906
907 // "1e" is not a valid float, but if the tokenizer reads it, it will
908 // report an error but still return it as a valid token. We need to
909 // accept anything the tokenizer could possibly return, error or not.
910 if (*end == 'e' || *end == 'E') {
911 ++end;
912 if (*end == '-' || *end == '+') ++end;
913 }
914
915 // If the Tokenizer had allow_f_after_float_ enabled, the float may be
916 // suffixed with the letter 'f'.
917 if (*end == 'f' || *end == 'F') {
918 ++end;
919 }
920
921 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
922 << " Tokenizer::ParseFloat() passed text that could not have been"
923 " tokenized as a float: "
924 << CEscape(text);
925 return result;
926 }
927
928 // Helper to append a Unicode code point to a string as UTF8, without bringing
929 // in any external dependencies.
AppendUTF8(uint32 code_point,std::string * output)930 static void AppendUTF8(uint32 code_point, std::string* output) {
931 uint32 tmp = 0;
932 int len = 0;
933 if (code_point <= 0x7f) {
934 tmp = code_point;
935 len = 1;
936 } else if (code_point <= 0x07ff) {
937 tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
938 len = 2;
939 } else if (code_point <= 0xffff) {
940 tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
941 ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
942 len = 3;
943 } else if (code_point <= 0x1fffff) {
944 tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
945 ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
946 (code_point & 0x003f);
947 len = 4;
948 } else {
949 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
950 // normally only defined up to there as well.
951 StringAppendF(output, "\\U%08x", code_point);
952 return;
953 }
954 tmp = ghtonl(tmp);
955 output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
956 }
957
958 // Try to read <len> hex digits from ptr, and stuff the numeric result into
959 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32 * result)960 static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
961 *result = 0;
962 if (len == 0) return false;
963 for (const char* end = ptr + len; ptr < end; ++ptr) {
964 if (*ptr == '\0') return false;
965 *result = (*result << 4) + DigitValue(*ptr);
966 }
967 return true;
968 }
969
970 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
971 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
972 // surrogate. These numbers are in a reserved range of Unicode code points, so
973 // if we encounter such a pair we know how to parse it and convert it into a
974 // single code point.
975 static const uint32 kMinHeadSurrogate = 0xd800;
976 static const uint32 kMaxHeadSurrogate = 0xdc00;
977 static const uint32 kMinTrailSurrogate = 0xdc00;
978 static const uint32 kMaxTrailSurrogate = 0xe000;
979
IsHeadSurrogate(uint32 code_point)980 static inline bool IsHeadSurrogate(uint32 code_point) {
981 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
982 }
983
IsTrailSurrogate(uint32 code_point)984 static inline bool IsTrailSurrogate(uint32 code_point) {
985 return (code_point >= kMinTrailSurrogate) &&
986 (code_point < kMaxTrailSurrogate);
987 }
988
989 // Combine a head and trail surrogate into a single Unicode code point.
AssembleUTF16(uint32 head_surrogate,uint32 trail_surrogate)990 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
991 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
992 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
993 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
994 (trail_surrogate - kMinTrailSurrogate));
995 }
996
997 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)998 static inline int UnicodeLength(char key) {
999 if (key == 'u') return 4;
1000 if (key == 'U') return 8;
1001 return 0;
1002 }
1003
1004 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1005 // to parse that sequence. On success, returns a pointer to the first char
1006 // beyond that sequence, and fills in *code_point. On failure, returns ptr
1007 // itself.
FetchUnicodePoint(const char * ptr,uint32 * code_point)1008 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
1009 const char* p = ptr;
1010 // Fetch the code point.
1011 const int len = UnicodeLength(*p++);
1012 if (!ReadHexDigits(p, len, code_point)) return ptr;
1013 p += len;
1014
1015 // Check if the code point we read is a "head surrogate." If so, then we
1016 // expect it to be immediately followed by another code point which is a valid
1017 // "trail surrogate," and together they form a UTF-16 pair which decodes into
1018 // a single Unicode point. Trail surrogates may only use \u, not \U.
1019 if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1020 uint32 trail_surrogate;
1021 if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1022 IsTrailSurrogate(trail_surrogate)) {
1023 *code_point = AssembleUTF16(*code_point, trail_surrogate);
1024 p += 6;
1025 }
1026 // If this failed, then we just emit the head surrogate as a code point.
1027 // It's bogus, but so is the string.
1028 }
1029
1030 return p;
1031 }
1032
1033 // The text string must begin and end with single or double quote
1034 // characters.
ParseStringAppend(const std::string & text,std::string * output)1035 void Tokenizer::ParseStringAppend(const std::string& text,
1036 std::string* output) {
1037 // Reminder: text[0] is always a quote character. (If text is
1038 // empty, it's invalid, so we'll just return).
1039 const size_t text_size = text.size();
1040 if (text_size == 0) {
1041 GOOGLE_LOG(DFATAL) << " Tokenizer::ParseStringAppend() passed text that could not"
1042 " have been tokenized as a string: "
1043 << CEscape(text);
1044 return;
1045 }
1046
1047 // Reserve room for new string. The branch is necessary because if
1048 // there is already space available the reserve() call might
1049 // downsize the output.
1050 const size_t new_len = text_size + output->size();
1051 if (new_len > output->capacity()) {
1052 output->reserve(new_len);
1053 }
1054
1055 // Loop through the string copying characters to "output" and
1056 // interpreting escape sequences. Note that any invalid escape
1057 // sequences or other errors were already reported while tokenizing.
1058 // In this case we do not need to produce valid results.
1059 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1060 if (*ptr == '\\' && ptr[1] != '\0') {
1061 // An escape sequence.
1062 ++ptr;
1063
1064 if (OctalDigit::InClass(*ptr)) {
1065 // An octal escape. May one, two, or three digits.
1066 int code = DigitValue(*ptr);
1067 if (OctalDigit::InClass(ptr[1])) {
1068 ++ptr;
1069 code = code * 8 + DigitValue(*ptr);
1070 }
1071 if (OctalDigit::InClass(ptr[1])) {
1072 ++ptr;
1073 code = code * 8 + DigitValue(*ptr);
1074 }
1075 output->push_back(static_cast<char>(code));
1076
1077 } else if (*ptr == 'x') {
1078 // A hex escape. May zero, one, or two digits. (The zero case
1079 // will have been caught as an error earlier.)
1080 int code = 0;
1081 if (HexDigit::InClass(ptr[1])) {
1082 ++ptr;
1083 code = DigitValue(*ptr);
1084 }
1085 if (HexDigit::InClass(ptr[1])) {
1086 ++ptr;
1087 code = code * 16 + DigitValue(*ptr);
1088 }
1089 output->push_back(static_cast<char>(code));
1090
1091 } else if (*ptr == 'u' || *ptr == 'U') {
1092 uint32 unicode;
1093 const char* end = FetchUnicodePoint(ptr, &unicode);
1094 if (end == ptr) {
1095 // Failure: Just dump out what we saw, don't try to parse it.
1096 output->push_back(*ptr);
1097 } else {
1098 AppendUTF8(unicode, output);
1099 ptr = end - 1; // Because we're about to ++ptr.
1100 }
1101 } else {
1102 // Some other escape code.
1103 output->push_back(TranslateEscape(*ptr));
1104 }
1105
1106 } else if (*ptr == text[0] && ptr[1] == '\0') {
1107 // Ignore final quote matching the starting quote.
1108 } else {
1109 output->push_back(*ptr);
1110 }
1111 }
1112 }
1113
1114 template <typename CharacterClass>
AllInClass(const std::string & s)1115 static bool AllInClass(const std::string& s) {
1116 for (int i = 0; i < s.size(); ++i) {
1117 if (!CharacterClass::InClass(s[i])) return false;
1118 }
1119 return true;
1120 }
1121
IsIdentifier(const std::string & text)1122 bool Tokenizer::IsIdentifier(const std::string& text) {
1123 // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1124 if (text.size() == 0) return false;
1125 if (!Letter::InClass(text.at(0))) return false;
1126 if (!AllInClass<Alphanumeric>(text.substr(1))) return false;
1127 return true;
1128 }
1129
1130 } // namespace io
1131 } // namespace protobuf
1132 } // namespace google
1133