• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 // Author: kenton@google.com (Kenton Varda)
9 //  Based on original Protocol Buffers design by
10 //  Sanjay Ghemawat, Jeff Dean, and others.
11 //
12 // Here we have a hand-written lexer.  At first you might ask yourself,
13 // "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
14 // yes I am crazy, but that's beside the point.  There are actually reasons
15 // why I ended up writing this this way.
16 //
17 // The traditional approach to lexing is to use lex to generate a lexer for
18 // you.  Unfortunately, lex's output is ridiculously ugly and difficult to
19 // integrate cleanly with C++ code, especially abstract code or code meant
20 // as a library.  Better parser-generators exist but would add dependencies
21 // which most users won't already have, which we'd like to avoid.  (GNU flex
22 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
23 // and not library-friendly.)
24 //
25 // The next approach that any good software engineer should look at is to
26 // use regular expressions.  And, indeed, I did.  I have code which
27 // implements this same class using regular expressions.  It's about 200
28 // lines shorter.  However:
29 // - Rather than error messages telling you "This string has an invalid
30 //   escape sequence at line 5, column 45", you get error messages like
31 //   "Parse error on line 5".  Giving more precise errors requires adding
32 //   a lot of code that ends up basically as complex as the hand-coded
33 //   version anyway.
34 // - The regular expression to match a string literal looks like this:
35 //     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
36 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
37 //                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
38 //                       "\'([^\'\\\\]|"        // Also support single-quotes.
39 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"
40 //                       "\\\\x[0-9a-fA-F])*\')");
41 //   Verifying the correctness of this line noise is actually harder than
42 //   verifying the correctness of ConsumeString(), defined below.  I'm not
43 //   even confident that the above is correct, after staring at it for some
44 //   time.
45 // - PCRE is fast, but there's still more overhead involved than the code
46 //   below.
47 // - Sadly, regular expressions are not part of the C standard library, so
48 //   using them would require depending on some other library.  For the
49 //   open source release, this could be really annoying.  Nobody likes
50 //   downloading one piece of software just to find that they need to
51 //   download something else to make it work, and in all likelihood
52 //   people downloading Protocol Buffers will already be doing so just
53 //   to make something else work.  We could include a copy of PCRE with
54 //   our code, but that obligates us to keep it up-to-date and just seems
55 //   like a big waste just to save 200 lines of code.
56 //
57 // On a similar but unrelated note, I'm even scared to use ctype.h.
58 // Apparently functions like isalpha() are locale-dependent.  So, if we used
59 // that, then if this code is being called from some program that doesn't
60 // have its locale set to "C", it would behave strangely.  We can't just set
61 // the locale to "C" ourselves since we might break the calling program that
62 // way, particularly if it is multi-threaded.  WTF?  Someone please let me
63 // (Kenton) know if I'm missing something here...
64 //
65 // I'd love to hear about other alternatives, though, as this code isn't
66 // exactly pretty.
67 
68 #include "google/protobuf/io/tokenizer.h"
69 
70 #include "google/protobuf/stubs/common.h"
71 #include "absl/log/absl_check.h"
72 #include "absl/log/absl_log.h"
73 #include "absl/strings/escaping.h"
74 #include "absl/strings/str_format.h"
75 #include "google/protobuf/io/strtod.h"
76 #include "google/protobuf/io/zero_copy_stream.h"
77 
78 // Must be included last.
79 #include "google/protobuf/port_def.inc"
80 
81 namespace google {
82 namespace protobuf {
83 namespace io {
84 namespace {
85 
86 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
87 // So, I have written replacement functions here.  Someone please smack me if
88 // this is a bad idea or if there is some way around this.
89 //
90 // These "character classes" are designed to be used in template methods.
91 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
92 // whitespace.
93 
94 // Note:  No class is allowed to contain '\0', since this is used to mark end-
95 //   of-input and is handled specially.
96 
97 #define CHARACTER_CLASS(NAME, EXPRESSION)                     \
98   class NAME {                                                \
99    public:                                                    \
100     static inline bool InClass(char c) { return EXPRESSION; } \
101   }
102 
103 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || c == '\r' ||
104                                 c == '\v' || c == '\f');
105 CHARACTER_CLASS(WhitespaceNoNewline,
106                 c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f');
107 
108 CHARACTER_CLASS(Unprintable, c<' ' && c> '\0');
109 
110 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
111 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
112 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
113                               ('A' <= c && c <= 'F'));
114 
115 CHARACTER_CLASS(Letter,
116                 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
117 
118 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
119                                   ('A' <= c && c <= 'Z') ||
120                                   ('0' <= c && c <= '9') || (c == '_'));
121 
122 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
123                             c == 'r' || c == 't' || c == 'v' || c == '\\' ||
124                             c == '?' || c == '\'' || c == '\"');
125 
126 #undef CHARACTER_CLASS
127 
128 // Given a char, interpret it as a numeric digit and return its value.
129 // This supports any number base up to 36.
130 // Represents integer values of digits.
131 // Uses 36 to indicate an invalid character since we support
132 // bases up to 36.
133 static const int8_t kAsciiToInt[256] = {
134     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 00-0F
135     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 10-1F
136     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // ' '-'/'
137     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,                           // '0'-'9'
138     36, 36, 36, 36, 36, 36, 36,                                      // ':'-'@'
139     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'P'
140     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'Q'-'Z'
141     36, 36, 36, 36, 36, 36,                                          // '['-'`'
142     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'a'-'p'
143     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'q'-'z'
144     36, 36, 36, 36, 36,                                              // '{'-DEL
145     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 80-8F
146     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 90-9F
147     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // A0-AF
148     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // B0-BF
149     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // C0-CF
150     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // D0-DF
151     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // E0-EF
152     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // F0-FF
153 };
154 
DigitValue(char digit)155 inline int DigitValue(char digit) { return kAsciiToInt[digit & 0xFF]; }
156 
157 // Inline because it's only used in one place.
TranslateEscape(char c)158 inline char TranslateEscape(char c) {
159   switch (c) {
160     case 'a':
161       return '\a';
162     case 'b':
163       return '\b';
164     case 'f':
165       return '\f';
166     case 'n':
167       return '\n';
168     case 'r':
169       return '\r';
170     case 't':
171       return '\t';
172     case 'v':
173       return '\v';
174     case '\\':
175       return '\\';
176     case '?':
177       return '\?';  // Trigraphs = :(
178     case '\'':
179       return '\'';
180     case '"':
181       return '\"';
182 
183     // We expect escape sequences to have been validated separately.
184     default:
185       return '?';
186   }
187 }
188 
189 }  // anonymous namespace
190 
~ErrorCollector()191 ErrorCollector::~ErrorCollector() {}
192 
193 // ===================================================================
194 
Tokenizer(ZeroCopyInputStream * input,ErrorCollector * error_collector)195 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
196                      ErrorCollector* error_collector)
197     : input_(input),
198       error_collector_(error_collector),
199       buffer_(nullptr),
200       buffer_size_(0),
201       buffer_pos_(0),
202       read_error_(false),
203       line_(0),
204       column_(0),
205       record_target_(nullptr),
206       record_start_(-1),
207       allow_f_after_float_(false),
208       comment_style_(CPP_COMMENT_STYLE),
209       require_space_after_number_(true),
210       allow_multiline_strings_(false) {
211   current_.line = 0;
212   current_.column = 0;
213   current_.end_column = 0;
214   current_.type = TYPE_START;
215   previous_ = current_;
216 
217   Refresh();
218 }
219 
~Tokenizer()220 Tokenizer::~Tokenizer() {
221   // If we had any buffer left unread, return it to the underlying stream
222   // so that someone else can read it.
223   if (buffer_size_ > buffer_pos_) {
224     input_->BackUp(buffer_size_ - buffer_pos_);
225   }
226 }
227 
report_whitespace() const228 bool Tokenizer::report_whitespace() const { return report_whitespace_; }
229 // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
set_report_whitespace(bool report)230 void Tokenizer::set_report_whitespace(bool report) {
231   report_whitespace_ = report;
232   report_newlines_ &= report;
233 }
234 
235 // If true, newline tokens are reported by Next().
report_newlines() const236 bool Tokenizer::report_newlines() const { return report_newlines_; }
237 // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
set_report_newlines(bool report)238 void Tokenizer::set_report_newlines(bool report) {
239   report_newlines_ = report;
240   report_whitespace_ |= report;  // enable report_whitespace if necessary
241 }
242 
243 // -------------------------------------------------------------------
244 // Internal helpers.
245 
NextChar()246 void Tokenizer::NextChar() {
247   // Update our line and column counters based on the character being
248   // consumed.
249   if (current_char_ == '\n') {
250     ++line_;
251     column_ = 0;
252   } else if (current_char_ == '\t') {
253     column_ += kTabWidth - column_ % kTabWidth;
254   } else {
255     ++column_;
256   }
257 
258   // Advance to the next character.
259   ++buffer_pos_;
260   if (buffer_pos_ < buffer_size_) {
261     current_char_ = buffer_[buffer_pos_];
262   } else {
263     Refresh();
264   }
265 }
266 
Refresh()267 void Tokenizer::Refresh() {
268   if (read_error_) {
269     current_char_ = '\0';
270     return;
271   }
272 
273   // If we're in a token, append the rest of the buffer to it.
274   if (record_target_ != nullptr && record_start_ < buffer_size_) {
275     record_target_->append(buffer_ + record_start_,
276                            buffer_size_ - record_start_);
277     record_start_ = 0;
278   }
279 
280   const void* data = NULL;
281   buffer_ = NULL;
282   buffer_pos_ = 0;
283   do {
284     if (!input_->Next(&data, &buffer_size_)) {
285       // end of stream (or read error)
286       buffer_size_ = 0;
287       read_error_ = true;
288       current_char_ = '\0';
289       return;
290     }
291   } while (buffer_size_ == 0);
292 
293   buffer_ = static_cast<const char*>(data);
294 
295   current_char_ = buffer_[0];
296 }
297 
RecordTo(std::string * target)298 inline void Tokenizer::RecordTo(std::string* target) {
299   record_target_ = target;
300   record_start_ = buffer_pos_;
301 }
302 
StopRecording()303 inline void Tokenizer::StopRecording() {
304   // Note:  The if() is necessary because some STL implementations crash when
305   //   you call string::append(NULL, 0), presumably because they are trying to
306   //   be helpful by detecting the NULL pointer, even though there's nothing
307   //   wrong with reading zero bytes from NULL.
308   if (buffer_pos_ != record_start_) {
309     record_target_->append(buffer_ + record_start_,
310                            buffer_pos_ - record_start_);
311   }
312   record_target_ = NULL;
313   record_start_ = -1;
314 }
315 
StartToken()316 inline void Tokenizer::StartToken() {
317   current_.type = TYPE_START;  // Just for the sake of initializing it.
318   current_.text.clear();
319   current_.line = line_;
320   current_.column = column_;
321   RecordTo(&current_.text);
322 }
323 
EndToken()324 inline void Tokenizer::EndToken() {
325   StopRecording();
326   current_.end_column = column_;
327 }
328 
329 // -------------------------------------------------------------------
330 // Helper methods that consume characters.
331 
332 template <typename CharacterClass>
LookingAt()333 inline bool Tokenizer::LookingAt() {
334   return CharacterClass::InClass(current_char_);
335 }
336 
337 template <typename CharacterClass>
TryConsumeOne()338 inline bool Tokenizer::TryConsumeOne() {
339   if (CharacterClass::InClass(current_char_)) {
340     NextChar();
341     return true;
342   } else {
343     return false;
344   }
345 }
346 
TryConsume(char c)347 inline bool Tokenizer::TryConsume(char c) {
348   if (current_char_ == c) {
349     NextChar();
350     return true;
351   } else {
352     return false;
353   }
354 }
355 
356 template <typename CharacterClass>
ConsumeZeroOrMore()357 inline void Tokenizer::ConsumeZeroOrMore() {
358   while (CharacterClass::InClass(current_char_)) {
359     NextChar();
360   }
361 }
362 
363 template <typename CharacterClass>
ConsumeOneOrMore(const char * error)364 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
365   if (!CharacterClass::InClass(current_char_)) {
366     AddError(error);
367   } else {
368     do {
369       NextChar();
370     } while (CharacterClass::InClass(current_char_));
371   }
372 }
373 
374 // -------------------------------------------------------------------
375 // Methods that read whole patterns matching certain kinds of tokens
376 // or comments.
377 
ConsumeString(char delimiter)378 void Tokenizer::ConsumeString(char delimiter) {
379   while (true) {
380     switch (current_char_) {
381       case '\0':
382         AddError("Unexpected end of string.");
383         return;
384 
385       case '\n': {
386         if (!allow_multiline_strings_) {
387           AddError("Multiline strings are not allowed. Did you miss a \"?.");
388           return;
389         }
390         NextChar();
391         break;
392       }
393 
394       case '\\': {
395         // An escape sequence.
396         NextChar();
397         if (TryConsumeOne<Escape>()) {
398           // Valid escape sequence.
399         } else if (TryConsumeOne<OctalDigit>()) {
400           // Possibly followed by two more octal digits, but these will
401           // just be consumed by the main loop anyway so we don't need
402           // to do so explicitly here.
403         } else if (TryConsume('x') || TryConsume('X')) {
404           if (!TryConsumeOne<HexDigit>()) {
405             AddError("Expected hex digits for escape sequence.");
406           }
407           // Possibly followed by another hex digit, but again we don't care.
408         } else if (TryConsume('u')) {
409           if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
410               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
411             AddError("Expected four hex digits for \\u escape sequence.");
412           }
413         } else if (TryConsume('U')) {
414           // We expect 8 hex digits; but only the range up to 0x10ffff is
415           // legal.
416           if (!TryConsume('0') || !TryConsume('0') ||
417               !(TryConsume('0') || TryConsume('1')) ||
418               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
419               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
420               !TryConsumeOne<HexDigit>()) {
421             AddError(
422                 "Expected eight hex digits up to 10ffff for \\U escape "
423                 "sequence");
424           }
425         } else {
426           AddError("Invalid escape sequence in string literal.");
427         }
428         break;
429       }
430 
431       default: {
432         if (current_char_ == delimiter) {
433           NextChar();
434           return;
435         }
436         NextChar();
437         break;
438       }
439     }
440   }
441 }
442 
ConsumeNumber(bool started_with_zero,bool started_with_dot)443 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
444                                               bool started_with_dot) {
445   bool is_float = false;
446 
447   if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
448     // A hex number (started with "0x").
449     ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
450 
451   } else if (started_with_zero && LookingAt<Digit>()) {
452     // An octal number (had a leading zero).
453     ConsumeZeroOrMore<OctalDigit>();
454     if (LookingAt<Digit>()) {
455       AddError("Numbers starting with leading zero must be in octal.");
456       ConsumeZeroOrMore<Digit>();
457     }
458 
459   } else {
460     // A decimal number.
461     if (started_with_dot) {
462       is_float = true;
463       ConsumeZeroOrMore<Digit>();
464     } else {
465       ConsumeZeroOrMore<Digit>();
466 
467       if (TryConsume('.')) {
468         is_float = true;
469         ConsumeZeroOrMore<Digit>();
470       }
471     }
472 
473     if (TryConsume('e') || TryConsume('E')) {
474       is_float = true;
475       TryConsume('-') || TryConsume('+');
476       ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
477     }
478 
479     if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
480       is_float = true;
481     }
482   }
483 
484   if (LookingAt<Letter>() && require_space_after_number_) {
485     AddError("Need space between number and identifier.");
486   } else if (current_char_ == '.') {
487     if (is_float) {
488       AddError(
489           "Already saw decimal point or exponent; can't have another one.");
490     } else {
491       AddError("Hex and octal numbers must be integers.");
492     }
493   }
494 
495   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
496 }
497 
ConsumeLineComment(std::string * content)498 void Tokenizer::ConsumeLineComment(std::string* content) {
499   if (content != NULL) RecordTo(content);
500 
501   while (current_char_ != '\0' && current_char_ != '\n') {
502     NextChar();
503   }
504   TryConsume('\n');
505 
506   if (content != NULL) StopRecording();
507 }
508 
ConsumeBlockComment(std::string * content)509 void Tokenizer::ConsumeBlockComment(std::string* content) {
510   int start_line = line_;
511   int start_column = column_ - 2;
512 
513   if (content != NULL) RecordTo(content);
514 
515   while (true) {
516     while (current_char_ != '\0' && current_char_ != '*' &&
517            current_char_ != '/' && current_char_ != '\n') {
518       NextChar();
519     }
520 
521     if (TryConsume('\n')) {
522       if (content != NULL) StopRecording();
523 
524       // Consume leading whitespace and asterisk;
525       ConsumeZeroOrMore<WhitespaceNoNewline>();
526       if (TryConsume('*')) {
527         if (TryConsume('/')) {
528           // End of comment.
529           break;
530         }
531       }
532 
533       if (content != NULL) RecordTo(content);
534     } else if (TryConsume('*') && TryConsume('/')) {
535       // End of comment.
536       if (content != NULL) {
537         StopRecording();
538         // Strip trailing "*/".
539         content->erase(content->size() - 2);
540       }
541       break;
542     } else if (TryConsume('/') && current_char_ == '*') {
543       // Note:  We didn't consume the '*' because if there is a '/' after it
544       //   we want to interpret that as the end of the comment.
545       AddError(
546           "\"/*\" inside block comment.  Block comments cannot be nested.");
547     } else if (current_char_ == '\0') {
548       AddError("End-of-file inside block comment.");
549       error_collector_->RecordError(start_line, start_column,
550                                     "  Comment started here.");
551       if (content != NULL) StopRecording();
552       break;
553     }
554   }
555 }
556 
TryConsumeCommentStart()557 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
558   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
559     if (TryConsume('/')) {
560       return LINE_COMMENT;
561     } else if (TryConsume('*')) {
562       return BLOCK_COMMENT;
563     } else {
564       // Oops, it was just a slash.  Return it.
565       current_.type = TYPE_SYMBOL;
566       current_.text = "/";
567       current_.line = line_;
568       current_.column = column_ - 1;
569       current_.end_column = column_;
570       return SLASH_NOT_COMMENT;
571     }
572   } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
573     return LINE_COMMENT;
574   } else {
575     return NO_COMMENT;
576   }
577 }
578 
TryConsumeWhitespace()579 bool Tokenizer::TryConsumeWhitespace() {
580   if (report_newlines_) {
581     if (TryConsumeOne<WhitespaceNoNewline>()) {
582       ConsumeZeroOrMore<WhitespaceNoNewline>();
583       current_.type = TYPE_WHITESPACE;
584       return true;
585     }
586     return false;
587   }
588   if (TryConsumeOne<Whitespace>()) {
589     ConsumeZeroOrMore<Whitespace>();
590     current_.type = TYPE_WHITESPACE;
591     return report_whitespace_;
592   }
593   return false;
594 }
595 
TryConsumeNewline()596 bool Tokenizer::TryConsumeNewline() {
597   if (!report_whitespace_ || !report_newlines_) {
598     return false;
599   }
600   if (TryConsume('\n')) {
601     current_.type = TYPE_NEWLINE;
602     return true;
603   }
604   return false;
605 }
606 
607 // -------------------------------------------------------------------
608 
Next()609 bool Tokenizer::Next() {
610   previous_ = current_;
611 
612   while (!read_error_) {
613     StartToken();
614     bool report_token = TryConsumeWhitespace() || TryConsumeNewline();
615     EndToken();
616     if (report_token) {
617       return true;
618     }
619 
620     switch (TryConsumeCommentStart()) {
621       case LINE_COMMENT:
622         ConsumeLineComment(NULL);
623         continue;
624       case BLOCK_COMMENT:
625         ConsumeBlockComment(NULL);
626         continue;
627       case SLASH_NOT_COMMENT:
628         return true;
629       case NO_COMMENT:
630         break;
631     }
632 
633     // Check for EOF before continuing.
634     if (read_error_) break;
635 
636     if (LookingAt<Unprintable>() || current_char_ == '\0') {
637       AddError("Invalid control characters encountered in text.");
638       NextChar();
639       // Skip more unprintable characters, too.  But, remember that '\0' is
640       // also what current_char_ is set to after EOF / read error.  We have
641       // to be careful not to go into an infinite loop of trying to consume
642       // it, so make sure to check read_error_ explicitly before consuming
643       // '\0'.
644       while (TryConsumeOne<Unprintable>() ||
645              (!read_error_ && TryConsume('\0'))) {
646         // Ignore.
647       }
648 
649     } else {
650       // Reading some sort of token.
651       StartToken();
652 
653       if (TryConsumeOne<Letter>()) {
654         ConsumeZeroOrMore<Alphanumeric>();
655         current_.type = TYPE_IDENTIFIER;
656       } else if (TryConsume('0')) {
657         current_.type = ConsumeNumber(true, false);
658       } else if (TryConsume('.')) {
659         // This could be the beginning of a floating-point number, or it could
660         // just be a '.' symbol.
661 
662         if (TryConsumeOne<Digit>()) {
663           // It's a floating-point number.
664           if (previous_.type == TYPE_IDENTIFIER &&
665               current_.line == previous_.line &&
666               current_.column == previous_.end_column) {
667             // We don't accept syntax like "blah.123".
668             error_collector_->RecordError(
669                 line_, column_ - 2,
670                 "Need space between identifier and decimal point.");
671           }
672           current_.type = ConsumeNumber(false, true);
673         } else {
674           current_.type = TYPE_SYMBOL;
675         }
676       } else if (TryConsumeOne<Digit>()) {
677         current_.type = ConsumeNumber(false, false);
678       } else if (TryConsume('\"')) {
679         ConsumeString('\"');
680         current_.type = TYPE_STRING;
681       } else if (TryConsume('\'')) {
682         ConsumeString('\'');
683         current_.type = TYPE_STRING;
684       } else {
685         // Check if the high order bit is set.
686         if (current_char_ & 0x80) {
687           error_collector_->RecordError(
688               line_, column_,
689               absl::StrFormat("Interpreting non ascii codepoint %d.",
690                               static_cast<unsigned char>(current_char_)));
691         }
692         NextChar();
693         current_.type = TYPE_SYMBOL;
694       }
695 
696       EndToken();
697       return true;
698     }
699   }
700 
701   // EOF
702   current_.type = TYPE_END;
703   current_.text.clear();
704   current_.line = line_;
705   current_.column = column_;
706   current_.end_column = column_;
707   return false;
708 }
709 
710 namespace {
711 
712 // Helper class for collecting comments and putting them in the right places.
713 //
714 // This basically just buffers the most recent comment until it can be decided
715 // exactly where that comment should be placed.  When Flush() is called, the
716 // current comment goes into either prev_trailing_comments or detached_comments.
717 // When the CommentCollector is destroyed, the last buffered comment goes into
718 // next_leading_comments.
719 class CommentCollector {
720  public:
CommentCollector(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)721   CommentCollector(std::string* prev_trailing_comments,
722                    std::vector<std::string>* detached_comments,
723                    std::string* next_leading_comments)
724       : prev_trailing_comments_(prev_trailing_comments),
725         detached_comments_(detached_comments),
726         next_leading_comments_(next_leading_comments),
727         num_comments_(0),
728         has_trailing_comment_(false),
729         has_comment_(false),
730         is_line_comment_(false),
731         can_attach_to_prev_(true) {
732     if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
733     if (detached_comments != NULL) detached_comments->clear();
734     if (next_leading_comments != NULL) next_leading_comments->clear();
735   }
736 
~CommentCollector()737   ~CommentCollector() {
738     // Whatever is in the buffer is a leading comment.
739     if (next_leading_comments_ != NULL && has_comment_) {
740       comment_buffer_.swap(*next_leading_comments_);
741     }
742   }
743 
744   // About to read a line comment.  Get the comment buffer pointer in order to
745   // read into it.
GetBufferForLineComment()746   std::string* GetBufferForLineComment() {
747     // We want to combine with previous line comments, but not block comments.
748     if (has_comment_ && !is_line_comment_) {
749       Flush();
750     }
751     has_comment_ = true;
752     is_line_comment_ = true;
753     return &comment_buffer_;
754   }
755 
756   // About to read a block comment.  Get the comment buffer pointer in order to
757   // read into it.
GetBufferForBlockComment()758   std::string* GetBufferForBlockComment() {
759     if (has_comment_) {
760       Flush();
761     }
762     has_comment_ = true;
763     is_line_comment_ = false;
764     return &comment_buffer_;
765   }
766 
ClearBuffer()767   void ClearBuffer() {
768     comment_buffer_.clear();
769     has_comment_ = false;
770   }
771 
772   // Called once we know that the comment buffer is complete and is *not*
773   // connected to the next token.
Flush()774   void Flush() {
775     if (has_comment_) {
776       if (can_attach_to_prev_) {
777         if (prev_trailing_comments_ != NULL) {
778           prev_trailing_comments_->append(comment_buffer_);
779         }
780         has_trailing_comment_ = true;
781         can_attach_to_prev_ = false;
782       } else {
783         if (detached_comments_ != NULL) {
784           detached_comments_->push_back(comment_buffer_);
785         }
786       }
787       ClearBuffer();
788       num_comments_++;
789     }
790   }
791 
DetachFromPrev()792   void DetachFromPrev() { can_attach_to_prev_ = false; }
793 
MaybeDetachComment()794   void MaybeDetachComment() {
795     int count = num_comments_;
796     if (has_comment_) count++;
797 
798     // If there's one comment, make sure it is detached.
799     if (count == 1) {
800       if (has_trailing_comment_ && prev_trailing_comments_ != NULL) {
801         std::string trail = *prev_trailing_comments_;
802         if (detached_comments_ != NULL) {
803           // push trailing comment to front of detached
804           detached_comments_->insert(detached_comments_->begin(), 1, trail);
805         }
806         prev_trailing_comments_->clear();
807       }
808       // flush pending comment so it's detached instead of leading
809       Flush();
810     }
811   }
812 
813  private:
814   std::string* prev_trailing_comments_;
815   std::vector<std::string>* detached_comments_;
816   std::string* next_leading_comments_;
817 
818   std::string comment_buffer_;
819   int num_comments_;
820   bool has_trailing_comment_;
821 
822   // True if any comments were read into comment_buffer_.  This can be true even
823   // if comment_buffer_ is empty, namely if the comment was "/**/".
824   bool has_comment_;
825 
826   // Is the comment in the comment buffer a line comment?
827   bool is_line_comment_;
828 
829   // Is it still possible that we could be reading a comment attached to the
830   // previous token?
831   bool can_attach_to_prev_;
832 };
833 
834 }  // namespace
835 
NextWithComments(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)836 bool Tokenizer::NextWithComments(std::string* prev_trailing_comments,
837                                  std::vector<std::string>* detached_comments,
838                                  std::string* next_leading_comments) {
839   CommentCollector collector(prev_trailing_comments, detached_comments,
840                              next_leading_comments);
841 
842   int prev_line = line_;
843   int trailing_comment_end_line = -1;
844 
845   if (current_.type == TYPE_START) {
846     // Ignore unicode byte order mark(BOM) if it appears at the file
847     // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
848     if (TryConsume(static_cast<char>(0xEF))) {
849       if (!TryConsume(static_cast<char>(0xBB)) ||
850           !TryConsume(static_cast<char>(0xBF))) {
851         AddError(
852             "Proto file starts with 0xEF but not UTF-8 BOM. "
853             "Only UTF-8 is accepted for proto file.");
854         return false;
855       }
856     }
857     collector.DetachFromPrev();
858     prev_line = -1;
859   } else {
860     // A comment appearing on the same line must be attached to the previous
861     // declaration.
862     ConsumeZeroOrMore<WhitespaceNoNewline>();
863     switch (TryConsumeCommentStart()) {
864       case LINE_COMMENT:
865         trailing_comment_end_line = line_;
866         ConsumeLineComment(collector.GetBufferForLineComment());
867 
868         // Don't allow comments on subsequent lines to be attached to a trailing
869         // comment.
870         collector.Flush();
871         break;
872       case BLOCK_COMMENT:
873         ConsumeBlockComment(collector.GetBufferForBlockComment());
874         trailing_comment_end_line = line_;
875         ConsumeZeroOrMore<WhitespaceNoNewline>();
876 
877         // Don't allow comments on subsequent lines to be attached to a trailing
878         // comment.
879         collector.Flush();
880         break;
881       case SLASH_NOT_COMMENT:
882         return true;
883       case NO_COMMENT:
884         if (!TryConsume('\n')) {
885           // The next token is on the same line.  There are no comments.
886           return Next();
887         }
888         break;
889     }
890   }
891 
892   // OK, we are now on the line *after* the previous token.
893   while (true) {
894     ConsumeZeroOrMore<WhitespaceNoNewline>();
895 
896     switch (TryConsumeCommentStart()) {
897       case LINE_COMMENT:
898         ConsumeLineComment(collector.GetBufferForLineComment());
899         break;
900       case BLOCK_COMMENT:
901         ConsumeBlockComment(collector.GetBufferForBlockComment());
902 
903         // Consume the rest of the line so that we don't interpret it as a
904         // blank line the next time around the loop.
905         ConsumeZeroOrMore<WhitespaceNoNewline>();
906         TryConsume('\n');
907         break;
908       case SLASH_NOT_COMMENT:
909         return true;
910       case NO_COMMENT:
911         if (TryConsume('\n')) {
912           // Completely blank line.
913           collector.Flush();
914           collector.DetachFromPrev();
915         } else {
916           bool result = Next();
917           if (!result || current_.text == "}" || current_.text == "]" ||
918               current_.text == ")") {
919             // It looks like we're at the end of a scope.  In this case it
920             // makes no sense to attach a comment to the following token.
921             collector.Flush();
922           }
923           if (result &&
924               (prev_line == line_ || trailing_comment_end_line == line_)) {
925             // When previous token and this one are on the same line, or
926             // even if a multi-line trailing comment ends on the same line
927             // as this token, it's unclear to what token the comment
928             // should be attached. So we detach it.
929             collector.MaybeDetachComment();
930           }
931           return result;
932         }
933         break;
934     }
935   }
936 }
937 
938 // -------------------------------------------------------------------
939 // Token-parsing helpers.  Remember that these don't need to report
940 // errors since any errors should already have been reported while
941 // tokenizing.  Also, these can assume that whatever text they
942 // are given is text that the tokenizer actually parsed as a token
943 // of the given type.
944 
ParseInteger(const std::string & text,uint64_t max_value,uint64_t * output)945 bool Tokenizer::ParseInteger(const std::string& text, uint64_t max_value,
946                              uint64_t* output) {
947   // We can't just use strtoull() because (a) it accepts negative numbers,
948   // (b) We want additional range checks, (c) it reports overflows via errno.
949 
950 #if 0
951   const char *str_begin = text.c_str();
952   if (*str_begin == '-') return false;
953   char *str_end = nullptr;
954   errno = 0;
955   *output = std::strtoull(str_begin, &str_end, 0);
956   return (errno == 0 && str_end && *str_end == '\0' && *output <= max_value);
957 #endif
958 
959   const char* ptr = text.c_str();
960   int base = 10;
961   uint64_t overflow_if_mul_base = (kuint64max / 10) + 1;
962   if (ptr[0] == '0') {
963     if (ptr[1] == 'x' || ptr[1] == 'X') {
964       // This is hex.
965       base = 16;
966       overflow_if_mul_base = (kuint64max / 16) + 1;
967       ptr += 2;
968     } else {
969       // This is octal.
970       base = 8;
971       overflow_if_mul_base = (kuint64max / 8) + 1;
972     }
973   }
974 
975   uint64_t result = 0;
976   // For all the leading '0's, and also the first non-zero character, we
977   // don't need to multiply.
978   while (*ptr != '\0') {
979     int digit = DigitValue(*ptr++);
980     if (digit >= base) {
981       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
982       // token, but Tokenizer still think it's integer.
983       return false;
984     }
985     if (digit != 0) {
986       result = digit;
987       break;
988     }
989   }
990   for (; *ptr != '\0'; ptr++) {
991     int digit = DigitValue(*ptr);
992     if (digit < 0 || digit >= base) {
993       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
994       // token, but Tokenizer still think it's integer.
995       return false;
996     }
997     if (result >= overflow_if_mul_base) {
998       // We know the multiply we're about to do will overflow, so exit now.
999       return false;
1000     }
1001     // We know that result * base won't overflow, but adding digit might...
1002     result = result * base + digit;
1003     // C++ guarantees defined "wrap" semantics when unsigned integer
1004     // operations overflow, making this a fast way to check if adding
1005     // digit made result overflow, and thus, wrap around.
1006     if (result < static_cast<uint64_t>(base)) return false;
1007   }
1008   if (result > max_value) return false;
1009 
1010   *output = result;
1011   return true;
1012 }
1013 
ParseFloat(const std::string & text)1014 double Tokenizer::ParseFloat(const std::string& text) {
1015   double result = 0;
1016   if (!TryParseFloat(text, &result)) {
1017     ABSL_DLOG(FATAL)
1018         << " Tokenizer::ParseFloat() passed text that could not have been"
1019            " tokenized as a float: "
1020         << absl::CEscape(text);
1021   }
1022   return result;
1023 }
1024 
TryParseFloat(const std::string & text,double * result)1025 bool Tokenizer::TryParseFloat(const std::string& text, double* result) {
1026   const char* start = text.c_str();
1027   char* end;
1028   *result = NoLocaleStrtod(start, &end);
1029 
1030   // "1e" is not a valid float, but if the tokenizer reads it, it will
1031   // report an error but still return it as a valid token.  We need to
1032   // accept anything the tokenizer could possibly return, error or not.
1033   if (*end == 'e' || *end == 'E') {
1034     ++end;
1035     if (*end == '-' || *end == '+') ++end;
1036   }
1037 
1038   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
1039   // suffixed with the letter 'f'.
1040   if (*end == 'f' || *end == 'F') {
1041     ++end;
1042   }
1043 
1044   return static_cast<size_t>(end - start) == text.size() && *start != '-';
1045 }
1046 
1047 // Helper to append a Unicode code point to a string as UTF8, without bringing
1048 // in any external dependencies.
AppendUTF8(uint32_t code_point,std::string * output)1049 static void AppendUTF8(uint32_t code_point, std::string* output) {
1050   uint32_t tmp = 0;
1051   int len = 0;
1052   if (code_point <= 0x7f) {
1053     tmp = code_point;
1054     len = 1;
1055   } else if (code_point <= 0x07ff) {
1056     tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
1057     len = 2;
1058   } else if (code_point <= 0xffff) {
1059     tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
1060           ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
1061     len = 3;
1062   } else if (code_point <= 0x10ffff) {
1063     tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
1064           ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
1065           (code_point & 0x003f);
1066     len = 4;
1067   } else {
1068     // Unicode code points end at 0x10FFFF, so this is out-of-range.
1069     // ConsumeString permits hex values up to 0x1FFFFF, and FetchUnicodePoint
1070     // doesn't perform a range check.
1071     absl::StrAppendFormat(output, "\\U%08x", code_point);
1072     return;
1073   }
1074   tmp = ghtonl(tmp);
1075   output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
1076 }
1077 
1078 // Try to read <len> hex digits from ptr, and stuff the numeric result into
1079 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32_t * result)1080 static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
1081   *result = 0;
1082   if (len == 0) return false;
1083   for (const char* end = ptr + len; ptr < end; ++ptr) {
1084     if (*ptr == '\0') return false;
1085     *result = (*result << 4) + DigitValue(*ptr);
1086   }
1087   return true;
1088 }
1089 
1090 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
1091 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
1092 // surrogate. These numbers are in a reserved range of Unicode code points, so
1093 // if we encounter such a pair we know how to parse it and convert it into a
1094 // single code point.
1095 static const uint32_t kMinHeadSurrogate = 0xd800;
1096 static const uint32_t kMaxHeadSurrogate = 0xdc00;
1097 static const uint32_t kMinTrailSurrogate = 0xdc00;
1098 static const uint32_t kMaxTrailSurrogate = 0xe000;
1099 
IsHeadSurrogate(uint32_t code_point)1100 static inline bool IsHeadSurrogate(uint32_t code_point) {
1101   return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
1102 }
1103 
IsTrailSurrogate(uint32_t code_point)1104 static inline bool IsTrailSurrogate(uint32_t code_point) {
1105   return (code_point >= kMinTrailSurrogate) &&
1106          (code_point < kMaxTrailSurrogate);
1107 }
1108 
1109 // Combine a head and trail surrogate into a single Unicode code point.
AssembleUTF16(uint32_t head_surrogate,uint32_t trail_surrogate)1110 static uint32_t AssembleUTF16(uint32_t head_surrogate,
1111                               uint32_t trail_surrogate) {
1112   ABSL_DCHECK(IsHeadSurrogate(head_surrogate));
1113   ABSL_DCHECK(IsTrailSurrogate(trail_surrogate));
1114   return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
1115                     (trail_surrogate - kMinTrailSurrogate));
1116 }
1117 
1118 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)1119 static inline int UnicodeLength(char key) {
1120   if (key == 'u') return 4;
1121   if (key == 'U') return 8;
1122   return 0;
1123 }
1124 
1125 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1126 // to parse that sequence. On success, returns a pointer to the first char
1127 // beyond that sequence, and fills in *code_point. On failure, returns ptr
1128 // itself.
FetchUnicodePoint(const char * ptr,uint32_t * code_point)1129 static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
1130   const char* p = ptr;
1131   // Fetch the code point.
1132   const int len = UnicodeLength(*p++);
1133   if (!ReadHexDigits(p, len, code_point)) return ptr;
1134   p += len;
1135 
1136   // Check if the code point we read is a "head surrogate." If so, then we
1137   // expect it to be immediately followed by another code point which is a valid
1138   // "trail surrogate," and together they form a UTF-16 pair which decodes into
1139   // a single Unicode point. Trail surrogates may only use \u, not \U.
1140   if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1141     uint32_t trail_surrogate;
1142     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1143         IsTrailSurrogate(trail_surrogate)) {
1144       *code_point = AssembleUTF16(*code_point, trail_surrogate);
1145       p += 6;
1146     }
1147     // If this failed, then we just emit the head surrogate as a code point.
1148     // It's bogus, but so is the string.
1149   }
1150 
1151   return p;
1152 }
1153 
1154 // The text string must begin and end with single or double quote
1155 // characters.
ParseStringAppend(const std::string & text,std::string * output)1156 void Tokenizer::ParseStringAppend(const std::string& text,
1157                                   std::string* output) {
1158   // Reminder: text[0] is always a quote character.  (If text is
1159   // empty, it's invalid, so we'll just return).
1160   const size_t text_size = text.size();
1161   if (text_size == 0) {
1162     ABSL_DLOG(FATAL)
1163         << " Tokenizer::ParseStringAppend() passed text that could not"
1164            " have been tokenized as a string: "
1165         << absl::CEscape(text);
1166     return;
1167   }
1168 
1169   // Reserve room for new string. The branch is necessary because if
1170   // there is already space available the reserve() call might
1171   // downsize the output.
1172   const size_t new_len = text_size + output->size();
1173   if (new_len > output->capacity()) {
1174     output->reserve(new_len);
1175   }
1176 
1177   // Loop through the string copying characters to "output" and
1178   // interpreting escape sequences.  Note that any invalid escape
1179   // sequences or other errors were already reported while tokenizing.
1180   // In this case we do not need to produce valid results.
1181   for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1182     if (*ptr == '\\' && ptr[1] != '\0') {
1183       // An escape sequence.
1184       ++ptr;
1185 
1186       if (OctalDigit::InClass(*ptr)) {
1187         // An octal escape.  May one, two, or three digits.
1188         int code = DigitValue(*ptr);
1189         if (OctalDigit::InClass(ptr[1])) {
1190           ++ptr;
1191           code = code * 8 + DigitValue(*ptr);
1192         }
1193         if (OctalDigit::InClass(ptr[1])) {
1194           ++ptr;
1195           code = code * 8 + DigitValue(*ptr);
1196         }
1197         output->push_back(static_cast<char>(code));
1198 
1199       } else if (*ptr == 'x' || *ptr == 'X') {
1200         // A hex escape.  May zero, one, or two digits.  (The zero case
1201         // will have been caught as an error earlier.)
1202         int code = 0;
1203         if (HexDigit::InClass(ptr[1])) {
1204           ++ptr;
1205           code = DigitValue(*ptr);
1206         }
1207         if (HexDigit::InClass(ptr[1])) {
1208           ++ptr;
1209           code = code * 16 + DigitValue(*ptr);
1210         }
1211         output->push_back(static_cast<char>(code));
1212 
1213       } else if (*ptr == 'u' || *ptr == 'U') {
1214         uint32_t unicode;
1215         const char* end = FetchUnicodePoint(ptr, &unicode);
1216         if (end == ptr) {
1217           // Failure: Just dump out what we saw, don't try to parse it.
1218           output->push_back(*ptr);
1219         } else {
1220           AppendUTF8(unicode, output);
1221           ptr = end - 1;  // Because we're about to ++ptr.
1222         }
1223       } else {
1224         // Some other escape code.
1225         output->push_back(TranslateEscape(*ptr));
1226       }
1227 
1228     } else if (*ptr == text[0] && ptr[1] == '\0') {
1229       // Ignore final quote matching the starting quote.
1230     } else {
1231       output->push_back(*ptr);
1232     }
1233   }
1234 }
1235 
1236 template <typename CharacterClass>
AllInClass(const std::string & s)1237 static bool AllInClass(const std::string& s) {
1238   for (const char character : s) {
1239     if (!CharacterClass::InClass(character)) return false;
1240   }
1241   return true;
1242 }
1243 
IsIdentifier(const std::string & text)1244 bool Tokenizer::IsIdentifier(const std::string& text) {
1245   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1246   if (text.size() == 0) return false;
1247   if (!Letter::InClass(text.at(0))) return false;
1248   if (!AllInClass<Alphanumeric>(text.substr(1))) return false;
1249   return true;
1250 }
1251 
1252 }  // namespace io
1253 }  // namespace protobuf
1254 }  // namespace google
1255 
1256 #include "google/protobuf/port_undef.inc"
1257