• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 #include "upb/io/tokenizer.h"
9 
10 #include "upb/io/string.h"
11 #include "upb/lex/strtod.h"
12 #include "upb/lex/unicode.h"
13 
14 // Must be included last.
15 #include "upb/port/def.inc"
16 
17 typedef enum {
18   // Started a line comment.
19   kUpb_CommentType_Line,
20 
21   // Started a block comment.
22   kUpb_CommentType_Block,
23 
24   // Consumed a slash, then realized it wasn't a comment.  current_ has
25   // been filled in with a slash token.  The caller should return it.
26   kUpb_CommentType_SlashNot,
27 
28   // We do not appear to be starting a comment here.
29   kUpb_CommentType_None,
30 } upb_CommentType;
31 
upb_Tokenizer_IsUnprintable(char c)32 static bool upb_Tokenizer_IsUnprintable(char c) { return '\0' < c && c < ' '; }
33 
34 // Since we count columns we need to interpret tabs somehow.  We'll take
35 // the standard 8-character definition for lack of any way to do better.
36 static const int kUpb_Tokenizer_TabWidth = 8;
37 
38 // Given a char, interpret it as a numeric digit and return its value.
39 // This supports any number base up to 36.
40 // Represents integer values of digits.
41 // Uses 36 to indicate an invalid character since we support
42 // bases up to 36.
43 static const int8_t kUpb_Tokenizer_AsciiToInt[256] = {
44     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 00-0F
45     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 10-1F
46     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // ' '-'/'
47     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,                           // '0'-'9'
48     36, 36, 36, 36, 36, 36, 36,                                      // ':'-'@'
49     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'P'
50     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'Q'-'Z'
51     36, 36, 36, 36, 36, 36,                                          // '['-'`'
52     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'a'-'p'
53     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'q'-'z'
54     36, 36, 36, 36, 36,                                              // '{'-DEL
55     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 80-8F
56     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 90-9F
57     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // A0-AF
58     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // B0-BF
59     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // C0-CF
60     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // D0-DF
61     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // E0-EF
62     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // F0-FF
63 };
64 
DigitValue(char digit)65 static int DigitValue(char digit) {
66   return kUpb_Tokenizer_AsciiToInt[digit & 0xFF];
67 }
68 
upb_Tokenizer_IsLetter(char c)69 static bool upb_Tokenizer_IsLetter(char c) {
70   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_');
71 }
72 
upb_Tokenizer_IsDigit(char c)73 static bool upb_Tokenizer_IsDigit(char c) { return '0' <= c && c <= '9'; }
74 
upb_Tokenizer_IsOctalDigit(char c)75 static bool upb_Tokenizer_IsOctalDigit(char c) { return '0' <= c && c <= '7'; }
76 
upb_Tokenizer_IsHexDigit(char c)77 static bool upb_Tokenizer_IsHexDigit(char c) {
78   return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
79          ('A' <= c && c <= 'F');
80 }
81 
upb_Tokenizer_IsAlphanumeric(char c)82 static bool upb_Tokenizer_IsAlphanumeric(char c) {
83   return upb_Tokenizer_IsLetter(c) || upb_Tokenizer_IsDigit(c);
84 }
85 
upb_Tokenizer_IsWhitespaceNoNewline(char c)86 static bool upb_Tokenizer_IsWhitespaceNoNewline(char c) {
87   return c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f';
88 }
89 
upb_Tokenizer_IsWhitespace(char c)90 static bool upb_Tokenizer_IsWhitespace(char c) {
91   return c == '\n' || upb_Tokenizer_IsWhitespaceNoNewline(c);
92 }
93 
upb_Tokenizer_IsEscape(char c)94 static bool upb_Tokenizer_IsEscape(char c) {
95   return c == 'a' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't' ||
96          c == 'v' || c == '\\' || c == '?' || c == '\'' || c == '\"';
97 }
98 
TranslateEscape(char c)99 static char TranslateEscape(char c) {
100   switch (c) {
101     case 'a':
102       return '\a';
103     case 'b':
104       return '\b';
105     case 'f':
106       return '\f';
107     case 'n':
108       return '\n';
109     case 'r':
110       return '\r';
111     case 't':
112       return '\t';
113     case 'v':
114       return '\v';
115     case '\\':
116       return '\\';
117     case '?':
118       return '\?';  // Trigraphs = :(
119     case '\'':
120       return '\'';
121     case '"':
122       return '\"';
123 
124     // We expect escape sequences to have been validated separately.
125     default:
126       return '?';
127   }
128 }
129 
130 // ===================================================================
131 
132 struct upb_Tokenizer {
133   upb_TokenType token_type;  // The type of the current token.
134 
135   // The exact text of the current token as it appeared in the input.
136   // e.g. tokens of TYPE_STRING will still be escaped and in quotes.
137   upb_String token_text;
138 
139   // "line" and "column" specify the position of the first character of
140   // the token within the input stream. They are zero-based.
141   int token_line;
142   int token_column;
143   int token_end_column;
144 
145   upb_ZeroCopyInputStream* input;
146   upb_Arena* arena;
147   upb_Status* status;
148 
149   char current_char;   // == buffer_[buffer_pos_], updated by NextChar().
150   const char* buffer;  // Current buffer returned from input_.
151   size_t buffer_size;  // Size of buffer_.
152   size_t buffer_pos;   // Current position within the buffer.
153   bool read_error;     // Did we previously encounter a read error?
154 
155   // Line and column number of current_char_ within the whole input stream.
156   int line;
157 
158   // By "column number", the proto compiler refers to a count of the number
159   // of bytes before a given byte, except that a tab character advances to
160   // the next multiple of 8 bytes.  Note in particular that column numbers
161   // are zero-based, while many user interfaces use one-based column numbers.
162   int column;
163 
164   // Cached values from before the most recent call to Next()
165   upb_TokenType previous_type;
166   int previous_line;
167   int previous_column;
168   int previous_end_column;
169 
170   // String to which text should be appended as we advance through it.
171   // Call RecordTo(&str) to start recording and StopRecording() to stop.
172   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
173   // position within the current buffer where recording started.
174   upb_String* record_target;
175   int record_start;
176   int options;
177   jmp_buf err;
178 };
179 
180 // Convenience methods to return an error at the current line and column.
181 
ReportError(upb_Tokenizer * t,const char * msg)182 UPB_NORETURN static void ReportError(upb_Tokenizer* t, const char* msg) {
183   upb_Status_SetErrorFormat(t->status, "%d:%d: %s", t->line, t->column, msg);
184   UPB_LONGJMP(t->err, 1);
185 }
186 
ReportErrorFormat(upb_Tokenizer * t,const char * fmt,...)187 UPB_NORETURN UPB_PRINTF(2, 3) static void ReportErrorFormat(upb_Tokenizer* t,
188                                                             const char* fmt,
189                                                             ...) {
190   va_list args;
191   va_start(args, fmt);
192   char msg[128];
193   vsnprintf(msg, sizeof(msg), fmt, args);
194   ReportError(t, msg);
195 }
196 
197 // Read a new buffer from the input.
Refresh(upb_Tokenizer * t)198 static void Refresh(upb_Tokenizer* t) {
199   if (t->read_error) {
200     t->current_char = '\0';
201     return;
202   }
203 
204   // If we're in a token, append the rest of the buffer to it.
205   if (t->record_target != NULL && t->record_start < t->buffer_size) {
206     upb_String_Append(t->record_target, t->buffer + t->record_start,
207                       t->buffer_size - t->record_start);
208     t->record_start = 0;
209   }
210 
211   t->buffer = NULL;
212   t->buffer_pos = 0;
213 
214   upb_Status status;
215   const void* data =
216       upb_ZeroCopyInputStream_Next(t->input, &t->buffer_size, &status);
217 
218   if (t->buffer_size > 0) {
219     t->buffer = data;
220     t->current_char = t->buffer[0];
221   } else {
222     // end of stream (or read error)
223     t->buffer_size = 0;
224     t->read_error = true;
225     t->current_char = '\0';
226   }
227 }
228 
229 // Consume this character and advance to the next one.
NextChar(upb_Tokenizer * t)230 static void NextChar(upb_Tokenizer* t) {
231   // Update our line and column counters based on the character being
232   // consumed.
233   if (t->current_char == '\n') {
234     t->line++;
235     t->column = 0;
236   } else if (t->current_char == '\t') {
237     t->column += kUpb_Tokenizer_TabWidth - t->column % kUpb_Tokenizer_TabWidth;
238   } else {
239     t->column++;
240   }
241 
242   // Advance to the next character.
243   t->buffer_pos++;
244   if (t->buffer_pos < t->buffer_size) {
245     t->current_char = t->buffer[t->buffer_pos];
246   } else {
247     Refresh(t);
248   }
249 }
250 
RecordTo(upb_Tokenizer * t,upb_String * target)251 static void RecordTo(upb_Tokenizer* t, upb_String* target) {
252   t->record_target = target;
253   t->record_start = t->buffer_pos;
254 }
255 
StopRecording(upb_Tokenizer * t)256 static void StopRecording(upb_Tokenizer* t) {
257   if (t->buffer_pos > t->record_start) {
258     upb_String_Append(t->record_target, t->buffer + t->record_start,
259                       t->buffer_pos - t->record_start);
260   }
261   t->record_target = NULL;
262   t->record_start = -1;
263 }
264 
265 // Called when the current character is the first character of a new
266 // token (not including whitespace or comments).
StartToken(upb_Tokenizer * t)267 static void StartToken(upb_Tokenizer* t) {
268   t->token_type = kUpb_TokenType_Start;
269   upb_String_Clear(&t->token_text);
270   t->token_line = t->line;
271   t->token_column = t->column;
272   RecordTo(t, &t->token_text);
273 }
274 
275 // Called when the current character is the first character after the
276 // end of the last token.  After this returns, current_.text will
277 // contain all text consumed since StartToken() was called.
EndToken(upb_Tokenizer * t)278 static void EndToken(upb_Tokenizer* t) {
279   StopRecording(t);
280   t->token_end_column = t->column;
281 }
282 
283 // -----------------------------------------------------------------
284 // These helper methods make the parsing code more readable.
285 // The "character classes" referred to are defined at the top of the file.
286 // The method returns true if c is a member of this "class", like "Letter"
287 // or "Digit".
288 
289 // Returns true if the current character is of the given character
290 // class, but does not consume anything.
LookingAt(const upb_Tokenizer * t,bool (* f)(char))291 static bool LookingAt(const upb_Tokenizer* t, bool (*f)(char)) {
292   return f(t->current_char);
293 }
294 
295 // If the current character is in the given class, consume it and return true.
296 // Otherwise return false.
TryConsumeOne(upb_Tokenizer * t,bool (* f)(char))297 static bool TryConsumeOne(upb_Tokenizer* t, bool (*f)(char)) {
298   if (f(t->current_char)) {
299     NextChar(t);
300     return true;
301   } else {
302     return false;
303   }
304 }
305 
306 // Like above, but try to consume the specific character indicated.
TryConsume(upb_Tokenizer * t,char c)307 static bool TryConsume(upb_Tokenizer* t, char c) {
308   if (t->current_char == c) {
309     NextChar(t);
310     return true;
311   } else {
312     return false;
313   }
314 }
315 
316 // Consume zero or more of the given character class.
ConsumeZeroOrMore(upb_Tokenizer * t,bool (* f)(char))317 static void ConsumeZeroOrMore(upb_Tokenizer* t, bool (*f)(char)) {
318   while (f(t->current_char)) {
319     NextChar(t);
320   }
321 }
322 
323 // Consume one or more of the given character class or log the given
324 // error message.
ConsumeOneOrMore(upb_Tokenizer * t,bool (* f)(char),const char * err_msg)325 static void ConsumeOneOrMore(upb_Tokenizer* t, bool (*f)(char),
326                              const char* err_msg) {
327   if (!f(t->current_char)) {
328     ReportError(t, err_msg);
329   }
330 
331   do {
332     NextChar(t);
333   } while (f(t->current_char));
334 }
335 
336 // -----------------------------------------------------------------
337 // The following four methods are used to consume tokens of specific
338 // types.  They are actually used to consume all characters *after*
339 // the first, since the calling function consumes the first character
340 // in order to decide what kind of token is being read.
341 
342 // Read and consume a string, ending when the given delimiter is consumed.
ConsumeString(upb_Tokenizer * t,char delimiter)343 static void ConsumeString(upb_Tokenizer* t, char delimiter) {
344   while (true) {
345     switch (t->current_char) {
346       case '\0':
347         ReportError(t, "Unexpected end of string.");
348 
349       case '\n':
350         ReportError(t, "String literals cannot cross line boundaries.");
351 
352       case '\\': {
353         // An escape sequence.
354         NextChar(t);
355         if (TryConsumeOne(t, upb_Tokenizer_IsEscape)) {
356           // Valid escape sequence.
357         } else if (TryConsumeOne(t, upb_Tokenizer_IsOctalDigit)) {
358           // Possibly followed by two more octal digits, but these will
359           // just be consumed by the main loop anyway so we don't need
360           // to do so explicitly here.
361         } else if (TryConsume(t, 'x')) {
362           if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
363             ReportError(t, "Expected hex digits for escape sequence.");
364           }
365           // Possibly followed by another hex digit, but again we don't care.
366         } else if (TryConsume(t, 'u')) {
367           if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
368               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
369               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
370               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
371             ReportError(t, "Expected four hex digits for \\u escape sequence.");
372           }
373         } else if (TryConsume(t, 'U')) {
374           // We expect 8 hex digits; but only the range up to 0x10ffff is
375           // legal.
376           if (!TryConsume(t, '0') || !TryConsume(t, '0') ||
377               !(TryConsume(t, '0') || TryConsume(t, '1')) ||
378               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
379               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
380               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
381               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
382               !TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
383             ReportError(t,
384                         "Expected eight hex digits up to 10ffff for \\U escape "
385                         "sequence");
386           }
387         } else {
388           ReportError(t, "Invalid escape sequence in string literal.");
389         }
390         break;
391       }
392 
393       default: {
394         if (t->current_char == delimiter) {
395           NextChar(t);
396           return;
397         }
398         NextChar(t);
399         break;
400       }
401     }
402   }
403 }
404 
405 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER depending
406 // on what was read.  This needs to know if the first character was a zero in
407 // order to correctly recognize hex and octal numbers.  It also needs to know
408 // whether the first character was a '.' to parse floating point correctly.
ConsumeNumber(upb_Tokenizer * t,bool started_with_zero,bool started_with_dot)409 static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
410                                    bool started_with_dot) {
411   bool is_float = false;
412 
413   if (started_with_zero && (TryConsume(t, 'x') || TryConsume(t, 'X'))) {
414     // A hex number (started with "0x").
415     ConsumeOneOrMore(t, upb_Tokenizer_IsHexDigit,
416                      "\"0x\" must be followed by hex digits.");
417 
418   } else if (started_with_zero && LookingAt(t, upb_Tokenizer_IsDigit)) {
419     // An octal number (had a leading zero).
420     ConsumeZeroOrMore(t, upb_Tokenizer_IsOctalDigit);
421     if (LookingAt(t, upb_Tokenizer_IsDigit)) {
422       ReportError(t, "Numbers starting with leading zero must be in octal.");
423     }
424 
425   } else {
426     // A decimal number.
427     if (started_with_dot) {
428       is_float = true;
429       ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
430     } else {
431       ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
432 
433       if (TryConsume(t, '.')) {
434         is_float = true;
435         ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
436       }
437     }
438 
439     if (TryConsume(t, 'e') || TryConsume(t, 'E')) {
440       is_float = true;
441       if (!TryConsume(t, '-')) TryConsume(t, '+');
442       ConsumeOneOrMore(t, upb_Tokenizer_IsDigit,
443                        "\"e\" must be followed by exponent.");
444     }
445 
446     if (t->options & kUpb_TokenizerOption_AllowFAfterFloat) {
447       if (TryConsume(t, 'f') || TryConsume(t, 'F')) is_float = true;
448     }
449   }
450 
451   if (LookingAt(t, upb_Tokenizer_IsLetter)) {
452     ReportError(t, "Need space between number and identifier.");
453   }
454 
455   if (t->current_char == '.') {
456     if (is_float) {
457       ReportError(
458           t, "Already saw decimal point or exponent; can't have another one.");
459     } else {
460       ReportError(t, "Hex and octal numbers must be integers.");
461     }
462   }
463 
464   return is_float ? kUpb_TokenType_Float : kUpb_TokenType_Integer;
465 }
466 
467 // Consume the rest of a line.
ConsumeLineComment(upb_Tokenizer * t,upb_String * content)468 static void ConsumeLineComment(upb_Tokenizer* t, upb_String* content) {
469   if (content != NULL) RecordTo(t, content);
470 
471   while (t->current_char != '\0' && t->current_char != '\n') {
472     NextChar(t);
473   }
474   TryConsume(t, '\n');
475 
476   if (content != NULL) StopRecording(t);
477 }
478 
ConsumeBlockComment(upb_Tokenizer * t,upb_String * content)479 static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
480   const int start_line = t->line;
481   const int start_column = t->column - 2;
482 
483   if (content != NULL) RecordTo(t, content);
484 
485   while (true) {
486     while (t->current_char != '\0' && t->current_char != '*' &&
487            t->current_char != '/' && t->current_char != '\n') {
488       NextChar(t);
489     }
490 
491     if (TryConsume(t, '\n')) {
492       if (content != NULL) StopRecording(t);
493 
494       // Consume leading whitespace and asterisk;
495       ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
496       if (TryConsume(t, '*')) {
497         if (TryConsume(t, '/')) {
498           // End of comment.
499           break;
500         }
501       }
502 
503       if (content != NULL) RecordTo(t, content);
504     } else if (TryConsume(t, '*') && TryConsume(t, '/')) {
505       // End of comment.
506       if (content != NULL) {
507         StopRecording(t);
508         // Strip trailing "*/".
509         upb_String_Erase(content, upb_String_Size(content) - 2, 2);
510       }
511       break;
512     } else if (TryConsume(t, '/') && t->current_char == '*') {
513       // Note:  We didn't consume the '*' because if there is a '/' after it
514       //   we want to interpret that as the end of the comment.
515       ReportError(
516           t, "\"/*\" inside block comment.  Block comments cannot be nested.");
517     } else if (t->current_char == '\0') {
518       ReportErrorFormat(
519           t, "End-of-file inside block comment.\n%d:%d: Comment started here.",
520           start_line, start_column);
521     }
522   }
523 }
524 
525 // If we're at the start of a new comment, consume it and return what kind
526 // of comment it is.
TryConsumeCommentStart(upb_Tokenizer * t)527 static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
528   const bool style_sh = t->options & kUpb_TokenizerOption_CommentStyleShell;
529   const bool style_cpp = !style_sh;
530 
531   if (style_cpp && TryConsume(t, '/')) {
532     if (TryConsume(t, '/')) {
533       return kUpb_CommentType_Line;
534     } else if (TryConsume(t, '*')) {
535       return kUpb_CommentType_Block;
536     } else {
537       // Oops, it was just a slash.  Return it.
538       t->token_type = kUpb_TokenType_Symbol;
539       upb_String_Assign(&t->token_text, "/", 1);
540       t->token_line = t->line;
541       t->token_column = t->column - 1;
542       t->token_end_column = t->column;
543       return kUpb_CommentType_SlashNot;
544     }
545   } else if (style_sh && TryConsume(t, '#')) {
546     return kUpb_CommentType_Line;
547   } else {
548     return kUpb_CommentType_None;
549   }
550 }
551 
552 // If we're looking at a TYPE_WHITESPACE token and `report_whitespace` is true,
553 // consume it and return true.
TryConsumeWhitespace(upb_Tokenizer * t)554 static bool TryConsumeWhitespace(upb_Tokenizer* t) {
555   if (t->options & kUpb_TokenizerOption_ReportNewlines) {
556     if (TryConsumeOne(t, upb_Tokenizer_IsWhitespaceNoNewline)) {
557       ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
558       t->token_type = kUpb_TokenType_Whitespace;
559       return true;
560     }
561     return false;
562   }
563   if (TryConsumeOne(t, upb_Tokenizer_IsWhitespace)) {
564     ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespace);
565     t->token_type = kUpb_TokenType_Whitespace;
566     return (t->options & kUpb_TokenizerOption_ReportWhitespace) != 0;
567   }
568   return false;
569 }
570 
571 // If we're looking at a TYPE_NEWLINE token and `report_newlines` is true,
572 // consume it and return true.
TryConsumeNewline(upb_Tokenizer * t)573 static bool TryConsumeNewline(upb_Tokenizer* t) {
574   if (t->options & kUpb_TokenizerOption_ReportNewlines) {
575     if (TryConsume(t, '\n')) {
576       t->token_type = kUpb_TokenType_Newline;
577       return true;
578     }
579   }
580   return false;
581 }
582 
583 // -------------------------------------------------------------------
584 
upb_Tokenizer_Column(const upb_Tokenizer * t)585 int upb_Tokenizer_Column(const upb_Tokenizer* t) { return t->token_column; }
586 
upb_Tokenizer_EndColumn(const upb_Tokenizer * t)587 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t) {
588   return t->token_end_column;
589 }
590 
upb_Tokenizer_Line(const upb_Tokenizer * t)591 int upb_Tokenizer_Line(const upb_Tokenizer* t) { return t->token_line; }
592 
upb_Tokenizer_TextSize(const upb_Tokenizer * t)593 int upb_Tokenizer_TextSize(const upb_Tokenizer* t) {
594   return t->token_text.size_;
595 }
596 
upb_Tokenizer_TextData(const upb_Tokenizer * t)597 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t) {
598   return t->token_text.data_;
599 }
600 
upb_Tokenizer_Type(const upb_Tokenizer * t)601 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t) {
602   return t->token_type;
603 }
604 
upb_Tokenizer_Next(upb_Tokenizer * t,upb_Status * status)605 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status) {
606   t->status = status;
607   t->previous_type = t->token_type;
608   t->previous_line = t->token_line;
609   t->previous_column = t->token_column;
610   t->previous_end_column = t->token_end_column;
611 
612   if (UPB_SETJMP(t->err)) return false;
613 
614   while (!t->read_error) {
615     StartToken(t);
616     bool report_token = TryConsumeWhitespace(t) || TryConsumeNewline(t);
617     EndToken(t);
618     if (report_token) return true;
619 
620     switch (TryConsumeCommentStart(t)) {
621       case kUpb_CommentType_Line:
622         ConsumeLineComment(t, NULL);
623         continue;
624       case kUpb_CommentType_Block:
625         ConsumeBlockComment(t, NULL);
626         continue;
627       case kUpb_CommentType_SlashNot:
628         return true;
629       case kUpb_CommentType_None:
630         break;
631     }
632 
633     // Check for EOF before continuing.
634     if (t->read_error) break;
635 
636     if (LookingAt(t, upb_Tokenizer_IsUnprintable) || t->current_char == '\0') {
637       ReportError(t, "Invalid control characters encountered in text.");
638     }
639 
640     // Reading some sort of token.
641     StartToken(t);
642 
643     if (TryConsumeOne(t, upb_Tokenizer_IsLetter)) {
644       ConsumeZeroOrMore(t, upb_Tokenizer_IsAlphanumeric);
645       t->token_type = kUpb_TokenType_Identifier;
646     } else if (TryConsume(t, '0')) {
647       t->token_type = ConsumeNumber(t, true, false);
648     } else if (TryConsume(t, '.')) {
649       // This could be the beginning of a floating-point number, or it could
650       // just be a '.' symbol.
651 
652       if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
653         // It's a floating-point number.
654         if (t->previous_type == kUpb_TokenType_Identifier &&
655             t->token_line == t->previous_line &&
656             t->token_column == t->previous_end_column) {
657           // We don't accept syntax like "blah.123".
658           t->column -= 2;
659           ReportError(t, "Need space between identifier and decimal point.");
660         }
661         t->token_type = ConsumeNumber(t, false, true);
662       } else {
663         t->token_type = kUpb_TokenType_Symbol;
664       }
665     } else if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
666       t->token_type = ConsumeNumber(t, false, false);
667     } else if (TryConsume(t, '\"')) {
668       ConsumeString(t, '\"');
669       t->token_type = kUpb_TokenType_String;
670     } else if (TryConsume(t, '\'')) {
671       ConsumeString(t, '\'');
672       t->token_type = kUpb_TokenType_String;
673     } else {
674       // Check if the high order bit is set.
675       if (t->current_char & 0x80) {
676         ReportErrorFormat(t, "Interpreting non ascii codepoint %d.",
677                           (uint8_t)t->current_char);
678       }
679       NextChar(t);
680       t->token_type = kUpb_TokenType_Symbol;
681     }
682 
683     EndToken(t);
684     return true;
685   }
686 
687   // EOF
688   t->token_type = kUpb_TokenType_End;
689   upb_String_Clear(&t->token_text);
690   t->token_line = t->line;
691   t->token_column = t->column;
692   t->token_end_column = t->column;
693   upb_Status_Clear(status);
694   return false;
695 }
696 
697 // -------------------------------------------------------------------
698 // Token-parsing helpers.  Remember that these don't need to report
699 // errors since any errors should already have been reported while
700 // tokenizing.  Also, these can assume that whatever text they
701 // are given is text that the tokenizer actually parsed as a token
702 // of the given type.
703 
upb_Parse_Integer(const char * text,uint64_t max_value,uint64_t * output)704 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output) {
705   // We can't just use strtoull() because (a) it accepts negative numbers,
706   // (b) We want additional range checks, (c) it reports overflows via errno.
707 
708   const char* ptr = text;
709   int base = 10;
710   uint64_t overflow_if_mul_base = (UINT64_MAX / 10) + 1;
711   if (ptr[0] == '0') {
712     if (ptr[1] == 'x' || ptr[1] == 'X') {
713       // This is hex.
714       base = 16;
715       overflow_if_mul_base = (UINT64_MAX / 16) + 1;
716       ptr += 2;
717     } else {
718       // This is octal.
719       base = 8;
720       overflow_if_mul_base = (UINT64_MAX / 8) + 1;
721     }
722   }
723 
724   uint64_t result = 0;
725   // For all the leading '0's, and also the first non-zero character, we
726   // don't need to multiply.
727   while (*ptr != '\0') {
728     int digit = DigitValue(*ptr++);
729     if (digit >= base) {
730       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
731       // token, but Tokenizer still think it's integer.
732       return false;
733     }
734     if (digit != 0) {
735       result = digit;
736       break;
737     }
738   }
739   for (; *ptr != '\0'; ptr++) {
740     int digit = DigitValue(*ptr);
741     if (digit < 0 || digit >= base) {
742       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
743       // token, but Tokenizer still think it's integer.
744       return false;
745     }
746     if (result >= overflow_if_mul_base) {
747       // We know the multiply we're about to do will overflow, so exit now.
748       return false;
749     }
750     // We know that result * base won't overflow, but adding digit might...
751     result = result * base + digit;
752     // C++ guarantees defined "wrap" semantics when unsigned integer
753     // operations overflow, making this a fast way to check if adding
754     // digit made result overflow, and thus, wrap around.
755     if (result < (uint64_t)base) return false;
756   }
757   if (result > max_value) return false;
758 
759   *output = result;
760   return true;
761 }
762 
upb_Parse_Float(const char * text)763 double upb_Parse_Float(const char* text) {
764   char* end;
765   double result = _upb_NoLocaleStrtod(text, &end);
766 
767   // "1e" is not a valid float, but if the tokenizer reads it, it will
768   // report an error but still return it as a valid token.  We need to
769   // accept anything the tokenizer could possibly return, error or not.
770   if (*end == 'e' || *end == 'E') {
771     ++end;
772     if (*end == '-' || *end == '+') ++end;
773   }
774 
775   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
776   // suffixed with the letter 'f'.
777   if (*end == 'f' || *end == 'F') {
778     ++end;
779   }
780 
781   if ((end - text) != strlen(text) || *text == '-') {
782     fprintf(stderr,
783             "upb_Parse_Float() passed text that could not have"
784             " been tokenized as a float: %s\n",
785             text);
786     UPB_ASSERT(0);
787   }
788   return result;
789 }
790 
791 // Append a Unicode code point to a string as UTF8.
AppendUTF8(uint32_t code_point,upb_String * output)792 static void AppendUTF8(uint32_t code_point, upb_String* output) {
793   char temp[24];
794   int len = upb_Unicode_ToUTF8(code_point, temp);
795   if (len == 0) {
796     // ConsumeString permits hex values up to 0x1FFFFF,
797     // and FetchUnicodePoint doesn't perform a range check.
798     // Unicode code points end at 0x10FFFF, so this is out-of-range.
799     len = snprintf(temp, sizeof temp, "\\U%08x", code_point);
800   }
801   upb_String_Append(output, temp, len);
802 }
803 
804 // Try to read <len> hex digits from ptr, and stuff the numeric result into
805 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32_t * result)806 static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
807   *result = 0;
808   if (len == 0) return false;
809   for (const char* end = ptr + len; ptr < end; ++ptr) {
810     if (*ptr == '\0') return false;
811     *result = (*result << 4) + DigitValue(*ptr);
812   }
813   return true;
814 }
815 
816 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)817 static int UnicodeLength(char key) {
818   if (key == 'u') return 4;
819   if (key == 'U') return 8;
820   return 0;
821 }
822 
823 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
824 // to parse that sequence. On success, returns a pointer to the first char
825 // beyond that sequence, and fills in *code_point. On failure, returns ptr
826 // itself.
FetchUnicodePoint(const char * ptr,uint32_t * code_point)827 static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
828   const char* p = ptr;
829   // Fetch the code point.
830   const int len = UnicodeLength(*p++);
831   if (!ReadHexDigits(p, len, code_point)) return ptr;
832   p += len;
833 
834   // Check if the code point we read is a "head surrogate." If so, then we
835   // expect it to be immediately followed by another code point which is a valid
836   // "trail surrogate," and together they form a UTF-16 pair which decodes into
837   // a single Unicode point. Trail surrogates may only use \u, not \U.
838   if (upb_Unicode_IsHigh(*code_point) && *p == '\\' && *(p + 1) == 'u') {
839     uint32_t trail_surrogate;
840     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
841         upb_Unicode_IsLow(trail_surrogate)) {
842       *code_point = upb_Unicode_FromPair(*code_point, trail_surrogate);
843       p += 6;
844     }
845     // If this failed, then we just emit the head surrogate as a code point.
846     // It's bogus, but so is the string.
847   }
848 
849   return p;
850 }
851 
852 // The text string must begin and end with single or double quote characters.
upb_Parse_String(const char * text,upb_Arena * arena)853 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena) {
854   const size_t size = strlen(text);
855 
856   upb_String output;
857   upb_String_Init(&output, arena);
858 
859   // Reminder: text[0] is always a quote character.
860   // (If text is empty, it's invalid, so we'll just return).
861   if (size == 0) {
862     fprintf(stderr,
863             "Tokenizer::ParseStringAppend() passed text that could not"
864             " have been tokenized as a string: %s",
865             text);
866     UPB_ASSERT(0);
867     return upb_StringView_FromDataAndSize(NULL, 0);
868   }
869 
870   // Reserve room for new string.
871   const size_t new_len = size + upb_String_Size(&output);
872   upb_String_Reserve(&output, new_len);
873 
874   // Loop through the string copying characters to "output" and
875   // interpreting escape sequences.  Note that any invalid escape
876   // sequences or other errors were already reported while tokenizing.
877   // In this case we do not need to produce valid results.
878   for (const char* ptr = text + 1; *ptr != '\0'; ptr++) {
879     if (*ptr == '\\' && ptr[1] != '\0') {
880       // An escape sequence.
881       ++ptr;
882 
883       if (upb_Tokenizer_IsOctalDigit(*ptr)) {
884         // An octal escape.  May one, two, or three digits.
885         int code = DigitValue(*ptr);
886         if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
887           ++ptr;
888           code = code * 8 + DigitValue(*ptr);
889         }
890         if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
891           ++ptr;
892           code = code * 8 + DigitValue(*ptr);
893         }
894         upb_String_PushBack(&output, (char)code);
895 
896       } else if (*ptr == 'x') {
897         // A hex escape.  May zero, one, or two digits.  (The zero case
898         // will have been caught as an error earlier.)
899         int code = 0;
900         if (upb_Tokenizer_IsHexDigit(ptr[1])) {
901           ++ptr;
902           code = DigitValue(*ptr);
903         }
904         if (upb_Tokenizer_IsHexDigit(ptr[1])) {
905           ++ptr;
906           code = code * 16 + DigitValue(*ptr);
907         }
908         upb_String_PushBack(&output, (char)code);
909 
910       } else if (*ptr == 'u' || *ptr == 'U') {
911         uint32_t unicode;
912         const char* end = FetchUnicodePoint(ptr, &unicode);
913         if (end == ptr) {
914           // Failure: Just dump out what we saw, don't try to parse it.
915           upb_String_PushBack(&output, *ptr);
916         } else {
917           AppendUTF8(unicode, &output);
918           ptr = end - 1;  // Because we're about to ++ptr.
919         }
920       } else {
921         // Some other escape code.
922         upb_String_PushBack(&output, TranslateEscape(*ptr));
923       }
924 
925     } else if (*ptr == text[0] && ptr[1] == '\0') {
926       // Ignore final quote matching the starting quote.
927     } else {
928       upb_String_PushBack(&output, *ptr);
929     }
930   }
931 
932   return upb_StringView_FromDataAndSize(upb_String_Data(&output),
933                                         upb_String_Size(&output));
934 }
935 
AllInClass(bool (* f)(char),const char * text,int size)936 static bool AllInClass(bool (*f)(char), const char* text, int size) {
937   for (int i = 0; i < size; i++) {
938     if (!f(text[i])) return false;
939   }
940   return true;
941 }
942 
upb_Tokenizer_IsIdentifier(const char * data,int size)943 bool upb_Tokenizer_IsIdentifier(const char* data, int size) {
944   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
945   if (size == 0) return false;
946   if (!upb_Tokenizer_IsLetter(data[0])) return false;
947   if (!AllInClass(upb_Tokenizer_IsAlphanumeric, data + 1, size - 1))
948     return false;
949   return true;
950 }
951 
upb_Tokenizer_New(const void * data,size_t size,upb_ZeroCopyInputStream * input,int options,upb_Arena * arena)952 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
953                                  upb_ZeroCopyInputStream* input, int options,
954                                  upb_Arena* arena) {
955   upb_Tokenizer* t = upb_Arena_Malloc(arena, sizeof(upb_Tokenizer));
956   if (!t) return NULL;
957 
958   t->input = input;
959   t->arena = arena;
960   t->buffer = data;
961   t->buffer_size = size;
962   t->buffer_pos = 0;
963   t->read_error = false;
964   t->line = 0;
965   t->column = 0;
966   t->record_target = NULL;
967   t->record_start = -1;
968 
969   // ReportNewlines implies ReportWhitespace.
970   if (options & kUpb_TokenizerOption_ReportNewlines) {
971     options |= kUpb_TokenizerOption_ReportWhitespace;
972   }
973   t->options = options;
974 
975   upb_String_Init(&t->token_text, arena);
976   t->token_type = kUpb_TokenType_Start;
977   t->token_line = 0;
978   t->token_column = 0;
979   t->token_end_column = 0;
980 
981   t->previous_type = kUpb_TokenType_Start;
982   t->previous_line = 0;
983   t->previous_column = 0;
984   t->previous_end_column = 0;
985 
986   if (size) {
987     t->current_char = t->buffer[0];
988   } else {
989     Refresh(t);
990   }
991   return t;
992 }
993 
upb_Tokenizer_Fini(upb_Tokenizer * t)994 void upb_Tokenizer_Fini(upb_Tokenizer* t) {
995   // If we had any buffer left unread, return it to the underlying stream
996   // so that someone else can read it.
997   if (t->buffer_size > t->buffer_pos) {
998     upb_ZeroCopyInputStream_BackUp(t->input, t->buffer_size - t->buffer_pos);
999   }
1000 }
1001