1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2023 Google LLC. All rights reserved. 3 // 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file or at 6 // https://developers.google.com/open-source/licenses/bsd 7 8 // Class for parsing tokenized text from a ZeroCopyInputStream. 9 10 #ifndef UPB_IO_TOKENIZER_H_ 11 #define UPB_IO_TOKENIZER_H_ 12 13 #include "upb/base/status.h" 14 #include "upb/base/string_view.h" 15 #include "upb/io/zero_copy_input_stream.h" 16 #include "upb/mem/arena.h" 17 18 // Must be included last. 19 #include "upb/port/def.inc" 20 21 #ifdef __cplusplus 22 extern "C" { 23 #endif 24 25 typedef enum { 26 kUpb_TokenType_Start, // Next() has not yet been called. 27 kUpb_TokenType_End, // End of input reached. "text" is empty. 28 29 // A sequence of letters, digits, and underscores, not starting with a digit. 30 // It is an error for a number to be followed by an identifier with no space 31 // in between. 32 kUpb_TokenType_Identifier, 33 34 // A sequence of digits representing an integer. Normally the digits are 35 // decimal, but a prefix of "0x" indicates a hex number and a leading zero 36 // indicates octal, just like with C numeric literals. A leading negative 37 // sign is NOT included in the token; it's up to the parser to interpret the 38 // unary minus operator on its own. 39 kUpb_TokenType_Integer, 40 41 // A floating point literal, with a fractional part and/or an exponent. 42 // Always in decimal. Again, never negative. 43 kUpb_TokenType_Float, 44 45 // A quoted sequence of escaped characters. 46 // Either single or double quotes can be used, but they must match. 47 // A string literal cannot cross a line break. 48 kUpb_TokenType_String, 49 50 // Any other printable character, like '!' or '+'. 51 // Symbols are always a single character, so "!+$%" is four tokens. 52 kUpb_TokenType_Symbol, 53 54 // A sequence of whitespace. 55 // This token type is only produced if report_whitespace() is true. 56 // It is not reported for whitespace within comments or strings. 57 kUpb_TokenType_Whitespace, 58 59 // A newline ('\n'). This token type is only produced if report_whitespace() 60 // is true and report_newlines() is also true. 61 // It is not reported for newlines in comments or strings. 62 kUpb_TokenType_Newline, 63 } upb_TokenType; 64 65 typedef enum { 66 // Set to allow floats to be suffixed with the letter 'f'. Tokens which would 67 // otherwise be integers but which have the 'f' suffix will be forced to be 68 // interpreted as floats. For all other purposes, the 'f' is ignored. 69 kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0, 70 71 // If set, whitespace tokens are reported by Next(). 72 kUpb_TokenizerOption_ReportWhitespace = 1 << 1, 73 74 // If set, newline tokens are reported by Next(). 75 // This is a superset of ReportWhitespace. 76 kUpb_TokenizerOption_ReportNewlines = 1 << 2, 77 78 // By default the tokenizer expects C-style (/* */) comments. 79 // If set, it expects shell-style (#) comments instead. 80 kUpb_TokenizerOption_CommentStyleShell = 1 << 3, 81 } upb_Tokenizer_Option; 82 83 typedef struct upb_Tokenizer upb_Tokenizer; 84 85 // Can be passed a flat array and/or a ZCIS as input. 86 // The array will be read first (if non-NULL), then the stream (if non-NULL). 87 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size, 88 upb_ZeroCopyInputStream* input, int options, 89 upb_Arena* arena); 90 91 void upb_Tokenizer_Fini(upb_Tokenizer* t); 92 93 // Advance the tokenizer to the next input token. Returns True on success. 94 // Returns False and (clears *status on EOF, sets *status on error). 95 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status); 96 97 // Accessors for inspecting current/previous parse tokens, 98 // which are opaque to the tokenizer (to reduce copying). 99 100 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t); 101 int upb_Tokenizer_Column(const upb_Tokenizer* t); 102 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t); 103 int upb_Tokenizer_Line(const upb_Tokenizer* t); 104 int upb_Tokenizer_TextSize(const upb_Tokenizer* t); 105 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t); 106 107 // External helper: validate an identifier. 108 bool upb_Tokenizer_IsIdentifier(const char* data, int size); 109 110 // Parses a TYPE_INTEGER token. Returns false if the result would be 111 // greater than max_value. Otherwise, returns true and sets *output to the 112 // result. If the text is not from a Token of type TYPE_INTEGER originally 113 // parsed by a Tokenizer, the result is undefined (possibly an assert 114 // failure). 115 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output); 116 117 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 118 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 119 // result is undefined (possibly an assert failure). 120 double upb_Parse_Float(const char* text); 121 122 // Parses a TYPE_STRING token. This never fails, so long as the text actually 123 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 124 // result is undefined (possibly an assert failure). 125 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena); 126 127 #ifdef __cplusplus 128 } /* extern "C" */ 129 #endif 130 131 #include "upb/port/undef.inc" 132 133 #endif // UPB_IO_TOKENIZER_H_ 134