• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 // Class for parsing tokenized text from a ZeroCopyInputStream.
9 
10 #ifndef UPB_IO_TOKENIZER_H_
11 #define UPB_IO_TOKENIZER_H_
12 
13 #include "upb/base/status.h"
14 #include "upb/base/string_view.h"
15 #include "upb/io/zero_copy_input_stream.h"
16 #include "upb/mem/arena.h"
17 
18 // Must be included last.
19 #include "upb/port/def.inc"
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 typedef enum {
26   kUpb_TokenType_Start,  // Next() has not yet been called.
27   kUpb_TokenType_End,    // End of input reached. "text" is empty.
28 
29   // A sequence of letters, digits, and underscores, not starting with a digit.
30   // It is an error for a number to be followed by an identifier with no space
31   // in between.
32   kUpb_TokenType_Identifier,
33 
34   // A sequence of digits representing an integer. Normally the digits are
35   // decimal, but a prefix of "0x" indicates a hex number and a leading zero
36   // indicates octal, just like with C numeric literals. A leading negative
37   // sign is NOT included in the token; it's up to the parser to interpret the
38   // unary minus operator on its own.
39   kUpb_TokenType_Integer,
40 
41   // A floating point literal, with a fractional part and/or an exponent.
42   // Always in decimal. Again, never negative.
43   kUpb_TokenType_Float,
44 
45   // A quoted sequence of escaped characters.
46   // Either single or double quotes can be used, but they must match.
47   // A string literal cannot cross a line break.
48   kUpb_TokenType_String,
49 
50   // Any other printable character, like '!' or '+'.
51   // Symbols are always a single character, so "!+$%" is four tokens.
52   kUpb_TokenType_Symbol,
53 
54   // A sequence of whitespace.
55   // This token type is only produced if report_whitespace() is true.
56   // It is not reported for whitespace within comments or strings.
57   kUpb_TokenType_Whitespace,
58 
59   // A newline ('\n'). This token type is only produced if report_whitespace()
60   // is true and report_newlines() is also true.
61   // It is not reported for newlines in comments or strings.
62   kUpb_TokenType_Newline,
63 } upb_TokenType;
64 
65 typedef enum {
66   // Set to allow floats to be suffixed with the letter 'f'. Tokens which would
67   // otherwise be integers but which have the 'f' suffix will be forced to be
68   // interpreted as floats. For all other purposes, the 'f' is ignored.
69   kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
70 
71   // If set, whitespace tokens are reported by Next().
72   kUpb_TokenizerOption_ReportWhitespace = 1 << 1,
73 
74   // If set, newline tokens are reported by Next().
75   // This is a superset of ReportWhitespace.
76   kUpb_TokenizerOption_ReportNewlines = 1 << 2,
77 
78   // By default the tokenizer expects C-style (/* */) comments.
79   // If set, it expects shell-style (#) comments instead.
80   kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
81 } upb_Tokenizer_Option;
82 
83 typedef struct upb_Tokenizer upb_Tokenizer;
84 
85 // Can be passed a flat array and/or a ZCIS as input.
86 // The array will be read first (if non-NULL), then the stream (if non-NULL).
87 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
88                                  upb_ZeroCopyInputStream* input, int options,
89                                  upb_Arena* arena);
90 
91 void upb_Tokenizer_Fini(upb_Tokenizer* t);
92 
93 // Advance the tokenizer to the next input token. Returns True on success.
94 // Returns False and (clears *status on EOF, sets *status on error).
95 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);
96 
97 // Accessors for inspecting current/previous parse tokens,
98 // which are opaque to the tokenizer (to reduce copying).
99 
100 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
101 int upb_Tokenizer_Column(const upb_Tokenizer* t);
102 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
103 int upb_Tokenizer_Line(const upb_Tokenizer* t);
104 int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
105 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);
106 
107 // External helper: validate an identifier.
108 bool upb_Tokenizer_IsIdentifier(const char* data, int size);
109 
110 // Parses a TYPE_INTEGER token. Returns false if the result would be
111 // greater than max_value. Otherwise, returns true and sets *output to the
112 // result. If the text is not from a Token of type TYPE_INTEGER originally
113 // parsed by a Tokenizer, the result is undefined (possibly an assert
114 // failure).
115 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
116 
117 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually
118 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
119 // result is undefined (possibly an assert failure).
120 double upb_Parse_Float(const char* text);
121 
122 // Parses a TYPE_STRING token. This never fails, so long as the text actually
123 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
124 // result is undefined (possibly an assert failure).
125 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);
126 
127 #ifdef __cplusplus
128 } /* extern "C" */
129 #endif
130 
131 #include "upb/port/undef.inc"
132 
133 #endif  // UPB_IO_TOKENIZER_H_
134